Untitled

 avatar
unknown
plain_text
a month ago
826 B
3
Indexable
import pandas as pd

# 1. Muat dataset
df = pd.read_csv('ulasan_ecommerce.csv')

# 2. Pemeriksaan awal
print(df.shape)               # Dimensi dataset
print(df.isnull().sum())      # Hitung missing values per kolom
print(df.duplicated().sum())  # Hitung jumlah duplikat

# 3. Hapus duplikat
df = df.drop_duplicates()

# 4. Imputasi missing value kolom Rating dengan median
df['Rating'] = df['Rating'].fillna(df['Rating'].median())

# 5. Standardisasi kolom Kota
df['Kota'] = df['Kota'].str.title().str.strip()

# 6. Tangani outlier Rating (nilai valid: 1 sampai 5)
df['Rating'] = df['Rating'].clip(1, 5)

# 7. Standardisasi format tanggal
df['Tanggal'] = pd.to_datetime(df['Tanggal'], dayfirst=True)

# 8. Simpan data bersih
df.to_csv('ulasan_clean.csv', index=False)
print('Data cleaning selesai!')
Editor is loading...
Leave a Comment