Untitled
unknown
plain_text
a month ago
826 B
4
Indexable
import pandas as pd
# 1. Muat dataset
df = pd.read_csv('ulasan_ecommerce.csv')
# 2. Pemeriksaan awal
print(df.shape) # Dimensi dataset
print(df.isnull().sum()) # Hitung missing values per kolom
print(df.duplicated().sum()) # Hitung jumlah duplikat
# 3. Hapus duplikat
df = df.drop_duplicates()
# 4. Imputasi missing value kolom Rating dengan median
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
# 5. Standardisasi kolom Kota
df['Kota'] = df['Kota'].str.title().str.strip()
# 6. Tangani outlier Rating (nilai valid: 1 sampai 5)
df['Rating'] = df['Rating'].clip(1, 5)
# 7. Standardisasi format tanggal
df['Tanggal'] = pd.to_datetime(df['Tanggal'], dayfirst=True)
# 8. Simpan data bersih
df.to_csv('ulasan_clean.csv', index=False)
print('Data cleaning selesai!')Editor is loading...
Leave a Comment