datavalid
unknown
python
2 years ago
1.5 kB
9
Indexable
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
class CSVPreprocessor:
def __init__(self, csv_file):
self.df = pd.read_csv(csv_file)
def validate_data(self):
if self.df.isnull().values.any():
raise Exception("Null values found")
non_numeric = [col for col in self.df.columns if not self.df[col].dtype in [int, float]]
if non_numeric:
raise Exception("Non numeric columns found: " + ", ".join(non_numeric))
invalid_names = [col for col in self.df.columns if ' ' in col]
if invalid_names:
raise Exception("Invalid column names found: " + ", ".join(invalid_names))
print("CSV data validation passed!")
def impute_missing(self):
imputer = SimpleImputer(strategy='mean')
self.df = imputer.fit_transform(self.df)
def encode_categorical(self, cat_cols):
enc = OneHotEncoder()
self.cat_transformed = enc.fit_transform(self.df[:, cat_cols])
def normalize(self):
scaler = StandardScaler()
self.normalized = scaler.fit_transform(self.df)
def preprocess(self):
self.validate_data()
self.impute_missing()
self.encode_categorical([0, 1, 5])
self.normalize()
processor = CSVPreprocessor('data.csv')
processor.preprocess()
print(processor.normalized)
print(processor.cat_transformed)Editor is loading...