datavalid
unknown
python
2 years ago
1.5 kB
4
Indexable
import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder class CSVPreprocessor: def __init__(self, csv_file): self.df = pd.read_csv(csv_file) def validate_data(self): if self.df.isnull().values.any(): raise Exception("Null values found") non_numeric = [col for col in self.df.columns if not self.df[col].dtype in [int, float]] if non_numeric: raise Exception("Non numeric columns found: " + ", ".join(non_numeric)) invalid_names = [col for col in self.df.columns if ' ' in col] if invalid_names: raise Exception("Invalid column names found: " + ", ".join(invalid_names)) print("CSV data validation passed!") def impute_missing(self): imputer = SimpleImputer(strategy='mean') self.df = imputer.fit_transform(self.df) def encode_categorical(self, cat_cols): enc = OneHotEncoder() self.cat_transformed = enc.fit_transform(self.df[:, cat_cols]) def normalize(self): scaler = StandardScaler() self.normalized = scaler.fit_transform(self.df) def preprocess(self): self.validate_data() self.impute_missing() self.encode_categorical([0, 1, 5]) self.normalize() processor = CSVPreprocessor('data.csv') processor.preprocess() print(processor.normalized) print(processor.cat_transformed)
Editor is loading...