datavalid

mail@pastecode.io avatar
unknown
python
a year ago
1.5 kB
2
Indexable
Never
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class CSVPreprocessor:

    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)

    def validate_data(self):
        if self.df.isnull().values.any():
            raise Exception("Null values found")
            
        non_numeric = [col for col in self.df.columns if not self.df[col].dtype in [int, float]]
        if non_numeric:
            raise Exception("Non numeric columns found: " + ", ".join(non_numeric))

        invalid_names = [col for col in self.df.columns if ' ' in col] 
        if invalid_names:
            raise Exception("Invalid column names found: " + ", ".join(invalid_names))
            
        print("CSV data validation passed!")
        
    def impute_missing(self):
        imputer = SimpleImputer(strategy='mean')
        self.df = imputer.fit_transform(self.df)
        
    def encode_categorical(self, cat_cols):
        enc = OneHotEncoder()
        self.cat_transformed = enc.fit_transform(self.df[:, cat_cols])
        
    def normalize(self):
        scaler = StandardScaler()
        self.normalized = scaler.fit_transform(self.df)
        
    def preprocess(self):
        self.validate_data()
        self.impute_missing()
        self.encode_categorical([0, 1, 5])
        self.normalize()
        
processor = CSVPreprocessor('data.csv')
processor.preprocess()

print(processor.normalized)
print(processor.cat_transformed)