Untitled
unknown
python
2 years ago
1.4 kB
2
Indexable
import pandas as pd from sklearn import preprocessing from datasets.Dataset import Dataset class Adult(Dataset): def __init__(self): name = "adult" sensitive_attribute = "gender" super().__init__(name, sensitive_attribute) def get_data(self, _, __): df = pd.read_csv('./datasets/adult/adult.csv').sample(frac=1) positive = df[self.sensitive_attribute].isin(["Male"]) negative = df[self.sensitive_attribute].isin(["Female"]) df.loc[positive, self.sensitive_attribute] = 1.0 df.loc[negative, self.sensitive_attribute] = 0.0 positive = df["income"] == ">50K" negative = df["income"] == "<=50K" df.loc[positive, "income"] = 1.0 df.loc[negative, "income"] = 0.0 cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "native-country"] df[cat_columns] = df[cat_columns].astype('category') df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes) # Normalize #x = df.values #min_max_scaler = preprocessing.MinMaxScaler() #x_scaled = min_max_scaler.fit_transform(x) #columns = df.columns #df = pd.DataFrame(x_scaled) #df.columns = columns X = df.copy() y = df.pop("income") X = X.drop(columns=["income"]) return X, y
Editor is loading...