Untitled
unknown
python
3 years ago
1.4 kB
5
Indexable
import pandas as pd
from sklearn import preprocessing
from datasets.Dataset import Dataset
class Adult(Dataset):
def __init__(self):
name = "adult"
sensitive_attribute = "gender"
super().__init__(name, sensitive_attribute)
def get_data(self, _, __):
df = pd.read_csv('./datasets/adult/adult.csv').sample(frac=1)
positive = df[self.sensitive_attribute].isin(["Male"])
negative = df[self.sensitive_attribute].isin(["Female"])
df.loc[positive, self.sensitive_attribute] = 1.0
df.loc[negative, self.sensitive_attribute] = 0.0
positive = df["income"] == ">50K"
negative = df["income"] == "<=50K"
df.loc[positive, "income"] = 1.0
df.loc[negative, "income"] = 0.0
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "native-country"]
df[cat_columns] = df[cat_columns].astype('category')
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
# Normalize
#x = df.values
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#columns = df.columns
#df = pd.DataFrame(x_scaled)
#df.columns = columns
X = df.copy()
y = df.pop("income")
X = X.drop(columns=["income"])
return X, y
Editor is loading...