Untitled

 avatar
unknown
python
2 years ago
1.4 kB
2
Indexable
import pandas as pd
from sklearn import preprocessing

from datasets.Dataset import Dataset


class Adult(Dataset):

    def __init__(self):
        name = "adult"
        sensitive_attribute = "gender"
        super().__init__(name, sensitive_attribute)

    def get_data(self, _, __):
        df = pd.read_csv('./datasets/adult/adult.csv').sample(frac=1)

        positive = df[self.sensitive_attribute].isin(["Male"])
        negative = df[self.sensitive_attribute].isin(["Female"])
        df.loc[positive, self.sensitive_attribute] = 1.0
        df.loc[negative, self.sensitive_attribute] = 0.0

        positive = df["income"] == ">50K"
        negative = df["income"] == "<=50K"
        df.loc[positive, "income"] = 1.0
        df.loc[negative, "income"] = 0.0

        cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "native-country"]
        df[cat_columns] = df[cat_columns].astype('category')
        df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

        # Normalize
        #x = df.values
        #min_max_scaler = preprocessing.MinMaxScaler()
        #x_scaled = min_max_scaler.fit_transform(x)
        #columns = df.columns
        #df = pd.DataFrame(x_scaled)
        #df.columns = columns

        X = df.copy()
        y = df.pop("income")
        X = X.drop(columns=["income"])

        return X, y
Editor is loading...