hidden_discrimination

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf


def run():
    df = pd.DataFrame(index=range(200))

    # Generate 'female' column
    df['female'] = 0
    df.loc[100:, 'female'] = 1

    # Generate 'education_cost' column
    df['education_cost'] = np.random.uniform(0, 1, 200)

    # Generate 'goteducation' column
    df['goteducation'] = 0
    df.loc[(df['female'] == 0) & (
        df['education_cost'] >= 0.5), 'goteducation'] = 1
    
    # Because of discrimination, females need to meet a higher "threshold" to get education
    df.loc[(df['female'] == 1) & (
        df['education_cost'] >= 0.7), 'goteducation'] = 1

    # Generate 'wage' column based on education
    df['wage'] = 5
    df.loc[df['goteducation'] == 1, 'wage'] = 10

    # Log-transform 'wage'
    df['log_wage'] = np.log(df['wage'])

    # Run regressions
    model1 = smf.ols(formula='log_wage ~ female', data=df).fit()
    model2 = smf.ols(formula='log_wage ~ female + goteducation', data=df).fit()

    print(model1.summary())
    print(model2.summary())


run()
Editor is loading...