Untitled

 avatar
unknown
python
6 days ago
899 B
6
Indexable
import pandas as pd
import numpy as np

def is_binary_column(series):
    unique_vals = series.dropna().unique()
    return (len(unique_vals) <= 2) and all(val in [0, 1] or val in ['0', '1', True, False] for val in unique_vals)

def missing_percentage(series):
    return series.isna().mean() * 100

def screen_binary_columns(df, max_missing_percent=20):
    binary_cols = [col for col in df.columns if is_binary_column(df[col])]
    binary_col_missing = {col: missing_percentage(df[col]) for col in binary_cols}
    suitable_cols = {col: pct for col, pct in binary_col_missing.items() if pct <= max_missing_percent}
    return suitable_cols

# Assuming your DataFrame is named 'df'
suitable_binary_columns = screen_binary_columns(df)

print("Binary Columns Suitable for Logistic Regression:")
for col, missing_pct in suitable_binary_columns.items():
    print(f"{col}: {missing_pct:.2f}% missing")
Leave a Comment