Untitled
import pandas as pd import numpy as np def is_binary_column(series): unique_vals = series.dropna().unique() return (len(unique_vals) <= 2) and all(val in [0, 1] or val in ['0', '1', True, False] for val in unique_vals) def missing_percentage(series): return series.isna().mean() * 100 def screen_binary_columns(df, max_missing_percent=20): binary_cols = [col for col in df.columns if is_binary_column(df[col])] binary_col_missing = {col: missing_percentage(df[col]) for col in binary_cols} suitable_cols = {col: pct for col, pct in binary_col_missing.items() if pct <= max_missing_percent} return suitable_cols # Assuming your DataFrame is named 'df' suitable_binary_columns = screen_binary_columns(df) print("Binary Columns Suitable for Logistic Regression:") for col, missing_pct in suitable_binary_columns.items(): print(f"{col}: {missing_pct:.2f}% missing")
Leave a Comment