Untitled
unknown
plain_text
a month ago
2.2 kB
4
Indexable
import pandas as pd import numpy as np from sklearn.model_selection import StratifiedKFold from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import xgboost as xgb import lightgbm as lgb # Copy the dataset to avoid modifying the original df = final_train_data.copy() # Binning continuous variables for stratification df['TransactionAmt_bin'] = pd.qcut(df['TransactionAmt'], q=5, labels=False, duplicates='drop') df['TransactionDT_bin'] = pd.qcut(df['TransactionDT'], q=5, labels=False, duplicates='drop') # Convert TransactionDT into useful time-based features df["TransactionDT_days"] = df["TransactionDT"] // (24 * 60 * 60) # Convert to days df["TransactionDT_hours"] = (df["TransactionDT"] // 3600) % 24 # Extract hour of day # Categorical columns to balance categorical_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain'] # Reduce category explosion by grouping rare values for col in categorical_cols: df[col] = df[col].astype(str) # Ensure strings top_categories = df[col].value_counts().index[:10] # Keep only the top 10 most frequent df[col] = df[col].apply(lambda x: x if x in top_categories else "Other") # Group rare categories # Adjust rare fraud cases by ensuring at least `n_splits` samples exist per category df['fraud_count'] = df.groupby('isFraud')['isFraud'].transform('count') df.loc[df['fraud_count'] < 5, 'isFraud'] = df['isFraud'].replace({1: 0}) # Reassign rare fraud cases # Create a stratification label combining key features df['stratify_label'] = ( df['isFraud'].astype(str) + "_" + # Fraud balance df['TransactionAmt_bin'].astype(str) + "_" + # Transaction amount range df['TransactionDT_bin'].astype(str) + "_" + # Transaction time range df['ProductCD'] + "_" + # Product type df['card1'] + "_" + # Card type df['card2'] + "_" + df['card3'] + "_" + df['card4'] + "_" + df['card5'] + "_" + df['card6'] + "_" + df['DeviceType'] # Device variety )
Editor is loading...
Leave a Comment