Untitled
unknown
plain_text
9 months ago
2.2 kB
7
Indexable
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
# Copy the dataset to avoid modifying the original
df = final_train_data.copy()
# Binning continuous variables for stratification
df['TransactionAmt_bin'] = pd.qcut(df['TransactionAmt'], q=5, labels=False, duplicates='drop')
df['TransactionDT_bin'] = pd.qcut(df['TransactionDT'], q=5, labels=False, duplicates='drop')
# Convert TransactionDT into useful time-based features
df["TransactionDT_days"] = df["TransactionDT"] // (24 * 60 * 60) # Convert to days
df["TransactionDT_hours"] = (df["TransactionDT"] // 3600) % 24 # Extract hour of day
# Categorical columns to balance
categorical_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']
# Reduce category explosion by grouping rare values
for col in categorical_cols:
df[col] = df[col].astype(str) # Ensure strings
top_categories = df[col].value_counts().index[:10] # Keep only the top 10 most frequent
df[col] = df[col].apply(lambda x: x if x in top_categories else "Other") # Group rare categories
# Adjust rare fraud cases by ensuring at least `n_splits` samples exist per category
df['fraud_count'] = df.groupby('isFraud')['isFraud'].transform('count')
df.loc[df['fraud_count'] < 5, 'isFraud'] = df['isFraud'].replace({1: 0}) # Reassign rare fraud cases
# Create a stratification label combining key features
df['stratify_label'] = (
df['isFraud'].astype(str) + "_" + # Fraud balance
df['TransactionAmt_bin'].astype(str) + "_" + # Transaction amount range
df['TransactionDT_bin'].astype(str) + "_" + # Transaction time range
df['ProductCD'] + "_" + # Product type
df['card1'] + "_" + # Card type
df['card2'] + "_" +
df['card3'] + "_" +
df['card4'] + "_" +
df['card5'] + "_" +
df['card6'] + "_" +
df['DeviceType'] # Device variety
)Editor is loading...
Leave a Comment