Untitled
unknown
plain_text
3 years ago
4.8 kB
5
Indexable
# NTU: Digital biomarkers for depression screening with machine learning modelling: cross-sectional study # Prediction modelling: classification with machine learning in CONTRASTED SUB-SAMPLES import warnings warnings.filterwarnings("ignore") import os import numpy as np import pandas as pd from scipy.stats import spearmanr from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.metrics import cohen_kappa_score, make_scorer, recall_score from xgboost import XGBClassifier from lightgbm import LGBMClassifier from sklearn.ensemble import RandomForestClassifier import xgboost as xgb import matplotlib.pyplot as plt def calculate_spearman(df, p): columns = df.columns significant = pd.DataFrame(columns = ['pred', 'r', 'p']) for i in range(0, df.shape[1]): coef, p_ = spearmanr(df[df.columns[i]], fit['phq.avg'], nan_policy='omit') if(p_ < p): significant.loc[i] = [columns[i], coef, p_] return significant.reset_index(drop=True) ds_folder = 'dataset_276_22h' ds_file = 'dataset_final_276_22h.csv' os.chdir('/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/data/' + str(ds_folder) + '/') fit = pd.read_csv(ds_file) # predictors preds = list(fit.columns)[8:] # excluding depression # determine outcome2 fit['depression.10'] = np.where((fit['phq_score'] >= 10) | (fit['phq_score_t2'] >= 10), 'depressed', 'healthy') # contrasted subsample (downsampling) fit_10 = fit[fit['depression.10'] == 'depressed'] # depressed group # healthy control groups fit_0 = fit[(fit['phq_score'] < 5) & (fit['phq_score_t2'] < 5) & ((fit['phq_score'] > 0) | (fit['phq_score_t2'] > 0))] # healthy control group fit_00 = fit_0#.sample(n=40, random_state=1) #healthy group # joining samples fit_a = pd.concat([fit_10, fit_00]).reset_index(drop=True) fit_a['depression'] = np.where((fit_a['phq_score'] >= 10) | (fit_a['phq_score_t2'] >= 10), 1, 0) # 1:depressed, 0:healthy # get significant predictors res = calculate_spearman(fit[preds], p=0.01) pred1_1 = list(res['pred']) ds = fit_a pr = pred1_1 # X = ds[pr].to_numpy() y = ds['depression'].to_numpy() ds = ds[pr] #ds = ds.drop(columns=['depression.10', 'depression', 'pid', 'phq_3cat', 'phq_3cat_t2', 'phq_score', 'phq_score_t2', 'phq.avg']) import lightgbm as lgb import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn.metrics import f1_score, accuracy_score, make_scorer from tqdm import tqdm os.chdir("/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/Statistical Analysis") df = pd.read_csv("training_data.csv") df = df.drop(columns=['Unnamed: 0']) df['depression'] = df['depression'] == "depressed" df['depression'] = df['depression'].astype(int) feature_cols = [f for f in df.columns.tolist() if f != "depression"] target_col = "depression" print(feature_cols) full_metrics = False rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=25) is_lgb = False is_xgb = True for threshold in np.arange(0.1, 0.9, 0.1): average_f1 = 0 average_acc = 0 threshold = 0.5 from sklearn.model_selection import ShuffleSplit, StratifiedKFold # fit model no training data params = {'colsample_bytree': 1 , 'gamma': 0.3, 'learning_rate': 0.3, 'max_depth': 1 , 'min_child_weight' : 2 , 'n_estimators': 100 , 'subsample': 0.8} for train_index, test_index in tqdm(rskf.split(df, df[target_col]), total=100): clf = xgb.XGBClassifier( eval_metric='auc', objective='binary:logistic', booster='dart', **params ) X = df[feature_cols].to_numpy() y = df[target_col].to_numpy() clf.fit(X[train_index], y[train_index]) # make predictions for test data y_pred_xgb = clf.predict_proba(X[test_index]) clf2 = lgb.LGBMClassifier( objective='binary', boosting_type='dart', ) clf2.fit(X[train_index], y[train_index]) y_pred_lgb = clf2.predict_proba(X[test_index]) clf3 = RandomForestClassifier( n_estimators=100, max_depth=3 ) clf3.fit(X[train_index], y[train_index]) y_pred_rf = clf3.predict_proba(X[test_index]) # linear ensemble y_pred = 0.5 * y_pred_xgb + 0.3 * y_pred_lgb + 0.2 * y_pred_rf y_pred = (y_pred_xgb + y_pred_lgb + y_pred_rf) / 3 y_pred = y_pred[:, 1] y_pred = y_pred > threshold average_f1 += f1_score(y[test_index], y_pred) average_acc += accuracy_score(y[test_index], y_pred) print("Threshold: ", threshold) print("Average F1: ", average_f1 / 100) print("Average Accuracy: ", average_acc / 100) break
Editor is loading...