Untitled
unknown
plain_text
3 years ago
4.8 kB
12
Indexable
# NTU: Digital biomarkers for depression screening with machine learning modelling: cross-sectional study
# Prediction modelling: classification with machine learning in CONTRASTED SUB-SAMPLES
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score, make_scorer, recall_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
def calculate_spearman(df, p):
columns = df.columns
significant = pd.DataFrame(columns = ['pred', 'r', 'p'])
for i in range(0, df.shape[1]):
coef, p_ = spearmanr(df[df.columns[i]], fit['phq.avg'], nan_policy='omit')
if(p_ < p):
significant.loc[i] = [columns[i], coef, p_]
return significant.reset_index(drop=True)
ds_folder = 'dataset_276_22h'
ds_file = 'dataset_final_276_22h.csv'
os.chdir('/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/data/' + str(ds_folder) + '/')
fit = pd.read_csv(ds_file)
# predictors
preds = list(fit.columns)[8:] # excluding depression
# determine outcome2
fit['depression.10'] = np.where((fit['phq_score'] >= 10) | (fit['phq_score_t2'] >= 10), 'depressed', 'healthy')
# contrasted subsample (downsampling)
fit_10 = fit[fit['depression.10'] == 'depressed'] # depressed group
# healthy control groups
fit_0 = fit[(fit['phq_score'] < 5) & (fit['phq_score_t2'] < 5) & ((fit['phq_score'] > 0) | (fit['phq_score_t2'] > 0))]
# healthy control group
fit_00 = fit_0#.sample(n=40, random_state=1) #healthy group
# joining samples
fit_a = pd.concat([fit_10, fit_00]).reset_index(drop=True)
fit_a['depression'] = np.where((fit_a['phq_score'] >= 10) | (fit_a['phq_score_t2'] >= 10), 1, 0) # 1:depressed, 0:healthy
# get significant predictors
res = calculate_spearman(fit[preds], p=0.01)
pred1_1 = list(res['pred'])
ds = fit_a
pr = pred1_1
# X = ds[pr].to_numpy()
y = ds['depression'].to_numpy()
ds = ds[pr]
#ds = ds.drop(columns=['depression.10', 'depression', 'pid', 'phq_3cat', 'phq_3cat_t2', 'phq_score', 'phq_score_t2', 'phq.avg'])
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from tqdm import tqdm
os.chdir("/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/Statistical Analysis")
df = pd.read_csv("training_data.csv")
df = df.drop(columns=['Unnamed: 0'])
df['depression'] = df['depression'] == "depressed"
df['depression'] = df['depression'].astype(int)
feature_cols = [f for f in df.columns.tolist() if f != "depression"]
target_col = "depression"
print(feature_cols)
full_metrics = False
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=25)
is_lgb = False
is_xgb = True
for threshold in np.arange(0.1, 0.9, 0.1):
average_f1 = 0
average_acc = 0
threshold = 0.5
from sklearn.model_selection import ShuffleSplit, StratifiedKFold
# fit model no training data
params = {'colsample_bytree': 1 , 'gamma': 0.3, 'learning_rate': 0.3, 'max_depth': 1 , 'min_child_weight' : 2 , 'n_estimators': 100 , 'subsample': 0.8}
for train_index, test_index in tqdm(rskf.split(df, df[target_col]), total=100):
clf = xgb.XGBClassifier(
eval_metric='auc',
objective='binary:logistic',
booster='dart',
**params
)
X = df[feature_cols].to_numpy()
y = df[target_col].to_numpy()
clf.fit(X[train_index], y[train_index])
# make predictions for test data
y_pred_xgb = clf.predict_proba(X[test_index])
clf2 = lgb.LGBMClassifier(
objective='binary',
boosting_type='dart',
)
clf2.fit(X[train_index], y[train_index])
y_pred_lgb = clf2.predict_proba(X[test_index])
clf3 = RandomForestClassifier(
n_estimators=100,
max_depth=3
)
clf3.fit(X[train_index], y[train_index])
y_pred_rf = clf3.predict_proba(X[test_index])
# linear ensemble
y_pred = 0.5 * y_pred_xgb + 0.3 * y_pred_lgb + 0.2 * y_pred_rf
y_pred = (y_pred_xgb + y_pred_lgb + y_pred_rf) / 3
y_pred = y_pred[:, 1]
y_pred = y_pred > threshold
average_f1 += f1_score(y[test_index], y_pred)
average_acc += accuracy_score(y[test_index], y_pred)
print("Threshold: ", threshold)
print("Average F1: ", average_f1 / 100)
print("Average Accuracy: ", average_acc / 100)
break
Editor is loading...