Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
4.8 kB
2
Indexable
Never
# NTU: Digital biomarkers for depression screening with machine learning modelling: cross-sectional study
# Prediction modelling: classification with machine learning in CONTRASTED SUB-SAMPLES
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score, make_scorer, recall_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import matplotlib.pyplot as plt 

def calculate_spearman(df, p):
   
    columns = df.columns
    significant = pd.DataFrame(columns = ['pred', 'r', 'p'])
    
    for i in range(0, df.shape[1]):
        coef, p_ = spearmanr(df[df.columns[i]], fit['phq.avg'], nan_policy='omit')
        if(p_ < p):
            significant.loc[i] = [columns[i], coef, p_]
    
    return significant.reset_index(drop=True)


ds_folder = 'dataset_276_22h'
ds_file = 'dataset_final_276_22h.csv'
os.chdir('/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/data/' + str(ds_folder) + '/') 

fit = pd.read_csv(ds_file)

# predictors
preds = list(fit.columns)[8:] # excluding depression

# determine outcome2
fit['depression.10'] = np.where((fit['phq_score'] >= 10) | (fit['phq_score_t2'] >= 10), 'depressed', 'healthy')

# contrasted subsample (downsampling)
fit_10 = fit[fit['depression.10'] == 'depressed'] # depressed group

# healthy control groups
fit_0 = fit[(fit['phq_score'] < 5) & (fit['phq_score_t2'] < 5) & ((fit['phq_score'] > 0) | (fit['phq_score_t2'] > 0))]

# healthy control group
fit_00 = fit_0#.sample(n=40, random_state=1) #healthy group

# joining samples
fit_a = pd.concat([fit_10, fit_00]).reset_index(drop=True)

fit_a['depression'] = np.where((fit_a['phq_score'] >= 10) | (fit_a['phq_score_t2'] >= 10), 1, 0) # 1:depressed, 0:healthy 

# get significant predictors
res = calculate_spearman(fit[preds], p=0.01)
pred1_1 = list(res['pred'])

ds = fit_a
pr = pred1_1
# X = ds[pr].to_numpy()
y = ds['depression'].to_numpy()
ds = ds[pr]
#ds = ds.drop(columns=['depression.10', 'depression', 'pid', 'phq_3cat', 'phq_3cat_t2', 'phq_score', 'phq_score_t2', 'phq.avg'])

import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from tqdm import tqdm

os.chdir("/fsx/home-duyphung/pilev2/temp_use/digital_biomarkers/Statistical Analysis")
df = pd.read_csv("training_data.csv")
df = df.drop(columns=['Unnamed: 0'])
df['depression'] = df['depression'] == "depressed"
df['depression'] = df['depression'].astype(int)
feature_cols = [f for f in df.columns.tolist() if f != "depression"]
target_col = "depression"
print(feature_cols)

full_metrics = False
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=25)
is_lgb = False
is_xgb = True
for threshold in np.arange(0.1, 0.9, 0.1):
    average_f1 = 0
    average_acc = 0
    threshold = 0.5
    from sklearn.model_selection import ShuffleSplit, StratifiedKFold
    # fit model no training data
    params = {'colsample_bytree': 1 , 'gamma': 0.3, 'learning_rate': 0.3, 'max_depth': 1 , 'min_child_weight' : 2 , 'n_estimators': 100 , 'subsample': 0.8}
    for train_index, test_index in tqdm(rskf.split(df, df[target_col]), total=100):
        clf = xgb.XGBClassifier(
            eval_metric='auc',
            objective='binary:logistic',
            booster='dart',
            **params
        )
        X = df[feature_cols].to_numpy()
        y = df[target_col].to_numpy()
        clf.fit(X[train_index], y[train_index])
        # make predictions for test data
        y_pred_xgb = clf.predict_proba(X[test_index])
        
        clf2 = lgb.LGBMClassifier(
            objective='binary',
            boosting_type='dart',
        )
        clf2.fit(X[train_index], y[train_index])
        y_pred_lgb = clf2.predict_proba(X[test_index])

        clf3 = RandomForestClassifier(
            n_estimators=100,
            max_depth=3
        )
        
        clf3.fit(X[train_index], y[train_index])
        y_pred_rf = clf3.predict_proba(X[test_index])
        
        # linear ensemble
        y_pred = 0.5 * y_pred_xgb + 0.3 * y_pred_lgb + 0.2 * y_pred_rf
        y_pred = (y_pred_xgb + y_pred_lgb + y_pred_rf) / 3
        y_pred = y_pred[:, 1]
        y_pred = y_pred > threshold
        average_f1 += f1_score(y[test_index], y_pred)
        average_acc += accuracy_score(y[test_index], y_pred)
    print("Threshold: ", threshold)
    print("Average F1: ", average_f1 / 100)
    print("Average Accuracy: ", average_acc / 100)
    break