Untitled
unknown
python
a year ago
6.3 kB
15
Indexable
""" Pipeline code for training and evaluating the sentiment classifier. We use the Deepmoji architecture here, see https://github.com/bfelbo/DeepMoji for detail. """ import re import codecs import random import numpy as np import sys import json import argparse from sklearn.metrics import confusion_matrix, f1_score, classification_report sys.path.append("DeepMoji/deepmoji/") from sentence_tokenizer import SentenceTokenizer from model_def import deepmoji_architecture, load_specific_weights from finetuning import load_benchmark, finetune from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.model_selection import train_test_split MAX_LEN = 150 # def load_data(filename): # f = codecs.open(filename, "r", "utf-8") # data_pair = [] # for line in f: # line = line.strip().split("\t") # data_pair.append((line[0], line[1])) # return data_pair # def prepare_5fold(data_pair): # sind = 0 # eind = 0 # random.shuffle(data_pair) # fold_size = int(len(data_pair) / 5) # for fold in range(0, 5): # sind = eind # eind = sind + fold_size # train_pair = data_pair[0:sind] + data_pair[eind:len(data_pair)] # test_pair = data_pair[sind:eind] # yield (train_pair, test_pair) import pandas as pd def load_data(train_file, test_file): train_df = pd.read_csv(train_file) val_df = pd.read_csv(test_file) return train_df, val_df if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, required=True, choices=["SEntiMoji", "SEntiMoji-T", "SEntiMoji-G"], help="name of pretrained representation model") parser.add_argument("--task", type=str.lower, required=True, choices=["sentiment", "emotion"], help="specify task (sentiment or emotion)") parser.add_argument("--benchmark_dataset_name", type=str, required=False, choices=["Jira", "GitHub", "StackOverflow", "CodeReview", "JavaLib"], help="name of benchmark dataset") parser.add_argument("--emotion_type", required=False, default=None, choices=["Anger", "Love", "Fear", "Sadness", "Joy", "Surprise"], help="specify emotion dataset") parser.add_argument("--use_own_dataset", action='store_true', help="whether use your own dataset or not") parser.add_argument("--own_dataset_dir", type=str, required=False, default=None, help="directory of your train data file") parser.add_argument("--own_dataset_file", type=str, required=False, default=None, help="file name of your train data file") args = parser.parse_args() print("args:") d = args.__dict__ for key,value in d.items(): print("%s = %s"%(key,value)) # parse arguments model_path = "../../model/representation_model/model_%s.hdf5" % args.model vocab_path = "vocabulary/vocabulary_%s.json" % args.model # load vocabulary with open(vocab_path, "r") as f_vocab: vocabulary = json.load(f_vocab) try: label2index_path = "label2index/emotion/label2index_2class.json" # Load the training and validation data train_file = "/home/imranm3/emotion_classification/datasets/SOEmotion-Train.csv" test_file = "/home/imranm3/emotion_classification/datasets/SOEmotion-Test.csv" train_df, val_df = load_data(train_file, test_file) except RuntimeError as e: print("Error:", repr(e)) st = SentenceTokenizer(vocabulary, MAX_LEN) fold = 1 # no fold for item in range(fold): # prepare training, validation, testing set train_text = train_df["Text"] train_label = train_df[args.emotion_type] test_text = val_df["Text"] test_label = val_df[args.emotion_type] train_X, _, _ = st.tokenize_sentences(train_text) test_X, _, _ = st.tokenize_sentences(test_text) train_y = np.array(train_label) test_y = np.array(test_label) nb_classes = 2 nb_tokens = len(vocabulary) # model model = deepmoji_architecture(nb_classes=nb_classes, nb_tokens=nb_tokens, maxlen=MAX_LEN, embed_dropout_rate=0.25, final_dropout_rate=0.5, embed_l2=1E-6) model.summary() # load pretrained representation model load_specific_weights(model, model_path, nb_tokens, MAX_LEN, exclude_names=["softmax"]) # train model model, acc = finetune(model, [train_X, test_X, test_X], [train_y, test_y, test_y], nb_classes, 100, method="chain-thaw", verbose=2) pred_y_prob = model.predict(test_X) if nb_classes == 2: pred_y = [0 if p < 0.5 else 1 for p in pred_y_prob] else: pred_y = np.argmax(pred_y_prob, axis=1) # evaluation print("*****************************************") print("Fold %d" % fold) accuracy = accuracy_score(test_y, pred_y) print("Accuracy: %.3f" % accuracy) precision = precision_score(test_y, pred_y, average=None) recall = recall_score(test_y, pred_y, average=None) f1score = f1_score(test_y, pred_y, average=None) for index in range(0, nb_classes): print("label: %s" % index) print("Precision: %.3f, Recall: %.3f, F1 score: %.3f" % (precision[index], recall[index], f1score[index])) print("*****************************************") print(confusion_matrix(test_y, pred_y)) print(classification_report(test_y, pred_y)) # save predict result if not args.use_own_dataset: if args.task == "sentiment": save_name = "result_%s_%s_fold%d.txt" % (args.model, args.benchmark_dataset_name, fold) elif args.task == "emotion": save_name = "result_%s_%s_%s_fold%d.txt" % (args.model, args.benchmark_dataset_name, args.emotion_type, fold) else: save_name = "result_fold%d.txt" % fold with open(save_name, "w") as f: for i in range(0, len(test_text)): f.write("%s\t%s\t%s\r\n" % (test_text[i], pred_y[i], test_label[i])) fold += 1
Editor is loading...
Leave a Comment