Untitled
plain_text
22 days ago
5.6 kB
1
Indexable
Never
import pandas as pd import pickle import time import datetime from joblib import dump, load import shutil, os import pdb import io import pandas as pd #from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity #from string import digits from sklearn.feature_extraction.text import CountVectorizer import sys from load_data import * from feature_engineering import * from param_config import config data = load_data_from_file() cleaned_data = data_pre_processing(data) def train_data(cleaned_data, column): #print("Cleaned Data : ",cleaned_data) print("Target Column : ",column) tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data['person_who_resolved']) sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data['person_who_resolved']) doc_term_matrix = sparse_matrix.todense() data_train_tfidf = pd.DataFrame(doc_term_matrix, columns=tfidf_vectorizer.get_feature_names_out()) print("Data Train TF-IDF : ",data_train_tfidf) count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer = CountVectorizer() count_vectorizer = count_vectorizer.fit(cleaned_data['person_who_resolved']) sparse_matrix = count_vectorizer.fit_transform(cleaned_data['person_who_resolved']) doc_term_matrix = sparse_matrix.todense() data_train_count = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out()) print("Data Train Count : ",data_train_count) return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer embeddings_dict = {} with open("glove_50d.txt", "r", encoding="utf-8") as f: for line in f: values = line.strip().split() token = values[0] vector = np.asarray(values[1:], dtype="float32") embeddings_dict[token] = vector def train_data_context(unique_train_data_word, embeddings_dict): unique_train_data = {} for word in unique_train_data_word: try: unique_train_data.update({word: embeddings_dict[word].tolist()}) except: continue Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix']) return Matching_data if __name__ == "__main__": print("********************************") print("------------STARTED-------------") print("********************************") start_time=time.time() start_date=str(datetime.datetime.now()) print("Start Date : ",start_date) try: files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib'] for f in files: shutil.copy(config.model_folder_name+f, config.archive_path) except: print('No Data Found in Model Folder, Running for 1st time') #Loading Data from DB/File df_train=load_data_from_file() #Data Preprocessing df_act=data_pre_processing(df_train) print("Feature Engineering Done") #print("DF Actual : ",df_act.head()) #Training Part and creating the Matrix new_target_col = df_act['person_who_resolved'] #print("New Target COlumn : ",new_target_col.head()) df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col) print('Training Done for NLP Based TFIDF') print('---------------------------------') print('contexual Training Started -----') print('---------------------------------') #print(df_train.head()) df_act_context = df_train #print("DF Act Context : ",df_act_context) print("DF ACT Context before : ",df_act_context.shape) df_act_context=data_pre_processing_context(df_act_context) ## Changes made here print("DF ACT Context After : ",df_act_context.shape) #print("DF Act COntext After : ",df_act_context.head()) #Training Part and creating the Matrix new_target_col = df_act_context['person_who_resolved'] df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1) print("DF ACT Context : ",df_act_context) unique_train_data_word = unique_word_list(df_act_context, 'Noun') print(unique_train_data_word) Matching_data = train_data_context(unique_train_data_word,embeddings_dict) print("Matching Data : ",Matching_data.head()) print('Training Done for contexual Search') ###Mode Dumping for Contexual search dump(Matching_data, config.model_folder_name + config.model_matching_data_train) dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict) dump(df_act_context, config.model_folder_name + config.context_data) print('Models successfully dumped in respetive folder for contexual search') ###Mode Dumping for TFIDF dump(df_train_tfidf, config.model_folder_name + config.model_tfidf) dump(df_train_tf_count, config.model_folder_name + config.model_tf_count) dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector) dump(count_vector, config.model_folder_name + config.model_tf_count_vector) dump(df_act, config.model_folder_name + config.raw_data) print('Models successfully dumped in respetive folder')