Untitled
plain_text
a month ago
3.9 kB
1
Indexable
Never
import pandas as pd import pickle import time import datetime from joblib import dump, load import shutil, os import pdb import io import pandas as pd #from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity #from string import digits from sklearn.feature_extraction.text import CountVectorizer import sys from load_data import * from feature_engineering import * data = load_data_from_file() cleaned_data = data_pre_processing(data) def train_data(cleaned_data, column): tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column]) sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) doc_term_matrix = sparse_matrix.todense() data_train_tfidf = pd.DataFrame(doc_term_matrix, columns=tfidf_vectorizer.get_feature_names_out()) count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer = CountVectorizer() count_vectorizer = count_vectorizer.fit(cleaned_data[column]) sparse_matrix = count_vectorizer.fit_transform(cleaned_data[column]) doc_term_matrix = sparse_matrix.todense() data_train_count = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out()) return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer embeddings_dict = {} with open("glove_50d.txt", "r", encoding="utf-8") as f: for line in f: values = line.strip().split() token = values[0] vector = np.asarray(values[1:], dtype="float32") embeddings_dict[token] = vector import pandas as pd def train_data_context(unique_train_data_word, embeddings_dict): unique_train_data = {} for word in unique_train_data_word: try: unique_train_data.update({word: embeddings_dict[word].tolist()}) except: continue Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix']) return Matching_data if __name__ == "__main__": print("********************************") print("------------STARTED-------------") print("********************************") start_time=time.time() start_date=str(datetime.datetime.now()) #Loading Data from DB/File df_train=load_data_from_file() #Data Preprocessing df_act=data_pre_processing(df_train) print("Feature Engineering Done") print(df_act.head()) #Training Part and creating the Matrix new_target_col = 'clean_text_'+ df_act['person_who_resolved'] df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col) print('Training Done for NLP Based TFIDF') print('---------------------------------') print('contexual Training Started -----') print('---------------------------------') print(df_train.head()) df_act_context = df_train print("DF ACT Context before : ",df_act_context.shape) df_act_context['Noun']=data_pre_processing_context(df_act_context) print("DF ACT Context After : ",df_act_context.shape) #Training Part and creating the Matrix new_target_col = 'clean_text_'+ df_act_context['person_who_resolved'] df_act_context['Noun']= df_act_context.apply(lambda row: noun_extraction(row[new_target_col]), axis=1) unique_train_data_word = unique_word_list(df_act_context, df_act_context['Noun']) print(unique_train_data_word) Matching_data = train_data_context(unique_train_data_word) print(Matching_data.head()) print('Training Done for contexual Search')