Untitled
unknown
plain_text
2 years ago
3.9 kB
9
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io
import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import sys
from load_data import *
from feature_engineering import *
data = load_data_from_file()
cleaned_data = data_pre_processing(data)
def train_data(cleaned_data, column):
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_tfidf = pd.DataFrame(doc_term_matrix,
columns=tfidf_vectorizer.get_feature_names_out())
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
count_vectorizer = count_vectorizer.fit(cleaned_data[column])
sparse_matrix = count_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_count = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names_out())
return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer
embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
for line in f:
values = line.strip().split()
token = values[0]
vector = np.asarray(values[1:], dtype="float32")
embeddings_dict[token] = vector
import pandas as pd
def train_data_context(unique_train_data_word, embeddings_dict):
unique_train_data = {}
for word in unique_train_data_word:
try:
unique_train_data.update({word: embeddings_dict[word].tolist()})
except:
continue
Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
return Matching_data
if __name__ == "__main__":
print("********************************")
print("------------STARTED-------------")
print("********************************")
start_time=time.time()
start_date=str(datetime.datetime.now())
#Loading Data from DB/File
df_train=load_data_from_file()
#Data Preprocessing
df_act=data_pre_processing(df_train)
print("Feature Engineering Done")
print(df_act.head())
#Training Part and creating the Matrix
new_target_col = 'clean_text_'+ df_act['person_who_resolved']
df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col)
print('Training Done for NLP Based TFIDF')
print('---------------------------------')
print('contexual Training Started -----')
print('---------------------------------')
print(df_train.head())
df_act_context = df_train
print("DF ACT Context before : ",df_act_context.shape)
df_act_context['Noun']=data_pre_processing_context(df_act_context)
print("DF ACT Context After : ",df_act_context.shape)
#Training Part and creating the Matrix
new_target_col = 'clean_text_'+ df_act_context['person_who_resolved']
df_act_context['Noun']= df_act_context.apply(lambda row: noun_extraction(row[new_target_col]), axis=1)
unique_train_data_word = unique_word_list(df_act_context, df_act_context['Noun'])
print(unique_train_data_word)
Matching_data = train_data_context(unique_train_data_word)
print(Matching_data.head())
print('Training Done for contexual Search')
Editor is loading...