Untitled
unknown
plain_text
2 years ago
3.9 kB
10
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io
import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import sys
from load_data import *
from feature_engineering import *
data = load_data_from_file()
cleaned_data = data_pre_processing(data)
def train_data(cleaned_data, column):
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_tfidf = pd.DataFrame(doc_term_matrix,
columns=tfidf_vectorizer.get_feature_names_out())
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
count_vectorizer = count_vectorizer.fit(cleaned_data[column])
sparse_matrix = count_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_count = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names_out())
return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer
embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
for line in f:
values = line.strip().split()
token = values[0]
vector = np.asarray(values[1:], dtype="float32")
embeddings_dict[token] = vector
import pandas as pd
def train_data_context(unique_train_data_word, embeddings_dict):
unique_train_data = {}
for word in unique_train_data_word:
try:
unique_train_data.update({word: embeddings_dict[word].tolist()})
except:
continue
Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
return Matching_data
if __name__ == "__main__":
print("********************************")
print("------------STARTED-------------")
print("********************************")
start_time=time.time()
start_date=str(datetime.datetime.now())
#Loading Data from DB/File
df_train=load_data_from_file()
#Data Preprocessing
df_act=data_pre_processing(df_train)
print("Feature Engineering Done")
print(df_act.head())
#Training Part and creating the Matrix
new_target_col = 'clean_text_'+ df_act['person_who_resolved']
df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col)
print('Training Done for NLP Based TFIDF')
print('---------------------------------')
print('contexual Training Started -----')
print('---------------------------------')
print(df_train.head())
df_act_context = df_train
print("DF ACT Context before : ",df_act_context.shape)
df_act_context['Noun']=data_pre_processing_context(df_act_context)
print("DF ACT Context After : ",df_act_context.shape)
#Training Part and creating the Matrix
new_target_col = 'clean_text_'+ df_act_context['person_who_resolved']
df_act_context['Noun']= df_act_context.apply(lambda row: noun_extraction(row[new_target_col]), axis=1)
unique_train_data_word = unique_word_list(df_act_context, df_act_context['Noun'])
print(unique_train_data_word)
Matching_data = train_data_context(unique_train_data_word)
print(Matching_data.head())
print('Training Done for contexual Search')
Editor is loading...