Untitled

mail@pastecode.io avatarunknown
plain_text
a month ago
3.9 kB
1
Indexable
Never
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *

data = load_data_from_file()
cleaned_data = data_pre_processing(data)


def train_data(cleaned_data, column):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
    sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix = sparse_matrix.todense()
    data_train_tfidf = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names_out())
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.fit(cleaned_data[column])
    sparse_matrix    = count_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix  = sparse_matrix.todense()
    data_train_count   = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names_out())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer


embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        token = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_dict[token] = vector

        import pandas as pd

def train_data_context(unique_train_data_word, embeddings_dict):
    unique_train_data = {}
    for word in unique_train_data_word:
        try:
            unique_train_data.update({word: embeddings_dict[word].tolist()})
        except:
            continue
    Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
    return Matching_data


if __name__ == "__main__":


    print("********************************")
    print("------------STARTED-------------")
    print("********************************")

    start_time=time.time()
    start_date=str(datetime.datetime.now())
    
    #Loading Data from DB/File
    df_train=load_data_from_file()
    
    #Data Preprocessing
    df_act=data_pre_processing(df_train)
    
    print("Feature Engineering Done")
    print(df_act.head())
    
    #Training Part and creating the Matrix
    new_target_col = 'clean_text_'+ df_act['person_who_resolved']
    df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col)
    print('Training Done for NLP Based TFIDF')
    
    print('---------------------------------')
    print('contexual Training Started -----')
    print('---------------------------------')
    
    print(df_train.head())
    df_act_context = df_train
    print("DF ACT Context before : ",df_act_context.shape)
    df_act_context['Noun']=data_pre_processing_context(df_act_context)
    print("DF ACT Context After : ",df_act_context.shape)
    
    #Training Part and creating the Matrix
    new_target_col = 'clean_text_'+ df_act_context['person_who_resolved']
    df_act_context['Noun']= df_act_context.apply(lambda row: noun_extraction(row[new_target_col]), axis=1)
    unique_train_data_word = unique_word_list(df_act_context, df_act_context['Noun'])
    print(unique_train_data_word)
    
    Matching_data = train_data_context(unique_train_data_word)
    print(Matching_data.head())
    print('Training Done for contexual Search')