Untitled

 avatar
unknown
plain_text
2 years ago
5.6 kB
11
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *
from param_config import config

data = load_data_from_file()
cleaned_data = data_pre_processing(data)


def train_data(cleaned_data, column):
    #print("Cleaned Data : ",cleaned_data)
    print("Target Column : ",column)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data['person_who_resolved'])
    sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data['person_who_resolved']) 
    doc_term_matrix = sparse_matrix.todense()
    data_train_tfidf = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names_out())
    print("Data Train TF-IDF : ",data_train_tfidf)
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.fit(cleaned_data['person_who_resolved'])
    sparse_matrix    = count_vectorizer.fit_transform(cleaned_data['person_who_resolved']) 
    doc_term_matrix  = sparse_matrix.todense()
    data_train_count   = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names_out())
    print("Data Train Count : ",data_train_count)
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer


embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        token = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_dict[token] = vector


def train_data_context(unique_train_data_word, embeddings_dict):
    unique_train_data = {}
    for word in unique_train_data_word:
        try:
            unique_train_data.update({word: embeddings_dict[word].tolist()})
        except:
            continue
    Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
    return Matching_data


if __name__ == "__main__":


    print("********************************")
    print("------------STARTED-------------")
    print("********************************")

    start_time=time.time()
    start_date=str(datetime.datetime.now())
    print("Start Date : ",start_date)
    
    try:
        files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib']
        for f in files:
            shutil.copy(config.model_folder_name+f, config.archive_path)
    except:
        print('No Data Found in Model Folder, Running for 1st time')
    
    #Loading Data from DB/File
    df_train=load_data_from_file()
    
    #Data Preprocessing
    df_act=data_pre_processing(df_train)
    
    print("Feature Engineering Done")
    #print("DF Actual : ",df_act.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act['person_who_resolved']
    #print("New Target COlumn : ",new_target_col.head())
    df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act,new_target_col)
    print('Training Done for NLP Based TFIDF')
    
    print('---------------------------------')
    print('contexual Training Started -----')
    print('---------------------------------')
    
    #print(df_train.head())
    df_act_context = df_train
    #print("DF Act Context : ",df_act_context)
    print("DF ACT Context before : ",df_act_context.shape)
    df_act_context=data_pre_processing_context(df_act_context)  ## Changes made here
    print("DF ACT Context After : ",df_act_context.shape)
    #print("DF Act COntext After : ",df_act_context.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act_context['person_who_resolved']
    df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1)
    print("DF ACT Context : ",df_act_context)
    unique_train_data_word = unique_word_list(df_act_context, 'Noun')
    print(unique_train_data_word)
    
    Matching_data = train_data_context(unique_train_data_word,embeddings_dict)
    print("Matching Data : ",Matching_data.head())
    print('Training Done for contexual Search')
    
    ###Mode Dumping for Contexual search
    dump(Matching_data, config.model_folder_name + config.model_matching_data_train)
    dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict)
    dump(df_act_context, config.model_folder_name + config.context_data)
    print('Models successfully dumped in respetive folder for contexual search')
    
    ###Mode Dumping for TFIDF
    dump(df_train_tfidf, config.model_folder_name + config.model_tfidf)
    dump(df_train_tf_count, config.model_folder_name + config.model_tf_count)
    dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector)
    dump(count_vector, config.model_folder_name + config.model_tf_count_vector)
    dump(df_act, config.model_folder_name + config.raw_data)
    
    print('Models successfully dumped in respetive folder')
    
Editor is loading...