Untitled

 avatar
unknown
plain_text
2 years ago
9.5 kB
10
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *
from param_config import config

data = load_data_from_file()
cleaned_data = data_pre_processing(data)


def train_data(cleaned_data):
    # Combine input text with the 'person_who_resolved' column
    print("Cleaned Data : ",cleaned_data)
    cleaned_data['input_text'] = cleaned_data['ticket_category'] + ' ' + cleaned_data['ticket_type'] + ' ' + cleaned_data['ticket_item'] + ' ' + cleaned_data['ticket_summary'] + ' ' + cleaned_data['ticket_desc'] + ' ' + cleaned_data['ticket_severity'] + ' ' + cleaned_data['person_who_resolved']

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the combined text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
    
    # Create a Count Vectorizer
    count_vectorizer = CountVectorizer(stop_words='english')
    
    # Fit and transform the combined text data
    count_matrix = count_vectorizer.fit_transform(cleaned_data['input_text'])
    
    # Convert TF-IDF and Count Vectorization results to dataframes
    data_train_tfidf = pd.DataFrame(tfidf_matrix.todense(), 
                      columns=tfidf_vectorizer.get_feature_names_out())
    
    data_train_count = pd.DataFrame(count_matrix.todense(), 
                      columns=count_vectorizer.get_feature_names_out())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer


embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        token = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_dict[token] = vector


def train_data_context(unique_train_data_word, embeddings_dict):
    unique_train_data = {}
    for word in unique_train_data_word:
        try:
            unique_train_data.update({word: embeddings_dict[word].tolist()})
        except:
            continue
    Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
    return Matching_data


if __name__ == "__main__":


    print("********************************")
    print("------------STARTED-------------")
    print("********************************")

    start_time=time.time()
    start_date=str(datetime.datetime.now())
    print("Start Date : ",start_date)
    
    try:
        files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib']
        for f in files:
            shutil.copy(config.model_folder_name+f, config.archive_path)
    except:
        print('No Data Found in Model Folder, Running for 1st time')
    
    #Loading Data from DB/File
    df_train=load_data_from_file()
    
    #Data Preprocessing
    df_act=data_pre_processing(df_train)
    
    print("Feature Engineering Done")
    #print("DF Actual : ",df_act.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act['person_who_resolved']
    #print("New Target COlumn : ",new_target_col.head())
    df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
    print('Training Done for NLP Based TFIDF')
    
    print('---------------------------------')
    print('contexual Training Started -----')
    print('---------------------------------')
    
    #print(df_train.head())
    df_act_context = df_train
    #print("DF Act Context : ",df_act_context)
    print("DF ACT Context before : ",df_act_context.shape)
    df_act_context=data_pre_processing_context(df_act_context)  ## Changes made here
    print("DF ACT Context After : ",df_act_context.shape)
    #print("DF Act COntext After : ",df_act_context.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act_context['person_who_resolved']
    df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1)
    print("DF ACT Context : ",df_act_context)
    unique_train_data_word = unique_word_list(df_act_context, 'Noun')
    print(unique_train_data_word)
    
    Matching_data = train_data_context(unique_train_data_word,embeddings_dict)
    print("Matching Data : ",Matching_data.head())
    print('Training Done for contexual Search')
    
    ###Mode Dumping for Contexual search
    dump(Matching_data, config.model_folder_name + config.model_matching_data_train)
    dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict)
    dump(df_act_context, config.model_folder_name + config.context_data)
    print('Models successfully dumped in respetive folder for contexual search')
    
    ###Mode Dumping for TFIDF
    dump(df_train_tfidf, config.model_folder_name + config.model_tfidf)
    dump(df_train_tf_count, config.model_folder_name + config.model_tf_count)
    dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector)
    dump(count_vector, config.model_folder_name + config.model_tf_count_vector)
    dump(df_act, config.model_folder_name + config.raw_data)
    
    print('Models successfully dumped in respetive folder')
    

Error-

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [1], in <cell line: 75>()
    103 new_target_col = df_act['person_who_resolved']
    104 #print("New Target COlumn : ",new_target_col.head())
--> 105 df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
    106 print('Training Done for NLP Based TFIDF')
    108 print('---------------------------------')

Input In [1], in train_data(cleaned_data)
     34 tfidf_vectorizer = TfidfVectorizer(stop_words='english')
     36 # Fit and transform the combined text data
---> 37 tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
     39 # Create a Count Vectorizer
     40 count_vectorizer = CountVectorizer(stop_words='english')

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:2133, in TfidfVectorizer.fit_transform(self, raw_documents, y)
   2126 self._check_params()
   2127 self._tfidf = TfidfTransformer(
   2128     norm=self.norm,
   2129     use_idf=self.use_idf,
   2130     smooth_idf=self.smooth_idf,
   2131     sublinear_tf=self.sublinear_tf,
   2132 )
-> 2133 X = super().fit_transform(raw_documents)
   2134 self._tfidf.fit(X)
   2135 # X is already a transformed view of raw_documents so
   2136 # we set copy to False

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1388, in CountVectorizer.fit_transform(self, raw_documents, y)
   1380             warnings.warn(
   1381                 "Upper case characters found in"
   1382                 " vocabulary while 'lowercase'"
   1383                 " is True. These entries will not"
   1384                 " be matched with any documents"
   1385             )
   1386             break
-> 1388 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1390 if self.binary:
   1391     X.data.fill(1)

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1275, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
   1273 for doc in raw_documents:
   1274     feature_counter = {}
-> 1275     for feature in analyze(doc):
   1276         try:
   1277             feature_idx = vocabulary[feature]

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:106, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
     84 """Chain together an optional series of text processing steps to go from
     85 a single document to ngrams, with or without tokenizing or preprocessing.
     86 
   (...)
    102     A sequence of tokens, possibly with pairs, triples, etc.
    103 """
    105 if decoder is not None:
--> 106     doc = decoder(doc)
    107 if analyzer is not None:
    108     doc = analyzer(doc)

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:239, in _VectorizerMixin.decode(self, doc)
    236     doc = doc.decode(self.encoding, self.decode_error)
    238 if doc is np.nan:
--> 239     raise ValueError(
    240         "np.nan is an invalid document, expected byte or unicode string."
    241     )
    243 return doc

ValueError: np.nan is an invalid document, expected byte or unicode string.


Main goal is I have created a concatenated string column containing all input columns as well as target column person_who_resolved.
And I have to build TF-IDF vector on this column and train the model and later these tf-idf vector should be used to calculate cosine similarity with this vector and input data, and give output person_who_resolved.
Editor is loading...