Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
9.5 kB
2
Indexable
Never
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *
from param_config import config

data = load_data_from_file()
cleaned_data = data_pre_processing(data)


def train_data(cleaned_data):
    # Combine input text with the 'person_who_resolved' column
    print("Cleaned Data : ",cleaned_data)
    cleaned_data['input_text'] = cleaned_data['ticket_category'] + ' ' + cleaned_data['ticket_type'] + ' ' + cleaned_data['ticket_item'] + ' ' + cleaned_data['ticket_summary'] + ' ' + cleaned_data['ticket_desc'] + ' ' + cleaned_data['ticket_severity'] + ' ' + cleaned_data['person_who_resolved']

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the combined text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
    
    # Create a Count Vectorizer
    count_vectorizer = CountVectorizer(stop_words='english')
    
    # Fit and transform the combined text data
    count_matrix = count_vectorizer.fit_transform(cleaned_data['input_text'])
    
    # Convert TF-IDF and Count Vectorization results to dataframes
    data_train_tfidf = pd.DataFrame(tfidf_matrix.todense(), 
                      columns=tfidf_vectorizer.get_feature_names_out())
    
    data_train_count = pd.DataFrame(count_matrix.todense(), 
                      columns=count_vectorizer.get_feature_names_out())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer


embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        token = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_dict[token] = vector


def train_data_context(unique_train_data_word, embeddings_dict):
    unique_train_data = {}
    for word in unique_train_data_word:
        try:
            unique_train_data.update({word: embeddings_dict[word].tolist()})
        except:
            continue
    Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
    return Matching_data


if __name__ == "__main__":


    print("********************************")
    print("------------STARTED-------------")
    print("********************************")

    start_time=time.time()
    start_date=str(datetime.datetime.now())
    print("Start Date : ",start_date)
    
    try:
        files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib']
        for f in files:
            shutil.copy(config.model_folder_name+f, config.archive_path)
    except:
        print('No Data Found in Model Folder, Running for 1st time')
    
    #Loading Data from DB/File
    df_train=load_data_from_file()
    
    #Data Preprocessing
    df_act=data_pre_processing(df_train)
    
    print("Feature Engineering Done")
    #print("DF Actual : ",df_act.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act['person_who_resolved']
    #print("New Target COlumn : ",new_target_col.head())
    df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
    print('Training Done for NLP Based TFIDF')
    
    print('---------------------------------')
    print('contexual Training Started -----')
    print('---------------------------------')
    
    #print(df_train.head())
    df_act_context = df_train
    #print("DF Act Context : ",df_act_context)
    print("DF ACT Context before : ",df_act_context.shape)
    df_act_context=data_pre_processing_context(df_act_context)  ## Changes made here
    print("DF ACT Context After : ",df_act_context.shape)
    #print("DF Act COntext After : ",df_act_context.head())
    
    #Training Part and creating the Matrix
    new_target_col = df_act_context['person_who_resolved']
    df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1)
    print("DF ACT Context : ",df_act_context)
    unique_train_data_word = unique_word_list(df_act_context, 'Noun')
    print(unique_train_data_word)
    
    Matching_data = train_data_context(unique_train_data_word,embeddings_dict)
    print("Matching Data : ",Matching_data.head())
    print('Training Done for contexual Search')
    
    ###Mode Dumping for Contexual search
    dump(Matching_data, config.model_folder_name + config.model_matching_data_train)
    dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict)
    dump(df_act_context, config.model_folder_name + config.context_data)
    print('Models successfully dumped in respetive folder for contexual search')
    
    ###Mode Dumping for TFIDF
    dump(df_train_tfidf, config.model_folder_name + config.model_tfidf)
    dump(df_train_tf_count, config.model_folder_name + config.model_tf_count)
    dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector)
    dump(count_vector, config.model_folder_name + config.model_tf_count_vector)
    dump(df_act, config.model_folder_name + config.raw_data)
    
    print('Models successfully dumped in respetive folder')
    

Error-

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [1], in <cell line: 75>()
    103 new_target_col = df_act['person_who_resolved']
    104 #print("New Target COlumn : ",new_target_col.head())
--> 105 df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
    106 print('Training Done for NLP Based TFIDF')
    108 print('---------------------------------')

Input In [1], in train_data(cleaned_data)
     34 tfidf_vectorizer = TfidfVectorizer(stop_words='english')
     36 # Fit and transform the combined text data
---> 37 tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
     39 # Create a Count Vectorizer
     40 count_vectorizer = CountVectorizer(stop_words='english')

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:2133, in TfidfVectorizer.fit_transform(self, raw_documents, y)
   2126 self._check_params()
   2127 self._tfidf = TfidfTransformer(
   2128     norm=self.norm,
   2129     use_idf=self.use_idf,
   2130     smooth_idf=self.smooth_idf,
   2131     sublinear_tf=self.sublinear_tf,
   2132 )
-> 2133 X = super().fit_transform(raw_documents)
   2134 self._tfidf.fit(X)
   2135 # X is already a transformed view of raw_documents so
   2136 # we set copy to False

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1388, in CountVectorizer.fit_transform(self, raw_documents, y)
   1380             warnings.warn(
   1381                 "Upper case characters found in"
   1382                 " vocabulary while 'lowercase'"
   1383                 " is True. These entries will not"
   1384                 " be matched with any documents"
   1385             )
   1386             break
-> 1388 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1390 if self.binary:
   1391     X.data.fill(1)

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1275, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
   1273 for doc in raw_documents:
   1274     feature_counter = {}
-> 1275     for feature in analyze(doc):
   1276         try:
   1277             feature_idx = vocabulary[feature]

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:106, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
     84 """Chain together an optional series of text processing steps to go from
     85 a single document to ngrams, with or without tokenizing or preprocessing.
     86 
   (...)
    102     A sequence of tokens, possibly with pairs, triples, etc.
    103 """
    105 if decoder is not None:
--> 106     doc = decoder(doc)
    107 if analyzer is not None:
    108     doc = analyzer(doc)

File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:239, in _VectorizerMixin.decode(self, doc)
    236     doc = doc.decode(self.encoding, self.decode_error)
    238 if doc is np.nan:
--> 239     raise ValueError(
    240         "np.nan is an invalid document, expected byte or unicode string."
    241     )
    243 return doc

ValueError: np.nan is an invalid document, expected byte or unicode string.


Main goal is I have created a concatenated string column containing all input columns as well as target column person_who_resolved.
And I have to build TF-IDF vector on this column and train the model and later these tf-idf vector should be used to calculate cosine similarity with this vector and input data, and give output person_who_resolved.