Untitled
unknown
plain_text
a year ago
9.5 kB
2
Indexable
Never
import pandas as pd import pickle import time import datetime from joblib import dump, load import shutil, os import pdb import io import pandas as pd #from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity #from string import digits from sklearn.feature_extraction.text import CountVectorizer import sys from load_data import * from feature_engineering import * from param_config import config data = load_data_from_file() cleaned_data = data_pre_processing(data) def train_data(cleaned_data): # Combine input text with the 'person_who_resolved' column print("Cleaned Data : ",cleaned_data) cleaned_data['input_text'] = cleaned_data['ticket_category'] + ' ' + cleaned_data['ticket_type'] + ' ' + cleaned_data['ticket_item'] + ' ' + cleaned_data['ticket_summary'] + ' ' + cleaned_data['ticket_desc'] + ' ' + cleaned_data['ticket_severity'] + ' ' + cleaned_data['person_who_resolved'] # Create a TF-IDF vectorizer tfidf_vectorizer = TfidfVectorizer(stop_words='english') # Fit and transform the combined text data tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text']) # Create a Count Vectorizer count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the combined text data count_matrix = count_vectorizer.fit_transform(cleaned_data['input_text']) # Convert TF-IDF and Count Vectorization results to dataframes data_train_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_vectorizer.get_feature_names_out()) data_train_count = pd.DataFrame(count_matrix.todense(), columns=count_vectorizer.get_feature_names_out()) return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer embeddings_dict = {} with open("glove_50d.txt", "r", encoding="utf-8") as f: for line in f: values = line.strip().split() token = values[0] vector = np.asarray(values[1:], dtype="float32") embeddings_dict[token] = vector def train_data_context(unique_train_data_word, embeddings_dict): unique_train_data = {} for word in unique_train_data_word: try: unique_train_data.update({word: embeddings_dict[word].tolist()}) except: continue Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix']) return Matching_data if __name__ == "__main__": print("********************************") print("------------STARTED-------------") print("********************************") start_time=time.time() start_date=str(datetime.datetime.now()) print("Start Date : ",start_date) try: files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib'] for f in files: shutil.copy(config.model_folder_name+f, config.archive_path) except: print('No Data Found in Model Folder, Running for 1st time') #Loading Data from DB/File df_train=load_data_from_file() #Data Preprocessing df_act=data_pre_processing(df_train) print("Feature Engineering Done") #print("DF Actual : ",df_act.head()) #Training Part and creating the Matrix new_target_col = df_act['person_who_resolved'] #print("New Target COlumn : ",new_target_col.head()) df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act) print('Training Done for NLP Based TFIDF') print('---------------------------------') print('contexual Training Started -----') print('---------------------------------') #print(df_train.head()) df_act_context = df_train #print("DF Act Context : ",df_act_context) print("DF ACT Context before : ",df_act_context.shape) df_act_context=data_pre_processing_context(df_act_context) ## Changes made here print("DF ACT Context After : ",df_act_context.shape) #print("DF Act COntext After : ",df_act_context.head()) #Training Part and creating the Matrix new_target_col = df_act_context['person_who_resolved'] df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1) print("DF ACT Context : ",df_act_context) unique_train_data_word = unique_word_list(df_act_context, 'Noun') print(unique_train_data_word) Matching_data = train_data_context(unique_train_data_word,embeddings_dict) print("Matching Data : ",Matching_data.head()) print('Training Done for contexual Search') ###Mode Dumping for Contexual search dump(Matching_data, config.model_folder_name + config.model_matching_data_train) dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict) dump(df_act_context, config.model_folder_name + config.context_data) print('Models successfully dumped in respetive folder for contexual search') ###Mode Dumping for TFIDF dump(df_train_tfidf, config.model_folder_name + config.model_tfidf) dump(df_train_tf_count, config.model_folder_name + config.model_tf_count) dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector) dump(count_vector, config.model_folder_name + config.model_tf_count_vector) dump(df_act, config.model_folder_name + config.raw_data) print('Models successfully dumped in respetive folder') Error- --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [1], in <cell line: 75>() 103 new_target_col = df_act['person_who_resolved'] 104 #print("New Target COlumn : ",new_target_col.head()) --> 105 df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act) 106 print('Training Done for NLP Based TFIDF') 108 print('---------------------------------') Input In [1], in train_data(cleaned_data) 34 tfidf_vectorizer = TfidfVectorizer(stop_words='english') 36 # Fit and transform the combined text data ---> 37 tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text']) 39 # Create a Count Vectorizer 40 count_vectorizer = CountVectorizer(stop_words='english') File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:2133, in TfidfVectorizer.fit_transform(self, raw_documents, y) 2126 self._check_params() 2127 self._tfidf = TfidfTransformer( 2128 norm=self.norm, 2129 use_idf=self.use_idf, 2130 smooth_idf=self.smooth_idf, 2131 sublinear_tf=self.sublinear_tf, 2132 ) -> 2133 X = super().fit_transform(raw_documents) 2134 self._tfidf.fit(X) 2135 # X is already a transformed view of raw_documents so 2136 # we set copy to False File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1388, in CountVectorizer.fit_transform(self, raw_documents, y) 1380 warnings.warn( 1381 "Upper case characters found in" 1382 " vocabulary while 'lowercase'" 1383 " is True. These entries will not" 1384 " be matched with any documents" 1385 ) 1386 break -> 1388 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_) 1390 if self.binary: 1391 X.data.fill(1) File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1275, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab) 1273 for doc in raw_documents: 1274 feature_counter = {} -> 1275 for feature in analyze(doc): 1276 try: 1277 feature_idx = vocabulary[feature] File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:106, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words) 84 """Chain together an optional series of text processing steps to go from 85 a single document to ngrams, with or without tokenizing or preprocessing. 86 (...) 102 A sequence of tokens, possibly with pairs, triples, etc. 103 """ 105 if decoder is not None: --> 106 doc = decoder(doc) 107 if analyzer is not None: 108 doc = analyzer(doc) File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:239, in _VectorizerMixin.decode(self, doc) 236 doc = doc.decode(self.encoding, self.decode_error) 238 if doc is np.nan: --> 239 raise ValueError( 240 "np.nan is an invalid document, expected byte or unicode string." 241 ) 243 return doc ValueError: np.nan is an invalid document, expected byte or unicode string. Main goal is I have created a concatenated string column containing all input columns as well as target column person_who_resolved. And I have to build TF-IDF vector on this column and train the model and later these tf-idf vector should be used to calculate cosine similarity with this vector and input data, and give output person_who_resolved.