Untitled
unknown
plain_text
2 years ago
9.5 kB
8
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io
import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import sys
from load_data import *
from feature_engineering import *
from param_config import config
data = load_data_from_file()
cleaned_data = data_pre_processing(data)
def train_data(cleaned_data):
# Combine input text with the 'person_who_resolved' column
print("Cleaned Data : ",cleaned_data)
cleaned_data['input_text'] = cleaned_data['ticket_category'] + ' ' + cleaned_data['ticket_type'] + ' ' + cleaned_data['ticket_item'] + ' ' + cleaned_data['ticket_summary'] + ' ' + cleaned_data['ticket_desc'] + ' ' + cleaned_data['ticket_severity'] + ' ' + cleaned_data['person_who_resolved']
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Fit and transform the combined text data
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
# Create a Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the combined text data
count_matrix = count_vectorizer.fit_transform(cleaned_data['input_text'])
# Convert TF-IDF and Count Vectorization results to dataframes
data_train_tfidf = pd.DataFrame(tfidf_matrix.todense(),
columns=tfidf_vectorizer.get_feature_names_out())
data_train_count = pd.DataFrame(count_matrix.todense(),
columns=count_vectorizer.get_feature_names_out())
return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer
embeddings_dict = {}
with open("glove_50d.txt", "r", encoding="utf-8") as f:
for line in f:
values = line.strip().split()
token = values[0]
vector = np.asarray(values[1:], dtype="float32")
embeddings_dict[token] = vector
def train_data_context(unique_train_data_word, embeddings_dict):
unique_train_data = {}
for word in unique_train_data_word:
try:
unique_train_data.update({word: embeddings_dict[word].tolist()})
except:
continue
Matching_data = pd.DataFrame(unique_train_data.items(), columns=['unique_train_data_word_embed', 'unique_train_data_matrix'])
return Matching_data
if __name__ == "__main__":
print("********************************")
print("------------STARTED-------------")
print("********************************")
start_time=time.time()
start_date=str(datetime.datetime.now())
print("Start Date : ",start_date)
try:
files = ['tfidf.joblib', 'tf_count.joblib', 'tfidf_vector.joblib','tf_countvector.joblib','raw_data.joblib']
for f in files:
shutil.copy(config.model_folder_name+f, config.archive_path)
except:
print('No Data Found in Model Folder, Running for 1st time')
#Loading Data from DB/File
df_train=load_data_from_file()
#Data Preprocessing
df_act=data_pre_processing(df_train)
print("Feature Engineering Done")
#print("DF Actual : ",df_act.head())
#Training Part and creating the Matrix
new_target_col = df_act['person_who_resolved']
#print("New Target COlumn : ",new_target_col.head())
df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
print('Training Done for NLP Based TFIDF')
print('---------------------------------')
print('contexual Training Started -----')
print('---------------------------------')
#print(df_train.head())
df_act_context = df_train
#print("DF Act Context : ",df_act_context)
print("DF ACT Context before : ",df_act_context.shape)
df_act_context=data_pre_processing_context(df_act_context) ## Changes made here
print("DF ACT Context After : ",df_act_context.shape)
#print("DF Act COntext After : ",df_act_context.head())
#Training Part and creating the Matrix
new_target_col = df_act_context['person_who_resolved']
df_act_context['Noun'] = df_act_context.apply(lambda row: noun_extraction(row['person_who_resolved']), axis=1)
print("DF ACT Context : ",df_act_context)
unique_train_data_word = unique_word_list(df_act_context, 'Noun')
print(unique_train_data_word)
Matching_data = train_data_context(unique_train_data_word,embeddings_dict)
print("Matching Data : ",Matching_data.head())
print('Training Done for contexual Search')
###Mode Dumping for Contexual search
dump(Matching_data, config.model_folder_name + config.model_matching_data_train)
dump(embeddings_dict, config.model_folder_name + config.glove_vector_dict)
dump(df_act_context, config.model_folder_name + config.context_data)
print('Models successfully dumped in respetive folder for contexual search')
###Mode Dumping for TFIDF
dump(df_train_tfidf, config.model_folder_name + config.model_tfidf)
dump(df_train_tf_count, config.model_folder_name + config.model_tf_count)
dump(tfidf_vector, config.model_folder_name + config.model_tfidf_vector)
dump(count_vector, config.model_folder_name + config.model_tf_count_vector)
dump(df_act, config.model_folder_name + config.raw_data)
print('Models successfully dumped in respetive folder')
Error-
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [1], in <cell line: 75>()
103 new_target_col = df_act['person_who_resolved']
104 #print("New Target COlumn : ",new_target_col.head())
--> 105 df_train_tfidf, df_train_tf_count,tfidf_vector,count_vector = train_data(df_act)
106 print('Training Done for NLP Based TFIDF')
108 print('---------------------------------')
Input In [1], in train_data(cleaned_data)
34 tfidf_vectorizer = TfidfVectorizer(stop_words='english')
36 # Fit and transform the combined text data
---> 37 tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['input_text'])
39 # Create a Count Vectorizer
40 count_vectorizer = CountVectorizer(stop_words='english')
File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:2133, in TfidfVectorizer.fit_transform(self, raw_documents, y)
2126 self._check_params()
2127 self._tfidf = TfidfTransformer(
2128 norm=self.norm,
2129 use_idf=self.use_idf,
2130 smooth_idf=self.smooth_idf,
2131 sublinear_tf=self.sublinear_tf,
2132 )
-> 2133 X = super().fit_transform(raw_documents)
2134 self._tfidf.fit(X)
2135 # X is already a transformed view of raw_documents so
2136 # we set copy to False
File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1388, in CountVectorizer.fit_transform(self, raw_documents, y)
1380 warnings.warn(
1381 "Upper case characters found in"
1382 " vocabulary while 'lowercase'"
1383 " is True. These entries will not"
1384 " be matched with any documents"
1385 )
1386 break
-> 1388 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
1390 if self.binary:
1391 X.data.fill(1)
File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:1275, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
1273 for doc in raw_documents:
1274 feature_counter = {}
-> 1275 for feature in analyze(doc):
1276 try:
1277 feature_idx = vocabulary[feature]
File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:106, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
84 """Chain together an optional series of text processing steps to go from
85 a single document to ngrams, with or without tokenizing or preprocessing.
86
(...)
102 A sequence of tokens, possibly with pairs, triples, etc.
103 """
105 if decoder is not None:
--> 106 doc = decoder(doc)
107 if analyzer is not None:
108 doc = analyzer(doc)
File /Analytics/venv/CAPEANALYTICS/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:239, in _VectorizerMixin.decode(self, doc)
236 doc = doc.decode(self.encoding, self.decode_error)
238 if doc is np.nan:
--> 239 raise ValueError(
240 "np.nan is an invalid document, expected byte or unicode string."
241 )
243 return doc
ValueError: np.nan is an invalid document, expected byte or unicode string.
Main goal is I have created a concatenated string column containing all input columns as well as target column person_who_resolved.
And I have to build TF-IDF vector on this column and train the model and later these tf-idf vector should be used to calculate cosine similarity with this vector and input data, and give output person_who_resolved.Editor is loading...