Untitled
unknown
plain_text
21 days ago
8.2 kB
1
Indexable
Never
import pandas as pd from nltk.corpus import stopwords import string import pickle from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from string import digits from sklearn.feature_extraction.text import CountVectorizer import numpy as np import pdb import re from fuzzywuzzy import fuzz from feature_engineering import * from param_config import config from model_loading import loading_model models = loading_model() tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act,Matching_data,embedding_dict,df_act_context = models.load_models() def extract_combined_info(text): words = text.split() info = [] current_info = [] for word in words: if word.isnumeric(): current_info.append(word) elif word[0].isupper() and current_info: current_name = ' '.join(words[words.index(word):]) info.append(current_info + [current_name]) current_info = [] return info def process_text(text): # Remove leading numbers using regular expression trimmed_text = re.sub(r'^\d+\s*', '', text) return trimmed_text def extract_role_name(text): role_name = re.findall(r'\d+\s+([\w\s]+?)\s+[A-Z]', text) return role_name def user_recommendation(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc): try: print("Input Tenant ID from API : ",input_tenant_id) user_recommendation_list_tfidf = user_recommendation_tfidf(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc) print("TFIDF Prediction Done", user_recommendation_list_tfidf) processed_recommendations_list =[process_text(text) for text in user_recommendation_list_tfidf] #print("Processed Recommendations List : ",processed_recommendations_list) user_recommendation_info = [] for res in processed_recommendations_list: #print("Res : ",res) info = extract_combined_info(res) role_name = extract_role_name(res) if info: info[0].append(role_name[0] if role_name else None) user_recommendation_info.append(info[0]) return user_recommendation_info except: user_recommendation_list = [] return user_recommendation_list def user_recommendation_tfidf(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc): #pdb.set_trace() global tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act ## First join 5 parameters andd then call input_data_preprocessing data_to_be_processed=str(input_tenant_id) + ' ' + str(input_ticket_category) +' ' + str(input_ticket_type) +' ' +str(input_ticket_item) + ' ' + str(input_ticket_summary) + ' ' +str(input_ticket_desc) ## Input Data Preprocessing input_processed_text = input_data_preprocessing(data_to_be_processed) ## 5 different fields print("Input processed Text : ",input_processed_text) print("Getting Tenant ID from input_processed_text") tenant_id = [int(i) for i in input_processed_text.split() if i.isdigit()][0] print("Tenant ID from Input processed text : ",tenant_id) ##TFIDF Prediction tfidf_pred,input_tfidfmatrx = input_evalution(input_processed_text,tenant_id,tfidf_matrix,tfidf_vector,df_act) #print("TF IDF Pred : ",tfidf_pred) #print("Input TFIDF Matrix : ",input_tfidfmatrx) ##TF_count Prediction tf_count_pred,input_tfcountmatrx = input_evalution_count(input_processed_text,tenant_id,tf_count_matrix,count_vector,df_act) #print("TF Count Pred : ",tf_count_pred) #print("INput Count Matrix : ",input_tfcountmatrx) tfidf_pred['score_new'] = tfidf_pred['score']*0.5 tf_count_pred['score_new'] = tf_count_pred['score']*0.5 tfidf_pred['flag'] = 'tfidf' tf_count_pred['flag'] = 'tf_count' overall_result = pd.concat([tfidf_pred,tf_count_pred]) #print("Overall Result : ",overall_result) if len(overall_result)>0: overall_result = overall_result.sort_values(by='score_new',ascending=False) overall_result = overall_result.head(config.max_reccom) #print("Overall Result : ",overall_result) user_recommendation_list = overall_result[config.target_column].tolist() print("USer recommendation List from event_prediction_tfidf function : ",user_recommendation_list) return user_recommendation_list def input_evalution(input_processed_text,tenant_id, df_train_mtrx,tfidf_vector,df_act): print("Into Input Evaluation function") text=input_processed_text print("Text : ",text) tfidf_vector=tfidf_vector tenant_id = tenant_id print("Tenant ID Inside INput Evaluation : ", tenant_id) #print("TFIDF Vector : ",tfidf_vector) df_train_mtrx=df_train_mtrx #print("DF Train Matrix : ",df_train_mtrx) df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx[str(tenant_id)] > 0] #print("DF Train Matrix Filtered : ",df_train_mtrx_filtered) ## Replacing Nan values in matrix with 0 df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered) input_tfidf=tfidf_vector.transform([text]) #print("Input TF IDF : ",input_tfidf) x=input_tfidf.todense() print("X : ",x) df_tst = pd.DataFrame(x) #print("Df Test Input Evaluation : ",df_tst) scr=cosine_similarity(df_train_mtrx_filtered, df_tst) #print("Cosine Similarity Input Evaluation : ",scr) df_chk = pd.DataFrame() df_chk['ticket_id']=df_train_mtrx_filtered.index df_chk['score']=scr print("DF CHeck Input Evaluation: ",df_chk.head()) # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50 score = df_chk[df_chk['score'] > 0.50]['score'].tolist() print("Score : ", score) # Get the indexes where the score is above the threshold indexes = df_chk[df_chk['score'] > 0.50].index print("Indexes : ",indexes) # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])] df_eval['score'] = score #print("DF eval Input Evaluation: ", df_eval.head()) return df_eval, df_tst def input_evalution_count(text, tenant_id,df_train_mtrx,count_vector,df_act): print("Into Input Evaluation Count function") text=text print("Text : ",text) tenant_id = tenant_id print("Tenant ID inside INput EValuation Count fn : ",tenant_id) df_train_mtrx=df_train_mtrx #print("DF Train Matrix : ",df_train_mtrx) df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx[str(tenant_id)] > 0] ## Replacing Nan values in matrix with 0 df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered) ## Transforming into COunt Vector input_count=count_vector.transform([text]) #print("Input Count : ",input_count) x=input_count.todense() print("X : ",x) df_tst = pd.DataFrame(x) #print("DF Test in evaluation count : ",df_tst) ## Calculating cosine similarity scr=cosine_similarity(df_train_mtrx_filtered, df_tst) #print("Cosine Similarity inside Input evaluation : ",scr) df_chk = pd.DataFrame() df_chk['ticket_id']=df_train_mtrx_filtered.index df_chk['score']=scr print("DF CHeck Input Evaluation Count: ",df_chk.head()) # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50 score = df_chk[df_chk['score'] > 0.50]['score'].tolist() print("Score : ", score) # Get the indexes where the score is above the threshold indexes = df_chk[df_chk['score'] > 0.50].index print("Indexes : ",indexes) # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])] #df_eval = df_train_mtrx.iloc[indexes] df_eval['score'] = score #print("DF eval inside Input Evaluation Count: ", df_eval.head()) return df_eval, df_tst