Untitled
unknown
plain_text
2 years ago
7.1 kB
3
Indexable
There are two similar functions like this- Function 1- def input_evalution(input_processed_text,tenant_id, df_train_mtrx,tfidf_vector,df_act): print("Into Input Evaluation function") text=input_processed_text print("Text : ",text) tfidf_vector=tfidf_vector tenant_id = tenant_id print("Tenant ID Inside INput Evaluation : ", tenant_id) #print("TFIDF Vector : ",tfidf_vector) df_train_mtrx=df_train_mtrx #print("DF Train Matrix : ",df_train_mtrx) df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0] #print("DF Train Matrix Filtered : ",df_train_mtrx_filtered) input_tfidf=tfidf_vector.transform([text]) print("Input TF IDF : ",input_tfidf) x=input_tfidf.todense() print("X : ",x) df_tst = pd.DataFrame(x) #print("Df Test Input Evaluation : ",df_tst) ## Replacing Nan values in matrix with 0 df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered) #print("DF Train MAtrix Nan : ",df_train_mtrx_nan) df_train_mtrx_filtered[df_train_mtrx_filtered_nan] = 0 scr=cosine_similarity(df_train_mtrx_filtered, df_tst) #print("Cosine Similarity : ",scr) df_chk = pd.DataFrame() df_chk['ticket_id']=df_train_mtrx_filtered.index df_chk['score']=scr #print("DF CHeck : ",df_chk.head()) # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50 score = df_chk[df_chk['score'] > 0.50]['score'].tolist() print("Score : ", score) # Get the indexes where the score is above the threshold indexes = df_chk[df_chk['score'] > 0.50].index print("Indexes : ",indexes) # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])] #print("DF eval : ", df_eval.head()) return df_eval, df_tst ANd Function2- def input_evalution_count(text, tenant_id,df_train_mtrx,count_vector,df_act): print("Into Input Evaluation Count function") text=text print("Text : ",text) tenant_id = tenant_id print("Tenant ID inside INput EValuation Count fn : ",tenant_id) df_train_mtrx=df_train_mtrx #print("DF Train Matrix : ",df_train_mtrx) df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0] ## Replacing Nan values in matrix with 0 df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered) ## Transforming into COunt Vector input_count=count_vector.transform([text]) print("Input Count : ",input_count) x=input_count.todense() print("X : ",x) df_tst = pd.DataFrame(x) #print("DF Test in evaluation count : ",df_tst) ## Calculating cosine similarity scr=cosine_similarity(df_train_mtrx_filtered, df_tst) print("Cosine Similarity inside Input evaluation : ",scr) df_chk = pd.DataFrame() df_chk['ticket_id']=df_train_mtrx_filtered.index df_chk['score']=scr # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50 score = df_chk[df_chk['score'] > 0.50]['score'].tolist() print("Score : ", score) # Get the indexes where the score is above the threshold indexes = df_chk[df_chk['score'] > 0.50].index print("Indexes : ",indexes) # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])] #df_eval = df_train_mtrx.iloc[indexes] df_eval['score'] = score print("DF eval inside Input Evaluation: ", df_eval.head()) return df_eval, df_tst If you carefully see, both have just one difference that is in 1st function TF IDF is used and in 2nd FUnction Count is used. ANd using these functions in event_prediction_tf_idf def event_prediction_tfidf(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc): #pdb.set_trace() global tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act ## First join 5 parameters andd then call input_data_preprocessing data_to_be_processed=str(input_tenant_id) + ' ' + str(input_ticket_category) +' ' + str(input_ticket_type) +' ' +str(input_ticket_item) + ' ' + str(input_ticket_summary) + ' ' +str(input_ticket_desc) ## Input Data Preprocessing input_processed_text = input_data_preprocessing(data_to_be_processed) ## 5 different fields print("Input processed Text : ",input_processed_text) print("Getting Tenant ID from input_processed_text") tenant_id = [int(i) for i in input_processed_text.split() if i.isdigit()][0] print("Tenant ID from Input processed text : ",tenant_id) ##TFIDF Prediction tfidf_pred,input_tfidfmatrx = input_evalution(input_processed_text,tenant_id,tfidf_matrix,tfidf_vector,df_act) print("TF IDF Pred : ",tfidf_pred) #print("Input TFIDF Matrix : ",input_tfidfmatrx) ##TF_count Prediction tf_count_pred,input_tfcountmatrx = input_evalution_count(input_processed_text,tenant_id,tf_count_matrix,count_vector,df_act) print("TF Count Pred : ",tf_count_pred) #print("INput Count Matrix : ",input_tfcountmatrx) tfidf_pred['score_new'] = tfidf_pred['score']*0.5 tf_count_pred['score_new'] = tf_count_pred['score']*0.5 tfidf_pred['flag'] = 'tfidf' tf_count_pred['flag'] = 'tf_count' overall_result = pd.concat([tfidf_pred,tf_count_pred]) print("Overall Result : ",overall_result) if len(overall_result)>0: overall_result = overall_result.sort_values(by='score_new',ascending=False) overall_result = overall_result.head(config.max_reccom) #print("Overall Result : ",overall_result) user_recommendation_list = overall_result[config.target_column].tolist() print("USer recommendation List from event_prediction_tfidf function : ",user_recommendation_list) return user_recommendation_list we are doing overall_result = pd.concat([tfidf_pred,tf_count_pred]) wherein we are trying to concatenate tf_idf pred and countpred and get overall result, but there is one issue, Actually while running , this input evaluation count , A score column is already there is matrix as below- This is the difference between tf-idf pred and tf count pred as below TF IDF Pred : tenant_id ticket_id ticket_category ticket_type ticket_item ticket_summary ticket_desc ... Noun 2670 32 2670 application hcm - opm talent profile application data error elow employee was hired.. [Roshni, Gangrade] TF Count Pred : tenant_id ticket_id ticket_category ticket_type ticket_item ticket_summary .... Noun score 1437 32 1437 application hcm - opm administration application data error [Roshni, Gangrade] 0.62994 Due to this difference it is not able to concat this tf_idf pred and tf_count pred here in overall_result = pd.concat([tfidf_pred,tf_count_pred]) in event_prediction_tfidf function. Can you help fix this issue.
Editor is loading...