Untitled
unknown
plain_text
2 years ago
7.1 kB
7
Indexable
There are two similar functions like this-
Function 1-
def input_evalution(input_processed_text,tenant_id, df_train_mtrx,tfidf_vector,df_act):
print("Into Input Evaluation function")
text=input_processed_text
print("Text : ",text)
tfidf_vector=tfidf_vector
tenant_id = tenant_id
print("Tenant ID Inside INput Evaluation : ", tenant_id)
#print("TFIDF Vector : ",tfidf_vector)
df_train_mtrx=df_train_mtrx
#print("DF Train Matrix : ",df_train_mtrx)
df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0]
#print("DF Train Matrix Filtered : ",df_train_mtrx_filtered)
input_tfidf=tfidf_vector.transform([text])
print("Input TF IDF : ",input_tfidf)
x=input_tfidf.todense()
print("X : ",x)
df_tst = pd.DataFrame(x)
#print("Df Test Input Evaluation : ",df_tst)
## Replacing Nan values in matrix with 0
df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered)
#print("DF Train MAtrix Nan : ",df_train_mtrx_nan)
df_train_mtrx_filtered[df_train_mtrx_filtered_nan] = 0
scr=cosine_similarity(df_train_mtrx_filtered, df_tst)
#print("Cosine Similarity : ",scr)
df_chk = pd.DataFrame()
df_chk['ticket_id']=df_train_mtrx_filtered.index
df_chk['score']=scr
#print("DF CHeck : ",df_chk.head())
# Filter 'df_chk' to keep rows where the 'score' is greater than 0.50
score = df_chk[df_chk['score'] > 0.50]['score'].tolist()
print("Score : ", score)
# Get the indexes where the score is above the threshold
indexes = df_chk[df_chk['score'] > 0.50].index
print("Indexes : ",indexes)
# Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])]
#print("DF eval : ", df_eval.head())
return df_eval, df_tst
ANd Function2-
def input_evalution_count(text, tenant_id,df_train_mtrx,count_vector,df_act):
print("Into Input Evaluation Count function")
text=text
print("Text : ",text)
tenant_id = tenant_id
print("Tenant ID inside INput EValuation Count fn : ",tenant_id)
df_train_mtrx=df_train_mtrx
#print("DF Train Matrix : ",df_train_mtrx)
df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0]
## Replacing Nan values in matrix with 0
df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered)
## Transforming into COunt Vector
input_count=count_vector.transform([text])
print("Input Count : ",input_count)
x=input_count.todense()
print("X : ",x)
df_tst = pd.DataFrame(x)
#print("DF Test in evaluation count : ",df_tst)
## Calculating cosine similarity
scr=cosine_similarity(df_train_mtrx_filtered, df_tst)
print("Cosine Similarity inside Input evaluation : ",scr)
df_chk = pd.DataFrame()
df_chk['ticket_id']=df_train_mtrx_filtered.index
df_chk['score']=scr
# Filter 'df_chk' to keep rows where the 'score' is greater than 0.50
score = df_chk[df_chk['score'] > 0.50]['score'].tolist()
print("Score : ", score)
# Get the indexes where the score is above the threshold
indexes = df_chk[df_chk['score'] > 0.50].index
print("Indexes : ",indexes)
# Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])]
#df_eval = df_train_mtrx.iloc[indexes]
df_eval['score'] = score
print("DF eval inside Input Evaluation: ", df_eval.head())
return df_eval, df_tst
If you carefully see, both have just one difference that is in 1st function TF IDF is used and in 2nd FUnction Count is used.
ANd using these functions in event_prediction_tf_idf
def event_prediction_tfidf(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc):
#pdb.set_trace()
global tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act
## First join 5 parameters andd then call input_data_preprocessing
data_to_be_processed=str(input_tenant_id) + ' ' + str(input_ticket_category) +' ' + str(input_ticket_type) +' ' +str(input_ticket_item) + ' ' + str(input_ticket_summary) + ' ' +str(input_ticket_desc)
## Input Data Preprocessing
input_processed_text = input_data_preprocessing(data_to_be_processed) ## 5 different fields
print("Input processed Text : ",input_processed_text)
print("Getting Tenant ID from input_processed_text")
tenant_id = [int(i) for i in input_processed_text.split() if i.isdigit()][0]
print("Tenant ID from Input processed text : ",tenant_id)
##TFIDF Prediction
tfidf_pred,input_tfidfmatrx = input_evalution(input_processed_text,tenant_id,tfidf_matrix,tfidf_vector,df_act)
print("TF IDF Pred : ",tfidf_pred)
#print("Input TFIDF Matrix : ",input_tfidfmatrx)
##TF_count Prediction
tf_count_pred,input_tfcountmatrx = input_evalution_count(input_processed_text,tenant_id,tf_count_matrix,count_vector,df_act)
print("TF Count Pred : ",tf_count_pred)
#print("INput Count Matrix : ",input_tfcountmatrx)
tfidf_pred['score_new'] = tfidf_pred['score']*0.5
tf_count_pred['score_new'] = tf_count_pred['score']*0.5
tfidf_pred['flag'] = 'tfidf'
tf_count_pred['flag'] = 'tf_count'
overall_result = pd.concat([tfidf_pred,tf_count_pred])
print("Overall Result : ",overall_result)
if len(overall_result)>0:
overall_result = overall_result.sort_values(by='score_new',ascending=False)
overall_result = overall_result.head(config.max_reccom)
#print("Overall Result : ",overall_result)
user_recommendation_list = overall_result[config.target_column].tolist()
print("USer recommendation List from event_prediction_tfidf function : ",user_recommendation_list)
return user_recommendation_list
we are doing overall_result = pd.concat([tfidf_pred,tf_count_pred]) wherein we are trying to concatenate tf_idf pred and countpred and get overall result, but there is one issue,
Actually while running , this input evaluation count , A score column is already there is matrix as below-
This is the difference between tf-idf pred and tf count pred as below
TF IDF Pred : tenant_id ticket_id ticket_category ticket_type ticket_item ticket_summary ticket_desc ... Noun
2670 32 2670 application hcm - opm talent profile application data error elow employee was hired.. [Roshni, Gangrade]
TF Count Pred : tenant_id ticket_id ticket_category ticket_type ticket_item ticket_summary .... Noun score
1437 32 1437 application hcm - opm administration application data error [Roshni, Gangrade] 0.62994
Due to this difference it is not able to concat this tf_idf pred and tf_count pred here in
overall_result = pd.concat([tfidf_pred,tf_count_pred]) in event_prediction_tfidf function.
Can you help fix this issue.Editor is loading...