Untitled

mail@pastecode.io avatar
unknown
plain_text
7 months ago
7.1 kB
1
Indexable
Never
There are two similar functions like this-

Function 1-

def input_evalution(input_processed_text,tenant_id, df_train_mtrx,tfidf_vector,df_act):
    print("Into Input Evaluation function")
    text=input_processed_text
    print("Text : ",text)
    tfidf_vector=tfidf_vector
    tenant_id = tenant_id
    print("Tenant ID Inside INput Evaluation : ", tenant_id)
    #print("TFIDF Vector : ",tfidf_vector)
    df_train_mtrx=df_train_mtrx
    #print("DF Train Matrix : ",df_train_mtrx)
    df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0]
    #print("DF Train Matrix Filtered : ",df_train_mtrx_filtered)

    input_tfidf=tfidf_vector.transform([text])
    print("Input TF IDF : ",input_tfidf)
    x=input_tfidf.todense()
    print("X : ",x)
    df_tst = pd.DataFrame(x)
    #print("Df Test Input Evaluation : ",df_tst)
    ## Replacing Nan values in matrix with 0
    df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered)
    #print("DF Train MAtrix Nan : ",df_train_mtrx_nan)
    df_train_mtrx_filtered[df_train_mtrx_filtered_nan] = 0
    scr=cosine_similarity(df_train_mtrx_filtered, df_tst)
    #print("Cosine Similarity : ",scr)
    df_chk = pd.DataFrame()
    df_chk['ticket_id']=df_train_mtrx_filtered.index
    df_chk['score']=scr
    #print("DF CHeck : ",df_chk.head())
    
    # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50
    score = df_chk[df_chk['score'] > 0.50]['score'].tolist()
    print("Score : ", score)
    
    # Get the indexes where the score is above the threshold
    indexes = df_chk[df_chk['score'] > 0.50].index
    print("Indexes : ",indexes)
    
    # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
    df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])]

    #print("DF eval : ", df_eval.head())

    return df_eval, df_tst


ANd Function2-

def input_evalution_count(text, tenant_id,df_train_mtrx,count_vector,df_act):
    print("Into Input Evaluation Count function")
    text=text
    print("Text : ",text)
    tenant_id = tenant_id
    print("Tenant ID inside INput EValuation Count fn : ",tenant_id)
 
    df_train_mtrx=df_train_mtrx
    #print("DF Train Matrix : ",df_train_mtrx)
    df_train_mtrx_filtered = df_train_mtrx[df_train_mtrx['32'] > 0]
    
    ## Replacing Nan values in matrix with 0
    df_train_mtrx_filtered_nan=np.isnan(df_train_mtrx_filtered)
    ## Transforming into COunt Vector
    input_count=count_vector.transform([text])
    print("Input Count : ",input_count)
    x=input_count.todense()
    print("X : ",x)
    df_tst = pd.DataFrame(x)
    #print("DF Test in evaluation count : ",df_tst)
    ## Calculating cosine similarity
    scr=cosine_similarity(df_train_mtrx_filtered, df_tst)
    print("Cosine Similarity inside Input evaluation : ",scr)
    df_chk = pd.DataFrame()
    df_chk['ticket_id']=df_train_mtrx_filtered.index
    df_chk['score']=scr
    
    # Filter 'df_chk' to keep rows where the 'score' is greater than 0.50
    score = df_chk[df_chk['score'] > 0.50]['score'].tolist()
    print("Score : ", score)
    
    # Get the indexes where the score is above the threshold
    indexes = df_chk[df_chk['score'] > 0.50].index
    print("Indexes : ",indexes)
    
    # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
    df_eval = df_act[df_act['ticket_id'].isin(df_chk[df_chk['score']>0.50]['ticket_id'])]
    #df_eval = df_train_mtrx.iloc[indexes]
    df_eval['score'] = score
    print("DF eval inside Input Evaluation: ", df_eval.head())

    return df_eval, df_tst


If you carefully see, both have just one difference that is in 1st function TF IDF is used and in 2nd FUnction Count is used.

ANd using these functions in event_prediction_tf_idf

def event_prediction_tfidf(input_tenant_id,input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc):
    #pdb.set_trace()
    
    global tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act
    ## First join 5 parameters andd then call input_data_preprocessing
    data_to_be_processed=str(input_tenant_id) + ' ' + str(input_ticket_category) +' ' + str(input_ticket_type) +' ' +str(input_ticket_item) + ' ' + str(input_ticket_summary) + ' ' +str(input_ticket_desc)

    ## Input Data Preprocessing
    input_processed_text = input_data_preprocessing(data_to_be_processed) ## 5 different fields
    
    print("Input processed Text : ",input_processed_text)
    
    print("Getting Tenant ID from input_processed_text")
    tenant_id = [int(i) for i in input_processed_text.split() if i.isdigit()][0]
    print("Tenant ID from Input processed text : ",tenant_id)
      
    ##TFIDF Prediction
    tfidf_pred,input_tfidfmatrx = input_evalution(input_processed_text,tenant_id,tfidf_matrix,tfidf_vector,df_act)
    print("TF IDF Pred : ",tfidf_pred)
    #print("Input TFIDF Matrix : ",input_tfidfmatrx)
    
    ##TF_count Prediction
    tf_count_pred,input_tfcountmatrx = input_evalution_count(input_processed_text,tenant_id,tf_count_matrix,count_vector,df_act)
    print("TF Count Pred : ",tf_count_pred)
    #print("INput Count Matrix : ",input_tfcountmatrx)
    
    
    tfidf_pred['score_new'] = tfidf_pred['score']*0.5
    tf_count_pred['score_new'] = tf_count_pred['score']*0.5
    
    tfidf_pred['flag'] = 'tfidf'
    tf_count_pred['flag'] = 'tf_count'
    
    overall_result = pd.concat([tfidf_pred,tf_count_pred])
    print("Overall Result : ",overall_result)
    if len(overall_result)>0:
    
        overall_result = overall_result.sort_values(by='score_new',ascending=False)
        
        overall_result = overall_result.head(config.max_reccom)
        #print("Overall Result : ",overall_result)
        
        user_recommendation_list = overall_result[config.target_column].tolist()
        print("USer recommendation List from event_prediction_tfidf function : ",user_recommendation_list)

    return user_recommendation_list


we are doing overall_result = pd.concat([tfidf_pred,tf_count_pred]) wherein we are trying to concatenate tf_idf pred and countpred and get overall result, but there is one issue,

Actually while running , this input evaluation count , A score column is already there is matrix as below-

This is the difference between tf-idf pred and tf count pred as below
TF IDF Pred :       tenant_id  ticket_id ticket_category ticket_type     ticket_item  	ticket_summary			ticket_desc				... Noun	
2670        			32       2670     application   hcm - opm  talent profile   	application data error	elow employee was hired..	[Roshni, Gangrade]


TF Count Pred :       tenant_id  ticket_id ticket_category ticket_type       ticket_item  ticket_summary		....		Noun	score
1437        			32       1437     application   hcm - opm    	administration   application data error 			[Roshni, Gangrade]		0.62994

Due to this difference it is not able to concat this tf_idf pred and tf_count pred here in 
overall_result = pd.concat([tfidf_pred,tf_count_pred]) in event_prediction_tfidf function.

Can you help fix this issue.