Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
11 kB
2
Indexable
This is the code-

import pandas as pd
from nltk.corpus import stopwords
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pdb

from fuzzywuzzy import fuzz


from feature_engineering import *
from param_config import config
from model_loading import loading_model

models = loading_model()

tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act,Matching_data,embedding_dict,df_act_context = models.load_models()
#print("Initial TFIDF MAtrix : ",tfidf_matrix)
#print(" Initial TF Count Matrix ",tf_count_matrix)
#print("Inital TFIDF Vector",tfidf_vector)
#print("Initial Count Vector ",count_vector)
#print("DF ACT",df_act.head())
#print("Initial Embedding Dict",embedding_dict)
#print("DF ACT Context ",df_act_context.head())


def event_prediction(input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc):
    try:
        user_recommendation_list_tfidf = event_prediction_tfidf(input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc)
        print("TFIDF Prediction Done",user_recommendation_list_tfidf)
        user_recommendation_list_context = event_prediction_context(input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc)
        print("Contexual Prediction Done")
        
        # Combine the recommendations from both methods
        user_recommendation_list = user_recommendation_list_tfidf ## Changes done here
        
        return user_recommendation_list
    
    except:
        user_recommendation_list = []
        return user_recommendation_list

    

def event_prediction_tfidf(input_ticket_category, input_ticket_type, input_ticket_item, input_ticket_summary, input_ticket_desc):
    #pdb.set_trace()
    
    global tfidf_matrix,tf_count_matrix,tfidf_vector,count_vector,df_act
    ## First join 5 parameters andd then call input_data_preprocessing
    data_to_be_processed=str(input_ticket_category) +' ' + str(input_ticket_type) +' ' +str(input_ticket_item) + ' ' + str(input_ticket_summary) + ' ' +str(input_ticket_desc)

    ## Input Data Preprocessing
    input_processed_text = input_data_preprocessing(data_to_be_processed) ## 5 different fields
    
    print("Input processed Text : ",input_processed_text)
    #pdb.set_trace()
      
    ##TFIDF Prediction
    tfidf_pred,input_tfidfmatrx = input_evalution(input_processed_text,tfidf_matrix,tfidf_vector,df_act)
    #print("TF IDF Pred : ",tfidf_pred)
    #print("Input TFIDF Matrix : ",input_tfidfmatrx)
    
    ##TF_count Prediction
    tf_count_pred,input_tfcountmatrx = input_evalution_count(input_processed_text,tf_count_matrix,count_vector,df_act)
    #pdb.set_trace()
    
    
    tfidf_pred['score_new'] = tfidf_pred['score']*0.5
    tf_count_pred['score_new'] = tf_count_pred['score']*0.5
    
    tfidf_pred['flag'] = 'tfidf'
    tf_count_pred['flag'] = 'tf_count'
    
    overall_result = pd.concat([tfidf_pred,tf_count_pred])
    #print("Overall Result : ",overall_result)
    if len(overall_result)>0:
    
        overall_result = overall_result.sort_values(by='score_new',ascending=False)
        #print("Sorted Overall Result : ",overall_result)
        
        #overall_result['fuzz_valid_score'] = overall_result.apply(lambda row: fuzz_score(input_processed_text, row['concatenated_string']), axis=1)
        
        # Continue with your filtering and sorting logic
        #overall_result = overall_result[(overall_result['fuzz_valid_score'] > config.fuzzy_threshold) | (overall_result['score_new'] >= config.tf_threshold)]
        overall_result = overall_result.head(config.max_reccom)
        #print("Overall Result : ",overall_result)
        
        user_recommendation_list = overall_result[config.target_column].tolist()
        print("USer recommendation List : ",user_recommendation_list)
        
        # Check if recommendations are found
        '''
        if not overall_result.empty:
            # Concatenate your final result lists
            user_recommendation_list = overall_result[config.target_column].tolist()
            print("USer recommendation List : ",user_recommendation_list)
        else:
            # No recommendations found, return empty lists or a message indicating no recommendations
            user_recommendation_list = []
            
        '''

    return user_recommendation_list


def input_evalution(input_processed_text, df_train_mtrx,tfidf_vector,df_act):
    print("Into Input Evaluation function")
    text=input_processed_text
    print("Text : ",text)
    tfidf_vector=tfidf_vector
    print("TFIDF Vector : ",tfidf_vector)
    df_train_mtrx=df_train_mtrx
    #print("DF Train Matrix : ",df_train_mtrx)
    
    input_tfidf=tfidf_vector.transform([text])
    print("Input TF IDF : ",input_tfidf)
    x=input_tfidf.todense()
    print("X : ",x)
    df_tst = pd.DataFrame(x)
    #print("Df Test Input Evaluation : ",df_tst)
    ## Replacing Nan values in matrix with 0
    df_train_mtrx_nan=np.isnan(df_train_mtrx)
    #print("DF Train MAtrix Nan : ",df_train_mtrx_nan)
    df_train_mtrx[df_train_mtrx_nan] = 0
    ## Appending df_tst to df_train
    #df_train_mtrx = df_train_mtrx.append(df_tst)
    #print("DF Train Matrix after appending : ",df_train_mtrx)
    ## Calculating Cosine Similarity-->issue
    scr=cosine_similarity(df_train_mtrx, df_tst)
    print("Cosine Similarity : ",scr)
    df_chk = pd.DataFrame()
    df_chk['ticket_id']=df_train_mtrx.index
    df_chk['score']=scr
    print("DF CHeck : ",df_chk.head())
    
    # Filter 'df_chk' to keep rows where the 'score' is greater than 0.25
    score = df_chk[df_chk['score'] > 0.45]['score'].tolist()
    print("Score : ", score)
    
    # Get the indexes where the score is above the threshold
    indexes = df_chk[df_chk['score'] > 0.45].index
    print("Indexes : ",indexes)
    
    # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
    df_eval = df_train_mtrx.iloc[indexes]
    df_eval['score'] = score
    #print("DF eval : ", df_eval.head())

    return df_eval, df_tst


def input_evalution_count(text, df_train_mtrx,count_vector,df_act):
    print("Into Input Evaluation Count function")
    text=text
    print("Text : ",text)
    input_count=count_vector.transform([text])
    print("Input Count : ",input_count)
    x=input_count.todense()
    print("X : ",x)
    df_tst = pd.DataFrame(x)
    #print("DF Test in evaluation count : ",df_tst)
    
    ## Replacing Nan values in matrix with 0
    df_train_mtrx_nan=np.isnan(df_train_mtrx)
    #print("DF Train MAtrix Nan : ",df_train_mtrx_nan)
    df_train_mtrx[df_train_mtrx_nan] = 0
    # Appending input data to train dataset 
    #df_train_mtrx = df_train_mtrx.append(df_tst.head())
    #print("DF Train Matrix after appending : ",df_train_mtrx)
    ## Calculating cosine similarity
    scr=cosine_similarity(df_train_mtrx, df_tst)
    print("Cosine Similarity inside Input evaluation : ",scr)
    df_chk = pd.DataFrame()
    df_chk['ticket_id']=df_train_mtrx.index
    df_chk['score']=scr
    
    # Filter 'df_chk' to keep rows where the 'score' is greater than 0.25
    score = df_chk[df_chk['score'] > 0.50]['score'].tolist()
    print("Score : ", score)
    
    # Get the indexes where the score is above the threshold
    indexes = df_chk[df_chk['score'] > 0.50].index
    print("Indexes : ",indexes)
    
    # Retrieve values from the 'df_train_mtrx' DataFrame based on the indexes
    df_eval = df_train_mtrx.iloc[indexes]
    df_eval['score'] = score
    #print("DF eval : ", df_eval.head())

    return df_eval, df_tst


##Testing this -

# Sample input data
input_ticket_category = 'Process'
input_ticket_type = 'HRO  Payroll'
input_ticket_item = 'Benefits and Payments'
input_ticket_summary = 'Incorrect Result'
input_ticket_desc = 'Dear Sir, As per the attached screen shots...'
input_ticket_severity = '4 - Default'

# Call the event_prediction function
user_recommendations = event_prediction(
    input_ticket_category,
    input_ticket_type,
    input_ticket_item,
    input_ticket_summary,
    input_ticket_desc,
)

# Print the user recommendations
print("User Recommendations:", user_recommendations)

# Add debug information to check if the functions are being called
#print("Debug Info - input_evalution:", input_evalution(input_processed_text, tfidf_matrix, tfidf_vector, df_act))
#print("Debug Info - input_evalution_count:", input_evalution_count(input_processed_text, tf_count_matrix, count_vector, df_act))

And we are getting output as below-

loading models Matrix ................
loading model Config................
loading Actual Data...................
loading unique noun in train data with vector value for context search ................
Input processed Text :  process hro payroll benefits payments incorrect result dear sir, per attached screen shots
Into Input Evaluation function
Text :  process hro payroll benefits payments incorrect result dear sir, per attached screen shots
TFIDF Vector :  TfidfVectorizer(stop_words='english')
Input TF IDF :    (0, 6601)	0.36862564889350036
  (0, 6534)	0.3496942376519081
  (0, 6282)	0.19397247510394874
  (0, 6005)	0.35528650950739105
  (0, 5358)	0.24346070031010753
  (0, 5005)	0.2336641199819507
  (0, 4999)	0.36153813756029163
  (0, 3300)	0.2182457922117022
  (0, 3060)	0.3496942376519081
  (0, 1767)	0.18937766196959577
  (0, 780)	0.3103140646757555
  (0, 564)	0.17696718472159018
X :  [[0. 0. 0. ... 0. 0. 0.]]
Cosine Similarity :  [[0.46867088]
 [0.2368872 ]
 [0.        ]
 ...
 [0.01469617]
 [0.        ]
 [0.04955105]]
DF CHeck :     ticket_id     score
0          0  0.468671
1          1  0.236887
2          2  0.000000
3          3  0.119194
4          4  0.222561
Score :  [0.4686708758445488, 0.4789154257385584]
Indexes :  Int64Index([0, 2294], dtype='int64')
Into Input Evaluation Count function
Text :  process hro payroll benefits payments incorrect result dear sir, per attached screen shots
Input Count :    (0, 564)	1
  (0, 780)	1
  (0, 1767)	1
  (0, 3060)	1
  (0, 3300)	1
  (0, 4999)	1
  (0, 5005)	1
  (0, 5358)	1
  (0, 6005)	1
  (0, 6282)	1
  (0, 6534)	1
  (0, 6601)	1
X :  [[0 0 0 ... 0 0 0]]

Cosine Similarity inside Input evaluation :  [[0.53674504]
 [0.30815782]
 [0.        ]
 ...
 [0.02254174]
 [0.        ]
 [0.08006408]]
Score :  [0.5367450401216933, 0.5051814855409227]
Indexes :  Int64Index([0, 2294], dtype='int64')
User Recommendations: []

We want to know why User Recommendations: [] is coming blank, also it is giving indexes at [0,2294]  .

How to check what is there at indexes [0,2294].