Untitled
plain_text
2 months ago
5.0 kB
1
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from feature_engineering import * import pdb vectorizer = TfidfVectorizer() def get_top_persons_who_resolved(pred_data): distance_metric='cosine' print("pred_data: ", pred_data) row=feature_engineering(pred_data) print("row: ", row) ticket_data= ticket_data = pd.concat(map(pd.read_csv, ['/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusIncidentData_Post_01-01-2019_Till_07-07-2019.csv', '/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusTicket_Data_Till-2019-01-01.csv']), ignore_index=True) df=feature_engineering(ticket_data) #print("Dataframe",df) # Sample training data with text features train_data = df.drop(columns=['person_who_resolved','owner_user_id','role_name']) print("Train data",train_data) output_df =df[['person_who_resolved','owner_user_id','role_name']] print("Output Columns",output_df) # New data for similarity calculation new_data = row print("New Data",new_data) # Create TF-IDF vectorizer and fit on training data #vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_data) print("Train data vectorized",X) # Transform new data using the same vectorizer X_new = vectorizer.transform(new_data) print("API data vectorized",X_new) # Calculate cosine similarity between new data and training data similarity_matrix = cosine_similarity(X_new, X) print("Similarity Matrix:",similarity_matrix) # Find the most similar training data indices for each new data point similar_row_indices = np.where(similarity_matrix > 0.7)[0] print('Similar Row Indices',similar_row_indices) # Get the corresponding output TF-IDF vectors for new data predicted_output_data=output_df.iloc[similar_row_indices] print("Predicted Output",predicted_output_data) return predicted_output_data Can you tell me what is this part of code doing- # Create TF-IDF vectorizer and fit on training data #vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_data) print("Train data vectorized",X) # Transform new data using the same vectorizer X_new = vectorizer.transform(new_data) print("API data vectorized",X_new) # Calculate cosine similarity between new data and training data similarity_matrix = cosine_similarity(X_new, X) print("Similarity Matrix:",similarity_matrix) and this is the output I am getting- Train data vectorized (0, 51) 1.0 (1, 31) 1.0 (2, 56) 1.0 (3, 47) 1.0 (4, 65) 1.0 (5, 52) 1.0 (6, 63) 1.0 (7, 59) 1.0 (8, 45) 1.0 (9, 54) 1.0 (10, 46) 1.0 (11, 57) 1.0 (12, 24) 1.0 (13, 25) 1.0 (14, 49) 1.0 (15, 35) 1.0 (16, 33) 1.0 (17, 60) 1.0 (18, 55) 1.0 (19, 61) 1.0 (20, 48) 1.0 (21, 64) 1.0 (22, 28) 1.0 (23, 50) 1.0 (24, 62) 1.0 : : (41, 40) 1.0 (42, 1) 1.0 (43, 58) 1.0 (44, 11) 1.0 (45, 14) 1.0 (46, 29) 1.0 (47, 13) 1.0 (48, 16) 1.0 (49, 43) 1.0 (50, 9) 1.0 (51, 27) 1.0 (52, 26) 1.0 (53, 44) 1.0 (54, 18) 1.0 (55, 12) 1.0 (56, 0) 1.0 (57, 4) 1.0 (58, 36) 1.0 (59, 42) 1.0 (60, 19) 1.0 (61, 30) 1.0 (62, 41) 1.0 (63, 34) 1.0 (64, 32) 1.0 (65, 37) 1.0 API data vectorized (0, 47) 1.0 (1, 65) 1.0 (2, 52) 1.0 (3, 63) 1.0 (4, 59) 1.0 (5, 33) 1.0 Similarity Matrix: [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] Similar Row Indices [0 1 2 3 4 5] Can you explain this output as well