Untitled
unknown
plain_text
a year ago
2.4 kB
2
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split #from feature_engineering import * import pdb vectorizer = TfidfVectorizer() def get_top_persons_who_resolved(pred_data): distance_metric='cosine' print("pred_data: ", pred_data) pred_data['created_date']=pd.to_datetime(pred_data['created_date']) row=pred_data.copy() print("row: ", row) ticket_data= ticket_data = pd.concat(map(pd.read_csv, ['/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusIncidentData_Post_01-01-2019_Till_07-07-2019.csv', '/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusTicket_Data_Till-2019-01-01.csv']), ignore_index=True) ticket_data['created_date']=pd.to_datetime(ticket_data['created_date']) df=ticket_data.copy() print("Dataframe ",df) # Sample training data with text features train_data = df[['ticket_category','ticket_type','ticket_item','ticket_summary','ticket_severity','resolution_sla_violated', 'created_date']] print("Train data",train_data) output_df =df[['person_who_resolved','owner_user_id','role_name']] print("Output Columns",output_df) # New data for similarity calculation new_data = row print("New Data",new_data) # Create TF-IDF vectorizer and fit on training data #vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(train_data) print("Train data vectorized",X) # Transform new data using the same vectorizer X_new = vectorizer.transform(new_data) print("API data vectorized",X_new) # Calculate cosine similarity between new data and training data similarity_matrix = cosine_similarity(X_new, X) print("Similarity Matrix:",similarity_matrix) # Find the most similar training data indices for each new data point similar_row_indices = np.where(similarity_matrix > 0.35)[0] print('Similar Row Indices',similar_row_indices) # Get the corresponding output TF-IDF vectors for new data predicted_output_data=output_df.iloc[similar_row_indices] print("Predicted Output",predicted_output_data) return predicted_output_data