Untitled

mail@pastecode.io avatarunknown
plain_text
2 months ago
5.0 kB
1
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from feature_engineering import *
import pdb

vectorizer = TfidfVectorizer()

def get_top_persons_who_resolved(pred_data):
    distance_metric='cosine'
    print("pred_data: ", pred_data)
    row=feature_engineering(pred_data)
    print("row: ", row)
    
    ticket_data= ticket_data = pd.concat(map(pd.read_csv, ['/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusIncidentData_Post_01-01-2019_Till_07-07-2019.csv', '/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusTicket_Data_Till-2019-01-01.csv']), ignore_index=True)
    df=feature_engineering(ticket_data)
    #print("Dataframe",df)
    
    # Sample training data with text features
    train_data = df.drop(columns=['person_who_resolved','owner_user_id','role_name'])
    print("Train data",train_data)

    output_df =df[['person_who_resolved','owner_user_id','role_name']]
    print("Output Columns",output_df)

    # New data for similarity calculation
    new_data = row
    print("New Data",new_data)

    # Create TF-IDF vectorizer and fit on training data
    #vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_data)
    print("Train data vectorized",X)
    
    # Transform new data using the same vectorizer
    X_new = vectorizer.transform(new_data)
    print("API data vectorized",X_new)

    # Calculate cosine similarity between new data and training data
    similarity_matrix = cosine_similarity(X_new, X)
    print("Similarity Matrix:",similarity_matrix)

    # Find the most similar training data indices for each new data point
    similar_row_indices = np.where(similarity_matrix > 0.7)[0]
    print('Similar Row Indices',similar_row_indices)
    
    # Get the corresponding output TF-IDF vectors for new data
    predicted_output_data=output_df.iloc[similar_row_indices]
    print("Predicted Output",predicted_output_data)
    
    return predicted_output_data
    
    Can you tell me what is this part of code doing-

 # Create TF-IDF vectorizer and fit on training data
    #vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_data)
    print("Train data vectorized",X)
    
    # Transform new data using the same vectorizer
    X_new = vectorizer.transform(new_data)
    print("API data vectorized",X_new)

    # Calculate cosine similarity between new data and training data
    similarity_matrix = cosine_similarity(X_new, X)
    print("Similarity Matrix:",similarity_matrix)


and this is the output I am getting-

Train data vectorized   (0, 51)	1.0
  (1, 31)	1.0
  (2, 56)	1.0
  (3, 47)	1.0
  (4, 65)	1.0
  (5, 52)	1.0
  (6, 63)	1.0
  (7, 59)	1.0
  (8, 45)	1.0
  (9, 54)	1.0
  (10, 46)	1.0
  (11, 57)	1.0
  (12, 24)	1.0
  (13, 25)	1.0
  (14, 49)	1.0
  (15, 35)	1.0
  (16, 33)	1.0
  (17, 60)	1.0
  (18, 55)	1.0
  (19, 61)	1.0
  (20, 48)	1.0
  (21, 64)	1.0
  (22, 28)	1.0
  (23, 50)	1.0
  (24, 62)	1.0
  :	:
  (41, 40)	1.0
  (42, 1)	1.0
  (43, 58)	1.0
  (44, 11)	1.0
  (45, 14)	1.0
  (46, 29)	1.0
  (47, 13)	1.0
  (48, 16)	1.0
  (49, 43)	1.0
  (50, 9)	1.0
  (51, 27)	1.0
  (52, 26)	1.0
  (53, 44)	1.0
  (54, 18)	1.0
  (55, 12)	1.0
  (56, 0)	1.0
  (57, 4)	1.0
  (58, 36)	1.0
  (59, 42)	1.0
  (60, 19)	1.0
  (61, 30)	1.0
  (62, 41)	1.0
  (63, 34)	1.0
  (64, 32)	1.0
  (65, 37)	1.0
API data vectorized   (0, 47)	1.0
  (1, 65)	1.0
  (2, 52)	1.0
  (3, 63)	1.0
  (4, 59)	1.0
  (5, 33)	1.0
Similarity Matrix: [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Similar Row Indices [0 1 2 3 4 5]

Can you explain this output as well