Untitled
unknown
plain_text
2 years ago
4.0 kB
9
Indexable
from sklearn.metrics import pairwise_distances
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from feature_engineering import *
processed_data=feature_engineering()
df=processed_data
label_enc = LabelEncoder()
df['role_name_encoded'] = label_enc.fit_transform(df['role_name'])
df['role_name_decoded'] = label_enc.inverse_transform(df['role_name_encoded'])
# Link the X vector with index
index = df.index.values
def get_top_5_person_who_resolved(df, row, distance_metric='cosine'):
# Concatenate the input data into a single string
input_data = ' '.join([str(row['ticket_category']), str(row['ticket_type']), str(row['ticket_item']),str(row['ticket_summary']),
str(row['ticket_severity']),str(row['resolution_sla_violated']),str(row['reopen_count']),
str(row['owner_user_id']),str(row['role_name_encoded']),str(row['ticket_resolution_time'])])
# Calculate the pairwise distances between the input vector and X
input_vector_x = np.array(list(row[['ticket_category', 'ticket_type', 'ticket_item','ticket_summary',
'ticket_severity', 'resolution_sla_violated', 'reopen_count', 'owner_user_id','role_name_encoded','ticket_resolution_time']]))
if distance_metric == 'cosine':
distances = pairwise_distances(input_vector_x.reshape(1, -1), df[['ticket_category', 'ticket_type', 'ticket_item','ticket_summary',
'ticket_severity', 'resolution_sla_violated', 'reopen_count', 'owner_user_id','role_name_encoded','ticket_resolution_time']], metric='cosine')[0]
elif distance_metric == 'euclidean':
distances = pairwise_distances(input_vector_x.reshape(1, -1), df[['ticket_category', 'ticket_type', 'ticket_item','ticket_summary',
'ticket_severity', 'resolution_sla_violated', 'reopen_count', 'owner_user_id','role_name_encoded','ticket_resolution_time']], metric='euclidean')[0]
elif distance_metric == 'manhattan':
distances = pairwise_distances(input_vector_x.reshape(1, -1), df[['ticket_category', 'ticket_type', 'ticket_item','ticket_summary',
'ticket_severity', 'resolution_sla_violated', 'reopen_count', 'owner_user_id','role_name_encoded','ticket_resolution_time']], metric='manhattan')[0]
else:
raise ValueError('Invalid distance metric')
# Get the indices of the top 5 closest tickets
closest_indices = np.argsort(distances)[:5]
# Get the person_who_resolved, owner_user_id, and role_name values for the closest tickets
closest_person_who_resolved = df.iloc[closest_indices]['person_who_resolved']
closest_owner_user_id = df.iloc[closest_indices]['owner_user_id']
closest_role_name_encoded = df.iloc[closest_indices]['role_name_encoded']
closest_role_name_decoded = df.iloc[closest_indices]['role_name_decoded']
# Get the actual person_who_resolved, owner_user_id, and role_name value for the input ticket
actual_person_who_resolved = row['person_who_resolved']
actual_owner_user_id = row['owner_user_id']
actual_role_name_encoded = row['role_name_encoded']
actual_role_name_decoded = row['role_name_decoded']
return list(zip(closest_person_who_resolved, closest_owner_user_id, closest_role_name_decoded)), (actual_person_who_resolved, actual_owner_user_id, actual_role_name_decoded)
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
# Apply the function to each row of the test data to get the recommendations
test_data['recommendations'], test_data['actual_person_who_resolved'] = zip(*test_data.apply(lambda row: get_top_5_person_who_resolved(train_data, row), axis=1))
# Remove duplicate values from recommendations
test_data['recommendations'] = test_data['recommendations'].apply(lambda x: list(set(x)))
test_data.head()Editor is loading...