Untitled
unknown
plain_text
3 years ago
2.0 kB
8
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load the preprocessed data into a pandas dataframe
df = data
print(df.shape)
# Create the X tf-idf vector
X_text_cols = ['ticket_category', 'ticket_type', 'ticket_item']
X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer_x.fit_transform(X)
print(X_tfidf.shape)
# Link the X vector with index
index = df.index.values
print("Index", index)
def impute_assigned_to(row):
# Check if the assigned_to value is NaN
if pd.isnull(row['assigned_to']):
# Concatenate the input data into a single string
input_data = ' '.join([row['ticket_category'], row['ticket_type'], row['ticket_item']])
# Get the tf-idf vector for the input data
input_vector_x = vectorizer_x.transform([input_data])
# Calculate the cosine similarity between the input vector and X_tfidf
sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten()
# Get the index of the most similar ticket
most_similar_ticket_idx = sim_scores_x.argmax()
# Get the similarity score of the most similar ticket
similarity_score = sim_scores_x[most_similar_ticket_idx]
# Check if the similarity score is greater than 0.5
if similarity_score >= 0.35:
# Get the assigned_to value for the most similar ticket
assigned_to_value = df.iloc[index[most_similar_ticket_idx]]['assigned_to']
return assigned_to_value
else:
return "No solution found"
else:
return row['assigned_to']
# Impute the assigned_to values for rows with NaN values
df['assigned_to'] = df.apply(impute_assigned_to, axis=1)
# Display the updated dataframe
print(df.head())
Editor is loading...