Untitled
unknown
plain_text
2 years ago
3.2 kB
8
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Load the preprocessed data into a pandas dataframe
df = Query_req_data
print(df.shape)
# Create the X tf-idf vector
X_text_cols = ['ticket_category', 'ticket_type','ticket_summary', 'ticket_desc']
X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer_x.fit_transform(X)
print(X_tfidf.shape)
# Create the Y tf-idf vector
Y = df['ticket_solution']
vectorizer_y = TfidfVectorizer(max_features=5000, stop_words='english')
Y_tfidf = vectorizer_y.fit_transform(Y)
print(Y_tfidf.shape)
# Link the X and Y vectors with index
index = df.index.values
print("Index",index)
def get_top_3_solutions(ticket_category, ticket_type,ticket_summary, ticket_desc):
# Concatenate the input data into a single string
input_data = ' '.join([ticket_category, ticket_type,ticket_summary, ticket_desc])
# Get the tf-idf vector for the input data
input_vector_x = vectorizer_x.transform([input_data])
# Calculate the cosine similarity between the input vector and X_tfidf
sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten()
# Get the index of the most similar ticket
most_similar_ticket_idx = sim_scores_x.argmax()
# Get the similarity score of the most similar ticket
similarity_score = sim_scores_x[most_similar_ticket_idx]
print("Cosine similarity score: ", similarity_score)
# Check if the similarity score is greater than 0.5
if similarity_score >= 0.5:
# Get the index of the most similar ticket in Y_tfidf
ticket_idx = index[most_similar_ticket_idx]
print("Ticket index",ticket_idx)
# Check if the ticket_idx is valid
if ticket_idx < Y_tfidf.shape[0]:
# Get the tf-idf vector for the most similar ticket in Y_tfidf
ticket_vector_y = Y_tfidf[ticket_idx]
# Calculate the cosine similarity between the ticket vector and Y_tfidf
sim_scores_y = cosine_similarity(ticket_vector_y, Y_tfidf).flatten()
# Get the indices of the top 3 tickets
top_ticket_indices = sim_scores_y.argsort()[::-1][:3]
# Get the solutions for the top 3 tickets
top_solutions = [df.iloc[index[ticket_idx]]['ticket_solution'] for ticket_idx in top_ticket_indices]
return top_solutions
else:
return ["No solution found"]
else:
return ["No solution found"]
# Example usage
ticket_category = 'Application'
ticket_type = 'HCM - Recruitment'
ticket_summary = 'clarification'
ticket_desc = 'follow issue TAS application form '
top_solutions = get_top_3_solutions(ticket_category, ticket_type,ticket_summary, ticket_desc)
if top_solutions != ["No solution found"]:
#print("Cosine similarity score: ", get_top_3_solutions(input_data))
print("Top 3 ticket solutions:")
for solution in top_solutions:
print(solution)
else:
print(top_solutions) Editor is loading...