Untitled
unknown
plain_text
2 years ago
2.9 kB
1
Indexable
Never
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load the preprocessed data into a pandas dataframe df = Incident_Data print(df.shape) # Create the X tf-idf vector X_text_cols = ['ticket_category', 'ticket_type', 'ticket_summary', 'ticket_desc'] X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1) vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english') X_tfidf = vectorizer_x.fit_transform(X) print(X_tfidf.shape) # Create the Y tf-idf vector Y = df['ticket_solution'] vectorizer_y = TfidfVectorizer(max_features=5000, stop_words='english') Y_tfidf = vectorizer_y.fit_transform(Y) print(Y_tfidf.shape) # Link the X and Y vectors with index index = df.index.values print("Index",index) def get_top_3_solutions(input_data): # Get the tf-idf vector for the input data input_vector_x = vectorizer_x.transform([input_data]) # Calculate the cosine similarity between the input vector and X_tfidf sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten() # Get the index of the most similar ticket most_similar_ticket_idx = sim_scores_x.argmax() # Get the similarity score of the most similar ticket similarity_score = sim_scores_x[most_similar_ticket_idx] print("Cosine similarity score: ", similarity_score) # Check if the similarity score is greater than 0.5 if similarity_score >= 0.5: # Get the index of the most similar ticket in Y_tfidf ticket_idx = index[most_similar_ticket_idx] print("Ticket index",ticket_idx) # Check if the ticket_idx is valid if ticket_idx < Y_tfidf.shape[0]: # Get the tf-idf vector for the most similar ticket in Y_tfidf ticket_vector_y = Y_tfidf[ticket_idx] # Calculate the cosine similarity between the ticket vector and Y_tfidf sim_scores_y = cosine_similarity(ticket_vector_y, Y_tfidf).flatten() # Get the indices of the top 3 tickets top_ticket_indices = sim_scores_y.argsort()[::-1][:3] # Get the solutions for the top 3 tickets top_solutions = [df.iloc[index[ticket_idx]]['ticket_solution'] for ticket_idx in top_ticket_indices] return top_solutions else: return ["No solution found"] else: return ["No solution found"] # Example usage input_data = "Please reset password" top_3_solutions = get_top_3_solutions(input_data) if top_3_solutions != ["No solution found"]: #print("Cosine similarity score: ", get_top_3_solutions(input_data)) print("Top 3 ticket solutions:") for solution in top_3_solutions: print(solution) else: print(top_3_solutions)