Untitled
unknown
plain_text
2 years ago
2.0 kB
4
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load the preprocessed data into a pandas dataframe df = data print(df.shape) # Create the X tf-idf vector X_text_cols = ['ticket_category', 'ticket_type', 'ticket_item'] X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1) vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english') X_tfidf = vectorizer_x.fit_transform(X) print(X_tfidf.shape) # Link the X vector with index index = df.index.values print("Index", index) def impute_assigned_to(row): # Check if the assigned_to value is NaN if pd.isnull(row['assigned_to']): # Concatenate the input data into a single string input_data = ' '.join([row['ticket_category'], row['ticket_type'], row['ticket_item']]) # Get the tf-idf vector for the input data input_vector_x = vectorizer_x.transform([input_data]) # Calculate the cosine similarity between the input vector and X_tfidf sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten() # Get the index of the most similar ticket most_similar_ticket_idx = sim_scores_x.argmax() # Get the similarity score of the most similar ticket similarity_score = sim_scores_x[most_similar_ticket_idx] # Check if the similarity score is greater than 0.5 if similarity_score >= 0.35: # Get the assigned_to value for the most similar ticket assigned_to_value = df.iloc[index[most_similar_ticket_idx]]['assigned_to'] return assigned_to_value else: return "No solution found" else: return row['assigned_to'] # Impute the assigned_to values for rows with NaN values df['assigned_to'] = df.apply(impute_assigned_to, axis=1) # Display the updated dataframe print(df.head())
Editor is loading...