Untitled

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the preprocessed data into a pandas dataframe
df = data
print(df.shape)

# Create the X tf-idf vector
X_text_cols = ['ticket_category', 'ticket_type', 'ticket_item']
X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer_x.fit_transform(X)
print(X_tfidf.shape)

# Link the X vector with index
index = df.index.values
print("Index", index)

def impute_assigned_to(row):
    # Check if the assigned_to value is NaN
    if pd.isnull(row['assigned_to']):
        # Concatenate the input data into a single string
        input_data = ' '.join([row['ticket_category'], row['ticket_type'], row['ticket_item']])

        # Get the tf-idf vector for the input data
        input_vector_x = vectorizer_x.transform([input_data])

        # Calculate the cosine similarity between the input vector and X_tfidf
        sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten()
        # Get the index of the most similar ticket
        most_similar_ticket_idx = sim_scores_x.argmax()
        # Get the similarity score of the most similar ticket
        similarity_score = sim_scores_x[most_similar_ticket_idx]

        # Check if the similarity score is greater than 0.5
        if similarity_score >= 0.35:
            # Get the assigned_to value for the most similar ticket
            assigned_to_value = df.iloc[index[most_similar_ticket_idx]]['assigned_to']
            return assigned_to_value
        else:
            return "No solution found"
    else:
        return row['assigned_to']

# Impute the assigned_to values for rows with NaN values
df['assigned_to'] = df.apply(impute_assigned_to, axis=1)

# Display the updated dataframe
print(df.head())
Editor is loading...