Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
2.9 kB
1
Indexable
Never
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the preprocessed data into a pandas dataframe
df = Incident_Data
print(df.shape)

# Create the X tf-idf vector
X_text_cols = ['ticket_category', 'ticket_type', 'ticket_summary', 'ticket_desc']
X = df[X_text_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
vectorizer_x = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer_x.fit_transform(X)
print(X_tfidf.shape)

# Create the Y tf-idf vector
Y = df['ticket_solution']
vectorizer_y = TfidfVectorizer(max_features=5000, stop_words='english')
Y_tfidf = vectorizer_y.fit_transform(Y)
print(Y_tfidf.shape)

# Link the X and Y vectors with index
index = df.index.values
print("Index",index)

def get_top_3_solutions(input_data):
    # Get the tf-idf vector for the input data
    input_vector_x = vectorizer_x.transform([input_data])

    # Calculate the cosine similarity between the input vector and X_tfidf
    sim_scores_x = cosine_similarity(input_vector_x, X_tfidf).flatten()
    # Get the index of the most similar ticket
    most_similar_ticket_idx = sim_scores_x.argmax()
    # Get the similarity score of the most similar ticket
    similarity_score = sim_scores_x[most_similar_ticket_idx]
    print("Cosine similarity score: ", similarity_score)

    # Check if the similarity score is greater than 0.5
    if similarity_score >= 0.5:
        # Get the index of the most similar ticket in Y_tfidf
        ticket_idx = index[most_similar_ticket_idx]
        print("Ticket index",ticket_idx)
        # Check if the ticket_idx is valid
        if ticket_idx < Y_tfidf.shape[0]:
            # Get the tf-idf vector for the most similar ticket in Y_tfidf
            ticket_vector_y = Y_tfidf[ticket_idx]
            # Calculate the cosine similarity between the ticket vector and Y_tfidf
            sim_scores_y = cosine_similarity(ticket_vector_y, Y_tfidf).flatten()
            # Get the indices of the top 3 tickets
            top_ticket_indices = sim_scores_y.argsort()[::-1][:3]
            # Get the solutions for the top 3 tickets
            top_solutions = [df.iloc[index[ticket_idx]]['ticket_solution'] for ticket_idx in top_ticket_indices]
            return top_solutions
        else:
            return ["No solution found"]
    else:
        return ["No solution found"]

    
    
    
# Example usage
input_data = "Please reset password"
top_3_solutions = get_top_3_solutions(input_data)

if top_3_solutions != ["No solution found"]:
    #print("Cosine similarity score: ", get_top_3_solutions(input_data))
    print("Top 3 ticket solutions:")
    for solution in top_3_solutions:
        print(solution)
else:
    print(top_3_solutions)