Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.4 kB
2
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#from feature_engineering import *
import pdb

vectorizer = TfidfVectorizer()

def get_top_persons_who_resolved(pred_data):
    distance_metric='cosine'
    print("pred_data: ", pred_data)
    pred_data['created_date']=pd.to_datetime(pred_data['created_date'])
    row=pred_data.copy()
    print("row: ", row)
    
    ticket_data= ticket_data = pd.concat(map(pd.read_csv, ['/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusIncidentData_Post_01-01-2019_Till_07-07-2019.csv', '/Analytics/venv/Jup/CAPE_ServicePlus_UC/ServicePlusTicket_Data_Till-2019-01-01.csv']), ignore_index=True)
    ticket_data['created_date']=pd.to_datetime(ticket_data['created_date'])
    df=ticket_data.copy()
    print("Dataframe ",df)
    
    # Sample training data with text features
    train_data = df[['ticket_category','ticket_type','ticket_item','ticket_summary','ticket_severity','resolution_sla_violated',
                     'created_date']]
    print("Train data",train_data)

    output_df =df[['person_who_resolved','owner_user_id','role_name']]
    print("Output Columns",output_df)

    # New data for similarity calculation
    new_data = row
    print("New Data",new_data)

    # Create TF-IDF vectorizer and fit on training data
    #vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_data)
    print("Train data vectorized",X)
    
    # Transform new data using the same vectorizer
    X_new = vectorizer.transform(new_data)
    print("API data vectorized",X_new)

    # Calculate cosine similarity between new data and training data
    similarity_matrix = cosine_similarity(X_new, X)
    print("Similarity Matrix:",similarity_matrix)

    # Find the most similar training data indices for each new data point
    similar_row_indices = np.where(similarity_matrix > 0.35)[0]
    print('Similar Row Indices',similar_row_indices)
    
    # Get the corresponding output TF-IDF vectors for new data
    predicted_output_data=output_df.iloc[similar_row_indices]
    print("Predicted Output",predicted_output_data)
    
    return predicted_output_data