Untitled

mail@pastecode.io avatarunknown
plain_text
a month ago
1.7 kB
1
Indexable
Never
def data_pre_processing(data):
    # Convert the 'creation_date' and 'resolution_date' columns to datetime
    data['created_date']=pd.to_datetime(data['created_date'])
    data['ticket_resolution_date']=pd.to_datetime(data['ticket_resolution_date'])
    data['ticket_resolution_time']=data.apply(lambda row:row['ticket_resolution_date']-row['created_date'],axis=1)
    data['ticket_resolution_time'] =data['ticket_resolution_time'].apply(lambda x: x.total_seconds() / 3600)
    data.drop(columns=['ticket_resolution_date','created_date'],axis=1 ,inplace=True)
    
    ## Convert all text columns to lowercase
    text_columns=[col for col in data.columns if data[col].dtype=='O']
    data[text_columns]=data[text_columns].apply(lambda x : x.str.lower())
    
    data['ticket_desc'] = data['ticket_desc'].apply(preprocess_text)
    #print(data)
    return data

def preprocess_text(text):
    if not isinstance(text, str):
        return text
    
    # Apply various text cleaning functions using neattext
    text = nfx.remove_userhandles(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_punctuations(text)
    text = nfx.remove_numbers(text)
    text = nfx.remove_special_characters(text)
    
    # Remove stopwords using a custom function
    text = remove_stopwords(text)
    
    return text

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
    return ' '.join(filtered_tokens)

def input_data_preprocess(text):
    ## We will same above function for preprocessing the input data as well
    return text