Untitled

mail@pastecode.io avatar
unknown
plain_text
10 months ago
2.5 kB
2
Indexable
Never
def data_pre_processing(data):
    
    ## Convert all text columns to lowercase
    text_columns=[col for col in data.columns if data[col].dtype=='O' and col !='person_who_resolved']
    print("Text Columns inside Data Preprocessing : ",text_columns)
    data[text_columns]=data[text_columns].apply(lambda x : x.str.lower())
    
    ## Modifying ticket_severity
    data['ticket_severity']=data['ticket_severity'].str.split('-').str[1]
      
    ## Concatenating first name and last name in person_who_resolved
    data['person_who_resolved'] = data['person_who_resolved'].apply(concatenate_names)
    
    ## Converting owner_user_id column as str
    data['owner_user_id']=data['owner_user_id'].astype(str)
    print("Dtype of owner user id : ",data['owner_user_id'].dtype)
    
    ## Making a new column
    data['concatenated_string']=data['ticket_category'] + ' ' +data['ticket_type'] + ' ' +data['ticket_item'] + ' ' + data['ticket_summary'] + ' ' +data['ticket_desc'] + ' ' + data['owner_user_id'] + ' ' + data['role_name'] + data['person_who_resolved']
    
    data['concatenated_string'] = data['concatenated_string'].apply(preprocess_text)
    #print(data)
    return data

def preprocess_text(text):
    if not isinstance(text, str):
        return text
    
    # Apply various text cleaning functions using neattext
    text = nfx.remove_userhandles(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_punctuations(text)
    text = nfx.remove_phone_numbers(text)
    
    text = nfx.remove_special_characters(text)
    
    # Remove stopwords using a custom function
    text = remove_stopwords(text)
    
    return text


If you see the above concatenated string - we are concatenating owner_user_id as well which contains numbers like-

112     260
1414    244
670     196
983     162
1308    108
       ... 
1750      1
1813      1
202       1
833       1
1119      1
Name: owner_user_id, Length: 93

Now the thing is I want this owner_user_id into training as I want it in prediction, but also want to remove the other junk numbers like in above text-

process hro time management leave application data entry issues dear siremp 11002310mss able deletechange leave dated 17082020 screen shot along error attached herewith 104 l2 supportTanvir Mirkar

11002310 and 17082020 are junk numbers and dont need them in training, but want 104 as its owner_user_id.

How to do this.