Untitled
plain_text
15 days ago
2.5 kB
1
Indexable
Never
def data_pre_processing(data): ## Convert all text columns to lowercase text_columns=[col for col in data.columns if data[col].dtype=='O' and col !='person_who_resolved'] print("Text Columns inside Data Preprocessing : ",text_columns) data[text_columns]=data[text_columns].apply(lambda x : x.str.lower()) ## Modifying ticket_severity data['ticket_severity']=data['ticket_severity'].str.split('-').str[1] ## Concatenating first name and last name in person_who_resolved data['person_who_resolved'] = data['person_who_resolved'].apply(concatenate_names) ## Converting owner_user_id column as str data['owner_user_id']=data['owner_user_id'].astype(str) print("Dtype of owner user id : ",data['owner_user_id'].dtype) ## Making a new column data['concatenated_string']=data['ticket_category'] + ' ' +data['ticket_type'] + ' ' +data['ticket_item'] + ' ' + data['ticket_summary'] + ' ' +data['ticket_desc'] + ' ' + data['owner_user_id'] + ' ' + data['role_name'] + data['person_who_resolved'] data['concatenated_string'] = data['concatenated_string'].apply(preprocess_text) #print(data) return data def preprocess_text(text): if not isinstance(text, str): return text # Apply various text cleaning functions using neattext text = nfx.remove_userhandles(text) text = nfx.remove_puncts(text) text = nfx.remove_punctuations(text) text = nfx.remove_phone_numbers(text) text = nfx.remove_special_characters(text) # Remove stopwords using a custom function text = remove_stopwords(text) return text If you see the above concatenated string - we are concatenating owner_user_id as well which contains numbers like- 112 260 1414 244 670 196 983 162 1308 108 ... 1750 1 1813 1 202 1 833 1 1119 1 Name: owner_user_id, Length: 93 Now the thing is I want this owner_user_id into training as I want it in prediction, but also want to remove the other junk numbers like in above text- process hro time management leave application data entry issues dear siremp 11002310mss able deletechange leave dated 17082020 screen shot along error attached herewith 104 l2 supportTanvir Mirkar 11002310 and 17082020 are junk numbers and dont need them in training, but want 104 as its owner_user_id. How to do this.