Untitled
plain_text
a month ago
3.1 kB
1
Indexable
Never
def data_pre_processing(df,column): #---- Upper case ------# #pdb.set_trace() df['clean_text_'+str(column)] = df[column].str.upper() #---- Remove Punctuation -----# #PUNCT_TO_REMOVE = string.punctuation #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) #------ Remove stop words ----# STOPWORDS = set(stopwords)#set(stopwords.words('english')) df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS])) #------- Remove numeric values ----- # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '') # Remove the '.' character # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '') return df def input_data_preprocessing(text): #pdb.set_trace() text = text.upper() # uuper case #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal text = ''.join(i for i in text if not i.isdigit()) # digit removal text = text.replace('.', '') return text 2ND BLOCK- def data_pre_processing_context(df,column): #---- Lower case ------# #pdb.set_trace() df['clean_text_'+str(column)] = df[column].str.lower() # --- Remove '_', '-' ------# df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ') df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ') #---- Remove Punctuation -----# #PUNCT_TO_REMOVE = string.punctuation #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) #------ Remove stop words ----# STOPWORDS = set(stopwords) #set(stopwords.words('english'))# df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS])) #------- Remove numeric values ----- # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '') # Remove the '.','_','-' character # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '') #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ') #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ') def input_data_preprocessing_context(text): #pdb.set_trace() text = text.lower() # lower case text = text.replace('_', ' ') text = text.replace('-',' ') #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal text = ''.join(i for i in text if not i.isdigit()) # digit removal text = text.replace('.', '') return text