Untitled
unknown
plain_text
2 years ago
1.3 kB
9
Indexable
def data_pre_processing(df,column):
#---- Upper case ------#
#pdb.set_trace()
df['clean_text_'+str(column)] = df[column].str.upper()
#---- Remove Punctuation -----#
#PUNCT_TO_REMOVE = string.punctuation
#df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
#------ Remove stop words ----#
STOPWORDS = set(stopwords)#set(stopwords.words('english'))
df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
#------- Remove numeric values ----- #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
# Remove the '.' character #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
return df
def input_data_preprocessing(text):
#pdb.set_trace()
text = text.upper() # uuper case
#text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
text = ''.join(i for i in text if not i.isdigit()) # digit removal
text = text.replace('.', '')
return textEditor is loading...