Untitled
unknown
plain_text
2 years ago
3.1 kB
15
Indexable
def data_pre_processing(df,column):
#---- Upper case ------#
#pdb.set_trace()
df['clean_text_'+str(column)] = df[column].str.upper()
#---- Remove Punctuation -----#
#PUNCT_TO_REMOVE = string.punctuation
#df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
#------ Remove stop words ----#
STOPWORDS = set(stopwords)#set(stopwords.words('english'))
df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
#------- Remove numeric values ----- #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
# Remove the '.' character #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
return df
def input_data_preprocessing(text):
#pdb.set_trace()
text = text.upper() # uuper case
#text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
text = ''.join(i for i in text if not i.isdigit()) # digit removal
text = text.replace('.', '')
return text
2ND BLOCK-
def data_pre_processing_context(df,column):
#---- Lower case ------#
#pdb.set_trace()
df['clean_text_'+str(column)] = df[column].str.lower()
# --- Remove '_', '-' ------#
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
#---- Remove Punctuation -----#
#PUNCT_TO_REMOVE = string.punctuation
#df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
#------ Remove stop words ----#
STOPWORDS = set(stopwords) #set(stopwords.words('english'))#
df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
#------- Remove numeric values ----- #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
# Remove the '.','_','-' character #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
#df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
#df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
def input_data_preprocessing_context(text):
#pdb.set_trace()
text = text.lower() # lower case
text = text.replace('_', ' ')
text = text.replace('-',' ')
#text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
text = ''.join(i for i in text if not i.isdigit()) # digit removal
text = text.replace('.', '')
return text
Editor is loading...