Untitled

mail@pastecode.io avatarunknown
plain_text
a month ago
3.1 kB
1
Indexable
Never
def data_pre_processing(df,column):
    #---- Upper case ------#
    #pdb.set_trace()
    df['clean_text_'+str(column)] = df[column].str.upper()
    #---- Remove Punctuation -----#
    #PUNCT_TO_REMOVE = string.punctuation
    #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
    #------ Remove stop words ----#
    STOPWORDS = set(stopwords)#set(stopwords.words('english'))
    df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
    #------- Remove numeric values ----- #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
    # Remove the '.' character #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
    
    return df

def input_data_preprocessing(text):
    #pdb.set_trace()
    text = text.upper() # uuper case
    #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
    text = ''.join(i for i in text if not i.isdigit()) # digit removal
    text = text.replace('.', '')
    return text


2ND BLOCK-

def data_pre_processing_context(df,column):
    #---- Lower case ------#
    #pdb.set_trace()
    df['clean_text_'+str(column)] = df[column].str.lower()
    # --- Remove '_', '-' ------#
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
    #---- Remove Punctuation -----#
    #PUNCT_TO_REMOVE = string.punctuation
    #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
    #------ Remove stop words ----#
    STOPWORDS = set(stopwords) #set(stopwords.words('english'))#
    df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
    #------- Remove numeric values ----- #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
    # Remove the '.','_','-' character #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
    #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
    #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')




def input_data_preprocessing_context(text):
    #pdb.set_trace()
    text = text.lower() # lower case
    text = text.replace('_', ' ')
    text = text.replace('-',' ')
    #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
    text = ''.join(i for i in text if not i.isdigit()) # digit removal
    text = text.replace('.', '')
    return text