
mail@pastecode.io avatar
a year ago
6.3 kB
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd
import datetime
import dateutil.parser
import pdb

import pandas as pd
from nltk.corpus import stopwords
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk

from Axis.Code.param_config import config

def drop_null_value_rows(df,drop_cols):
    df = df.dropna(subset=drop_cols,how='any',axis=0)
    return df

def data_pre_processing(df,column):
    #---- Upper case ------#
    df['clean_text_'+str(column)] = df[column].str.upper()
    #---- Remove Punctuation -----#
    #PUNCT_TO_REMOVE = string.punctuation
    #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
    #------ Remove stop words ----#
    STOPWORDS = set(stopwords)#set(stopwords.words('english'))
    df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
    #------- Remove numeric values ----- #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
    # Remove the '.' character #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
    return df

def input_data_preprocessing(text):
    text = text.upper() # uuper case
    #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
    text = ''.join(i for i in text if not i.isdigit()) # digit removal
    text = text.replace('.', '')
    return text

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def data_pre_processing_context(df,column):
    #---- Lower case ------#
    df['clean_text_'+str(column)] = df[column].str.lower()
    # --- Remove '_', '-' ------#
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
    #---- Remove Punctuation -----#
    #PUNCT_TO_REMOVE = string.punctuation
    #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
    #------ Remove stop words ----#
    STOPWORDS = set(stopwords) #set(stopwords.words('english'))#
    df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
    #------- Remove numeric values ----- #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
    # Remove the '.','_','-' character #
    df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
    #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
    #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')

def input_data_preprocessing_context(text):
    text = text.lower() # lower case
    text = text.replace('_', ' ')
    text = text.replace('-',' ')
    #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
    text = ''.join(i for i in text if not i.isdigit()) # digit removal
    text = text.replace('.', '')
    return text

def noun_extraction(text):
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    tokenized = nltk.word_tokenize(text)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]

    return nouns

def unique_word_list(df, column):
    str2 =''
    for i in df[column]:
        str1 = ' '.join(i)
        str2 = str2+ ' ' + str1
    train_data_list = set(str2.split())
    unique_train_data_word = list(train_data_list)
    return unique_train_data_word

def dict_eval(lst,embeddings_dict):
    for i in lst:
    return dicts

def input_noun_keylist_context(Input):
    Input = input_data_preprocessing(Input)
    nouns = noun_extraction(Input)
    input_dict = dict_eval(nouns,embeddings_dict)
    keylist = list(input_dict.keys())
    return keylist, input_dict