import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd
import datetime
import dateutil.parser
import pdb
import pandas as pd
from nltk.corpus import stopwords
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
from Axis.Code.param_config import config
def drop_null_value_rows(df,drop_cols):
df = df.dropna(subset=drop_cols,how='any',axis=0)
return df
def data_pre_processing(df,column):
#---- Upper case ------#
#pdb.set_trace()
df['clean_text_'+str(column)] = df[column].str.upper()
#---- Remove Punctuation -----#
#PUNCT_TO_REMOVE = string.punctuation
#df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
#------ Remove stop words ----#
STOPWORDS = set(stopwords)#set(stopwords.words('english'))
df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
#------- Remove numeric values ----- #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
# Remove the '.' character #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
return df
def input_data_preprocessing(text):
#pdb.set_trace()
text = text.upper() # uuper case
#text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
text = ''.join(i for i in text if not i.isdigit()) # digit removal
text = text.replace('.', '')
return text
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
def data_pre_processing_context(df,column):
#---- Lower case ------#
#pdb.set_trace()
df['clean_text_'+str(column)] = df[column].str.lower()
# --- Remove '_', '-' ------#
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
#---- Remove Punctuation -----#
#PUNCT_TO_REMOVE = string.punctuation
#df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
#------ Remove stop words ----#
STOPWORDS = set(stopwords) #set(stopwords.words('english'))#
df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
#------- Remove numeric values ----- #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '')
# Remove the '.','_','-' character #
df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '')
#df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ')
#df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ')
def input_data_preprocessing_context(text):
#pdb.set_trace()
text = text.lower() # lower case
text = text.replace('_', ' ')
text = text.replace('-',' ')
#text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal
text = ''.join(i for i in text if not i.isdigit()) # digit removal
text = text.replace('.', '')
return text
def noun_extraction(text):
# function to test if something is a noun
is_noun = lambda pos: pos[:2] == 'NN'
# do the nlp stuff
tokenized = nltk.word_tokenize(text)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
return nouns
def unique_word_list(df, column):
str2 =''
for i in df[column]:
str1 = ' '.join(i)
str2 = str2+ ' ' + str1
train_data_list = set(str2.split())
unique_train_data_word = list(train_data_list)
return unique_train_data_word
def dict_eval(lst,embeddings_dict):
dicts={}
for i in lst:
try:
dicts.update({i:embeddings_dict[i].tolist()})
except:
dicts={}
return dicts
def input_noun_keylist_context(Input):
Input = input_data_preprocessing(Input)
#print(Input)
nouns = noun_extraction(Input)
#print(nouns)
input_dict = dict_eval(nouns,embeddings_dict)
keylist = list(input_dict.keys())
return keylist, input_dict