Untitled
unknown
plain_text
a year ago
6.3 kB
2
Indexable
Never
import numpy as np from sklearn.preprocessing import LabelEncoder import pickle import pandas as pd import datetime import dateutil.parser import pdb import pandas as pd from nltk.corpus import stopwords import string import pickle from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from string import digits from sklearn.feature_extraction.text import CountVectorizer import numpy as np import nltk from Axis.Code.param_config import config def drop_null_value_rows(df,drop_cols): df = df.dropna(subset=drop_cols,how='any',axis=0) return df def data_pre_processing(df,column): #---- Upper case ------# #pdb.set_trace() df['clean_text_'+str(column)] = df[column].str.upper() #---- Remove Punctuation -----# #PUNCT_TO_REMOVE = string.punctuation #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) #------ Remove stop words ----# STOPWORDS = set(stopwords)#set(stopwords.words('english')) df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS])) #------- Remove numeric values ----- # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '') # Remove the '.' character # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '') return df def input_data_preprocessing(text): #pdb.set_trace() text = text.upper() # uuper case #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal text = ''.join(i for i in text if not i.isdigit()) # digit removal text = text.replace('.', '') return text stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"] def data_pre_processing_context(df,column): #---- Lower case ------# #pdb.set_trace() df['clean_text_'+str(column)] = df[column].str.lower() # --- Remove '_', '-' ------# df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ') df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ') #---- Remove Punctuation -----# #PUNCT_TO_REMOVE = string.punctuation #df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))) #------ Remove stop words ----# STOPWORDS = set(stopwords) #set(stopwords.words('english'))# df['clean_text_'+str(column)]= df['clean_text_'+str(column)].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS])) #------- Remove numeric values ----- # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('\d+', '') # Remove the '.','_','-' character # df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('.', '') #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('_', ' ') #df['clean_text_'+str(column)] = df['clean_text_'+str(column)].str.replace('-', ' ') def input_data_preprocessing_context(text): #pdb.set_trace() text = text.lower() # lower case text = text.replace('_', ' ') text = text.replace('-',' ') #text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuation text = ' '.join([word for word in text.split() if word not in stopwords]) # stopword removal text = ''.join(i for i in text if not i.isdigit()) # digit removal text = text.replace('.', '') return text def noun_extraction(text): # function to test if something is a noun is_noun = lambda pos: pos[:2] == 'NN' # do the nlp stuff tokenized = nltk.word_tokenize(text) nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] return nouns def unique_word_list(df, column): str2 ='' for i in df[column]: str1 = ' '.join(i) str2 = str2+ ' ' + str1 train_data_list = set(str2.split()) unique_train_data_word = list(train_data_list) return unique_train_data_word def dict_eval(lst,embeddings_dict): dicts={} for i in lst: try: dicts.update({i:embeddings_dict[i].tolist()}) except: dicts={} return dicts def input_noun_keylist_context(Input): Input = input_data_preprocessing(Input) #print(Input) nouns = noun_extraction(Input) #print(nouns) input_dict = dict_eval(nouns,embeddings_dict) keylist = list(input_dict.keys()) return keylist, input_dict