import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io
import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer
import sys
from load_data import *
from feature_engineering import *
def train_data(cleaned_data, column):
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_tfidf = pd.DataFrame(doc_term_matrix,
columns=tfidf_vectorizer.get_feature_names())
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
count_vectorizer = count_vectorizer.fit(cleaned_data[column])
sparse_matrix = count_vectorizer.fit_transform(cleaned_data[column])
doc_term_matrix = sparse_matrix.todense()
data_train_count = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names())
return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer