Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
1.5 kB
2
Indexable
Never
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *


def train_data(cleaned_data, column):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
    sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix = sparse_matrix.todense()
    data_train_tfidf = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names())
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.fit(cleaned_data[column])
    sparse_matrix    = count_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix  = sparse_matrix.todense()
    data_train_count   = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer