Untitled

 avatar
unknown
plain_text
2 years ago
1.5 kB
10
Indexable
import pandas as pd
import pickle
import time
import datetime
from joblib import dump, load
import shutil, os
import pdb
import io


import pandas as pd
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from string import digits
from sklearn.feature_extraction.text import CountVectorizer

import sys

from load_data import *
from feature_engineering import *


def train_data(cleaned_data, column):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
    sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix = sparse_matrix.todense()
    data_train_tfidf = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names())
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.fit(cleaned_data[column])
    sparse_matrix    = count_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix  = sparse_matrix.todense()
    data_train_count   = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer
Editor is loading...