Untitled
unknown
plain_text
a year ago
1.5 kB
2
Indexable
Never
import pandas as pd import pickle import time import datetime from joblib import dump, load import shutil, os import pdb import io import pandas as pd #from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity #from string import digits from sklearn.feature_extraction.text import CountVectorizer import sys from load_data import * from feature_engineering import * def train_data(cleaned_data, column): tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column]) sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) doc_term_matrix = sparse_matrix.todense() data_train_tfidf = pd.DataFrame(doc_term_matrix, columns=tfidf_vectorizer.get_feature_names()) count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer = CountVectorizer() count_vectorizer = count_vectorizer.fit(cleaned_data[column]) sparse_matrix = count_vectorizer.fit_transform(cleaned_data[column]) doc_term_matrix = sparse_matrix.todense() data_train_count = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names()) return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer