Untitled

mail@pastecode.io avatarunknown
plain_text
a month ago
978 B
1
Indexable
Never
def train_data(cleaned_data, column):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(cleaned_data[column])
    sparse_matrix = tfidf_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix = sparse_matrix.todense()
    data_train_tfidf = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names())
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.fit(cleaned_data[column])
    sparse_matrix    = count_vectorizer.fit_transform(cleaned_data[column]) 
    doc_term_matrix  = sparse_matrix.todense()
    data_train_count   = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names())
    
    return data_train_tfidf, data_train_count, tfidf_vectorizer, count_vectorizer