tf*idf Model
unknown
python
3 years ago
859 B
3
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer import nltk nltk.download('stopwords') nltk.download('wordnet') def preprocess_text(curr_doc_cont): review = re.sub('[^a-zA-Z]', ' ', curr_doc_cont) review = review.lower() review = review.split() review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)] review = ' '.join(review) return review stopwords = nltk.corpus.stopwords.words('english') lemmatizer = WordNetLemmatizer() tf_idf = TfidfVectorizer() #training dataset X_train_tf = tf_idf.fit_transform(query_docs) #tf*idf model generated only on the training dataset X_train_tf = tf_idf.transform(query_docs) #represent the training data with the tf*idf model X_test_tf = tf_idf.transform([review]) #represent the test data with the tf*idf model