tf*idf Model
unknown
python
4 years ago
859 B
8
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(curr_doc_cont):
review = re.sub('[^a-zA-Z]', ' ', curr_doc_cont)
review = review.lower()
review = review.split()
review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
review = ' '.join(review)
return review
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
tf_idf = TfidfVectorizer()
#training dataset
X_train_tf = tf_idf.fit_transform(query_docs) #tf*idf model generated only on the training dataset
X_train_tf = tf_idf.transform(query_docs) #represent the training data with the tf*idf model
X_test_tf = tf_idf.transform([review]) #represent the test data with the tf*idf modelEditor is loading...