tf*idf Model

mail@pastecode.io avatar
unknown
python
3 years ago
859 B
3
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet') 

def preprocess_text(curr_doc_cont):
    review = re.sub('[^a-zA-Z]', ' ', curr_doc_cont)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    
    return review
    
 
  
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

tf_idf = TfidfVectorizer()
#training dataset
X_train_tf = tf_idf.fit_transform(query_docs) #tf*idf model generated only on the training dataset
X_train_tf = tf_idf.transform(query_docs) #represent the training data with the tf*idf model

X_test_tf = tf_idf.transform([review]) #represent the test data with the tf*idf model