import pandas as pd import seaborn as sns from wordcloud import WordCloud import matplotlib.pyplot as plt import re import pickle from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import ConfusionMatrixDisplay data = pd.read_csv("reviews.csv") data = data.head(30000) data.tail() data.isnull().sum() # sns.countplot(x="Rating", data=data, palette="pastel") def transform_ratings(rating): if rating == 5 or rating == 4: return "Good" if rating == 3: return "Neutral" if rating == 2 or rating == 1: return "Bad" data["Desc"] = data["Rating"].apply(transform_ratings) data["length"] = data["Review"].str.len() # sns.countplot(x="Desc", data=data, palette="pastel") data.head() # sns.scatterplot(x=data['length'][data.length < 500], # y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc']) reviews = " ".join(data.loc[:, 'Review']) wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(reviews) # plt.imshow(wordcloud) # Lowering the word data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda x: " ".join(x.lower() for x in x.split())) # Removing word that is not AlphaNumeric data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word)) # Removing word that is stopword stop_words = stopwords.words('english') data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words)) # Simplify the word stemmer = PorterStemmer() data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(stemmer.stem(word) for word in word.split())) data.head()["Review"] X = data["Review"] y = data["Desc"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40) le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) y_test = le.transform(y_test) cv = CountVectorizer() cv.fit(X_train) X_train_count = cv.transform(X_train) X_test_count = cv.transform(X_test) tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3)) tfidf.fit(X_train) X_train_tfidf = tfidf.transform(X_train) X_test_tfidf = tfidf.transform(X_test) rfl = RandomForestClassifier() rfl.fit(X_train_count, y_train) rf_cv = rfl.score(X_test_count, y_test) rf2 = RandomForestClassifier() rf2.fit(X_train_tfidf, y_train) rf_tfidf = rf2.score(X_test_tfidf, y_test) mnb1 = MultinomialNB() mnb1.fit(X_train_count, y_train) mnb_cv = mnb1.score(X_test_count, y_test) mnb2 = MultinomialNB() mnb2.fit(X_train_tfidf, y_train) mnb_tfidf = mnb2.score(X_test_tfidf, y_test) model = {'Model':['RF-Count', 'RF-TFIDF', 'MNB-Count','MNB-TFIDF'], 'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf] } model_df = pd.DataFrame(model) sns.barplot(data=model_df, y="Model", x="Score") disp = ConfusionMatrixDisplay.from_estimator(mnb1, X_test_count, y_test) disp.ax_.set_title("Confusion Matrix M NaiveBayes-Count") with open('model.pkl', 'wb') as f: pickle.dump(mnb1, f) with open('transformer.pkl', 'wb') as f: pickle.dump(cv, f) with open('le.pkl', 'wb') as f: pickle.dump(le, f)
