Untitled
import pandas as pd import seaborn as sns from wordcloud import WordCloud import matplotlib.pyplot as plt import re import pickle from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import ConfusionMatrixDisplay data = pd.read_csv("reviews.csv") data = data.head(30000) data.tail() data.isnull().sum() # sns.countplot(x="Rating", data=data, palette="pastel") def transform_ratings(rating): if rating == 5 or rating == 4: return "Good" if rating == 3: return "Neutral" if rating == 2 or rating == 1: return "Bad" data["Desc"] = data["Rating"].apply(transform_ratings) data["length"] = data["Review"].str.len() # sns.countplot(x="Desc", data=data, palette="pastel") data.head() # sns.scatterplot(x=data['length'][data.length < 500], # y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc']) reviews = " ".join(data.loc[:, 'Review']) wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(reviews) # plt.imshow(wordcloud) # Lowering the word data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda x: " ".join(x.lower() for x in x.split())) # Removing word that is not AlphaNumeric data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word)) # Removing word that is stopword stop_words = stopwords.words('english') data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words)) # Simplify the word stemmer = PorterStemmer() data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(stemmer.stem(word) for word in word.split())) data.head()["Review"] X = data["Review"] y = data["Desc"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40) le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) y_test = le.transform(y_test) cv = CountVectorizer() cv.fit(X_train) X_train_count = cv.transform(X_train) X_test_count = cv.transform(X_test) tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3)) tfidf.fit(X_train) X_train_tfidf = tfidf.transform(X_train) X_test_tfidf = tfidf.transform(X_test) rfl = RandomForestClassifier() rfl.fit(X_train_count, y_train) rf_cv = rfl.score(X_test_count, y_test) rf2 = RandomForestClassifier() rf2.fit(X_train_tfidf, y_train) rf_tfidf = rf2.score(X_test_tfidf, y_test) mnb1 = MultinomialNB() mnb1.fit(X_train_count, y_train) mnb_cv = mnb1.score(X_test_count, y_test) mnb2 = MultinomialNB() mnb2.fit(X_train_tfidf, y_train) mnb_tfidf = mnb2.score(X_test_tfidf, y_test) model = {'Model':['RF-Count', 'RF-TFIDF', 'MNB-Count','MNB-TFIDF'], 'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf] } model_df = pd.DataFrame(model) sns.barplot(data=model_df, y="Model", x="Score") disp = ConfusionMatrixDisplay.from_estimator(mnb1, X_test_count, y_test) disp.ax_.set_title("Confusion Matrix M NaiveBayes-Count") with open('model.pkl', 'wb') as f: pickle.dump(mnb1, f) with open('transformer.pkl', 'wb') as f: pickle.dump(cv, f) with open('le.pkl', 'wb') as f: pickle.dump(le, f)
Leave a Comment