Untitled
user_9363972
python
2 years ago
3.6 kB
6
Indexable
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay
data = pd.read_csv("reviews.csv")
data = data.head(30000)
data.tail()
data.isnull().sum()
# sns.countplot(x="Rating", data=data, palette="pastel")
def transform_ratings(rating):
if rating == 5 or rating == 4:
return "Good"
if rating == 3:
return "Neutral"
if rating == 2 or rating == 1:
return "Bad"
data["Desc"] = data["Rating"].apply(transform_ratings)
data["length"] = data["Review"].str.len()
# sns.countplot(x="Desc", data=data, palette="pastel")
data.head()
# sns.scatterplot(x=data['length'][data.length < 500],
# y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc'])
reviews = " ".join(data.loc[:, 'Review'])
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(reviews)
# plt.imshow(wordcloud)
# Lowering the word
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Removing word that is not AlphaNumeric
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word))
# Removing word that is stopword
stop_words = stopwords.words('english')
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words))
# Simplify the word
stemmer = PorterStemmer()
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(stemmer.stem(word) for word in word.split()))
data.head()["Review"]
X = data["Review"]
y = data["Desc"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)
cv = CountVectorizer()
cv.fit(X_train)
X_train_count = cv.transform(X_train)
X_test_count = cv.transform(X_test)
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
rfl = RandomForestClassifier()
rfl.fit(X_train_count, y_train)
rf_cv = rfl.score(X_test_count, y_test)
rf2 = RandomForestClassifier()
rf2.fit(X_train_tfidf, y_train)
rf_tfidf = rf2.score(X_test_tfidf, y_test)
mnb1 = MultinomialNB()
mnb1.fit(X_train_count, y_train)
mnb_cv = mnb1.score(X_test_count, y_test)
mnb2 = MultinomialNB()
mnb2.fit(X_train_tfidf, y_train)
mnb_tfidf = mnb2.score(X_test_tfidf, y_test)
model = {'Model':['RF-Count', 'RF-TFIDF', 'MNB-Count','MNB-TFIDF'], 'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf] }
model_df = pd.DataFrame(model)
sns.barplot(data=model_df, y="Model", x="Score")
disp = ConfusionMatrixDisplay.from_estimator(mnb1, X_test_count, y_test)
disp.ax_.set_title("Confusion Matrix M NaiveBayes-Count")
with open('model.pkl', 'wb') as f:
pickle.dump(mnb1, f)
with open('transformer.pkl', 'wb') as f:
pickle.dump(cv, f)
with open('le.pkl', 'wb') as f:
pickle.dump(le, f)
Editor is loading...
Leave a Comment