Untitled

 avatar
user_9363972
python
a year ago
3.6 kB
3
Indexable
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
import re 
import pickle 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay


data = pd.read_csv("reviews.csv")
data = data.head(30000)

data.tail()
data.isnull().sum()
# sns.countplot(x="Rating", data=data, palette="pastel")


def transform_ratings(rating): 
    if rating == 5 or rating == 4: 
        return "Good" 
    if rating == 3: 
        return "Neutral" 
    if rating == 2 or rating == 1: 
        return "Bad"

data["Desc"] = data["Rating"].apply(transform_ratings)
data["length"] = data["Review"].str.len()

# sns.countplot(x="Desc", data=data, palette="pastel")
data.head()

# sns.scatterplot(x=data['length'][data.length < 500],
#                 y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc'])

reviews = " ".join(data.loc[:, 'Review']) 
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(reviews) 
# plt.imshow(wordcloud)

# Lowering the word
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda x: " ".join(x.lower() for x in x.split())) 

# Removing word that is not AlphaNumeric
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word)) 

# Removing word that is stopword
stop_words = stopwords.words('english') 
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words)) 


# Simplify the word
stemmer = PorterStemmer() 
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(stemmer.stem(word) for word in word.split())) 
data.head()["Review"]

X = data["Review"]
y = data["Desc"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40) 


le = LabelEncoder() 
le.fit(y_train) 
y_train = le.transform(y_train) 
y_test = le.transform(y_test) 


cv = CountVectorizer() 
cv.fit(X_train) 
X_train_count = cv.transform(X_train) 
X_test_count = cv.transform(X_test) 


tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3)) 
tfidf.fit(X_train) 
X_train_tfidf = tfidf.transform(X_train) 
X_test_tfidf = tfidf.transform(X_test) 

rfl = RandomForestClassifier()
rfl.fit(X_train_count, y_train)
rf_cv = rfl.score(X_test_count, y_test)


rf2 = RandomForestClassifier() 
rf2.fit(X_train_tfidf, y_train) 
rf_tfidf = rf2.score(X_test_tfidf, y_test) 

mnb1 = MultinomialNB() 
mnb1.fit(X_train_count, y_train) 
mnb_cv = mnb1.score(X_test_count, y_test) 


mnb2 = MultinomialNB()
mnb2.fit(X_train_tfidf, y_train)
mnb_tfidf = mnb2.score(X_test_tfidf, y_test)


model = {'Model':['RF-Count', 'RF-TFIDF', 'MNB-Count','MNB-TFIDF'], 'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf] } 
model_df = pd.DataFrame(model) 

sns.barplot(data=model_df, y="Model", x="Score")

disp = ConfusionMatrixDisplay.from_estimator(mnb1, X_test_count, y_test) 
disp.ax_.set_title("Confusion Matrix M NaiveBayes-Count")

with open('model.pkl', 'wb') as f:
    pickle.dump(mnb1, f) 
with open('transformer.pkl', 'wb') as f: 
    pickle.dump(cv, f) 
with open('le.pkl', 'wb') as f: 
    pickle.dump(le, f)
Leave a Comment