import pandas as pd
import seaborn as sns
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
import re 
import pickle 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay

data = pd.read_csv("reviews.csv")
data = data.head(30000)

# sns.countplot(x="Rating", data=data, palette="pastel")

def transform_ratings(rating): 
    if rating == 5 or rating == 4: 
        return "Good" 
    if rating == 3: 
        return "Neutral" 
    if rating == 2 or rating == 1: 
        return "Bad"

data["Desc"] = data["Rating"].apply(transform_ratings)
data["length"] = data["Review"].str.len()

# sns.countplot(x="Desc", data=data, palette="pastel")

# sns.scatterplot(x=data['length'][data.length < 500],
#                 y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc'])

reviews = " ".join(data.loc[:, 'Review']) 
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(reviews) 
# plt.imshow(wordcloud)

# Lowering the word
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda x: " ".join(x.lower() for x in x.split())) 

# Removing word that is not AlphaNumeric
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word)) 

# Removing word that is stopword
stop_words = stopwords.words('english') 
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words)) 

# Simplify the word
stemmer = PorterStemmer() 
data.loc[:, 'Review'] = data.loc[:, 'Review'].apply(lambda word: " ".join(stemmer.stem(word) for word in word.split())) 

X = data["Review"]
y = data["Desc"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40) 

le = LabelEncoder() 
y_train = le.transform(y_train) 
y_test = le.transform(y_test) 

cv = CountVectorizer() 
X_train_count = cv.transform(X_train) 
X_test_count = cv.transform(X_test) 

tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3)) 
X_train_tfidf = tfidf.transform(X_train) 
X_test_tfidf = tfidf.transform(X_test) 

rfl = RandomForestClassifier()
rfl.fit(X_train_count, y_train)
rf_cv = rfl.score(X_test_count, y_test)

rf2 = RandomForestClassifier() 
rf2.fit(X_train_tfidf, y_train) 
rf_tfidf = rf2.score(X_test_tfidf, y_test) 

mnb1 = MultinomialNB() 
mnb1.fit(X_train_count, y_train) 
mnb_cv = mnb1.score(X_test_count, y_test) 

mnb2 = MultinomialNB()
mnb2.fit(X_train_tfidf, y_train)
mnb_tfidf = mnb2.score(X_test_tfidf, y_test)

model = {'Model':['RF-Count', 'RF-TFIDF', 'MNB-Count','MNB-TFIDF'], 'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf] } 
model_df = pd.DataFrame(model) 

sns.barplot(data=model_df, y="Model", x="Score")

disp = ConfusionMatrixDisplay.from_estimator(mnb1, X_test_count, y_test) 
disp.ax_.set_title("Confusion Matrix M NaiveBayes-Count")

with open('model.pkl', 'wb') as f:
    pickle.dump(mnb1, f) 
with open('transformer.pkl', 'wb') as f: 
    pickle.dump(cv, f) 
with open('le.pkl', 'wb') as f: 
    pickle.dump(le, f)
