Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.4 kB
3
Indexable
Never
# Load up from the feather file
import pandas as pd
import numpy as np

from pathlib import Path
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Nick
# ROOT = "/content/drive/MyDrive/Final Project/"
# Omar
# ROOT = "/content/drive/MyDrive/MTH 9796/Final Project/"
# Aneesh
ROOT = "/content/drive/MyDrive/Colab Notebooks/Final Project/"

# Hersh, pass in your root here
ROOT = 
root = Path(ROOT)
assert root.exists(), f"{ROOT} does not exist, make sure to set the right path"

export_path = root / 'data/sentiment_analysis/'
sentiment_scores_df = pd.read_feather(export_path / 'sentiment_scores2.feather')
sentiment_scores_df = sentiment_scores_df.set_index('index')

# Tweet-Roberta model
%pip install transformers
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

def process_tweet(row):
    text = row['cleaned_content']
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    score = (scores[0] * -1 + scores[1] * 0 + scores[2] * 1) # Expectation
    return score

sentiment_scores_df['twitter_roberta_score'] = sentiment_scores_df.apply(process_tweet, axis=1, result_type="expand")

plt.hist(sentiment_scores_df['twitter_roberta_score'].loc[sentiment_scores_df['twitter_roberta_score']!=0], density=False, bins=50)
plt.ylabel('Number of tweets')
plt.xlabel('Twitter-Roberta score');
plt.show()


# Save the feather file
export_path = root / 'data/sentiment_analysis/'
sentiment_scores_df.reset_index().to_feather(export_path / 'sentiment_scores_twitter_roberta.feather')