Untitled
unknown
plain_text
a year ago
2.4 kB
3
Indexable
Never
# Load up from the feather file import pandas as pd import numpy as np from pathlib import Path from google.colab import drive drive.mount('/content/drive', force_remount=True) # Nick # ROOT = "/content/drive/MyDrive/Final Project/" # Omar # ROOT = "/content/drive/MyDrive/MTH 9796/Final Project/" # Aneesh ROOT = "/content/drive/MyDrive/Colab Notebooks/Final Project/" # Hersh, pass in your root here ROOT = root = Path(ROOT) assert root.exists(), f"{ROOT} does not exist, make sure to set the right path" export_path = root / 'data/sentiment_analysis/' sentiment_scores_df = pd.read_feather(export_path / 'sentiment_scores2.feather') sentiment_scores_df = sentiment_scores_df.set_index('index') # Tweet-Roberta model %pip install transformers from transformers import AutoModelForSequenceClassification from transformers import TFAutoModelForSequenceClassification from transformers import AutoTokenizer import numpy as np from scipy.special import softmax import csv import urllib.request task='sentiment' MODEL = f"cardiffnlp/twitter-roberta-base-{task}" tokenizer = AutoTokenizer.from_pretrained(MODEL) # download label mapping labels=[] mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt" with urllib.request.urlopen(mapping_link) as f: html = f.read().decode('utf-8').split("\n") csvreader = csv.reader(html, delimiter='\t') labels = [row[1] for row in csvreader if len(row) > 1] model = AutoModelForSequenceClassification.from_pretrained(MODEL) model.save_pretrained(MODEL) def process_tweet(row): text = row['cleaned_content'] encoded_input = tokenizer(text, return_tensors='pt') output = model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) score = (scores[0] * -1 + scores[1] * 0 + scores[2] * 1) # Expectation return score sentiment_scores_df['twitter_roberta_score'] = sentiment_scores_df.apply(process_tweet, axis=1, result_type="expand") plt.hist(sentiment_scores_df['twitter_roberta_score'].loc[sentiment_scores_df['twitter_roberta_score']!=0], density=False, bins=50) plt.ylabel('Number of tweets') plt.xlabel('Twitter-Roberta score'); plt.show() # Save the feather file export_path = root / 'data/sentiment_analysis/' sentiment_scores_df.reset_index().to_feather(export_path / 'sentiment_scores_twitter_roberta.feather')