Untitled
unknown
plain_text
a year ago
12 kB
9
Indexable
import os
import nltk
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
from nltk.tokenize import sent_tokenize
from translation import translate
from base import AbstractSentimentAnalyzer
import time
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("wordnet")
class SentimentAnalyzer(AbstractSentimentAnalyzer):
def __init__(self, language_key: str, language_endpoint: str):
"""
Summary:
Initializes the SentimentAnalyzer object with the Azure Text Analytics client
Args:
language_key: str, the Azure Text Analytics key
language_endpoint: str, the Azure Text Analytics endpoint
Returns:
None
"""
self.language_key = language_key
self.language_endpoint = language_endpoint
def extract_sentiment(self, text: str, input_language: str) -> dict:
"""
Summary:
Calculate the sentiment analysis of the transcription file
Args:
text: str, the transcription to analyze
use_english: bool, whether to translate the transcription to English before sentiment analysis
Returns:
result: dict, the sentiment analysis of the transcription file and the confidence scores
"""
result = {}
# Load the Azure Text Analytics client
credential = AzureKeyCredential(self.language_key)
text_analytics_client = TextAnalyticsClient(
endpoint=self.language_endpoint, credential=credential
)
# Translate the text to English if the input language is not English
available_languages = ["en", "el"]
if input_language in available_languages:
text = translate(text=text, target_lang="en", detected_lang=input_language)
else:
raise ValueError("The input language is not supported. Please use one of the following languages: " + ", ".join(available_languages))
# Text Cleaning
stopwords = (nltk.corpus.stopwords.words("english"))
lemmatizer = nltk.stem.WordNetLemmatizer()
text = text.lower()
text = (
text.replace("\n", " ")
.replace("\r", " ")
.replace("\t ", " ")
.replace("\t ", " ")
.replace("\t ", " ")
.replace("\t", " ")
)
text = " ".join([word for word in text.split() if word not in stopwords])
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
# Define variables for sentiment analysis
text_length = len(text)
MAX_LENGTH = 5000
THRESHOLD = 0.3
# If the text is too long, split it into chunks of size CHUNK_SIZE
if text_length > MAX_LENGTH:
CHUNK_SIZE = 1500
size_of_temp_list = 0
chunked_sentences, temp_list = [], []
# Split the text into sentences and chunk them into parts of size CHUNK_SIZE
sentences = sent_tokenize(text)
for i, sentence in enumerate(sentences):
if size_of_temp_list + len(sentence) < CHUNK_SIZE:
temp_list.append(sentence)
size_of_temp_list += len(sentence)
else:
chunked_sentences.append(temp_list)
temp_list = [sentence]
size_of_temp_list = len(sentence)
# Join the sentences in each part and analyze the sentiment of each part
chunked_sentences = [" ".join(text) for text in chunked_sentences]
# Calculate the average sentiment score (positive, neutral, negative) of all parts
positive, neutral, negative = 0, 0, 0
sentiments, top_positive_sentences, top_negative_sentences = [], [], []
for i, chunk in enumerate(chunked_sentences):
sentiment = text_analytics_client.analyze_sentiment(documents=[chunk])[0]
sentiments.append(sentiment.sentiment)
positive += sentiment.confidence_scores.positive
neutral += sentiment.confidence_scores.neutral
negative += sentiment.confidence_scores.negative
# Store the senteces that have greater score than THRESHOLD
for sentence in sentiment.sentences:
which_sentiment = sentence.sentiment
positive_score = sentence.confidence_scores.positive
negative_score = sentence.confidence_scores.negative
if which_sentiment == "positive" and positive_score > THRESHOLD:
top_positive_sentences.append(
{"sentence": sentence.text, "positive_score": positive_score}
)
elif which_sentiment == "negative" and negative_score > THRESHOLD:
top_negative_sentences.append(
{"sentence": sentence.text, "negative_score": negative_score}
)
# Keep the top 5 positive and negative sentences
top_positive_sentences = sorted(
top_positive_sentences, key=lambda x: x["positive_score"], reverse=True
)[:5]
top_negative_sentences = sorted(
top_negative_sentences, key=lambda x: x["negative_score"], reverse=True
)[:5]
# Create the output format for the top positive and negative sentences
output_format_positive, output_format_negative = "", ""
for sentence_pos, sentence_neg in zip(
top_positive_sentences, top_negative_sentences
):
sentence_pos["sentence"] = translate(
text=sentence_pos["sentence"], detected_lang="en", target_lang="el"
)
sentence_neg["sentence"] = translate(
text=sentence_neg["sentence"], detected_lang="en", target_lang="el"
)
output_format_positive += (
sentence_pos["sentence"]
+ " "
+ str(sentence_pos["positive_score"])
+ "\n"
)
output_format_negative += (
sentence_neg["sentence"]
+ " "
+ str(sentence_neg["negative_score"])
+ "\n"
)
# Store the sentiment analysis in the result dictionary
result["sentiment"] = max(set(sentiments), key=sentiments.count)
result["sentiment_confidence_scores"] = {
"positive": round(positive / len(sentiments), 2),
"neutral": round(neutral / len(sentiments), 2),
"negative": round(negative / len(sentiments), 2),
}
result["top_positive_sentences"] = output_format_positive
result["top_negative_sentences"] = output_format_negative
else:
# Calculate the sentiment of the whole text
sentiment_analysis = text_analytics_client.analyze_sentiment(documents=[text])[
0
]
# Store the senteces that have greater score than THRESHOLD
top_positive_sentences, top_negative_sentences = [], []
for sentence in sentiment_analysis.sentences:
which_sentiment = sentence.sentiment
positive_score = sentence.confidence_scores.positive
negative_score = sentence.confidence_scores.negative
if which_sentiment == "positive" and positive_score > THRESHOLD:
top_positive_sentences.append(
{"sentence": sentence.text, "positive_score": positive_score}
)
elif which_sentiment == "negative" and negative_score > THRESHOLD:
top_negative_sentences.append(
{"sentence": sentence.text, "negative_score": negative_score}
)
# Keep the top 5 positive and negative sentences
top_positive_sentences = sorted(
top_positive_sentences, key=lambda x: x["positive_score"], reverse=True
)[:5]
top_negative_sentences = sorted(
top_negative_sentences, key=lambda x: x["negative_score"], reverse=True
)[:5]
# Create the output format for the top positive and negative sentences
output_format_positive, output_format_negative = "", ""
for sentence_pos, sentence_neg in zip(
top_positive_sentences, top_negative_sentences
):
sentence_pos["sentence"] = translate(
text=sentence_pos["sentence"], detected_lang="en", target_lang="el"
)
sentence_neg["sentence"] = translate(
text=sentence_neg["sentence"], detected_lang="en", target_lang="el"
)
output_format_positive += (
sentence_pos["sentence"]
+ " "
+ str(sentence_pos["positive_score"])
+ "\n"
)
output_format_negative += (
sentence_neg["sentence"]
+ " "
+ str(sentence_neg["negative_score"])
+ "\n"
)
# Store the sentiment analysis in the result dictionary
result["sentiment"] = sentiment_analysis.sentiment
result["sentiment_positive_confidence_score"] = (
sentiment_analysis.confidence_scores.positive
)
result["sentiment_neutral_confidence_score"] = (
sentiment_analysis.confidence_scores.neutral
)
result["sentiment_negative_confidence_score"] = (
sentiment_analysis.confidence_scores.negative
)
result["top_positive_sentences"] = output_format_positive
result["top_negative_sentences"] = output_format_negative
return result
def main():
start = time.time()
# Load the environment variables
load_dotenv()
key = os.getenv("AZURE_LANGUAGE_KEY")
endpoint = os.getenv("AZURE_LANGUAGE_ENDPOINT")
# Create a SentimentAnalyzer object
SA = SentimentAnalyzer(key, endpoint)
transcripts_folder = "./new_labeled_transcripts/"
for filename in os.listdir(transcripts_folder):
filename = os.path.join(transcripts_folder, filename)
if filename.endswith(".txt"):
# Read the transcript file (supposed to be in greek language)
with open(filename, "r", encoding='utf8') as f:
greek_transcript = f.read()
f.close()
# Extract the sentiment from the transcript (Greek Version)
greek_result = SA.extract_sentiment(greek_transcript,"el")
# Translate the transcript to English and extract the sentiment (English Version)
english_transcript = translate(greek_transcript,"en","el")
time.sleep(10)
english_result = SA.extract_sentiment(english_transcript,"en")
# Print the results
print(filename)
print('Greek:\n', greek_result)
print('\nEnglish:\n', english_result)
print("\n\n")
print("\n\nExecution Time:", int(time.time() - start), "seconds")
if __name__ == "__main__":
main()Editor is loading...
Leave a Comment