Untitled
unknown
plain_text
a year ago
12 kB
5
Indexable
import os import nltk from azure.ai.textanalytics import TextAnalyticsClient from azure.core.credentials import AzureKeyCredential from nltk.tokenize import sent_tokenize from translation import translate from base import AbstractSentimentAnalyzer import time from dotenv import load_dotenv from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download("stopwords") nltk.download("wordnet") class SentimentAnalyzer(AbstractSentimentAnalyzer): def __init__(self, language_key: str, language_endpoint: str): """ Summary: Initializes the SentimentAnalyzer object with the Azure Text Analytics client Args: language_key: str, the Azure Text Analytics key language_endpoint: str, the Azure Text Analytics endpoint Returns: None """ self.language_key = language_key self.language_endpoint = language_endpoint def extract_sentiment(self, text: str, input_language: str) -> dict: """ Summary: Calculate the sentiment analysis of the transcription file Args: text: str, the transcription to analyze use_english: bool, whether to translate the transcription to English before sentiment analysis Returns: result: dict, the sentiment analysis of the transcription file and the confidence scores """ result = {} # Load the Azure Text Analytics client credential = AzureKeyCredential(self.language_key) text_analytics_client = TextAnalyticsClient( endpoint=self.language_endpoint, credential=credential ) # Translate the text to English if the input language is not English available_languages = ["en", "el"] if input_language in available_languages: text = translate(text=text, target_lang="en", detected_lang=input_language) else: raise ValueError("The input language is not supported. Please use one of the following languages: " + ", ".join(available_languages)) # Text Cleaning stopwords = (nltk.corpus.stopwords.words("english")) lemmatizer = nltk.stem.WordNetLemmatizer() text = text.lower() text = ( text.replace("\n", " ") .replace("\r", " ") .replace("\t ", " ") .replace("\t ", " ") .replace("\t ", " ") .replace("\t", " ") ) text = " ".join([word for word in text.split() if word not in stopwords]) text = " ".join([lemmatizer.lemmatize(word) for word in text.split()]) # Define variables for sentiment analysis text_length = len(text) MAX_LENGTH = 5000 THRESHOLD = 0.3 # If the text is too long, split it into chunks of size CHUNK_SIZE if text_length > MAX_LENGTH: CHUNK_SIZE = 1500 size_of_temp_list = 0 chunked_sentences, temp_list = [], [] # Split the text into sentences and chunk them into parts of size CHUNK_SIZE sentences = sent_tokenize(text) for i, sentence in enumerate(sentences): if size_of_temp_list + len(sentence) < CHUNK_SIZE: temp_list.append(sentence) size_of_temp_list += len(sentence) else: chunked_sentences.append(temp_list) temp_list = [sentence] size_of_temp_list = len(sentence) # Join the sentences in each part and analyze the sentiment of each part chunked_sentences = [" ".join(text) for text in chunked_sentences] # Calculate the average sentiment score (positive, neutral, negative) of all parts positive, neutral, negative = 0, 0, 0 sentiments, top_positive_sentences, top_negative_sentences = [], [], [] for i, chunk in enumerate(chunked_sentences): sentiment = text_analytics_client.analyze_sentiment(documents=[chunk])[0] sentiments.append(sentiment.sentiment) positive += sentiment.confidence_scores.positive neutral += sentiment.confidence_scores.neutral negative += sentiment.confidence_scores.negative # Store the senteces that have greater score than THRESHOLD for sentence in sentiment.sentences: which_sentiment = sentence.sentiment positive_score = sentence.confidence_scores.positive negative_score = sentence.confidence_scores.negative if which_sentiment == "positive" and positive_score > THRESHOLD: top_positive_sentences.append( {"sentence": sentence.text, "positive_score": positive_score} ) elif which_sentiment == "negative" and negative_score > THRESHOLD: top_negative_sentences.append( {"sentence": sentence.text, "negative_score": negative_score} ) # Keep the top 5 positive and negative sentences top_positive_sentences = sorted( top_positive_sentences, key=lambda x: x["positive_score"], reverse=True )[:5] top_negative_sentences = sorted( top_negative_sentences, key=lambda x: x["negative_score"], reverse=True )[:5] # Create the output format for the top positive and negative sentences output_format_positive, output_format_negative = "", "" for sentence_pos, sentence_neg in zip( top_positive_sentences, top_negative_sentences ): sentence_pos["sentence"] = translate( text=sentence_pos["sentence"], detected_lang="en", target_lang="el" ) sentence_neg["sentence"] = translate( text=sentence_neg["sentence"], detected_lang="en", target_lang="el" ) output_format_positive += ( sentence_pos["sentence"] + " " + str(sentence_pos["positive_score"]) + "\n" ) output_format_negative += ( sentence_neg["sentence"] + " " + str(sentence_neg["negative_score"]) + "\n" ) # Store the sentiment analysis in the result dictionary result["sentiment"] = max(set(sentiments), key=sentiments.count) result["sentiment_confidence_scores"] = { "positive": round(positive / len(sentiments), 2), "neutral": round(neutral / len(sentiments), 2), "negative": round(negative / len(sentiments), 2), } result["top_positive_sentences"] = output_format_positive result["top_negative_sentences"] = output_format_negative else: # Calculate the sentiment of the whole text sentiment_analysis = text_analytics_client.analyze_sentiment(documents=[text])[ 0 ] # Store the senteces that have greater score than THRESHOLD top_positive_sentences, top_negative_sentences = [], [] for sentence in sentiment_analysis.sentences: which_sentiment = sentence.sentiment positive_score = sentence.confidence_scores.positive negative_score = sentence.confidence_scores.negative if which_sentiment == "positive" and positive_score > THRESHOLD: top_positive_sentences.append( {"sentence": sentence.text, "positive_score": positive_score} ) elif which_sentiment == "negative" and negative_score > THRESHOLD: top_negative_sentences.append( {"sentence": sentence.text, "negative_score": negative_score} ) # Keep the top 5 positive and negative sentences top_positive_sentences = sorted( top_positive_sentences, key=lambda x: x["positive_score"], reverse=True )[:5] top_negative_sentences = sorted( top_negative_sentences, key=lambda x: x["negative_score"], reverse=True )[:5] # Create the output format for the top positive and negative sentences output_format_positive, output_format_negative = "", "" for sentence_pos, sentence_neg in zip( top_positive_sentences, top_negative_sentences ): sentence_pos["sentence"] = translate( text=sentence_pos["sentence"], detected_lang="en", target_lang="el" ) sentence_neg["sentence"] = translate( text=sentence_neg["sentence"], detected_lang="en", target_lang="el" ) output_format_positive += ( sentence_pos["sentence"] + " " + str(sentence_pos["positive_score"]) + "\n" ) output_format_negative += ( sentence_neg["sentence"] + " " + str(sentence_neg["negative_score"]) + "\n" ) # Store the sentiment analysis in the result dictionary result["sentiment"] = sentiment_analysis.sentiment result["sentiment_positive_confidence_score"] = ( sentiment_analysis.confidence_scores.positive ) result["sentiment_neutral_confidence_score"] = ( sentiment_analysis.confidence_scores.neutral ) result["sentiment_negative_confidence_score"] = ( sentiment_analysis.confidence_scores.negative ) result["top_positive_sentences"] = output_format_positive result["top_negative_sentences"] = output_format_negative return result def main(): start = time.time() # Load the environment variables load_dotenv() key = os.getenv("AZURE_LANGUAGE_KEY") endpoint = os.getenv("AZURE_LANGUAGE_ENDPOINT") # Create a SentimentAnalyzer object SA = SentimentAnalyzer(key, endpoint) transcripts_folder = "./new_labeled_transcripts/" for filename in os.listdir(transcripts_folder): filename = os.path.join(transcripts_folder, filename) if filename.endswith(".txt"): # Read the transcript file (supposed to be in greek language) with open(filename, "r", encoding='utf8') as f: greek_transcript = f.read() f.close() # Extract the sentiment from the transcript (Greek Version) greek_result = SA.extract_sentiment(greek_transcript,"el") # Translate the transcript to English and extract the sentiment (English Version) english_transcript = translate(greek_transcript,"en","el") time.sleep(10) english_result = SA.extract_sentiment(english_transcript,"en") # Print the results print(filename) print('Greek:\n', greek_result) print('\nEnglish:\n', english_result) print("\n\n") print("\n\nExecution Time:", int(time.time() - start), "seconds") if __name__ == "__main__": main()
Editor is loading...
Leave a Comment