Untitled

mail@pastecode.io avatar
unknown
plain_text
21 days ago
12 kB
2
Indexable
Never
import os
import nltk
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
from nltk.tokenize import sent_tokenize
from translation import translate
from base import AbstractSentimentAnalyzer
import time
from dotenv import load_dotenv

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")


class SentimentAnalyzer(AbstractSentimentAnalyzer):
    def __init__(self, language_key: str, language_endpoint: str):
        """
        Summary:
        Initializes the SentimentAnalyzer object with the Azure Text Analytics client
        
        Args:
        language_key: str, the Azure Text Analytics key
        language_endpoint: str, the Azure Text Analytics endpoint
        
        Returns:
        None
        """
        self.language_key = language_key
        self.language_endpoint = language_endpoint

    def extract_sentiment(self, text: str, input_language: str) -> dict:
        """
        Summary:
        Calculate the sentiment analysis of the transcription file

        Args:
        text: str, the transcription to analyze
        use_english: bool, whether to translate the transcription to English before sentiment analysis

        Returns:
        result: dict, the sentiment analysis of the transcription file and the confidence scores
        """

        result = {}

        # Load the Azure Text Analytics client
        credential = AzureKeyCredential(self.language_key)
        text_analytics_client = TextAnalyticsClient(
            endpoint=self.language_endpoint, credential=credential
        )

        # Translate the text to English if the input language is not English
        available_languages = ["en", "el"]
        if input_language in available_languages:
            text = translate(text=text, target_lang="en", detected_lang=input_language)
        else:
            raise ValueError("The input language is not supported. Please use one of the following languages: " + ", ".join(available_languages))
        
        # Text Cleaning
        stopwords = (nltk.corpus.stopwords.words("english"))
        lemmatizer = nltk.stem.WordNetLemmatizer()
        text = text.lower()
        text = (
            text.replace("\n", " ")
            .replace("\r", " ")
            .replace("\t   ", " ")
            .replace("\t ", " ")
            .replace("\t ", " ")
            .replace("\t", " ")
        )
        text = " ".join([word for word in text.split() if word not in stopwords])
        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

        # Define variables for sentiment analysis
        text_length = len(text)
        MAX_LENGTH = 5000
        THRESHOLD = 0.3

        # If the text is too long, split it into chunks of size CHUNK_SIZE
        if text_length > MAX_LENGTH:
            CHUNK_SIZE = 1500
            size_of_temp_list = 0
            chunked_sentences, temp_list = [], []

            # Split the text into sentences and chunk them into parts of size CHUNK_SIZE
            sentences = sent_tokenize(text)
            for i, sentence in enumerate(sentences):
                if size_of_temp_list + len(sentence) < CHUNK_SIZE:
                    temp_list.append(sentence)
                    size_of_temp_list += len(sentence)
                else:
                    chunked_sentences.append(temp_list)
                    temp_list = [sentence]
                    size_of_temp_list = len(sentence)

            # Join the sentences in each part and analyze the sentiment of each part
            chunked_sentences = [" ".join(text) for text in chunked_sentences]

            # Calculate the average sentiment score (positive, neutral, negative) of all parts
            positive, neutral, negative = 0, 0, 0
            sentiments, top_positive_sentences, top_negative_sentences = [], [], []
            for i, chunk in enumerate(chunked_sentences):
                sentiment = text_analytics_client.analyze_sentiment(documents=[chunk])[0]
                sentiments.append(sentiment.sentiment)
                positive += sentiment.confidence_scores.positive
                neutral += sentiment.confidence_scores.neutral
                negative += sentiment.confidence_scores.negative

                # Store the senteces that have greater score than THRESHOLD
                for sentence in sentiment.sentences:
                    which_sentiment = sentence.sentiment
                    positive_score = sentence.confidence_scores.positive
                    negative_score = sentence.confidence_scores.negative

                    if which_sentiment == "positive" and positive_score > THRESHOLD:
                        top_positive_sentences.append(
                            {"sentence": sentence.text, "positive_score": positive_score}
                        )
                    elif which_sentiment == "negative" and negative_score > THRESHOLD:
                        top_negative_sentences.append(
                            {"sentence": sentence.text, "negative_score": negative_score}
                        )

            # Keep the top 5 positive and negative sentences
            top_positive_sentences = sorted(
                top_positive_sentences, key=lambda x: x["positive_score"], reverse=True
            )[:5]
            top_negative_sentences = sorted(
                top_negative_sentences, key=lambda x: x["negative_score"], reverse=True
            )[:5]

            # Create the output format for the top positive and negative sentences
            output_format_positive, output_format_negative = "", ""
            for sentence_pos, sentence_neg in zip(
                top_positive_sentences, top_negative_sentences
            ):
                sentence_pos["sentence"] = translate(
                    text=sentence_pos["sentence"], detected_lang="en", target_lang="el"
                )
                sentence_neg["sentence"] = translate(
                    text=sentence_neg["sentence"], detected_lang="en", target_lang="el"
                )
                output_format_positive += (
                    sentence_pos["sentence"]
                    + " "
                    + str(sentence_pos["positive_score"])
                    + "\n"
                )
                output_format_negative += (
                    sentence_neg["sentence"]
                    + " "
                    + str(sentence_neg["negative_score"])
                    + "\n"
                )

            # Store the sentiment analysis in the result dictionary
            result["sentiment"] = max(set(sentiments), key=sentiments.count)
            result["sentiment_confidence_scores"] = {
                "positive": round(positive / len(sentiments), 2),
                "neutral": round(neutral / len(sentiments), 2),
                "negative": round(negative / len(sentiments), 2),
            }
            result["top_positive_sentences"] = output_format_positive
            result["top_negative_sentences"] = output_format_negative

        else:
            # Calculate the sentiment of the whole text
            sentiment_analysis = text_analytics_client.analyze_sentiment(documents=[text])[
                0
            ]

            # Store the senteces that have greater score than THRESHOLD
            top_positive_sentences, top_negative_sentences = [], []
            for sentence in sentiment_analysis.sentences:
                which_sentiment = sentence.sentiment
                positive_score = sentence.confidence_scores.positive
                negative_score = sentence.confidence_scores.negative

                if which_sentiment == "positive" and positive_score > THRESHOLD:
                    top_positive_sentences.append(
                        {"sentence": sentence.text, "positive_score": positive_score}
                    )
                elif which_sentiment == "negative" and negative_score > THRESHOLD:
                    top_negative_sentences.append(
                        {"sentence": sentence.text, "negative_score": negative_score}
                    )

            # Keep the top 5 positive and negative sentences
            top_positive_sentences = sorted(
                top_positive_sentences, key=lambda x: x["positive_score"], reverse=True
            )[:5]
            top_negative_sentences = sorted(
                top_negative_sentences, key=lambda x: x["negative_score"], reverse=True
            )[:5]

            # Create the output format for the top positive and negative sentences
            output_format_positive, output_format_negative = "", ""
            for sentence_pos, sentence_neg in zip(
                top_positive_sentences, top_negative_sentences
            ):
                sentence_pos["sentence"] = translate(
                    text=sentence_pos["sentence"], detected_lang="en", target_lang="el"
                )
                sentence_neg["sentence"] = translate(
                    text=sentence_neg["sentence"], detected_lang="en", target_lang="el"
                )
                output_format_positive += (
                    sentence_pos["sentence"]
                    + " "
                    + str(sentence_pos["positive_score"])
                    + "\n"
                )
                output_format_negative += (
                    sentence_neg["sentence"]
                    + " "
                    + str(sentence_neg["negative_score"])
                    + "\n"
                )

            # Store the sentiment analysis in the result dictionary
            result["sentiment"] = sentiment_analysis.sentiment
            result["sentiment_positive_confidence_score"] = (
                sentiment_analysis.confidence_scores.positive
            )
            result["sentiment_neutral_confidence_score"] = (
                sentiment_analysis.confidence_scores.neutral
            )
            result["sentiment_negative_confidence_score"] = (
                sentiment_analysis.confidence_scores.negative
            )
            result["top_positive_sentences"] = output_format_positive
            result["top_negative_sentences"] = output_format_negative

        return result
    

def main():
    start = time.time()

    # Load the environment variables
    load_dotenv()
    key = os.getenv("AZURE_LANGUAGE_KEY")
    endpoint = os.getenv("AZURE_LANGUAGE_ENDPOINT")

    # Create a SentimentAnalyzer object
    SA = SentimentAnalyzer(key, endpoint)

    transcripts_folder = "./new_labeled_transcripts/"
    for filename in os.listdir(transcripts_folder):
        
        filename = os.path.join(transcripts_folder, filename)
        if filename.endswith(".txt"):  
            # Read the transcript file (supposed to be in greek language)
            with open(filename, "r", encoding='utf8') as f:
                greek_transcript = f.read()
                f.close()

            # Extract the sentiment from the transcript (Greek Version)
            greek_result = SA.extract_sentiment(greek_transcript,"el")  

            # Translate the transcript to English and extract the sentiment (English Version)  
            english_transcript = translate(greek_transcript,"en","el")
            time.sleep(10)
            english_result = SA.extract_sentiment(english_transcript,"en")    

            # Print the results
            print(filename)
            print('Greek:\n', greek_result)
            print('\nEnglish:\n', english_result)
            print("\n\n")

    print("\n\nExecution Time:", int(time.time() - start), "seconds")

if __name__ == "__main__":
    main()
Leave a Comment