mail@pastecode.io avatar
2 months ago
6.7 kB
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk import word_tokenize
import string
import pyLDAvis
import pyLDAvis.gensim_models
import networkx as nx
import nltk

class ldaModel:
    def __init__(self):
        # LWIC categories to remove
        self.cat_remove = ['funct', 'article', 'preps', 'conj', 'present', 'past', 'auxverb', 'relativ', 'pronoun',
                           'auxverb', 'quant', 'conj', 'ipron', 'article', 'time', 'past', 'cogmech']

        # Tokenization
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.words_ly = [word for word in nltk.corpus.words.words() if word.endswith('ly')]
        self.tagged_words = nltk.pos_tag(nltk.word_tokenize(" ".join(stopwords.words('english'))))
        self.adverbs = [word for word, pos in self.tagged_words if pos == 'RB']
        self.additional_words = {'room', 'hotel', 'like', 'recommended', 'nice', 'price', 'people', 'go', 'really',
                                  'excellent', 'table', 'horrible', 'best', 'bad', 'euro', 'great', 'many', 'excellent',
                                  'went', 'give', 'un', 'met', 'del', 'de', 'la', 'place', 'lol', 'good', 'ever', 'us',
                                  'would', '2024', '2023', 'also', 'back', 'last', 'almost', 'around', 'I', 'am', 'us',
                                  'one', 'much', 'today', 'better', 'is', 'are', 'will', 'a', 'the', 'and', 'in', 'on',
                                  'at', 'to', 'for', 'with', 'it', 'of', 'that', 'this', 'as', 'by', 'from', 'was', 'were',
                                  'an', 'be', 'first', 'day', 'video', 'find', 'watch', 'time', 'episode', 'seen', 'even',
                                  'every', 'gets', 'season', 'made', 'dont', 'new', 'highly', 'homemade', 'lot', 'pretty',
                                  'barrio', 'recommend', 'menu', 'service', 'location', 'food', 'selection', 'atmosphere',
                                  'fast', 'always', 'big', 'option', 'get', 'share', 'group', 'makes', 'some', 'entertaining',
                                  'competition', 'well', 'decorated', 'pretty', 'going', 'here', 'very', 'make', 'you',}
        self.en_stop = set(stopwords.words('english') + list(self.additional_words) + [str(num) for num in range(100)]
                           + self.words_ly + self.adverbs)
        self.lemma = WordNetLemmatizer()

        # Initialize the lda_model and corpus variables globally
        self.lda_model = None
        self.corpus = None
        self.dictionary = None

    def preprocess_fda(self, text):
        # Tokenize
        text = text.replace('\n\n', '')
        tokens = word_tokenize(text.lower())
        remove_punch = text.translate(str.maketrans('', '', string.punctuation))
        filtered_tokens = [WordNetLemmatizer().lemmatize(word.lower()) for word in tokens if word.lower() not in self.en_stop and word.lower() not in string.punctuation]
        # Remove words belonging to specified LWIC categories
        tokens_filtered = []
        for token in filtered_tokens:
            synsets = wn.synsets(token)
            if synsets:
                synset = synsets[0]  # Take the first synset
                if synset.pos() not in self.cat_remove:

        # Handle negations
        tokens_neg_handled = []
        negation = False
        for token in tokens_filtered:
            if token == 'not':
                negation = True
            elif negation and token in stopwords.words('english'):
                tokens_neg_handled.append("not_" + token)  # Prefix negated words with "not_"
                negation = False
                negation = False

        # Lemmatization
        tokens_lemmatized = [WordNetLemmatizer().lemmatize(token) for token in tokens_neg_handled]

        # Generate n-grams
        n_grams = []
        for n in range(1, 4):  # Consider unigrams, bigrams, and trigrams
            n_grams.extend([' '.join(gram) for gram in ngrams(tokens_lemmatized, n)])

        return n_grams

    def get_comments_with_topics(self, comments, place_name, cat_remove):
        global lda_model  # Access the global lda_model variable

        processed_comments = [self.preprocess_fda(comment) for comment in comments]

        # Create dictionary and corpus
        dictionary = corpora.Dictionary(processed_comments)
        corpus = [dictionary.doc2bow(comment) for comment in processed_comments]

        # Evaluate model coherence
        coherence_values = []
        for num_topics in range(1, 6):
            lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
            coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_comments, dictionary=dictionary, coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence() 
            coherence_values.append((num_topics, coherence_lda))

        # Select the optimal number of topics based on coherence scores
        optimal_num_topics = max(coherence_values, key=lambda x: x[1])[0]

        # Apply LDA model with optimal number of topics
        lda_model_result = gensim.models.ldamodel.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15)

        # Define topic labels based on most probable words
        topic_labels = {}
        for topic_idx in range(optimal_num_topics):
            top_words = lda_model_result.show_topic(topic_idx, topn=3)  # Adjust topn as needed
            topic_labels[topic_idx] = ', '.join([word for word, _ in top_words])

        # Get topics for each comment
        extracted_topics = []
        for idx, comment in enumerate(comments):
            topic_probabilities = lda_model.get_document_topics(corpus[idx])
            topic_idx, _ = max(topic_probabilities, key=lambda x: x[1])
            topic_label = topic_labels[topic_idx]
            extracted_topics.append(topic_label.split(", "))

        # Create a graph
        G = nx.Graph()

        # Add nodes for comments and topics
        for idx, (comment, topics) in enumerate(zip(comments, extracted_topics)):
            G.add_node(f'Comment_{idx}', label=comment, type='comment')
            for topic in topics:
                G.add_node(topic, type='topic')
                G.add_edge(f'Comment_{idx}', topic)

        return G, {"place_name": place_name, "topics": extracted_topics}
Leave a Comment