Untitled
import gensim from gensim import corpora from gensim.models import CoherenceModel from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords, wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer from nltk.util import ngrams from nltk import word_tokenize import string import pyLDAvis import pyLDAvis.gensim_models import networkx as nx import nltk class ldaModel: def __init__(self): # LWIC categories to remove self.cat_remove = ['funct', 'article', 'preps', 'conj', 'present', 'past', 'auxverb', 'relativ', 'pronoun', 'auxverb', 'quant', 'conj', 'ipron', 'article', 'time', 'past', 'cogmech'] # Tokenization self.tokenizer = RegexpTokenizer(r'\w+') self.words_ly = [word for word in nltk.corpus.words.words() if word.endswith('ly')] self.tagged_words = nltk.pos_tag(nltk.word_tokenize(" ".join(stopwords.words('english')))) self.adverbs = [word for word, pos in self.tagged_words if pos == 'RB'] self.additional_words = {'room', 'hotel', 'like', 'recommended', 'nice', 'price', 'people', 'go', 'really', 'excellent', 'table', 'horrible', 'best', 'bad', 'euro', 'great', 'many', 'excellent', 'went', 'give', 'un', 'met', 'del', 'de', 'la', 'place', 'lol', 'good', 'ever', 'us', 'would', '2024', '2023', 'also', 'back', 'last', 'almost', 'around', 'I', 'am', 'us', 'one', 'much', 'today', 'better', 'is', 'are', 'will', 'a', 'the', 'and', 'in', 'on', 'at', 'to', 'for', 'with', 'it', 'of', 'that', 'this', 'as', 'by', 'from', 'was', 'were', 'an', 'be', 'first', 'day', 'video', 'find', 'watch', 'time', 'episode', 'seen', 'even', 'every', 'gets', 'season', 'made', 'dont', 'new', 'highly', 'homemade', 'lot', 'pretty', 'barrio', 'recommend', 'menu', 'service', 'location', 'food', 'selection', 'atmosphere', 'fast', 'always', 'big', 'option', 'get', 'share', 'group', 'makes', 'some', 'entertaining', 'competition', 'well', 'decorated', 'pretty', 'going', 'here', 'very', 'make', 'you',} self.en_stop = set(stopwords.words('english') + list(self.additional_words) + [str(num) for num in range(100)] + self.words_ly + self.adverbs) self.lemma = WordNetLemmatizer() # Initialize the lda_model and corpus variables globally self.lda_model = None self.corpus = None self.dictionary = None def preprocess_fda(self, text): # Tokenize text = text.replace('\n\n', '') tokens = word_tokenize(text.lower()) remove_punch = text.translate(str.maketrans('', '', string.punctuation)) filtered_tokens = [WordNetLemmatizer().lemmatize(word.lower()) for word in tokens if word.lower() not in self.en_stop and word.lower() not in string.punctuation] # Remove words belonging to specified LWIC categories tokens_filtered = [] for token in filtered_tokens: synsets = wn.synsets(token) if synsets: synset = synsets[0] # Take the first synset if synset.pos() not in self.cat_remove: tokens_filtered.append(token) # Handle negations tokens_neg_handled = [] negation = False for token in tokens_filtered: if token == 'not': negation = True elif negation and token in stopwords.words('english'): tokens_neg_handled.append("not_" + token) # Prefix negated words with "not_" negation = False else: tokens_neg_handled.append(token) negation = False # Lemmatization tokens_lemmatized = [WordNetLemmatizer().lemmatize(token) for token in tokens_neg_handled] # Generate n-grams n_grams = [] for n in range(1, 4): # Consider unigrams, bigrams, and trigrams n_grams.extend([' '.join(gram) for gram in ngrams(tokens_lemmatized, n)]) return n_grams def get_comments_with_topics(self, comments, place_name, cat_remove): global lda_model # Access the global lda_model variable processed_comments = [self.preprocess_fda(comment) for comment in comments] # Create dictionary and corpus dictionary = corpora.Dictionary(processed_comments) corpus = [dictionary.doc2bow(comment) for comment in processed_comments] # Evaluate model coherence coherence_values = [] for num_topics in range(1, 6): lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15) coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_comments, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() coherence_values.append((num_topics, coherence_lda)) # Select the optimal number of topics based on coherence scores optimal_num_topics = max(coherence_values, key=lambda x: x[1])[0] # Apply LDA model with optimal number of topics lda_model_result = gensim.models.ldamodel.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15) # Define topic labels based on most probable words topic_labels = {} for topic_idx in range(optimal_num_topics): top_words = lda_model_result.show_topic(topic_idx, topn=3) # Adjust topn as needed topic_labels[topic_idx] = ', '.join([word for word, _ in top_words]) # Get topics for each comment extracted_topics = [] for idx, comment in enumerate(comments): topic_probabilities = lda_model.get_document_topics(corpus[idx]) topic_idx, _ = max(topic_probabilities, key=lambda x: x[1]) topic_label = topic_labels[topic_idx] extracted_topics.append(topic_label.split(", ")) # Create a graph G = nx.Graph() # Add nodes for comments and topics for idx, (comment, topics) in enumerate(zip(comments, extracted_topics)): G.add_node(f'Comment_{idx}', label=comment, type='comment') for topic in topics: G.add_node(topic, type='topic') G.add_edge(f'Comment_{idx}', topic) return G, {"place_name": place_name, "topics": extracted_topics}
Leave a Comment