Untitled
unknown
plain_text
2 years ago
6.7 kB
6
Indexable
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk import word_tokenize
import string
import pyLDAvis
import pyLDAvis.gensim_models
import networkx as nx
import nltk
class ldaModel:
def __init__(self):
# LWIC categories to remove
self.cat_remove = ['funct', 'article', 'preps', 'conj', 'present', 'past', 'auxverb', 'relativ', 'pronoun',
'auxverb', 'quant', 'conj', 'ipron', 'article', 'time', 'past', 'cogmech']
# Tokenization
self.tokenizer = RegexpTokenizer(r'\w+')
self.words_ly = [word for word in nltk.corpus.words.words() if word.endswith('ly')]
self.tagged_words = nltk.pos_tag(nltk.word_tokenize(" ".join(stopwords.words('english'))))
self.adverbs = [word for word, pos in self.tagged_words if pos == 'RB']
self.additional_words = {'room', 'hotel', 'like', 'recommended', 'nice', 'price', 'people', 'go', 'really',
'excellent', 'table', 'horrible', 'best', 'bad', 'euro', 'great', 'many', 'excellent',
'went', 'give', 'un', 'met', 'del', 'de', 'la', 'place', 'lol', 'good', 'ever', 'us',
'would', '2024', '2023', 'also', 'back', 'last', 'almost', 'around', 'I', 'am', 'us',
'one', 'much', 'today', 'better', 'is', 'are', 'will', 'a', 'the', 'and', 'in', 'on',
'at', 'to', 'for', 'with', 'it', 'of', 'that', 'this', 'as', 'by', 'from', 'was', 'were',
'an', 'be', 'first', 'day', 'video', 'find', 'watch', 'time', 'episode', 'seen', 'even',
'every', 'gets', 'season', 'made', 'dont', 'new', 'highly', 'homemade', 'lot', 'pretty',
'barrio', 'recommend', 'menu', 'service', 'location', 'food', 'selection', 'atmosphere',
'fast', 'always', 'big', 'option', 'get', 'share', 'group', 'makes', 'some', 'entertaining',
'competition', 'well', 'decorated', 'pretty', 'going', 'here', 'very', 'make', 'you',}
self.en_stop = set(stopwords.words('english') + list(self.additional_words) + [str(num) for num in range(100)]
+ self.words_ly + self.adverbs)
self.lemma = WordNetLemmatizer()
# Initialize the lda_model and corpus variables globally
self.lda_model = None
self.corpus = None
self.dictionary = None
def preprocess_fda(self, text):
# Tokenize
text = text.replace('\n\n', '')
tokens = word_tokenize(text.lower())
remove_punch = text.translate(str.maketrans('', '', string.punctuation))
filtered_tokens = [WordNetLemmatizer().lemmatize(word.lower()) for word in tokens if word.lower() not in self.en_stop and word.lower() not in string.punctuation]
# Remove words belonging to specified LWIC categories
tokens_filtered = []
for token in filtered_tokens:
synsets = wn.synsets(token)
if synsets:
synset = synsets[0] # Take the first synset
if synset.pos() not in self.cat_remove:
tokens_filtered.append(token)
# Handle negations
tokens_neg_handled = []
negation = False
for token in tokens_filtered:
if token == 'not':
negation = True
elif negation and token in stopwords.words('english'):
tokens_neg_handled.append("not_" + token) # Prefix negated words with "not_"
negation = False
else:
tokens_neg_handled.append(token)
negation = False
# Lemmatization
tokens_lemmatized = [WordNetLemmatizer().lemmatize(token) for token in tokens_neg_handled]
# Generate n-grams
n_grams = []
for n in range(1, 4): # Consider unigrams, bigrams, and trigrams
n_grams.extend([' '.join(gram) for gram in ngrams(tokens_lemmatized, n)])
return n_grams
def get_comments_with_topics(self, comments, place_name, cat_remove):
global lda_model # Access the global lda_model variable
processed_comments = [self.preprocess_fda(comment) for comment in comments]
# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_comments)
corpus = [dictionary.doc2bow(comment) for comment in processed_comments]
# Evaluate model coherence
coherence_values = []
for num_topics in range(1, 6):
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_comments, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
coherence_values.append((num_topics, coherence_lda))
# Select the optimal number of topics based on coherence scores
optimal_num_topics = max(coherence_values, key=lambda x: x[1])[0]
# Apply LDA model with optimal number of topics
lda_model_result = gensim.models.ldamodel.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=15)
# Define topic labels based on most probable words
topic_labels = {}
for topic_idx in range(optimal_num_topics):
top_words = lda_model_result.show_topic(topic_idx, topn=3) # Adjust topn as needed
topic_labels[topic_idx] = ', '.join([word for word, _ in top_words])
# Get topics for each comment
extracted_topics = []
for idx, comment in enumerate(comments):
topic_probabilities = lda_model.get_document_topics(corpus[idx])
topic_idx, _ = max(topic_probabilities, key=lambda x: x[1])
topic_label = topic_labels[topic_idx]
extracted_topics.append(topic_label.split(", "))
# Create a graph
G = nx.Graph()
# Add nodes for comments and topics
for idx, (comment, topics) in enumerate(zip(comments, extracted_topics)):
G.add_node(f'Comment_{idx}', label=comment, type='comment')
for topic in topics:
G.add_node(topic, type='topic')
G.add_edge(f'Comment_{idx}', topic)
return G, {"place_name": place_name, "topics": extracted_topics}
Editor is loading...
Leave a Comment