%%time from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer import re import numpy as np all_posts.dropna(subset=['title'], inplace=True) # Function to clean titles def clean_title(title): # Remove [deleted] or [removed] title = re.sub(r'\[deleted\]|\[removed\]', '', title) # Remove special characters title = re.sub(r'[^a-zA-Z0-9\s]', '', title) return title.strip() # Strip leading and trailing whitespaces # Apply the cleaning function to the 'title' column using .loc all_posts['title'] = all_posts['title'].apply(clean_title) %%time import os os.environ["TOKENIZERS_PARALLELISM"] = "false" import random from sentence_transformers import SentenceTransformer from umap import UMAP from hdbscan import HDBSCAN from sklearn.feature_extraction.text import CountVectorizer from nltk.corpus import stopwords from bertopic import BERTopic import numpy as np # Define the UMAP model with fixed random state umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05,metric='cosine', low_memory=False, random_state=41) # Define the HDBSCAN model without fixed random state as it does not accept the random_state argument hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=10, gen_min_span_tree=True, prediction_data=True) # Download NLTK stopwords if not already downloaded import nltk nltk.download('stopwords') # Define stopwords list stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com'] # Define the CountVectorizer model with fixed random state vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords) # Define the SentenceTransformer model embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize BERTopic with fixed random state and other parameters model = BERTopic( umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, vectorizer_model=vectorizer_model, top_n_words=10, language='english', calculate_probabilities=True, verbose=True, nr_topics= 10 ) # Fit BERTopic model to the data topics, probs = model.fit_transform(neg_posts['title']) freq2 = model.get_topic_info() freq2 %%time # model.visualize_barchart(top_n_topics = 12) model.visualize_barchart(top_n_topics = 10)
