Untitled
unknown
plain_text
a year ago
2.4 kB
14
Indexable
%%time
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np
all_posts.dropna(subset=['title'], inplace=True)
# Function to clean titles
def clean_title(title):
# Remove [deleted] or [removed]
title = re.sub(r'\[deleted\]|\[removed\]', '', title)
# Remove special characters
title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
return title.strip() # Strip leading and trailing whitespaces
# Apply the cleaning function to the 'title' column using .loc
all_posts['title'] = all_posts['title'].apply(clean_title)
%%time
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import random
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
import numpy as np
# Define the UMAP model with fixed random state
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05,metric='cosine',
low_memory=False, random_state=41)
# Define the HDBSCAN model without fixed random state as it does not accept the random_state argument
hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=10,
gen_min_span_tree=True,
prediction_data=True)
# Download NLTK stopwords if not already downloaded
import nltk
nltk.download('stopwords')
# Define stopwords list
stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']
# Define the CountVectorizer model with fixed random state
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)
# Define the SentenceTransformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize BERTopic with fixed random state and other parameters
model = BERTopic(
umap_model=umap_model,
hdbscan_model=hdbscan_model,
embedding_model=embedding_model,
vectorizer_model=vectorizer_model,
top_n_words=10,
language='english',
calculate_probabilities=True,
verbose=True,
nr_topics= 10
)
# Fit BERTopic model to the data
topics, probs = model.fit_transform(neg_posts['title'])
freq2 = model.get_topic_info()
freq2
%%time
# model.visualize_barchart(top_n_topics = 12)
model.visualize_barchart(top_n_topics = 10)Editor is loading...
Leave a Comment