Untitled

mail@pastecode.io avatar
unknown
plain_text
5 months ago
2.4 kB
6
Indexable
%%time
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np

all_posts.dropna(subset=['title'], inplace=True)
# Function to clean titles
def clean_title(title):
    # Remove [deleted] or [removed]
    title = re.sub(r'\[deleted\]|\[removed\]', '', title)
    # Remove special characters
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
    return title.strip()  # Strip leading and trailing whitespaces


# Apply the cleaning function to the 'title' column using .loc
all_posts['title'] = all_posts['title'].apply(clean_title)


%%time
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
import numpy as np

# Define the UMAP model with fixed random state
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05,metric='cosine',
             low_memory=False, random_state=41)

# Define the HDBSCAN model without fixed random state as it does not accept the random_state argument
hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=10,
                        gen_min_span_tree=True,
                        prediction_data=True)

# Download NLTK stopwords if not already downloaded
import nltk
nltk.download('stopwords')

# Define stopwords list
stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']

# Define the CountVectorizer model with fixed random state
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

# Define the SentenceTransformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize BERTopic with fixed random state and other parameters
model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    nr_topics= 10
)

# Fit BERTopic model to the data
topics, probs = model.fit_transform(neg_posts['title'])


freq2 = model.get_topic_info()
freq2

%%time
# model.visualize_barchart(top_n_topics = 12)
model.visualize_barchart(top_n_topics = 10)
Leave a Comment