Untitled
unknown
plain_text
a year ago
2.1 kB
5
Indexable
clean_text <- (yt_data)$Comment |> # change 'comment' to 'Comment' for YouTube
replace_url() |>
replace_html() |>
replace_non_ascii() |>
replace_word_elongation() |>
replace_internet_slang() |>
replace_contraction() |>
removeNumbers() |>
removePunctuation()
# Convert clean_text vector into a document corpus (collection of documents)
text_corpus <- VCorpus(VectorSource(clean_text))
text_corpus[[1]]$content
text_corpus[[5]]$content
# Perform further pre-processing
text_corpus <- text_corpus |>
tm_map(content_transformer(tolower)) |>
tm_map(removeWords, stopwords(kind = "SMART")) |>
# tm_map(stemDocument) |> # optional
tm_map(stripWhitespace)
text_corpus[[1]]$content
text_corpus[[5]]$content
# Perform further pre-processing
text_corpus <- text_corpus |>
tm_map(content_transformer(tolower)) |>
tm_map(removeWords, stopwords(kind = "SMART")) |>
# tm_map(stemDocument) |> # optional
tm_map(stripWhitespace)
text_corpus[[1]]$content
text_corpus[[5]]$content
# Transform corpus into a Document Term Matrix and remove 0 entries
yt_doc_term_matrix <- DocumentTermMatrix(text_corpus)
yt_non_zero_entries = unique(yt_doc_term_matrix$i)
yt_dtm = yt_doc_term_matrix[yt_non_zero_entries,]
# Optional: Remove objects and run garbage collection for faster processing
save(yt_dtm, file = "yt_doc_term_matrix.RData")
rm(list = ls(all.names = TRUE))
gc()
load("yt_doc_term_matrix.RData")
# Create LDA model with k topics
lda_model <- LDA(yt_dtm, k = 6)
# Generate topic probabilities for each word
# 'beta' shows the probability that this word was generated by that topic
found_topics <- tidy(lda_model, matrix = "beta")
View(found_topics)
# Visualise the top 10 terms per topic
top_terms <- found_topics |>
group_by(topic) |>
slice_max(beta, n = 10) |>
ungroup() |>
arrange(topic, -beta)
top_terms |>
mutate(term = reorder_within(term, beta, topic)) |>
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
Editor is loading...
Leave a Comment