Untitled
unknown
plain_text
10 months ago
2.1 kB
3
Indexable
clean_text <- (yt_data)$Comment |> # change 'comment' to 'Comment' for YouTube replace_url() |> replace_html() |> replace_non_ascii() |> replace_word_elongation() |> replace_internet_slang() |> replace_contraction() |> removeNumbers() |> removePunctuation() # Convert clean_text vector into a document corpus (collection of documents) text_corpus <- VCorpus(VectorSource(clean_text)) text_corpus[[1]]$content text_corpus[[5]]$content # Perform further pre-processing text_corpus <- text_corpus |> tm_map(content_transformer(tolower)) |> tm_map(removeWords, stopwords(kind = "SMART")) |> # tm_map(stemDocument) |> # optional tm_map(stripWhitespace) text_corpus[[1]]$content text_corpus[[5]]$content # Perform further pre-processing text_corpus <- text_corpus |> tm_map(content_transformer(tolower)) |> tm_map(removeWords, stopwords(kind = "SMART")) |> # tm_map(stemDocument) |> # optional tm_map(stripWhitespace) text_corpus[[1]]$content text_corpus[[5]]$content # Transform corpus into a Document Term Matrix and remove 0 entries yt_doc_term_matrix <- DocumentTermMatrix(text_corpus) yt_non_zero_entries = unique(yt_doc_term_matrix$i) yt_dtm = yt_doc_term_matrix[yt_non_zero_entries,] # Optional: Remove objects and run garbage collection for faster processing save(yt_dtm, file = "yt_doc_term_matrix.RData") rm(list = ls(all.names = TRUE)) gc() load("yt_doc_term_matrix.RData") # Create LDA model with k topics lda_model <- LDA(yt_dtm, k = 6) # Generate topic probabilities for each word # 'beta' shows the probability that this word was generated by that topic found_topics <- tidy(lda_model, matrix = "beta") View(found_topics) # Visualise the top 10 terms per topic top_terms <- found_topics |> group_by(topic) |> slice_max(beta, n = 10) |> ungroup() |> arrange(topic, -beta) top_terms |> mutate(term = reorder_within(term, beta, topic)) |> ggplot(aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered()
Editor is loading...
Leave a Comment