Untitled

mail@pastecode.io avatar
unknown
plain_text
a month ago
2.1 kB
2
Indexable
Never
clean_text <- (yt_data)$Comment |> # change 'comment' to 'Comment' for YouTube
  replace_url() |> 
  replace_html() |>
  replace_non_ascii() |>
  replace_word_elongation() |>
  replace_internet_slang() |>
  replace_contraction() |>
  removeNumbers() |> 
  removePunctuation()


# Convert clean_text vector into a document corpus (collection of documents)

text_corpus <- VCorpus(VectorSource(clean_text))

text_corpus[[1]]$content
text_corpus[[5]]$content


# Perform further pre-processing 

text_corpus <- text_corpus |>
  tm_map(content_transformer(tolower)) |> 
  tm_map(removeWords, stopwords(kind = "SMART")) |> 
  # tm_map(stemDocument) |> # optional
  tm_map(stripWhitespace)

text_corpus[[1]]$content
text_corpus[[5]]$content

# Perform further pre-processing 

text_corpus <- text_corpus |>
  tm_map(content_transformer(tolower)) |> 
  tm_map(removeWords, stopwords(kind = "SMART")) |> 
  # tm_map(stemDocument) |> # optional
  tm_map(stripWhitespace)

text_corpus[[1]]$content
text_corpus[[5]]$content


# Transform corpus into a Document Term Matrix and remove 0 entries

yt_doc_term_matrix <- DocumentTermMatrix(text_corpus)
yt_non_zero_entries = unique(yt_doc_term_matrix$i)
yt_dtm = yt_doc_term_matrix[yt_non_zero_entries,]


# Optional: Remove objects and run garbage collection for faster processing

save(yt_dtm, file = "yt_doc_term_matrix.RData")
rm(list = ls(all.names = TRUE))
gc() 
load("yt_doc_term_matrix.RData")


# Create LDA model with k topics

lda_model <- LDA(yt_dtm, k = 6)


# Generate topic probabilities for each word
# 'beta' shows the probability that this word was generated by that topic

found_topics <- tidy(lda_model, matrix = "beta")
View(found_topics)


# Visualise the top 10 terms per topic

top_terms <- found_topics |>
  group_by(topic) |>
  slice_max(beta, n = 10) |> 
  ungroup() |>
  arrange(topic, -beta)

top_terms |>
  mutate(term = reorder_within(term, beta, topic)) |>
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()
Leave a Comment