library(tidyverse) library(quanteda) library(lubridate) library(quanteda.textmodels) library(lexicon) library(stm) rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") rfc_df$doc_id = 1:nrow(rfc_df) #some cleaning around the timestamp of the comment made rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC") rfc_corp = corpus(rfc_df$comment_text, docvars = rfc_df, docnames = rfc_df$doc_id) rfc_tokens <- tokens(rfc_corp, what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE, include_docvars = TRUE) #removing not only english stopwords but some bespoke ones too additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author) #take out references to other authors custom_stopwords <- c(stopwords("english"), additional_stopwords) rfc_dfm <- rfc_tokens|> dfm() |> dfm_select(pattern = custom_stopwords, selection = c("remove"), valuetype = c("fixed")) rfc_dfm_lemmatized = dfm_replace(rfc_dfm, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) rfc_feature_counts <- colSums(rfc_dfm_lemmatized) docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized) #saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds") rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm", docvars = docvars(rfc_dfm_lemmatized)) #run the model K = 5 seed = 9021000 #model = stm(documents = rfc_dfm_stm$documents, # vocab = rfc_dfm_stm$vocab, # K = K, # seed = seed, # data = rfc_df, # prevalence=~posix_timestamp, # verbose = TRUE) plot(model) saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds") labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10) results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date = rfc_dfm_stm$meta$posix_timestamp, model$theta) cat(results[order(-results$X1),"text"][1]) cat(results[order(-results$X1),"text"][2]) cat(results[order(-results$X1),"text"][3]) cat(results[order(-results$X1),"text"][4]) cat(results[order(-results$X1),"text"][5])