1
0
mw-lifecycle-analysis/text_analysis/case1/case1_stm.R
2025-03-01 17:08:16 -08:00

70 lines
2.5 KiB
R

library(tidyverse)
library(quanteda)
library(lubridate)
library(quanteda.textmodels)
library(lexicon)
library(stm)
rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
rfc_df$doc_id = 1:nrow(rfc_df)
#some cleaning around the timestamp of the comment made
rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC")
rfc_corp = corpus(rfc_df$comment_text,
docvars = rfc_df,
docnames = rfc_df$doc_id)
rfc_tokens <- tokens(rfc_corp,
what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
include_docvars = TRUE)
#removing not only english stopwords but some bespoke ones too
additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author)
#take out references to other authors
custom_stopwords <- c(stopwords("english"), additional_stopwords)
rfc_dfm <- rfc_tokens|>
dfm() |>
dfm_select(pattern = custom_stopwords,
selection = c("remove"),
valuetype = c("fixed"))
rfc_dfm_lemmatized = dfm_replace(rfc_dfm,
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma)
rfc_feature_counts <- colSums(rfc_dfm_lemmatized)
docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized)
#saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds")
rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm",
docvars = docvars(rfc_dfm_lemmatized))
#run the model
K = 5
seed = 9021000
#model = stm(documents = rfc_dfm_stm$documents,
# vocab = rfc_dfm_stm$vocab,
# K = K,
# seed = seed,
# data = rfc_df,
# prevalence=~posix_timestamp,
# verbose = TRUE)
plot(model)
saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds")
labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10)
results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date = rfc_dfm_stm$meta$posix_timestamp, model$theta)
cat(results[order(-results$X1),"text"][1])
cat(results[order(-results$X1),"text"][2])
cat(results[order(-results$X1),"text"][3])
cat(results[order(-results$X1),"text"][4])
cat(results[order(-results$X1),"text"][5])