72 lines
2.6 KiB
R
72 lines
2.6 KiB
R
library(tidyverse)
|
|
library(quanteda)
|
|
library(lubridate)
|
|
library(quanteda.textmodels)
|
|
library(lexicon)
|
|
library(stm)
|
|
|
|
rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
|
|
rfc_df$doc_id = 1:nrow(rfc_df)
|
|
#some cleaning around the timestamp of the comment made
|
|
rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC")
|
|
rfc_corp = corpus(rfc_df$comment_text,
|
|
docvars = rfc_df,
|
|
docnames = rfc_df$doc_id)
|
|
|
|
rfc_tokens <- tokens(rfc_corp,
|
|
what = "word",
|
|
remove_punct = TRUE,
|
|
remove_symbols = TRUE,
|
|
remove_numbers = TRUE,
|
|
remove_url = TRUE,
|
|
remove_separators = TRUE,
|
|
include_docvars = TRUE)
|
|
|
|
#removing not only english stopwords but some bespoke ones too
|
|
additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author)
|
|
#take out references to other authors
|
|
custom_stopwords <- c(stopwords("english"), additional_stopwords)
|
|
|
|
rfc_dfm <- rfc_tokens|>
|
|
dfm() |>
|
|
dfm_select(pattern = custom_stopwords,
|
|
selection = c("remove"),
|
|
valuetype = c("fixed"))
|
|
|
|
rfc_dfm_lemmatized = dfm_replace(rfc_dfm,
|
|
pattern = lexicon::hash_lemmas$token,
|
|
replacement = lexicon::hash_lemmas$lemma)
|
|
|
|
rfc_feature_counts <- colSums(rfc_dfm_lemmatized)
|
|
docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized)
|
|
#saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds")
|
|
rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm",
|
|
docvars = docvars(rfc_dfm_lemmatized))
|
|
|
|
#run the model
|
|
K = 5
|
|
seed = 9021000
|
|
|
|
#model = stm(documents = rfc_dfm_stm$documents,
|
|
# vocab = rfc_dfm_stm$vocab,
|
|
# K = K,
|
|
# seed = seed,
|
|
# data = rfc_df,
|
|
# prevalence=~posix_timestamp,
|
|
# verbose = TRUE)
|
|
plot(model)
|
|
#saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds")
|
|
model <- readRDS(file = "text_analysis/case1/030125_ve_rfc_stm.rds")
|
|
|
|
labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10)
|
|
theta <- model$theta
|
|
expected_topic_proportions <- colMeans(theta)
|
|
|
|
results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date = rfc_dfm_stm$meta$posix_timestamp, model$theta)
|
|
|
|
cat(results[order(-results$X1),"text"][1])
|
|
cat(results[order(-results$X1),"text"][2])
|
|
cat(results[order(-results$X1),"text"][3])
|
|
cat(results[order(-results$X1),"text"][4])
|
|
cat(results[order(-results$X1),"text"][5])
|