1
0
mw-lifecycle-analysis/text_analysis/phab_topic_trends.R
2025-03-01 17:08:16 -08:00

78 lines
2.8 KiB
R

library(tidyverse)
library(quanteda)
library(lubridate)
library(quanteda.textmodels)
library(lexicon)
library(stm)
phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv")
phab_df$doc_id = 1:nrow(phab_df)
phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC")
phab_df <- phab_df|>
filter(date_created > 1351728001 & date_created < 1383263999)
phab_corp = corpus(phab_df$comment_text,
docvars = phab_df,
docnames = phab_df$doc_id)
phab_tokens <- tokens(phab_corp,
what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = FALSE,
remove_url = TRUE,
remove_separators = TRUE,
include_docvars = TRUE)
#removing not only english stopwords but some bespoke ones too
additional_stopwords <- c("and")
#take out references to other authors
custom_stopwords <- c(stopwords("english"), additional_stopwords)
phab_dfm <- phab_tokens|>
dfm() |>
dfm_select(pattern = custom_stopwords,
selection = c("remove"),
valuetype = c("fixed"))
phab_dfm_lemmatized = dfm_replace(phab_dfm,
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma)
phab_feature_counts <- colSums(phab_dfm_lemmatized)
docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized)
#read in the RDS
rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds")
new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized))
phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm",
docvars = docvars(phab_dfm_lemmatized))
#loading in the STM that was fitted over the RFC data
stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds")
plot(stm_model)
#fit it over the new data
new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents)
#gives us 32058 comment scores to work with
results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),
date = phab_dfm_stm$meta$utc_date,
affil=phab_dfm_stm$meta$WMFaffil,
new_topic_scores$theta)
# the issue is, of course, that these topics are not fit to the documents
# but topic models must describe documents in terms of 1
# so it will ill-fit to the phabricator comments
grouped_results <- results |>
mutate(week = floor_date(as.POSIXct(date), "week")) |>
group_by(week, affil) |>
summarise(across(starts_with("X"), median, na.rm = TRUE))
plot <- grouped_results |>
ggplot(aes(x=week,
y=X5,
color=affil)) +
geom_line()
plot