mw-lifecycle-analysis/text_analysis/phab_topic_trends.R

library(tidyverse)
library(quanteda)
library(lubridate)
library(quanteda.textmodels)
library(lexicon)
library(stm)

phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv")
phab_df$doc_id = 1:nrow(phab_df)

phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC")
phab_df <- phab_df|>
  filter(date_created > 1351728001 & date_created < 1383263999)

phab_corp = corpus(phab_df$comment_text,
                  docvars = phab_df,
                  docnames = phab_df$doc_id)

phab_tokens <- tokens(phab_corp,
                     what = "word",
                     remove_punct = TRUE,
                     remove_symbols = TRUE,
                     remove_numbers = FALSE,
                     remove_url = TRUE,
                     remove_separators = TRUE,
                     include_docvars = TRUE)

#removing not only english stopwords but some bespoke ones too
additional_stopwords <- c("and")
#take out references to other authors
custom_stopwords <- c(stopwords("english"), additional_stopwords)

phab_dfm <- phab_tokens|>
  dfm() |>
  dfm_select(pattern = custom_stopwords,
             selection = c("remove"),
             valuetype = c("fixed"))

phab_dfm_lemmatized = dfm_replace(phab_dfm,
                                 pattern = lexicon::hash_lemmas$token,
                                 replacement = lexicon::hash_lemmas$lemma)

phab_feature_counts <- colSums(phab_dfm_lemmatized)
docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized)

#read in the RDS
rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds")
new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized))

phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm",
                      docvars = docvars(phab_dfm_lemmatized))

#loading in the STM that was fitted over the RFC data
stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds")
plot(stm_model)
#fit it over the new data
new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents)
#gives us 32058 comment scores to work with

results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),
                     date = phab_dfm_stm$meta$utc_date,
                     affil=phab_dfm_stm$meta$WMFaffil,
                     new_topic_scores$theta)
# the issue is, of course, that these topics are not fit to the documents
# but topic models must describe documents in terms of 1
# so it will ill-fit to the phabricator comments

grouped_results <- results |>
  mutate(week = floor_date(as.POSIXct(date), "week")) |>
  group_by(week, affil) |>
  summarise(across(starts_with("X"), median, na.rm = TRUE))

plot <- grouped_results |>
  ggplot(aes(x=week,
             y=X5,
             color=affil)) +
  geom_line()
plot