library(tidyverse) library(quanteda) library(lubridate) library(quanteda.textmodels) library(lexicon) library(stm) phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv") phab_df$doc_id = 1:nrow(phab_df) phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC") phab_df <- phab_df|> filter(date_created > 1351728001 & date_created < 1383263999) phab_corp = corpus(phab_df$comment_text, docvars = phab_df, docnames = phab_df$doc_id) phab_tokens <- tokens(phab_corp, what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, remove_url = TRUE, remove_separators = TRUE, include_docvars = TRUE) #removing not only english stopwords but some bespoke ones too additional_stopwords <- c("and") #take out references to other authors custom_stopwords <- c(stopwords("english"), additional_stopwords) phab_dfm <- phab_tokens|> dfm() |> dfm_select(pattern = custom_stopwords, selection = c("remove"), valuetype = c("fixed")) phab_dfm_lemmatized = dfm_replace(phab_dfm, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) phab_feature_counts <- colSums(phab_dfm_lemmatized) docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized) #read in the RDS rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds") new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized)) phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm", docvars = docvars(phab_dfm_lemmatized)) #loading in the STM that was fitted over the RFC data stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds") plot(stm_model) #fit it over the new data new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents) #gives us 32058 comment scores to work with results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id), date = phab_dfm_stm$meta$utc_date, affil=phab_dfm_stm$meta$WMFaffil, new_topic_scores$theta) # the issue is, of course, that these topics are not fit to the documents # but topic models must describe documents in terms of 1 # so it will ill-fit to the phabricator comments grouped_results <- results |> mutate(week = floor_date(as.POSIXct(date), "week")) |> group_by(week, affil) |> summarise(across(starts_with("X"), median, na.rm = TRUE)) plot <- grouped_results |> ggplot(aes(x=week, y=X1, color=affil)) + geom_line() plot