78 lines
2.8 KiB
R
78 lines
2.8 KiB
R
library(tidyverse)
|
|
library(quanteda)
|
|
library(lubridate)
|
|
library(quanteda.textmodels)
|
|
library(lexicon)
|
|
library(stm)
|
|
|
|
phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv")
|
|
phab_df$doc_id = 1:nrow(phab_df)
|
|
|
|
phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC")
|
|
phab_df <- phab_df|>
|
|
filter(date_created > 1351728001 & date_created < 1383263999)
|
|
|
|
phab_corp = corpus(phab_df$comment_text,
|
|
docvars = phab_df,
|
|
docnames = phab_df$doc_id)
|
|
|
|
phab_tokens <- tokens(phab_corp,
|
|
what = "word",
|
|
remove_punct = TRUE,
|
|
remove_symbols = TRUE,
|
|
remove_numbers = FALSE,
|
|
remove_url = TRUE,
|
|
remove_separators = TRUE,
|
|
include_docvars = TRUE)
|
|
|
|
#removing not only english stopwords but some bespoke ones too
|
|
additional_stopwords <- c("and")
|
|
#take out references to other authors
|
|
custom_stopwords <- c(stopwords("english"), additional_stopwords)
|
|
|
|
phab_dfm <- phab_tokens|>
|
|
dfm() |>
|
|
dfm_select(pattern = custom_stopwords,
|
|
selection = c("remove"),
|
|
valuetype = c("fixed"))
|
|
|
|
phab_dfm_lemmatized = dfm_replace(phab_dfm,
|
|
pattern = lexicon::hash_lemmas$token,
|
|
replacement = lexicon::hash_lemmas$lemma)
|
|
|
|
phab_feature_counts <- colSums(phab_dfm_lemmatized)
|
|
docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized)
|
|
|
|
#read in the RDS
|
|
rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds")
|
|
new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized))
|
|
|
|
phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm",
|
|
docvars = docvars(phab_dfm_lemmatized))
|
|
|
|
#loading in the STM that was fitted over the RFC data
|
|
stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds")
|
|
plot(stm_model)
|
|
#fit it over the new data
|
|
new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents)
|
|
#gives us 32058 comment scores to work with
|
|
|
|
results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),
|
|
date = phab_dfm_stm$meta$utc_date,
|
|
affil=phab_dfm_stm$meta$WMFaffil,
|
|
new_topic_scores$theta)
|
|
# the issue is, of course, that these topics are not fit to the documents
|
|
# but topic models must describe documents in terms of 1
|
|
# so it will ill-fit to the phabricator comments
|
|
|
|
grouped_results <- results |>
|
|
mutate(week = floor_date(as.POSIXct(date), "week")) |>
|
|
group_by(week, affil) |>
|
|
summarise(across(starts_with("X"), median, na.rm = TRUE))
|
|
|
|
plot <- grouped_results |>
|
|
ggplot(aes(x=week,
|
|
y=X1,
|
|
color=affil)) +
|
|
geom_line()
|
|
plot |