updating some of the text analysis

2025-03-01 17:08:16 -08:00 · 2025-03-01 17:08:16 -08:00 · 7858612d60
commit 7858612d60
parent 98fcf85e48
13 changed files with 316 additions and 28 deletions
--- a/.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
+++ b/.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
@ -0,0 +1,72 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dd04d34-25c0-470f-973d-1325ce0df797",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
+    "from transformers import Trainer, TrainingArguments\n",
+    "from torch.utils.data import Dataset\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n",
+    "import torch\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3504b633-4999-47d0-a6eb-ce7916206ced",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"distilbert-base-uncased\"\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(model_name, \n",
+    "                                                           num_labels=1) \n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.21"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.sh_history
+++ b/.sh_history
@ -17,3 +17,6 @@ cd ~
 ls
 ls .local
 rm -r -f .local
+cd rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
+cd /gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/
+ls
--- a/commit_analysis/case1/0301-ve-testing-commit-plot.png
+++ b/commit_analysis/case1/0301-ve-testing-commit-plot.png
--- a/commit_analysis/case1/030125_ve-testing-share-ba.png
+++ b/commit_analysis/case1/030125_ve-testing-share-ba.png
--- a/commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
+++ b/commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
--- a/commit_analysis/matched_rdd_models.R
+++ b/commit_analysis/matched_rdd_models.R
@ -8,8 +8,9 @@ widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-t
 event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
 event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")

-input_df <- bind_rows(entest_df, widetest_df, event_df)
-#input_df <- bind_rows(entest_df, widetest_df)
+#input_df <- bind_rows(entest_df, widetest_df, event_df)
+#dropping the event (2013-07-01) from the modeling
+input_df <- bind_rows(entest_df, widetest_df)

 input_df <- input_df |>
  mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
@ -39,13 +40,13 @@ library(rdd)
 intermediate_long_df <- intermediate_long_df |>
  drop_na()

-var(intermediate_long_df$commit_share) # 1253.343
-mean(intermediate_long_df$commit_share) # 44.92381
-median(intermediate_long_df$commit_share) # 39.5
+var(intermediate_long_df$lengthened_commit_count) # 1253.343
+mean(intermediate_long_df$lengthened_commit_count) # 44.92381
+median(intermediate_long_df$lengthened_commit_count) # 39.5

 get_optimal_bandwidth <- function(df){
  bw <- tryCatch({
-    IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular")
+    IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
  }, error = function(e) {
    NA
  })
@ -54,12 +55,12 @@ get_optimal_bandwidth <- function(df){
 optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df)


-window_num <- 18
+window_num <- 4
 final_long_df <- intermediate_long_df |>
  filter(relative_week >= (- window_num) & relative_week <= (window_num)) 

 library(fitdistrplus)
-descdist(long_df$lengthened_commit_count, discrete=FALSE)
+descdist(final_long_df$lengthened_commit_count, discrete=FALSE)
 #start_values <- list(shape1 = 1, shape2 = 1)
 #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial")
 print(fit)
@ -70,7 +71,7 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week +
                  (before_after*relative_week|rd_event),
                control=glmerControl(optimizer="bobyqa",
                                     optCtrl=list(maxfun=2e5)), nAGQ=0,
-                data=long_df)
+                data=final_long_df)
 #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+
 #                    (before_after*relative_week|commit_type) + 
 #                    (before_after*relative_week|rd_event) ,data=long_df)
@ -95,7 +96,8 @@ icc(wikimedia_share_lmer)

 other_long_df <- final_long_df |>
  filter(commit_type == "other_commit_count")
-other_share_lmer <- lm(commit_share ~ before_after*relative_week,
+other_share_lmer <- lmer(commit_share ~ before_after*relative_week +
+                         (1| rd_event),
                           data=other_long_df)
 summary(other_share_lmer)
 icc(other_share_lmer)
--- a/commit_analysis/testing-share-plotting.R
+++ b/commit_analysis/testing-share-plotting.R
@ -0,0 +1,82 @@
+library(tidyverse)
+entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv"
+entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing")
+
+widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv"
+widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing")
+
+event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
+event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
+
+#input_df <- bind_rows(entest_df, widetest_df, event_df)
+#dropping the event (2013-07-01) from the modeling
+input_df <- bind_rows(entest_df, widetest_df)
+
+input_df <- input_df |>
+  mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
+  mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |>
+  mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |>
+  dplyr::select(-mediawiki_dev_commit_count) |>
+  dplyr::select(-wikia_commit_count)
+
+#get into mlm format
+long_df <- input_df |>
+  pivot_longer(cols = c(other_commit_count, wikimedia_commit_count),
+               names_to = "commit_type",
+               values_to = "lengthened_commit_count")
+
+intermediate_long_df <- long_df |>
+  mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |>
+  mutate(log_commits = log1p(lengthened_commit_count))|>
+  mutate(scaled_long_commits = lengthened_commit_count / 10) 
+
+intermediate_long_df <- intermediate_long_df |>
+  drop_na()
+
+window_num <- 4
+final_long_df <- intermediate_long_df |>
+  filter(relative_week >= (- window_num) & relative_week <= (window_num)) 
+
+commit_plot <- final_long_df |>
+  ggplot(aes(x=relative_week, 
+              y=lengthened_commit_count, 
+              color=commit_type,
+              linetype = rd_event)) +
+  geom_line() +
+  geom_point() + 
+  labs(x = "Relative Week", y = "Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") +
+  scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + 
+  scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) +
+  ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") +
+  theme_bw() + 
+  theme(legend.position = "top")
+commit_plot
+
+total_commit_plot <- final_long_df |>
+  filter(commit_type == "other_commit_count")|>
+  ggplot(aes(x=relative_week, 
+             y=nonbot_commit_count, 
+             linetype = rd_event)) +
+  geom_line() +
+  geom_point() + 
+  labs(x = "Relative Week", y = "Nonbot Commit Count", linetype = "Testing Event") +
+  scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + 
+  ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events") +
+  theme_bw() + 
+  theme(legend.position = "top")
+total_commit_plot
+
+commit_share_plot <- final_long_df |>
+  ggplot(aes(x=relative_week, 
+             y=commit_share, 
+             color=commit_type,
+             linetype = rd_event)) +
+  geom_line() +
+  geom_point() + 
+  labs(x = "Relative Week", y = "Share of Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") +
+  scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + 
+  scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) +
+  ggtitle("VisualEditor Nonbot Commit Share Around Opt-In Testing Events") +
+  theme_bw() + 
+  theme(legend.position = "top")
+commit_share_plot
--- a/mgaughan-rstudio-server_24451895.out
+++ b/mgaughan-rstudio-server_24451895.out
@ -1,18 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3439:56635 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: gYARjsVrF/GA3VDv3MYl
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 24451895
-slurmstepd: error: *** JOB 24451895 ON n3439 CANCELLED AT 2025-02-26T10:32:49 ***
--- a/text_analysis/case1/030125_rfc_dfm.rds
+++ b/text_analysis/case1/030125_rfc_dfm.rds
--- a/text_analysis/case1/030125_ve_rfc_stm.rds
+++ b/text_analysis/case1/030125_ve_rfc_stm.rds
--- a/text_analysis/case1/case1_stm.R
+++ b/text_analysis/case1/case1_stm.R
@ -0,0 +1,69 @@
+library(tidyverse)
+library(quanteda)
+library(lubridate)
+library(quanteda.textmodels)
+library(lexicon)
+library(stm)
+
+rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
+rfc_df$doc_id = 1:nrow(rfc_df)
+#some cleaning around the timestamp of the comment made
+rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC")
+rfc_corp = corpus(rfc_df$comment_text,
+                  docvars = rfc_df,
+                  docnames = rfc_df$doc_id)
+
+rfc_tokens <- tokens(rfc_corp,
+                     what = "word",
+                     remove_punct = TRUE,
+                     remove_symbols = TRUE,
+                     remove_numbers = TRUE,
+                     remove_url = TRUE,
+                     remove_separators = TRUE,
+                     include_docvars = TRUE) 
+
+#removing not only english stopwords but some bespoke ones too 
+additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author)
+#take out references to other authors
+custom_stopwords <- c(stopwords("english"), additional_stopwords)
+
+rfc_dfm <- rfc_tokens|>
+  dfm() |>
+  dfm_select(pattern = custom_stopwords,
+             selection = c("remove"),
+             valuetype = c("fixed")) 
+
+rfc_dfm_lemmatized = dfm_replace(rfc_dfm, 
+                                 pattern = lexicon::hash_lemmas$token, 
+                                 replacement = lexicon::hash_lemmas$lemma)
+
+rfc_feature_counts <- colSums(rfc_dfm_lemmatized) 
+docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized)
+#saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds")
+rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm",
+                      docvars = docvars(rfc_dfm_lemmatized))
+
+#run the model 
+K = 5
+seed = 9021000
+
+#model = stm(documents = rfc_dfm_stm$documents,
+#            vocab = rfc_dfm_stm$vocab,
+#            K = K,
+#            seed = seed,
+#            data = rfc_df,
+#            prevalence=~posix_timestamp,
+#            verbose = TRUE)
+plot(model)
+saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds")
+
+labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10)
+
+
+results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date =  rfc_dfm_stm$meta$posix_timestamp, model$theta)
+
+cat(results[order(-results$X1),"text"][1]) 
+cat(results[order(-results$X1),"text"][2]) 
+cat(results[order(-results$X1),"text"][3]) 
+cat(results[order(-results$X1),"text"][4]) 
+cat(results[order(-results$X1),"text"][5]) 
--- a/text_analysis/case1/ve_dependency.ipynb
+++ b/text_analysis/case1/ve_dependency.ipynb
--- a/text_analysis/phab_topic_trends.R
+++ b/text_analysis/phab_topic_trends.R
@ -0,0 +1,78 @@
+library(tidyverse)
+library(quanteda)
+library(lubridate)
+library(quanteda.textmodels)
+library(lexicon)
+library(stm)
+
+phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv")
+phab_df$doc_id = 1:nrow(phab_df)
+
+phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC")
+phab_df <- phab_df|>
+  filter(date_created > 1351728001 & date_created < 1383263999)
+
+phab_corp = corpus(phab_df$comment_text,
+                  docvars = phab_df,
+                  docnames = phab_df$doc_id)
+
+phab_tokens <- tokens(phab_corp,
+                     what = "word",
+                     remove_punct = TRUE,
+                     remove_symbols = TRUE,
+                     remove_numbers = FALSE,
+                     remove_url = TRUE,
+                     remove_separators = TRUE,
+                     include_docvars = TRUE) 
+
+#removing not only english stopwords but some bespoke ones too 
+additional_stopwords <- c("and")
+#take out references to other authors
+custom_stopwords <- c(stopwords("english"), additional_stopwords)
+
+phab_dfm <- phab_tokens|>
+  dfm() |>
+  dfm_select(pattern = custom_stopwords,
+             selection = c("remove"),
+             valuetype = c("fixed")) 
+
+phab_dfm_lemmatized = dfm_replace(phab_dfm, 
+                                 pattern = lexicon::hash_lemmas$token, 
+                                 replacement = lexicon::hash_lemmas$lemma)
+
+phab_feature_counts <- colSums(phab_dfm_lemmatized) 
+docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized)
+
+#read in the RDS
+rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds")
+new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized))
+
+phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm",
+                      docvars = docvars(phab_dfm_lemmatized))
+
+#loading in the STM that was fitted over the RFC data 
+stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds")
+plot(stm_model)
+#fit it over the new data
+new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents)
+#gives us 32058 comment scores to work with
+
+results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id), 
+                     date = phab_dfm_stm$meta$utc_date, 
+                     affil=phab_dfm_stm$meta$WMFaffil,
+                     new_topic_scores$theta)
+# the issue is, of course, that these topics are not fit to the documents
+# but topic models must describe documents in terms of 1
+# so it will ill-fit to the phabricator comments
+
+grouped_results <- results |>
+  mutate(week = floor_date(as.POSIXct(date), "week")) |>
+  group_by(week, affil) |>
+  summarise(across(starts_with("X"), median, na.rm = TRUE))
+
+plot <- grouped_results |>
+  ggplot(aes(x=week,
+             y=X5, 
+             color=affil)) +
+  geom_line()
+plot