diff --git a/.ipynb_checkpoints/BERT-hw-checkpoint.ipynb b/.ipynb_checkpoints/BERT-hw-checkpoint.ipynb new file mode 100644 index 0000000..9409b57 --- /dev/null +++ b/.ipynb_checkpoints/BERT-hw-checkpoint.ipynb @@ -0,0 +1,72 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "2dd04d34-25c0-470f-973d-1325ce0df797", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", + "from transformers import Trainer, TrainingArguments\n", + "from torch.utils.data import Dataset\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n", + "import torch\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3504b633-4999-47d0-a6eb-ce7916206ced", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"distilbert-base-uncased\"\n", + "model = AutoModelForSequenceClassification.from_pretrained(model_name, \n", + " num_labels=1) \n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.sh_history b/.sh_history index f09a142..af118fb 100644 --- a/.sh_history +++ b/.sh_history @@ -17,3 +17,6 @@ cd ~ ls ls .local rm -r -f .local +cd rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") +cd /gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/ +ls diff --git a/commit_analysis/case1/0301-ve-testing-commit-plot.png b/commit_analysis/case1/0301-ve-testing-commit-plot.png new file mode 100644 index 0000000..2a647ef Binary files /dev/null and b/commit_analysis/case1/0301-ve-testing-commit-plot.png differ diff --git a/commit_analysis/case1/030125_ve-testing-share-ba.png b/commit_analysis/case1/030125_ve-testing-share-ba.png new file mode 100644 index 0000000..1cf23ff Binary files /dev/null and b/commit_analysis/case1/030125_ve-testing-share-ba.png differ diff --git a/commit_analysis/case1/030125_ve_testing_commits_ba_plot.png b/commit_analysis/case1/030125_ve_testing_commits_ba_plot.png new file mode 100644 index 0000000..2e69997 Binary files /dev/null and b/commit_analysis/case1/030125_ve_testing_commits_ba_plot.png differ diff --git a/commit_analysis/matched_rdd_models.R b/commit_analysis/matched_rdd_models.R index 9f11398..e6e5544 100644 --- a/commit_analysis/matched_rdd_models.R +++ b/commit_analysis/matched_rdd_models.R @@ -8,8 +8,9 @@ widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-t event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") -input_df <- bind_rows(entest_df, widetest_df, event_df) -#input_df <- bind_rows(entest_df, widetest_df) +#input_df <- bind_rows(entest_df, widetest_df, event_df) +#dropping the event (2013-07-01) from the modeling +input_df <- bind_rows(entest_df, widetest_df) input_df <- input_df |> mutate(nonbot_commit_count = commit_count - bot_commit_count)|> @@ -39,13 +40,13 @@ library(rdd) intermediate_long_df <- intermediate_long_df |> drop_na() -var(intermediate_long_df$commit_share) # 1253.343 -mean(intermediate_long_df$commit_share) # 44.92381 -median(intermediate_long_df$commit_share) # 39.5 +var(intermediate_long_df$lengthened_commit_count) # 1253.343 +mean(intermediate_long_df$lengthened_commit_count) # 44.92381 +median(intermediate_long_df$lengthened_commit_count) # 39.5 get_optimal_bandwidth <- function(df){ bw <- tryCatch({ - IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") + IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") }, error = function(e) { NA }) @@ -54,12 +55,12 @@ get_optimal_bandwidth <- function(df){ optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) -window_num <- 18 +window_num <- 4 final_long_df <- intermediate_long_df |> filter(relative_week >= (- window_num) & relative_week <= (window_num)) library(fitdistrplus) -descdist(long_df$lengthened_commit_count, discrete=FALSE) +descdist(final_long_df$lengthened_commit_count, discrete=FALSE) #start_values <- list(shape1 = 1, shape2 = 1) #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") print(fit) @@ -70,7 +71,7 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + (before_after*relative_week|rd_event), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), nAGQ=0, - data=long_df) + data=final_long_df) #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ # (before_after*relative_week|commit_type) + # (before_after*relative_week|rd_event) ,data=long_df) @@ -95,7 +96,8 @@ icc(wikimedia_share_lmer) other_long_df <- final_long_df |> filter(commit_type == "other_commit_count") -other_share_lmer <- lm(commit_share ~ before_after*relative_week, +other_share_lmer <- lmer(commit_share ~ before_after*relative_week + + (1| rd_event), data=other_long_df) summary(other_share_lmer) icc(other_share_lmer) diff --git a/commit_analysis/testing-share-plotting.R b/commit_analysis/testing-share-plotting.R new file mode 100644 index 0000000..08c6d17 --- /dev/null +++ b/commit_analysis/testing-share-plotting.R @@ -0,0 +1,82 @@ +library(tidyverse) +entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" +entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") + +widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" +widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") + +event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" +event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") + +#input_df <- bind_rows(entest_df, widetest_df, event_df) +#dropping the event (2013-07-01) from the modeling +input_df <- bind_rows(entest_df, widetest_df) + +input_df <- input_df |> + mutate(nonbot_commit_count = commit_count - bot_commit_count)|> + mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> + mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> + dplyr::select(-mediawiki_dev_commit_count) |> + dplyr::select(-wikia_commit_count) + +#get into mlm format +long_df <- input_df |> + pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), + names_to = "commit_type", + values_to = "lengthened_commit_count") + +intermediate_long_df <- long_df |> + mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> + mutate(log_commits = log1p(lengthened_commit_count))|> + mutate(scaled_long_commits = lengthened_commit_count / 10) + +intermediate_long_df <- intermediate_long_df |> + drop_na() + +window_num <- 4 +final_long_df <- intermediate_long_df |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) + +commit_plot <- final_long_df |> + ggplot(aes(x=relative_week, + y=lengthened_commit_count, + color=commit_type, + linetype = rd_event)) + + geom_line() + + geom_point() + + labs(x = "Relative Week", y = "Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + + scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + + scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + + ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") + + theme_bw() + + theme(legend.position = "top") +commit_plot + +total_commit_plot <- final_long_df |> + filter(commit_type == "other_commit_count")|> + ggplot(aes(x=relative_week, + y=nonbot_commit_count, + linetype = rd_event)) + + geom_line() + + geom_point() + + labs(x = "Relative Week", y = "Nonbot Commit Count", linetype = "Testing Event") + + scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + + ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events") + + theme_bw() + + theme(legend.position = "top") +total_commit_plot + +commit_share_plot <- final_long_df |> + ggplot(aes(x=relative_week, + y=commit_share, + color=commit_type, + linetype = rd_event)) + + geom_line() + + geom_point() + + labs(x = "Relative Week", y = "Share of Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + + scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) + + scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + + ggtitle("VisualEditor Nonbot Commit Share Around Opt-In Testing Events") + + theme_bw() + + theme(legend.position = "top") +commit_share_plot diff --git a/mgaughan-rstudio-server_24451895.out b/mgaughan-rstudio-server_24451895.out deleted file mode 100644 index d86b0de..0000000 --- a/mgaughan-rstudio-server_24451895.out +++ /dev/null @@ -1,18 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3439:56635 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: gYARjsVrF/GA3VDv3MYl - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 24451895 -slurmstepd: error: *** JOB 24451895 ON n3439 CANCELLED AT 2025-02-26T10:32:49 *** diff --git a/text_analysis/case1/030125_rfc_dfm.rds b/text_analysis/case1/030125_rfc_dfm.rds new file mode 100644 index 0000000..6dd8c86 Binary files /dev/null and b/text_analysis/case1/030125_rfc_dfm.rds differ diff --git a/text_analysis/case1/030125_ve_rfc_stm.rds b/text_analysis/case1/030125_ve_rfc_stm.rds new file mode 100644 index 0000000..fc7da6e Binary files /dev/null and b/text_analysis/case1/030125_ve_rfc_stm.rds differ diff --git a/text_analysis/case1/case1_stm.R b/text_analysis/case1/case1_stm.R new file mode 100644 index 0000000..41b878a --- /dev/null +++ b/text_analysis/case1/case1_stm.R @@ -0,0 +1,69 @@ +library(tidyverse) +library(quanteda) +library(lubridate) +library(quanteda.textmodels) +library(lexicon) +library(stm) + +rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") +rfc_df$doc_id = 1:nrow(rfc_df) +#some cleaning around the timestamp of the comment made +rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC") +rfc_corp = corpus(rfc_df$comment_text, + docvars = rfc_df, + docnames = rfc_df$doc_id) + +rfc_tokens <- tokens(rfc_corp, + what = "word", + remove_punct = TRUE, + remove_symbols = TRUE, + remove_numbers = TRUE, + remove_url = TRUE, + remove_separators = TRUE, + include_docvars = TRUE) + +#removing not only english stopwords but some bespoke ones too +additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author) +#take out references to other authors +custom_stopwords <- c(stopwords("english"), additional_stopwords) + +rfc_dfm <- rfc_tokens|> + dfm() |> + dfm_select(pattern = custom_stopwords, + selection = c("remove"), + valuetype = c("fixed")) + +rfc_dfm_lemmatized = dfm_replace(rfc_dfm, + pattern = lexicon::hash_lemmas$token, + replacement = lexicon::hash_lemmas$lemma) + +rfc_feature_counts <- colSums(rfc_dfm_lemmatized) +docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized) +#saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds") +rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm", + docvars = docvars(rfc_dfm_lemmatized)) + +#run the model +K = 5 +seed = 9021000 + +#model = stm(documents = rfc_dfm_stm$documents, +# vocab = rfc_dfm_stm$vocab, +# K = K, +# seed = seed, +# data = rfc_df, +# prevalence=~posix_timestamp, +# verbose = TRUE) +plot(model) +saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds") + +labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10) + + +results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date = rfc_dfm_stm$meta$posix_timestamp, model$theta) + +cat(results[order(-results$X1),"text"][1]) +cat(results[order(-results$X1),"text"][2]) +cat(results[order(-results$X1),"text"][3]) +cat(results[order(-results$X1),"text"][4]) +cat(results[order(-results$X1),"text"][5]) diff --git a/text_analysis/ve_dependency.ipynb b/text_analysis/case1/ve_dependency.ipynb similarity index 100% rename from text_analysis/ve_dependency.ipynb rename to text_analysis/case1/ve_dependency.ipynb diff --git a/text_analysis/phab_topic_trends.R b/text_analysis/phab_topic_trends.R new file mode 100644 index 0000000..a2d8a50 --- /dev/null +++ b/text_analysis/phab_topic_trends.R @@ -0,0 +1,78 @@ +library(tidyverse) +library(quanteda) +library(lubridate) +library(quanteda.textmodels) +library(lexicon) +library(stm) + +phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv") +phab_df$doc_id = 1:nrow(phab_df) + +phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC") +phab_df <- phab_df|> + filter(date_created > 1351728001 & date_created < 1383263999) + +phab_corp = corpus(phab_df$comment_text, + docvars = phab_df, + docnames = phab_df$doc_id) + +phab_tokens <- tokens(phab_corp, + what = "word", + remove_punct = TRUE, + remove_symbols = TRUE, + remove_numbers = FALSE, + remove_url = TRUE, + remove_separators = TRUE, + include_docvars = TRUE) + +#removing not only english stopwords but some bespoke ones too +additional_stopwords <- c("and") +#take out references to other authors +custom_stopwords <- c(stopwords("english"), additional_stopwords) + +phab_dfm <- phab_tokens|> + dfm() |> + dfm_select(pattern = custom_stopwords, + selection = c("remove"), + valuetype = c("fixed")) + +phab_dfm_lemmatized = dfm_replace(phab_dfm, + pattern = lexicon::hash_lemmas$token, + replacement = lexicon::hash_lemmas$lemma) + +phab_feature_counts <- colSums(phab_dfm_lemmatized) +docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized) + +#read in the RDS +rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds") +new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized)) + +phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm", + docvars = docvars(phab_dfm_lemmatized)) + +#loading in the STM that was fitted over the RFC data +stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds") +plot(stm_model) +#fit it over the new data +new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents) +#gives us 32058 comment scores to work with + +results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id), + date = phab_dfm_stm$meta$utc_date, + affil=phab_dfm_stm$meta$WMFaffil, + new_topic_scores$theta) +# the issue is, of course, that these topics are not fit to the documents +# but topic models must describe documents in terms of 1 +# so it will ill-fit to the phabricator comments + +grouped_results <- results |> + mutate(week = floor_date(as.POSIXct(date), "week")) |> + group_by(week, affil) |> + summarise(across(starts_with("X"), median, na.rm = TRUE)) + +plot <- grouped_results |> + ggplot(aes(x=week, + y=X5, + color=affil)) + + geom_line() +plot \ No newline at end of file