updating some of the text analysis
This commit is contained in:
		
							parent
							
								
									98fcf85e48
								
							
						
					
					
						commit
						7858612d60
					
				
							
								
								
									
										72
									
								
								.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,72 @@ | |||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "2dd04d34-25c0-470f-973d-1325ce0df797", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n" | ||||||
|  |      ] | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |      "name": "stderr", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||||||
|  |       "  from .autonotebook import tqdm as notebook_tqdm\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "import pandas as pd\n", | ||||||
|  |     "import numpy as np\n", | ||||||
|  |     "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", | ||||||
|  |     "from transformers import Trainer, TrainingArguments\n", | ||||||
|  |     "from torch.utils.data import Dataset\n", | ||||||
|  |     "from sklearn.model_selection import train_test_split\n", | ||||||
|  |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n", | ||||||
|  |     "import torch\n", | ||||||
|  |     "import json" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "3504b633-4999-47d0-a6eb-ce7916206ced", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "model_name = \"distilbert-base-uncased\"\n", | ||||||
|  |     "model = AutoModelForSequenceClassification.from_pretrained(model_name, \n", | ||||||
|  |     "                                                           num_labels=1) \n", | ||||||
|  |     "tokenizer = AutoTokenizer.from_pretrained(model_name)" | ||||||
|  |    ] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3 (ipykernel)", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.9.21" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
| @ -17,3 +17,6 @@ cd ~ | |||||||
| ls | ls | ||||||
| ls .local | ls .local | ||||||
| rm -r -f .local | rm -r -f .local | ||||||
|  | cd rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") | ||||||
|  | cd /gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/ | ||||||
|  | ls | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0301-ve-testing-commit-plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0301-ve-testing-commit-plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 46 KiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve-testing-share-ba.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve-testing-share-ba.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 95 KiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 93 KiB | 
| @ -8,8 +8,9 @@ widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-t | |||||||
| event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
| event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||||
| 
 | 
 | ||||||
| input_df <- bind_rows(entest_df, widetest_df, event_df) | #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||||
| #input_df <- bind_rows(entest_df, widetest_df) | #dropping the event (2013-07-01) from the modeling | ||||||
|  | input_df <- bind_rows(entest_df, widetest_df) | ||||||
| 
 | 
 | ||||||
| input_df <- input_df |> | input_df <- input_df |> | ||||||
|   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
| @ -39,13 +40,13 @@ library(rdd) | |||||||
| intermediate_long_df <- intermediate_long_df |> | intermediate_long_df <- intermediate_long_df |> | ||||||
|   drop_na() |   drop_na() | ||||||
| 
 | 
 | ||||||
| var(intermediate_long_df$commit_share) # 1253.343 | var(intermediate_long_df$lengthened_commit_count) # 1253.343 | ||||||
| mean(intermediate_long_df$commit_share) # 44.92381 | mean(intermediate_long_df$lengthened_commit_count) # 44.92381 | ||||||
| median(intermediate_long_df$commit_share) # 39.5 | median(intermediate_long_df$lengthened_commit_count) # 39.5 | ||||||
| 
 | 
 | ||||||
| get_optimal_bandwidth <- function(df){ | get_optimal_bandwidth <- function(df){ | ||||||
|   bw <- tryCatch({ |   bw <- tryCatch({ | ||||||
|     IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") |     IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") | ||||||
|   }, error = function(e) { |   }, error = function(e) { | ||||||
|     NA |     NA | ||||||
|   }) |   }) | ||||||
| @ -54,12 +55,12 @@ get_optimal_bandwidth <- function(df){ | |||||||
| optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| window_num <- 18 | window_num <- 4 | ||||||
| final_long_df <- intermediate_long_df |> | final_long_df <- intermediate_long_df |> | ||||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num))  |   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||||
| 
 | 
 | ||||||
| library(fitdistrplus) | library(fitdistrplus) | ||||||
| descdist(long_df$lengthened_commit_count, discrete=FALSE) | descdist(final_long_df$lengthened_commit_count, discrete=FALSE) | ||||||
| #start_values <- list(shape1 = 1, shape2 = 1) | #start_values <- list(shape1 = 1, shape2 = 1) | ||||||
| #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") | #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") | ||||||
| print(fit) | print(fit) | ||||||
| @ -70,7 +71,7 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + | |||||||
|                   (before_after*relative_week|rd_event), |                   (before_after*relative_week|rd_event), | ||||||
|                 control=glmerControl(optimizer="bobyqa", |                 control=glmerControl(optimizer="bobyqa", | ||||||
|                                      optCtrl=list(maxfun=2e5)), nAGQ=0, |                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||||
|                 data=long_df) |                 data=final_long_df) | ||||||
| #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ | #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ | ||||||
| #                    (before_after*relative_week|commit_type) +  | #                    (before_after*relative_week|commit_type) +  | ||||||
| #                    (before_after*relative_week|rd_event) ,data=long_df) | #                    (before_after*relative_week|rd_event) ,data=long_df) | ||||||
| @ -95,7 +96,8 @@ icc(wikimedia_share_lmer) | |||||||
| 
 | 
 | ||||||
| other_long_df <- final_long_df |> | other_long_df <- final_long_df |> | ||||||
|   filter(commit_type == "other_commit_count") |   filter(commit_type == "other_commit_count") | ||||||
| other_share_lmer <- lm(commit_share ~ before_after*relative_week, | other_share_lmer <- lmer(commit_share ~ before_after*relative_week + | ||||||
|  |                          (1| rd_event), | ||||||
|                            data=other_long_df) |                            data=other_long_df) | ||||||
| summary(other_share_lmer) | summary(other_share_lmer) | ||||||
| icc(other_share_lmer) | icc(other_share_lmer) | ||||||
|  | |||||||
							
								
								
									
										82
									
								
								commit_analysis/testing-share-plotting.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								commit_analysis/testing-share-plotting.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,82 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | ||||||
|  | 
 | ||||||
|  | widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | ||||||
|  | 
 | ||||||
|  | event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||||
|  | event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||||
|  | 
 | ||||||
|  | #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||||
|  | #dropping the event (2013-07-01) from the modeling | ||||||
|  | input_df <- bind_rows(entest_df, widetest_df) | ||||||
|  | 
 | ||||||
|  | input_df <- input_df |> | ||||||
|  |   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||||
|  |   mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> | ||||||
|  |   mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> | ||||||
|  |   dplyr::select(-mediawiki_dev_commit_count) |> | ||||||
|  |   dplyr::select(-wikia_commit_count) | ||||||
|  | 
 | ||||||
|  | #get into mlm format | ||||||
|  | long_df <- input_df |> | ||||||
|  |   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), | ||||||
|  |                names_to = "commit_type", | ||||||
|  |                values_to = "lengthened_commit_count") | ||||||
|  | 
 | ||||||
|  | intermediate_long_df <- long_df |> | ||||||
|  |   mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | ||||||
|  |   mutate(log_commits = log1p(lengthened_commit_count))|> | ||||||
|  |   mutate(scaled_long_commits = lengthened_commit_count / 10)  | ||||||
|  | 
 | ||||||
|  | intermediate_long_df <- intermediate_long_df |> | ||||||
|  |   drop_na() | ||||||
|  | 
 | ||||||
|  | window_num <- 4 | ||||||
|  | final_long_df <- intermediate_long_df |> | ||||||
|  |   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||||
|  | 
 | ||||||
|  | commit_plot <- final_long_df |> | ||||||
|  |   ggplot(aes(x=relative_week,  | ||||||
|  |               y=lengthened_commit_count,  | ||||||
|  |               color=commit_type, | ||||||
|  |               linetype = rd_event)) + | ||||||
|  |   geom_line() + | ||||||
|  |   geom_point() +  | ||||||
|  |   labs(x = "Relative Week", y = "Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + | ||||||
|  |   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||||
|  |   scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + | ||||||
|  |   ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") + | ||||||
|  |   theme_bw() +  | ||||||
|  |   theme(legend.position = "top") | ||||||
|  | commit_plot | ||||||
|  | 
 | ||||||
|  | total_commit_plot <- final_long_df |> | ||||||
|  |   filter(commit_type == "other_commit_count")|> | ||||||
|  |   ggplot(aes(x=relative_week,  | ||||||
|  |              y=nonbot_commit_count,  | ||||||
|  |              linetype = rd_event)) + | ||||||
|  |   geom_line() + | ||||||
|  |   geom_point() +  | ||||||
|  |   labs(x = "Relative Week", y = "Nonbot Commit Count", linetype = "Testing Event") + | ||||||
|  |   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||||
|  |   ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events") + | ||||||
|  |   theme_bw() +  | ||||||
|  |   theme(legend.position = "top") | ||||||
|  | total_commit_plot | ||||||
|  | 
 | ||||||
|  | commit_share_plot <- final_long_df |> | ||||||
|  |   ggplot(aes(x=relative_week,  | ||||||
|  |              y=commit_share,  | ||||||
|  |              color=commit_type, | ||||||
|  |              linetype = rd_event)) + | ||||||
|  |   geom_line() + | ||||||
|  |   geom_point() +  | ||||||
|  |   labs(x = "Relative Week", y = "Share of Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + | ||||||
|  |   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||||
|  |   scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + | ||||||
|  |   ggtitle("VisualEditor Nonbot Commit Share Around Opt-In Testing Events") + | ||||||
|  |   theme_bw() +  | ||||||
|  |   theme(legend.position = "top") | ||||||
|  | commit_share_plot | ||||||
| @ -1,18 +0,0 @@ | |||||||
| 1. SSH tunnel from your workstation using the following command: |  | ||||||
| 
 |  | ||||||
|    ssh -N -L 8787:n3439:56635 mjilg@klone.hyak.uw.edu |  | ||||||
| 
 |  | ||||||
|    and point your web browser to http://localhost:8787 |  | ||||||
| 
 |  | ||||||
| 2. log in to RStudio Server using the following credentials: |  | ||||||
| 
 |  | ||||||
|    user: mjilg |  | ||||||
|    password: gYARjsVrF/GA3VDv3MYl |  | ||||||
| 
 |  | ||||||
| When done using RStudio Server, terminate the job by: |  | ||||||
| 
 |  | ||||||
| 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) |  | ||||||
| 2. Issue the following command on the login node: |  | ||||||
| 
 |  | ||||||
|       scancel -f 24451895 |  | ||||||
| slurmstepd: error: *** JOB 24451895 ON n3439 CANCELLED AT 2025-02-26T10:32:49 *** |  | ||||||
							
								
								
									
										
											BIN
										
									
								
								text_analysis/case1/030125_rfc_dfm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text_analysis/case1/030125_rfc_dfm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text_analysis/case1/030125_ve_rfc_stm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text_analysis/case1/030125_ve_rfc_stm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										69
									
								
								text_analysis/case1/case1_stm.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								text_analysis/case1/case1_stm.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,69 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | library(quanteda) | ||||||
|  | library(lubridate) | ||||||
|  | library(quanteda.textmodels) | ||||||
|  | library(lexicon) | ||||||
|  | library(stm) | ||||||
|  | 
 | ||||||
|  | rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") | ||||||
|  | rfc_df$doc_id = 1:nrow(rfc_df) | ||||||
|  | #some cleaning around the timestamp of the comment made | ||||||
|  | rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC") | ||||||
|  | rfc_corp = corpus(rfc_df$comment_text, | ||||||
|  |                   docvars = rfc_df, | ||||||
|  |                   docnames = rfc_df$doc_id) | ||||||
|  | 
 | ||||||
|  | rfc_tokens <- tokens(rfc_corp, | ||||||
|  |                      what = "word", | ||||||
|  |                      remove_punct = TRUE, | ||||||
|  |                      remove_symbols = TRUE, | ||||||
|  |                      remove_numbers = TRUE, | ||||||
|  |                      remove_url = TRUE, | ||||||
|  |                      remove_separators = TRUE, | ||||||
|  |                      include_docvars = TRUE)  | ||||||
|  | 
 | ||||||
|  | #removing not only english stopwords but some bespoke ones too  | ||||||
|  | additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author) | ||||||
|  | #take out references to other authors | ||||||
|  | custom_stopwords <- c(stopwords("english"), additional_stopwords) | ||||||
|  | 
 | ||||||
|  | rfc_dfm <- rfc_tokens|> | ||||||
|  |   dfm() |> | ||||||
|  |   dfm_select(pattern = custom_stopwords, | ||||||
|  |              selection = c("remove"), | ||||||
|  |              valuetype = c("fixed"))  | ||||||
|  | 
 | ||||||
|  | rfc_dfm_lemmatized = dfm_replace(rfc_dfm,  | ||||||
|  |                                  pattern = lexicon::hash_lemmas$token,  | ||||||
|  |                                  replacement = lexicon::hash_lemmas$lemma) | ||||||
|  | 
 | ||||||
|  | rfc_feature_counts <- colSums(rfc_dfm_lemmatized)  | ||||||
|  | docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized) | ||||||
|  | #saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds") | ||||||
|  | rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm", | ||||||
|  |                       docvars = docvars(rfc_dfm_lemmatized)) | ||||||
|  | 
 | ||||||
|  | #run the model  | ||||||
|  | K = 5 | ||||||
|  | seed = 9021000 | ||||||
|  | 
 | ||||||
|  | #model = stm(documents = rfc_dfm_stm$documents, | ||||||
|  | #            vocab = rfc_dfm_stm$vocab, | ||||||
|  | #            K = K, | ||||||
|  | #            seed = seed, | ||||||
|  | #            data = rfc_df, | ||||||
|  | #            prevalence=~posix_timestamp, | ||||||
|  | #            verbose = TRUE) | ||||||
|  | plot(model) | ||||||
|  | saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds") | ||||||
|  | 
 | ||||||
|  | labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date =  rfc_dfm_stm$meta$posix_timestamp, model$theta) | ||||||
|  | 
 | ||||||
|  | cat(results[order(-results$X1),"text"][1])  | ||||||
|  | cat(results[order(-results$X1),"text"][2])  | ||||||
|  | cat(results[order(-results$X1),"text"][3])  | ||||||
|  | cat(results[order(-results$X1),"text"][4])  | ||||||
|  | cat(results[order(-results$X1),"text"][5])  | ||||||
							
								
								
									
										78
									
								
								text_analysis/phab_topic_trends.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								text_analysis/phab_topic_trends.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,78 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | library(quanteda) | ||||||
|  | library(lubridate) | ||||||
|  | library(quanteda.textmodels) | ||||||
|  | library(lexicon) | ||||||
|  | library(stm) | ||||||
|  | 
 | ||||||
|  | phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv") | ||||||
|  | phab_df$doc_id = 1:nrow(phab_df) | ||||||
|  | 
 | ||||||
|  | phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC") | ||||||
|  | phab_df <- phab_df|> | ||||||
|  |   filter(date_created > 1351728001 & date_created < 1383263999) | ||||||
|  | 
 | ||||||
|  | phab_corp = corpus(phab_df$comment_text, | ||||||
|  |                   docvars = phab_df, | ||||||
|  |                   docnames = phab_df$doc_id) | ||||||
|  | 
 | ||||||
|  | phab_tokens <- tokens(phab_corp, | ||||||
|  |                      what = "word", | ||||||
|  |                      remove_punct = TRUE, | ||||||
|  |                      remove_symbols = TRUE, | ||||||
|  |                      remove_numbers = FALSE, | ||||||
|  |                      remove_url = TRUE, | ||||||
|  |                      remove_separators = TRUE, | ||||||
|  |                      include_docvars = TRUE)  | ||||||
|  | 
 | ||||||
|  | #removing not only english stopwords but some bespoke ones too  | ||||||
|  | additional_stopwords <- c("and") | ||||||
|  | #take out references to other authors | ||||||
|  | custom_stopwords <- c(stopwords("english"), additional_stopwords) | ||||||
|  | 
 | ||||||
|  | phab_dfm <- phab_tokens|> | ||||||
|  |   dfm() |> | ||||||
|  |   dfm_select(pattern = custom_stopwords, | ||||||
|  |              selection = c("remove"), | ||||||
|  |              valuetype = c("fixed"))  | ||||||
|  | 
 | ||||||
|  | phab_dfm_lemmatized = dfm_replace(phab_dfm,  | ||||||
|  |                                  pattern = lexicon::hash_lemmas$token,  | ||||||
|  |                                  replacement = lexicon::hash_lemmas$lemma) | ||||||
|  | 
 | ||||||
|  | phab_feature_counts <- colSums(phab_dfm_lemmatized)  | ||||||
|  | docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized) | ||||||
|  | 
 | ||||||
|  | #read in the RDS | ||||||
|  | rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds") | ||||||
|  | new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized)) | ||||||
|  | 
 | ||||||
|  | phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm", | ||||||
|  |                       docvars = docvars(phab_dfm_lemmatized)) | ||||||
|  | 
 | ||||||
|  | #loading in the STM that was fitted over the RFC data  | ||||||
|  | stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds") | ||||||
|  | plot(stm_model) | ||||||
|  | #fit it over the new data | ||||||
|  | new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents) | ||||||
|  | #gives us 32058 comment scores to work with | ||||||
|  | 
 | ||||||
|  | results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),  | ||||||
|  |                      date = phab_dfm_stm$meta$utc_date,  | ||||||
|  |                      affil=phab_dfm_stm$meta$WMFaffil, | ||||||
|  |                      new_topic_scores$theta) | ||||||
|  | # the issue is, of course, that these topics are not fit to the documents | ||||||
|  | # but topic models must describe documents in terms of 1 | ||||||
|  | # so it will ill-fit to the phabricator comments | ||||||
|  | 
 | ||||||
|  | grouped_results <- results |> | ||||||
|  |   mutate(week = floor_date(as.POSIXct(date), "week")) |> | ||||||
|  |   group_by(week, affil) |> | ||||||
|  |   summarise(across(starts_with("X"), median, na.rm = TRUE)) | ||||||
|  | 
 | ||||||
|  | plot <- grouped_results |> | ||||||
|  |   ggplot(aes(x=week, | ||||||
|  |              y=X5,  | ||||||
|  |              color=affil)) + | ||||||
|  |   geom_line() | ||||||
|  | plot | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user