updating some of the text analysis
This commit is contained in:
		
							parent
							
								
									98fcf85e48
								
							
						
					
					
						commit
						7858612d60
					
				
							
								
								
									
										72
									
								
								.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,72 @@ | ||||
| { | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "2dd04d34-25c0-470f-973d-1325ce0df797", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||||
|       "  from .autonotebook import tqdm as notebook_tqdm\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "import pandas as pd\n", | ||||
|     "import numpy as np\n", | ||||
|     "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", | ||||
|     "from transformers import Trainer, TrainingArguments\n", | ||||
|     "from torch.utils.data import Dataset\n", | ||||
|     "from sklearn.model_selection import train_test_split\n", | ||||
|     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n", | ||||
|     "import torch\n", | ||||
|     "import json" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "3504b633-4999-47d0-a6eb-ce7916206ced", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "model_name = \"distilbert-base-uncased\"\n", | ||||
|     "model = AutoModelForSequenceClassification.from_pretrained(model_name, \n", | ||||
|     "                                                           num_labels=1) \n", | ||||
|     "tokenizer = AutoTokenizer.from_pretrained(model_name)" | ||||
|    ] | ||||
|   } | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
|   "language_info": { | ||||
|    "codemirror_mode": { | ||||
|     "name": "ipython", | ||||
|     "version": 3 | ||||
|    }, | ||||
|    "file_extension": ".py", | ||||
|    "mimetype": "text/x-python", | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.9.21" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  "nbformat_minor": 5 | ||||
| } | ||||
| @ -17,3 +17,6 @@ cd ~ | ||||
| ls | ||||
| ls .local | ||||
| rm -r -f .local | ||||
| cd rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") | ||||
| cd /gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/ | ||||
| ls | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/0301-ve-testing-commit-plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/0301-ve-testing-commit-plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 46 KiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve-testing-share-ba.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve-testing-share-ba.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 95 KiB | 
							
								
								
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 93 KiB | 
| @ -8,8 +8,9 @@ widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-t | ||||
| event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||
| event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||
| 
 | ||||
| input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||
| #input_df <- bind_rows(entest_df, widetest_df) | ||||
| #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||
| #dropping the event (2013-07-01) from the modeling | ||||
| input_df <- bind_rows(entest_df, widetest_df) | ||||
| 
 | ||||
| input_df <- input_df |> | ||||
|   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||
| @ -39,13 +40,13 @@ library(rdd) | ||||
| intermediate_long_df <- intermediate_long_df |> | ||||
|   drop_na() | ||||
| 
 | ||||
| var(intermediate_long_df$commit_share) # 1253.343 | ||||
| mean(intermediate_long_df$commit_share) # 44.92381 | ||||
| median(intermediate_long_df$commit_share) # 39.5 | ||||
| var(intermediate_long_df$lengthened_commit_count) # 1253.343 | ||||
| mean(intermediate_long_df$lengthened_commit_count) # 44.92381 | ||||
| median(intermediate_long_df$lengthened_commit_count) # 39.5 | ||||
| 
 | ||||
| get_optimal_bandwidth <- function(df){ | ||||
|   bw <- tryCatch({ | ||||
|     IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular") | ||||
|     IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") | ||||
|   }, error = function(e) { | ||||
|     NA | ||||
|   }) | ||||
| @ -54,12 +55,12 @@ get_optimal_bandwidth <- function(df){ | ||||
| optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df) | ||||
| 
 | ||||
| 
 | ||||
| window_num <- 18 | ||||
| window_num <- 4 | ||||
| final_long_df <- intermediate_long_df |> | ||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||
| 
 | ||||
| library(fitdistrplus) | ||||
| descdist(long_df$lengthened_commit_count, discrete=FALSE) | ||||
| descdist(final_long_df$lengthened_commit_count, discrete=FALSE) | ||||
| #start_values <- list(shape1 = 1, shape2 = 1) | ||||
| #fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial") | ||||
| print(fit) | ||||
| @ -70,7 +71,7 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week + | ||||
|                   (before_after*relative_week|rd_event), | ||||
|                 control=glmerControl(optimizer="bobyqa", | ||||
|                                      optCtrl=list(maxfun=2e5)), nAGQ=0, | ||||
|                 data=long_df) | ||||
|                 data=final_long_df) | ||||
| #mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+ | ||||
| #                    (before_after*relative_week|commit_type) +  | ||||
| #                    (before_after*relative_week|rd_event) ,data=long_df) | ||||
| @ -95,7 +96,8 @@ icc(wikimedia_share_lmer) | ||||
| 
 | ||||
| other_long_df <- final_long_df |> | ||||
|   filter(commit_type == "other_commit_count") | ||||
| other_share_lmer <- lm(commit_share ~ before_after*relative_week, | ||||
| other_share_lmer <- lmer(commit_share ~ before_after*relative_week + | ||||
|                          (1| rd_event), | ||||
|                            data=other_long_df) | ||||
| summary(other_share_lmer) | ||||
| icc(other_share_lmer) | ||||
|  | ||||
							
								
								
									
										82
									
								
								commit_analysis/testing-share-plotting.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								commit_analysis/testing-share-plotting.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,82 @@ | ||||
| library(tidyverse) | ||||
| entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||
| entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing") | ||||
| 
 | ||||
| widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv" | ||||
| widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing") | ||||
| 
 | ||||
| event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv" | ||||
| event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default") | ||||
| 
 | ||||
| #input_df <- bind_rows(entest_df, widetest_df, event_df) | ||||
| #dropping the event (2013-07-01) from the modeling | ||||
| input_df <- bind_rows(entest_df, widetest_df) | ||||
| 
 | ||||
| input_df <- input_df |> | ||||
|   mutate(nonbot_commit_count = commit_count - bot_commit_count)|> | ||||
|   mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |> | ||||
|   mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |> | ||||
|   dplyr::select(-mediawiki_dev_commit_count) |> | ||||
|   dplyr::select(-wikia_commit_count) | ||||
| 
 | ||||
| #get into mlm format | ||||
| long_df <- input_df |> | ||||
|   pivot_longer(cols = c(other_commit_count, wikimedia_commit_count), | ||||
|                names_to = "commit_type", | ||||
|                values_to = "lengthened_commit_count") | ||||
| 
 | ||||
| intermediate_long_df <- long_df |> | ||||
|   mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |> | ||||
|   mutate(log_commits = log1p(lengthened_commit_count))|> | ||||
|   mutate(scaled_long_commits = lengthened_commit_count / 10)  | ||||
| 
 | ||||
| intermediate_long_df <- intermediate_long_df |> | ||||
|   drop_na() | ||||
| 
 | ||||
| window_num <- 4 | ||||
| final_long_df <- intermediate_long_df |> | ||||
|   filter(relative_week >= (- window_num) & relative_week <= (window_num))  | ||||
| 
 | ||||
| commit_plot <- final_long_df |> | ||||
|   ggplot(aes(x=relative_week,  | ||||
|               y=lengthened_commit_count,  | ||||
|               color=commit_type, | ||||
|               linetype = rd_event)) + | ||||
|   geom_line() + | ||||
|   geom_point() +  | ||||
|   labs(x = "Relative Week", y = "Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + | ||||
|   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||
|   scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + | ||||
|   ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") + | ||||
|   theme_bw() +  | ||||
|   theme(legend.position = "top") | ||||
| commit_plot | ||||
| 
 | ||||
| total_commit_plot <- final_long_df |> | ||||
|   filter(commit_type == "other_commit_count")|> | ||||
|   ggplot(aes(x=relative_week,  | ||||
|              y=nonbot_commit_count,  | ||||
|              linetype = rd_event)) + | ||||
|   geom_line() + | ||||
|   geom_point() +  | ||||
|   labs(x = "Relative Week", y = "Nonbot Commit Count", linetype = "Testing Event") + | ||||
|   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||
|   ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events") + | ||||
|   theme_bw() +  | ||||
|   theme(legend.position = "top") | ||||
| total_commit_plot | ||||
| 
 | ||||
| commit_share_plot <- final_long_df |> | ||||
|   ggplot(aes(x=relative_week,  | ||||
|              y=commit_share,  | ||||
|              color=commit_type, | ||||
|              linetype = rd_event)) + | ||||
|   geom_line() + | ||||
|   geom_point() +  | ||||
|   labs(x = "Relative Week", y = "Share of Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") + | ||||
|   scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +  | ||||
|   scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) + | ||||
|   ggtitle("VisualEditor Nonbot Commit Share Around Opt-In Testing Events") + | ||||
|   theme_bw() +  | ||||
|   theme(legend.position = "top") | ||||
| commit_share_plot | ||||
| @ -1,18 +0,0 @@ | ||||
| 1. SSH tunnel from your workstation using the following command: | ||||
| 
 | ||||
|    ssh -N -L 8787:n3439:56635 mjilg@klone.hyak.uw.edu | ||||
| 
 | ||||
|    and point your web browser to http://localhost:8787 | ||||
| 
 | ||||
| 2. log in to RStudio Server using the following credentials: | ||||
| 
 | ||||
|    user: mjilg | ||||
|    password: gYARjsVrF/GA3VDv3MYl | ||||
| 
 | ||||
| When done using RStudio Server, terminate the job by: | ||||
| 
 | ||||
| 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) | ||||
| 2. Issue the following command on the login node: | ||||
| 
 | ||||
|       scancel -f 24451895 | ||||
| slurmstepd: error: *** JOB 24451895 ON n3439 CANCELLED AT 2025-02-26T10:32:49 *** | ||||
							
								
								
									
										
											BIN
										
									
								
								text_analysis/case1/030125_rfc_dfm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text_analysis/case1/030125_rfc_dfm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								text_analysis/case1/030125_ve_rfc_stm.rds
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								text_analysis/case1/030125_ve_rfc_stm.rds
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										69
									
								
								text_analysis/case1/case1_stm.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								text_analysis/case1/case1_stm.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,69 @@ | ||||
| library(tidyverse) | ||||
| library(quanteda) | ||||
| library(lubridate) | ||||
| library(quanteda.textmodels) | ||||
| library(lexicon) | ||||
| library(stm) | ||||
| 
 | ||||
| rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv") | ||||
| rfc_df$doc_id = 1:nrow(rfc_df) | ||||
| #some cleaning around the timestamp of the comment made | ||||
| rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC") | ||||
| rfc_corp = corpus(rfc_df$comment_text, | ||||
|                   docvars = rfc_df, | ||||
|                   docnames = rfc_df$doc_id) | ||||
| 
 | ||||
| rfc_tokens <- tokens(rfc_corp, | ||||
|                      what = "word", | ||||
|                      remove_punct = TRUE, | ||||
|                      remove_symbols = TRUE, | ||||
|                      remove_numbers = TRUE, | ||||
|                      remove_url = TRUE, | ||||
|                      remove_separators = TRUE, | ||||
|                      include_docvars = TRUE)  | ||||
| 
 | ||||
| #removing not only english stopwords but some bespoke ones too  | ||||
| additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author) | ||||
| #take out references to other authors | ||||
| custom_stopwords <- c(stopwords("english"), additional_stopwords) | ||||
| 
 | ||||
| rfc_dfm <- rfc_tokens|> | ||||
|   dfm() |> | ||||
|   dfm_select(pattern = custom_stopwords, | ||||
|              selection = c("remove"), | ||||
|              valuetype = c("fixed"))  | ||||
| 
 | ||||
| rfc_dfm_lemmatized = dfm_replace(rfc_dfm,  | ||||
|                                  pattern = lexicon::hash_lemmas$token,  | ||||
|                                  replacement = lexicon::hash_lemmas$lemma) | ||||
| 
 | ||||
| rfc_feature_counts <- colSums(rfc_dfm_lemmatized)  | ||||
| docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized) | ||||
| #saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds") | ||||
| rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm", | ||||
|                       docvars = docvars(rfc_dfm_lemmatized)) | ||||
| 
 | ||||
| #run the model  | ||||
| K = 5 | ||||
| seed = 9021000 | ||||
| 
 | ||||
| #model = stm(documents = rfc_dfm_stm$documents, | ||||
| #            vocab = rfc_dfm_stm$vocab, | ||||
| #            K = K, | ||||
| #            seed = seed, | ||||
| #            data = rfc_df, | ||||
| #            prevalence=~posix_timestamp, | ||||
| #            verbose = TRUE) | ||||
| plot(model) | ||||
| saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds") | ||||
| 
 | ||||
| labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10) | ||||
| 
 | ||||
| 
 | ||||
| results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date =  rfc_dfm_stm$meta$posix_timestamp, model$theta) | ||||
| 
 | ||||
| cat(results[order(-results$X1),"text"][1])  | ||||
| cat(results[order(-results$X1),"text"][2])  | ||||
| cat(results[order(-results$X1),"text"][3])  | ||||
| cat(results[order(-results$X1),"text"][4])  | ||||
| cat(results[order(-results$X1),"text"][5])  | ||||
							
								
								
									
										78
									
								
								text_analysis/phab_topic_trends.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								text_analysis/phab_topic_trends.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,78 @@ | ||||
| library(tidyverse) | ||||
| library(quanteda) | ||||
| library(lubridate) | ||||
| library(quanteda.textmodels) | ||||
| library(lexicon) | ||||
| library(stm) | ||||
| 
 | ||||
| phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv") | ||||
| phab_df$doc_id = 1:nrow(phab_df) | ||||
| 
 | ||||
| phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC") | ||||
| phab_df <- phab_df|> | ||||
|   filter(date_created > 1351728001 & date_created < 1383263999) | ||||
| 
 | ||||
| phab_corp = corpus(phab_df$comment_text, | ||||
|                   docvars = phab_df, | ||||
|                   docnames = phab_df$doc_id) | ||||
| 
 | ||||
| phab_tokens <- tokens(phab_corp, | ||||
|                      what = "word", | ||||
|                      remove_punct = TRUE, | ||||
|                      remove_symbols = TRUE, | ||||
|                      remove_numbers = FALSE, | ||||
|                      remove_url = TRUE, | ||||
|                      remove_separators = TRUE, | ||||
|                      include_docvars = TRUE)  | ||||
| 
 | ||||
| #removing not only english stopwords but some bespoke ones too  | ||||
| additional_stopwords <- c("and") | ||||
| #take out references to other authors | ||||
| custom_stopwords <- c(stopwords("english"), additional_stopwords) | ||||
| 
 | ||||
| phab_dfm <- phab_tokens|> | ||||
|   dfm() |> | ||||
|   dfm_select(pattern = custom_stopwords, | ||||
|              selection = c("remove"), | ||||
|              valuetype = c("fixed"))  | ||||
| 
 | ||||
| phab_dfm_lemmatized = dfm_replace(phab_dfm,  | ||||
|                                  pattern = lexicon::hash_lemmas$token,  | ||||
|                                  replacement = lexicon::hash_lemmas$lemma) | ||||
| 
 | ||||
| phab_feature_counts <- colSums(phab_dfm_lemmatized)  | ||||
| docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized) | ||||
| 
 | ||||
| #read in the RDS | ||||
| rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds") | ||||
| new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized)) | ||||
| 
 | ||||
| phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm", | ||||
|                       docvars = docvars(phab_dfm_lemmatized)) | ||||
| 
 | ||||
| #loading in the STM that was fitted over the RFC data  | ||||
| stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds") | ||||
| plot(stm_model) | ||||
| #fit it over the new data | ||||
| new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents) | ||||
| #gives us 32058 comment scores to work with | ||||
| 
 | ||||
| results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),  | ||||
|                      date = phab_dfm_stm$meta$utc_date,  | ||||
|                      affil=phab_dfm_stm$meta$WMFaffil, | ||||
|                      new_topic_scores$theta) | ||||
| # the issue is, of course, that these topics are not fit to the documents | ||||
| # but topic models must describe documents in terms of 1 | ||||
| # so it will ill-fit to the phabricator comments | ||||
| 
 | ||||
| grouped_results <- results |> | ||||
|   mutate(week = floor_date(as.POSIXct(date), "week")) |> | ||||
|   group_by(week, affil) |> | ||||
|   summarise(across(starts_with("X"), median, na.rm = TRUE)) | ||||
| 
 | ||||
| plot <- grouped_results |> | ||||
|   ggplot(aes(x=week, | ||||
|              y=X5,  | ||||
|              color=affil)) + | ||||
|   geom_line() | ||||
| plot | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user