updating some of the text analysis
This commit is contained in:
parent
98fcf85e48
commit
7858612d60
72
.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
Normal file
72
.ipynb_checkpoints/BERT-hw-checkpoint.ipynb
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2dd04d34-25c0-470f-973d-1325ce0df797",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/gscratch/scrubbed/mjilg/jupyter-notebook/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
|
||||||
|
"from transformers import Trainer, TrainingArguments\n",
|
||||||
|
"from torch.utils.data import Dataset\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import json"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3504b633-4999-47d0-a6eb-ce7916206ced",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model_name = \"distilbert-base-uncased\"\n",
|
||||||
|
"model = AutoModelForSequenceClassification.from_pretrained(model_name, \n",
|
||||||
|
" num_labels=1) \n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.21"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -17,3 +17,6 @@ cd ~
|
|||||||
ls
|
ls
|
||||||
ls .local
|
ls .local
|
||||||
rm -r -f .local
|
rm -r -f .local
|
||||||
|
cd rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
|
||||||
|
cd /gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/
|
||||||
|
ls
|
||||||
|
BIN
commit_analysis/case1/0301-ve-testing-commit-plot.png
Normal file
BIN
commit_analysis/case1/0301-ve-testing-commit-plot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 46 KiB |
BIN
commit_analysis/case1/030125_ve-testing-share-ba.png
Normal file
BIN
commit_analysis/case1/030125_ve-testing-share-ba.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 95 KiB |
BIN
commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
Normal file
BIN
commit_analysis/case1/030125_ve_testing_commits_ba_plot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 93 KiB |
@ -8,8 +8,9 @@ widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-t
|
|||||||
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
|
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
|
||||||
|
|
||||||
input_df <- bind_rows(entest_df, widetest_df, event_df)
|
#input_df <- bind_rows(entest_df, widetest_df, event_df)
|
||||||
#input_df <- bind_rows(entest_df, widetest_df)
|
#dropping the event (2013-07-01) from the modeling
|
||||||
|
input_df <- bind_rows(entest_df, widetest_df)
|
||||||
|
|
||||||
input_df <- input_df |>
|
input_df <- input_df |>
|
||||||
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
||||||
@ -39,13 +40,13 @@ library(rdd)
|
|||||||
intermediate_long_df <- intermediate_long_df |>
|
intermediate_long_df <- intermediate_long_df |>
|
||||||
drop_na()
|
drop_na()
|
||||||
|
|
||||||
var(intermediate_long_df$commit_share) # 1253.343
|
var(intermediate_long_df$lengthened_commit_count) # 1253.343
|
||||||
mean(intermediate_long_df$commit_share) # 44.92381
|
mean(intermediate_long_df$lengthened_commit_count) # 44.92381
|
||||||
median(intermediate_long_df$commit_share) # 39.5
|
median(intermediate_long_df$lengthened_commit_count) # 39.5
|
||||||
|
|
||||||
get_optimal_bandwidth <- function(df){
|
get_optimal_bandwidth <- function(df){
|
||||||
bw <- tryCatch({
|
bw <- tryCatch({
|
||||||
IKbandwidth(df$relative_week, df$commit_share, cutpoint = 0, verbose = FALSE, kernel = "triangular")
|
IKbandwidth(df$relative_week, df$lengthened_commit_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
|
||||||
}, error = function(e) {
|
}, error = function(e) {
|
||||||
NA
|
NA
|
||||||
})
|
})
|
||||||
@ -54,12 +55,12 @@ get_optimal_bandwidth <- function(df){
|
|||||||
optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df)
|
optimal_bandwidth <- get_optimal_bandwidth(intermediate_long_df)
|
||||||
|
|
||||||
|
|
||||||
window_num <- 18
|
window_num <- 4
|
||||||
final_long_df <- intermediate_long_df |>
|
final_long_df <- intermediate_long_df |>
|
||||||
filter(relative_week >= (- window_num) & relative_week <= (window_num))
|
filter(relative_week >= (- window_num) & relative_week <= (window_num))
|
||||||
|
|
||||||
library(fitdistrplus)
|
library(fitdistrplus)
|
||||||
descdist(long_df$lengthened_commit_count, discrete=FALSE)
|
descdist(final_long_df$lengthened_commit_count, discrete=FALSE)
|
||||||
#start_values <- list(shape1 = 1, shape2 = 1)
|
#start_values <- list(shape1 = 1, shape2 = 1)
|
||||||
#fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial")
|
#fit <- MASS::fitdistr(as.numeric(long_df$lengthened_commit_count), "negative binomial")
|
||||||
print(fit)
|
print(fit)
|
||||||
@ -70,7 +71,7 @@ mlm <- glmer.nb(lengthened_commit_count ~ before_after*relative_week +
|
|||||||
(before_after*relative_week|rd_event),
|
(before_after*relative_week|rd_event),
|
||||||
control=glmerControl(optimizer="bobyqa",
|
control=glmerControl(optimizer="bobyqa",
|
||||||
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
optCtrl=list(maxfun=2e5)), nAGQ=0,
|
||||||
data=long_df)
|
data=final_long_df)
|
||||||
#mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+
|
#mlm <- lmer(lengthened_commit_count ~ before_after*relative_week+
|
||||||
# (before_after*relative_week|commit_type) +
|
# (before_after*relative_week|commit_type) +
|
||||||
# (before_after*relative_week|rd_event) ,data=long_df)
|
# (before_after*relative_week|rd_event) ,data=long_df)
|
||||||
@ -95,7 +96,8 @@ icc(wikimedia_share_lmer)
|
|||||||
|
|
||||||
other_long_df <- final_long_df |>
|
other_long_df <- final_long_df |>
|
||||||
filter(commit_type == "other_commit_count")
|
filter(commit_type == "other_commit_count")
|
||||||
other_share_lmer <- lm(commit_share ~ before_after*relative_week,
|
other_share_lmer <- lmer(commit_share ~ before_after*relative_week +
|
||||||
|
(1| rd_event),
|
||||||
data=other_long_df)
|
data=other_long_df)
|
||||||
summary(other_share_lmer)
|
summary(other_share_lmer)
|
||||||
icc(other_share_lmer)
|
icc(other_share_lmer)
|
||||||
|
82
commit_analysis/testing-share-plotting.R
Normal file
82
commit_analysis/testing-share-plotting.R
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
entest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/en-testing_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
entest_df <- read.csv(entest_fp, header = TRUE) |> mutate(rd_event = "en-testing")
|
||||||
|
|
||||||
|
widetest_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/wide-testing_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
widetest_df <- read.csv(widetest_fp, header = TRUE) |> mutate(rd_event = "wide-testing")
|
||||||
|
|
||||||
|
event_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0217_extensions_ve_weekly_commit_count_data.csv"
|
||||||
|
event_df <- read.csv(event_fp, header = TRUE) |> mutate(rd_event = "default")
|
||||||
|
|
||||||
|
#input_df <- bind_rows(entest_df, widetest_df, event_df)
|
||||||
|
#dropping the event (2013-07-01) from the modeling
|
||||||
|
input_df <- bind_rows(entest_df, widetest_df)
|
||||||
|
|
||||||
|
input_df <- input_df |>
|
||||||
|
mutate(nonbot_commit_count = commit_count - bot_commit_count)|>
|
||||||
|
mutate(other_commit_count = nonbot_commit_count - mediawiki_dev_commit_count - wikia_commit_count - wikimedia_commit_count) |>
|
||||||
|
mutate(wikimedia_commit_count = wikimedia_commit_count + mediawiki_dev_commit_count + wikia_commit_count) |>
|
||||||
|
dplyr::select(-mediawiki_dev_commit_count) |>
|
||||||
|
dplyr::select(-wikia_commit_count)
|
||||||
|
|
||||||
|
#get into mlm format
|
||||||
|
long_df <- input_df |>
|
||||||
|
pivot_longer(cols = c(other_commit_count, wikimedia_commit_count),
|
||||||
|
names_to = "commit_type",
|
||||||
|
values_to = "lengthened_commit_count")
|
||||||
|
|
||||||
|
intermediate_long_df <- long_df |>
|
||||||
|
mutate(commit_share = lengthened_commit_count / (nonbot_commit_count)) |>
|
||||||
|
mutate(log_commits = log1p(lengthened_commit_count))|>
|
||||||
|
mutate(scaled_long_commits = lengthened_commit_count / 10)
|
||||||
|
|
||||||
|
intermediate_long_df <- intermediate_long_df |>
|
||||||
|
drop_na()
|
||||||
|
|
||||||
|
window_num <- 4
|
||||||
|
final_long_df <- intermediate_long_df |>
|
||||||
|
filter(relative_week >= (- window_num) & relative_week <= (window_num))
|
||||||
|
|
||||||
|
commit_plot <- final_long_df |>
|
||||||
|
ggplot(aes(x=relative_week,
|
||||||
|
y=lengthened_commit_count,
|
||||||
|
color=commit_type,
|
||||||
|
linetype = rd_event)) +
|
||||||
|
geom_line() +
|
||||||
|
geom_point() +
|
||||||
|
labs(x = "Relative Week", y = "Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") +
|
||||||
|
scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +
|
||||||
|
scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) +
|
||||||
|
ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events (by Affiliation)") +
|
||||||
|
theme_bw() +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
commit_plot
|
||||||
|
|
||||||
|
total_commit_plot <- final_long_df |>
|
||||||
|
filter(commit_type == "other_commit_count")|>
|
||||||
|
ggplot(aes(x=relative_week,
|
||||||
|
y=nonbot_commit_count,
|
||||||
|
linetype = rd_event)) +
|
||||||
|
geom_line() +
|
||||||
|
geom_point() +
|
||||||
|
labs(x = "Relative Week", y = "Nonbot Commit Count", linetype = "Testing Event") +
|
||||||
|
scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +
|
||||||
|
ggtitle("VisualEditor Nonbot Commit Count Around Opt-In Testing Events") +
|
||||||
|
theme_bw() +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
total_commit_plot
|
||||||
|
|
||||||
|
commit_share_plot <- final_long_df |>
|
||||||
|
ggplot(aes(x=relative_week,
|
||||||
|
y=commit_share,
|
||||||
|
color=commit_type,
|
||||||
|
linetype = rd_event)) +
|
||||||
|
geom_line() +
|
||||||
|
geom_point() +
|
||||||
|
labs(x = "Relative Week", y = "Share of Nonbot Commits", linetype = "Testing Event", color="Commit Author Affiliation") +
|
||||||
|
scale_linetype_discrete(labels = c("enwiki testing (2012-12-12)", "wide testing (2013-04-25)")) +
|
||||||
|
scale_color_discrete(labels = c("Unaffiliated", "Organizationally Affiliated")) +
|
||||||
|
ggtitle("VisualEditor Nonbot Commit Share Around Opt-In Testing Events") +
|
||||||
|
theme_bw() +
|
||||||
|
theme(legend.position = "top")
|
||||||
|
commit_share_plot
|
@ -1,18 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3439:56635 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: gYARjsVrF/GA3VDv3MYl
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 24451895
|
|
||||||
slurmstepd: error: *** JOB 24451895 ON n3439 CANCELLED AT 2025-02-26T10:32:49 ***
|
|
BIN
text_analysis/case1/030125_rfc_dfm.rds
Normal file
BIN
text_analysis/case1/030125_rfc_dfm.rds
Normal file
Binary file not shown.
BIN
text_analysis/case1/030125_ve_rfc_stm.rds
Normal file
BIN
text_analysis/case1/030125_ve_rfc_stm.rds
Normal file
Binary file not shown.
69
text_analysis/case1/case1_stm.R
Normal file
69
text_analysis/case1/case1_stm.R
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(quanteda)
|
||||||
|
library(lubridate)
|
||||||
|
library(quanteda.textmodels)
|
||||||
|
library(lexicon)
|
||||||
|
library(stm)
|
||||||
|
|
||||||
|
rfc_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0220_ve_rfcs_text.csv")
|
||||||
|
rfc_df$doc_id = 1:nrow(rfc_df)
|
||||||
|
#some cleaning around the timestamp of the comment made
|
||||||
|
rfc_df$posix_timestamp = parse_date_time(gsub("\\(UTC\\)", "", rfc_df$date_created), orders = "HM, dmy", tz = "UTC")
|
||||||
|
rfc_corp = corpus(rfc_df$comment_text,
|
||||||
|
docvars = rfc_df,
|
||||||
|
docnames = rfc_df$doc_id)
|
||||||
|
|
||||||
|
rfc_tokens <- tokens(rfc_corp,
|
||||||
|
what = "word",
|
||||||
|
remove_punct = TRUE,
|
||||||
|
remove_symbols = TRUE,
|
||||||
|
remove_numbers = TRUE,
|
||||||
|
remove_url = TRUE,
|
||||||
|
remove_separators = TRUE,
|
||||||
|
include_docvars = TRUE)
|
||||||
|
|
||||||
|
#removing not only english stopwords but some bespoke ones too
|
||||||
|
additional_stopwords <- c(c("talk", "user", "n", "utc"), rfc_df$Author)
|
||||||
|
#take out references to other authors
|
||||||
|
custom_stopwords <- c(stopwords("english"), additional_stopwords)
|
||||||
|
|
||||||
|
rfc_dfm <- rfc_tokens|>
|
||||||
|
dfm() |>
|
||||||
|
dfm_select(pattern = custom_stopwords,
|
||||||
|
selection = c("remove"),
|
||||||
|
valuetype = c("fixed"))
|
||||||
|
|
||||||
|
rfc_dfm_lemmatized = dfm_replace(rfc_dfm,
|
||||||
|
pattern = lexicon::hash_lemmas$token,
|
||||||
|
replacement = lexicon::hash_lemmas$lemma)
|
||||||
|
|
||||||
|
rfc_feature_counts <- colSums(rfc_dfm_lemmatized)
|
||||||
|
docvars(rfc_dfm_lemmatized)$doc_id = docnames(rfc_dfm_lemmatized)
|
||||||
|
#saveRDS(rfc_dfm_lemmatized, file="text_analysis/case1/030125_rfc_dfm.rds")
|
||||||
|
rfc_dfm_stm = convert(rfc_dfm_lemmatized, to = "stm",
|
||||||
|
docvars = docvars(rfc_dfm_lemmatized))
|
||||||
|
|
||||||
|
#run the model
|
||||||
|
K = 5
|
||||||
|
seed = 9021000
|
||||||
|
|
||||||
|
#model = stm(documents = rfc_dfm_stm$documents,
|
||||||
|
# vocab = rfc_dfm_stm$vocab,
|
||||||
|
# K = K,
|
||||||
|
# seed = seed,
|
||||||
|
# data = rfc_df,
|
||||||
|
# prevalence=~posix_timestamp,
|
||||||
|
# verbose = TRUE)
|
||||||
|
plot(model)
|
||||||
|
saveRDS(model, file = "text_analysis/case1/030125_ve_rfc_stm.rds")
|
||||||
|
|
||||||
|
labelTopics(model, topics = c(5, 4, 2, 3, 1), n = 10)
|
||||||
|
|
||||||
|
|
||||||
|
results = data.frame(text = corpus_subset(rfc_corp, docnames(rfc_corp) %in% rfc_dfm_stm$meta$doc_id), date = rfc_dfm_stm$meta$posix_timestamp, model$theta)
|
||||||
|
|
||||||
|
cat(results[order(-results$X1),"text"][1])
|
||||||
|
cat(results[order(-results$X1),"text"][2])
|
||||||
|
cat(results[order(-results$X1),"text"][3])
|
||||||
|
cat(results[order(-results$X1),"text"][4])
|
||||||
|
cat(results[order(-results$X1),"text"][5])
|
78
text_analysis/phab_topic_trends.R
Normal file
78
text_analysis/phab_topic_trends.R
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(quanteda)
|
||||||
|
library(lubridate)
|
||||||
|
library(quanteda.textmodels)
|
||||||
|
library(lexicon)
|
||||||
|
library(stm)
|
||||||
|
|
||||||
|
phab_df <- read.csv("/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv")
|
||||||
|
phab_df$doc_id = 1:nrow(phab_df)
|
||||||
|
|
||||||
|
phab_df$utc_date <- as.POSIXct(phab_df$date_created, origin = "1970-01-01", tz = "UTC")
|
||||||
|
phab_df <- phab_df|>
|
||||||
|
filter(date_created > 1351728001 & date_created < 1383263999)
|
||||||
|
|
||||||
|
phab_corp = corpus(phab_df$comment_text,
|
||||||
|
docvars = phab_df,
|
||||||
|
docnames = phab_df$doc_id)
|
||||||
|
|
||||||
|
phab_tokens <- tokens(phab_corp,
|
||||||
|
what = "word",
|
||||||
|
remove_punct = TRUE,
|
||||||
|
remove_symbols = TRUE,
|
||||||
|
remove_numbers = FALSE,
|
||||||
|
remove_url = TRUE,
|
||||||
|
remove_separators = TRUE,
|
||||||
|
include_docvars = TRUE)
|
||||||
|
|
||||||
|
#removing not only english stopwords but some bespoke ones too
|
||||||
|
additional_stopwords <- c("and")
|
||||||
|
#take out references to other authors
|
||||||
|
custom_stopwords <- c(stopwords("english"), additional_stopwords)
|
||||||
|
|
||||||
|
phab_dfm <- phab_tokens|>
|
||||||
|
dfm() |>
|
||||||
|
dfm_select(pattern = custom_stopwords,
|
||||||
|
selection = c("remove"),
|
||||||
|
valuetype = c("fixed"))
|
||||||
|
|
||||||
|
phab_dfm_lemmatized = dfm_replace(phab_dfm,
|
||||||
|
pattern = lexicon::hash_lemmas$token,
|
||||||
|
replacement = lexicon::hash_lemmas$lemma)
|
||||||
|
|
||||||
|
phab_feature_counts <- colSums(phab_dfm_lemmatized)
|
||||||
|
docvars(phab_dfm_lemmatized)$doc_id = docnames(phab_dfm_lemmatized)
|
||||||
|
|
||||||
|
#read in the RDS
|
||||||
|
rfc_dfm_lemmatized <- readRDS("text_analysis/case1/030125_rfc_dfm.rds")
|
||||||
|
new_phab_dfm_lemmatized <- dfm_match(phab_dfm_lemmatized, features = colnames(rfc_dfm_lemmatized))
|
||||||
|
|
||||||
|
phab_dfm_stm = convert(new_phab_dfm_lemmatized, to = "stm",
|
||||||
|
docvars = docvars(phab_dfm_lemmatized))
|
||||||
|
|
||||||
|
#loading in the STM that was fitted over the RFC data
|
||||||
|
stm_model <- readRDS("text_analysis/case1/030125_ve_rfc_stm.rds")
|
||||||
|
plot(stm_model)
|
||||||
|
#fit it over the new data
|
||||||
|
new_topic_scores <- fitNewDocuments(stm_model, phab_dfm_stm$documents)
|
||||||
|
#gives us 32058 comment scores to work with
|
||||||
|
|
||||||
|
results = data.frame(text = corpus_subset(phab_corp, docnames(phab_corp) %in% phab_dfm_stm$meta$doc_id),
|
||||||
|
date = phab_dfm_stm$meta$utc_date,
|
||||||
|
affil=phab_dfm_stm$meta$WMFaffil,
|
||||||
|
new_topic_scores$theta)
|
||||||
|
# the issue is, of course, that these topics are not fit to the documents
|
||||||
|
# but topic models must describe documents in terms of 1
|
||||||
|
# so it will ill-fit to the phabricator comments
|
||||||
|
|
||||||
|
grouped_results <- results |>
|
||||||
|
mutate(week = floor_date(as.POSIXct(date), "week")) |>
|
||||||
|
group_by(week, affil) |>
|
||||||
|
summarise(across(starts_with("X"), median, na.rm = TRUE))
|
||||||
|
|
||||||
|
plot <- grouped_results |>
|
||||||
|
ggplot(aes(x=week,
|
||||||
|
y=X5,
|
||||||
|
color=affil)) +
|
||||||
|
geom_line()
|
||||||
|
plot
|
Loading…
Reference in New Issue
Block a user