1
0

updating with new analysis/new information for write up

This commit is contained in:
Matthew Gaughan 2025-12-01 13:44:49 -08:00
parent a0545ad8de
commit 86ee932c67
5 changed files with 67 additions and 13 deletions

View File

@ -0,0 +1,8 @@
library(tidyverse)
main_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
main_df <- read.csv(main_csv, header = TRUE)
c3_df <- main_df |>
filter(source=="c3")

View File

@ -10,10 +10,10 @@ library(purrr)
# get the categorical variables encoded as integers, then wrapped as factors
# figure out power at 200, 400, 500, 750, and 1000
#joining sentences with their
labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
labeled_csv <-"~/analysis_data/100625_constituent_dfs/092325_biberplus_complete_labels.csv"
labeled_df <- read.csv(labeled_csv, header = TRUE)
main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
main_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
main_df <- read.csv(main_csv, header = TRUE)
dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)

View File

Can't render this file because it is too large.

View File

@ -1,9 +1,55 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
weekly_summary <- dsl_df |>
group_by(week_index, source, isAuthorWMF)|>
summarise(
tasks_made = sum(!is.na(resolution_outcome)),
count_resolution_outcome = sum(dsl_score),
author_closer_sum = sum(author_closer == TRUE),
median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
median_comments_before_resolution = median(n_comments_before)
)
ggplot(
weekly_summary,
aes(
x=week_index,
y=tasks_made,
fill=isAuthorWMF
)
) +
facet_grid(source ~ ., scales = "free_y") +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c1"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c2"),
aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c3"),
aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = weekly_summary |> filter(source == "c3"),
aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
theme_minimal() +
scale_fill_viridis_d()
outcome_summary <- dsl_df |>
group_by(source, isAuthorWMF)|>
@ -23,13 +69,15 @@ library(ggdist)
ggplot(
dsl_df,
aes(
x=n_comments_before,
color=source,
fill=source
x=week_index,
y=olmo_EP_prop_adac,
color=isAuthorWMF
)
) +
facet_grid(~isAuthorWMF) +
stat_halfeye() +
facet_grid(source ~ .) +
geom_point() +
geom_smooth() +
scale_color_viridis_d() +
theme_minimal()
dsl_df <- dsl_df |>

View File

@ -1,6 +1,6 @@
library(tidyverse)
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
@ -8,10 +8,8 @@ library(survival)
library(broom)
dsl_df$ttr_weeks <- dsl_df$TTR / 168
trial.survival <- Surv(dsl_df$ttr_weeks)
trial.model <- coxph(trial.survival ~ isAuthorWMF +
median_PC3_adac + week_index +
median_gerrit_loc_delta + median_gerrit_reviewers + source +
phase + author_closer, data=dsl_df)
trial.model <- coxph(trial.survival ~ n_comments_before
+ week_index + as.factor(isAuthorWMF) * as.factor(source), data=dsl_df)
summary(trial.model)
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)