updating with new analysis/new information for write up
This commit is contained in:
parent
a0545ad8de
commit
86ee932c67
8
analysis_data/120125_data_verification.R
Normal file
8
analysis_data/120125_data_verification.R
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
main_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
|
||||||
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
c3_df <- main_df |>
|
||||||
|
filter(source=="c3")
|
||||||
|
|
||||||
@ -10,10 +10,10 @@ library(purrr)
|
|||||||
# get the categorical variables encoded as integers, then wrapped as factors
|
# get the categorical variables encoded as integers, then wrapped as factors
|
||||||
# figure out power at 200, 400, 500, 750, and 1000
|
# figure out power at 200, 400, 500, 750, and 1000
|
||||||
#joining sentences with their
|
#joining sentences with their
|
||||||
labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
|
labeled_csv <-"~/analysis_data/100625_constituent_dfs/092325_biberplus_complete_labels.csv"
|
||||||
labeled_df <- read.csv(labeled_csv, header = TRUE)
|
labeled_df <- read.csv(labeled_csv, header = TRUE)
|
||||||
|
|
||||||
main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
|
main_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
|
||||||
main_df <- read.csv(main_csv, header = TRUE)
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
|
dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
|
||||||
|
|||||||
|
Can't render this file because it is too large.
|
@ -1,9 +1,55 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
#library(dsl)
|
#library(dsl)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
|
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
|
||||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
|
||||||
|
weekly_summary <- dsl_df |>
|
||||||
|
group_by(week_index, source, isAuthorWMF)|>
|
||||||
|
summarise(
|
||||||
|
tasks_made = sum(!is.na(resolution_outcome)),
|
||||||
|
count_resolution_outcome = sum(dsl_score),
|
||||||
|
author_closer_sum = sum(author_closer == TRUE),
|
||||||
|
median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
|
||||||
|
median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
|
||||||
|
median_comments_before_resolution = median(n_comments_before)
|
||||||
|
)
|
||||||
|
|
||||||
|
ggplot(
|
||||||
|
weekly_summary,
|
||||||
|
aes(
|
||||||
|
x=week_index,
|
||||||
|
y=tasks_made,
|
||||||
|
fill=isAuthorWMF
|
||||||
|
)
|
||||||
|
) +
|
||||||
|
facet_grid(source ~ ., scales = "free_y") +
|
||||||
|
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -29),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -9),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c1"),
|
||||||
|
aes(xintercept = -4),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c2"),
|
||||||
|
aes(xintercept = -99),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c2"),
|
||||||
|
aes(xintercept = -4),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c3"),
|
||||||
|
aes(xintercept = -97),
|
||||||
|
linetype = "dotted", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(data = weekly_summary |> filter(source == "c3"),
|
||||||
|
aes(xintercept = -3),
|
||||||
|
linetype = "3313", color = "black", linewidth = 0.5) +
|
||||||
|
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
|
||||||
|
theme_minimal() +
|
||||||
|
scale_fill_viridis_d()
|
||||||
|
|
||||||
|
|
||||||
outcome_summary <- dsl_df |>
|
outcome_summary <- dsl_df |>
|
||||||
group_by(source, isAuthorWMF)|>
|
group_by(source, isAuthorWMF)|>
|
||||||
@ -23,13 +69,15 @@ library(ggdist)
|
|||||||
ggplot(
|
ggplot(
|
||||||
dsl_df,
|
dsl_df,
|
||||||
aes(
|
aes(
|
||||||
x=n_comments_before,
|
x=week_index,
|
||||||
color=source,
|
y=olmo_EP_prop_adac,
|
||||||
fill=source
|
color=isAuthorWMF
|
||||||
)
|
)
|
||||||
) +
|
) +
|
||||||
facet_grid(~isAuthorWMF) +
|
facet_grid(source ~ .) +
|
||||||
stat_halfeye() +
|
geom_point() +
|
||||||
|
geom_smooth() +
|
||||||
|
scale_color_viridis_d() +
|
||||||
theme_minimal()
|
theme_minimal()
|
||||||
|
|
||||||
dsl_df <- dsl_df |>
|
dsl_df <- dsl_df |>
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
|
||||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
dsl_csv <-"~/dsl/111725_DSL_frame.csv"
|
||||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
#https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
|
||||||
|
|
||||||
@ -8,10 +8,8 @@ library(survival)
|
|||||||
library(broom)
|
library(broom)
|
||||||
dsl_df$ttr_weeks <- dsl_df$TTR / 168
|
dsl_df$ttr_weeks <- dsl_df$TTR / 168
|
||||||
trial.survival <- Surv(dsl_df$ttr_weeks)
|
trial.survival <- Surv(dsl_df$ttr_weeks)
|
||||||
trial.model <- coxph(trial.survival ~ isAuthorWMF +
|
trial.model <- coxph(trial.survival ~ n_comments_before
|
||||||
median_PC3_adac + week_index +
|
+ week_index + as.factor(isAuthorWMF) * as.factor(source), data=dsl_df)
|
||||||
median_gerrit_loc_delta + median_gerrit_reviewers + source +
|
|
||||||
phase + author_closer, data=dsl_df)
|
|
||||||
summary(trial.model)
|
summary(trial.model)
|
||||||
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user