1
0
mw-lifecycle-analysis/analysis_data/scratch.R
2025-12-08 20:21:58 -08:00

87 lines
2.5 KiB
R

library(tidyverse)
main_csv <-"~/analysis_data/120725_unified.csv"
main_df <- read.csv(main_csv, header = TRUE)
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
needs_triage <- dsl_df |>
filter(week_index >= -4) |>
filter(priority=="Needs Triage") |>
mutate(ttr_weeks = TTR/168) |>
group_by(source, isAuthorWMF)|>
summarize(
mean_ttr_weeks = mean(ttr_weeks),
sd_ttr_weeks = sd(ttr_weeks)
)
changes<- dsl_df |>
filter(priority == "Needs Triage" |
priority == "Unbreak Now!" |
priority == "High") |>
mutate(
period = case_when(
week_index >= -4 & week_index <= 4 ~ "8 weeks after announcement",
week_index >= -13 & week_index <= -5 ~ "8 weeks before deployment announcement",
TRUE ~ NA
)
) %>%
filter(!is.na(period)) |>
mutate(ttr_weeks = TTR/168) |>
group_by(source, period, priority) %>%
summarise(
count = n(),
mean_ttr_weeks = mean(ttr_weeks, na.rm = TRUE),
sd_ttr_weeks = sd(ttr_weeks, na.rm = TRUE),
)
# new contributors
first_task <- main_df |>
filter(comment_type == "task_description") |>
group_by(source, AuthorPHID) |>
summarise(
task_count = n(),
first_task_week = min(week_index)
)
tasks_flagged <- main_df %>%
filter(comment_type == "task_description") |>
left_join(first_task, by = c("source", "AuthorPHID")) %>%
mutate(is_first_time_author = week_index == first_task_week)
summary_df <- tasks_flagged %>%
group_by(week_index, source) %>%
summarize(
total_tasks = n(),
first_time_tasks = sum(is_first_time_author),
proportion_first_time = first_time_tasks / total_tasks
) %>%
ungroup()
period_counts <- summary_df %>%
mutate(
period = case_when(
week_index >= -4 & week_index <= 4 ~ "recent",
week_index >= -13 & week_index <= -5 ~ "prior",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(period)) %>%
group_by(source, period) %>%
summarize(period_first_time_tasks = sum(first_time_tasks), .groups = "drop")
ggplot(summary_df, aes(x = week_index, y = first_time_tasks)) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_col() +
labs(
x = "Date of first task",
y = "Number of tasks created",
title = "Task count by Author's first task date"
) +
theme_minimal()