mw-lifecycle-analysis/analysis_data/sampling_strat_check.R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)

pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv"
pre_unified_df <- read.csv(pre_unified_csv, header = TRUE)

unified_task_df <- pre_unified_df |> filter(comment_type=="task_description")
unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description")

library(scales)

unified_task_df |>
  count(source, phase, isAuthorWMF) |>
  mutate(pct = round(n / sum(n), 3))

median_n <- unified_comment_df |>
  count(AuthorPHID) |>
  pull(n) |>
  median(na.rm = TRUE)
median_n

median_comments <- pre_unified_df |>
  count(TaskPHID) |>
  pull(n) |>
  median(na.rm=TRUE)
median_comments

human_csv <-"~/analysis_data/102025_human_labels.csv"
human_df <- read.csv(human_csv, header = TRUE)

cleaned_human_df <- human_df |>
  mutate(human_label = replace_na(human_label, "NA"),
         human_label = str_squish(human_label)) |>
  group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |>
  summarise(human_labels = list(human_label), .groups = "drop")

cleaned_human_df <- cleaned_human_df |>
  left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID"))


median_comments <- human_comment_df |>
  count(AuthorPHID) |>
  pull(n) |>
  median(na.rm=TRUE)
median_comments