1
0
mw-lifecycle-analysis/analysis_data/sampling_strat_check.R
2025-10-21 15:19:13 -07:00

50 lines
1.3 KiB
R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
pre_unified_csv <-"~/analysis_data/100625_unified_w_affil.csv"
pre_unified_df <- read.csv(pre_unified_csv, header = TRUE)
unified_task_df <- pre_unified_df |> filter(comment_type=="task_description")
unified_comment_df <- pre_unified_df |> filter(comment_type!="task_description")
library(scales)
unified_task_df |>
count(source, phase, isAuthorWMF) |>
mutate(pct = round(n / sum(n), 3))
median_n <- unified_comment_df |>
count(AuthorPHID) |>
pull(n) |>
median(na.rm = TRUE)
median_n
median_comments <- pre_unified_df |>
count(TaskPHID) |>
pull(n) |>
median(na.rm=TRUE)
median_comments
human_csv <-"~/analysis_data/102025_human_labels.csv"
human_df <- read.csv(human_csv, header = TRUE)
cleaned_human_df <- human_df |>
mutate(human_label = replace_na(human_label, "NA"),
human_label = str_squish(human_label)) |>
group_by(id, TaskPHID, AuthorPHID, comment_text, task_title, comment_type, priority, source, phase) |>
summarise(human_labels = list(human_label), .groups = "drop")
cleaned_human_df <- cleaned_human_df |>
left_join(pre_unified_df, by = c("id", "TaskPHID", "AuthorPHID"))
median_comments <- human_comment_df |>
count(AuthorPHID) |>
pull(n) |>
median(na.rm=TRUE)
median_comments