mw-lifecycle-analysis/dsl/dsl_aggregation.R

library(tidyverse)

unified_csv <-"~/analysis_data/102725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)

# 1. aggregate to the task level
#   1a. create human info proportions (ADAC/general)
#   1b. create OLMO info proportions (ADAC/general)
#   1c.
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')

human_list_unified_df <- unified_df %>%
  filter(!is.na(human_labels)) |>
  mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
  mutate(list_human_labels = map(human_labels, ~ {
    if (is.na(.x)) {
      NA_character_
    } else if (str_detect(.x, '^\\s*c\\(')) {
      eval(parse(text = .x))
    } else {
      .x
    }
  })) %>%
  unnest(list_human_labels, keep_empty = TRUE) |>
  filter(list_human_labels != "NA") |>
  group_by(TaskPHID) |>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(list_human_labels)),
    human_BE_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("BUG REPRODUCTION",
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),

    # ADAC==1 proportions
    n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
    human_BE_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    # ADAC==0 proportions
    n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
    human_BE_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac, -n_tags_no_adac)


olmo_list_unified_df <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) |>
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  group_by(TaskPHID)|>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(olmo_label)),
    olmo_BE_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("BUG REPRODUCTION",
                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
    olmo_BE_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
    olmo_BE_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                        "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac, -n_tags_no_adac)

# aggregate other Task-level variables and then join
task_level_variables <- unified_df |>
  group_by(TaskPHID) |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
            median_PC3 = median(PC3),
            median_PC3_adac = median(PC3[ADAC==1]),
            median_PC3_no_adac = median(PC3[ADAC==0]),
            median_PC1 = median(PC1),
            median_PC1_adac = median(PC1[ADAC==1]),
            median_PC1_no_adac = median(PC1[ADAC==0]),
            median_PC4 = median(PC4),
            median_PC4_adac = median(PC4[ADAC==1]),
            median_PC4_no_adac = median(PC4[ADAC==0]),
            )

descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
         source, phase, week_index, author_closer, resolution_outcome )

task_level_variables <- task_level_variables |>
  left_join(
    descriptions,
    by="TaskPHID"
  )

task_level_variables <- task_level_variables |>
  left_join(
    olmo_list_unified_df,
    by="TaskPHID"
  )

task_level_variables <- task_level_variables |>
  left_join(
    human_list_unified_df,
    by="TaskPHID"
  )
# 2. assign sampling prob for different tasks
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
# refer to DSL specification sheet
task_level_variables <- task_level_variables |>
  mutate(
    isFirstSample = TaskPHID %in% first_sample_tasks,
    sampling_prob = case_when(
      source == "c2" ~ 0.086,
      source == "c3" ~ 0.055,
      source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
      source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
    )
  ) |>
  select(-isFirstSample) |>
  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
  mutate(TTR = (date_closed - date_created)/3600)
# 3. check validity of different aggregate variables
mean(task_level_variables$sampling_prob)
table(task_level_variables$resolution_outcome)
# look at bivariate plots
ggplot(task_level_variables, aes(
  x = as.factor(source),
  y = week_index,
  fill = resolution_outcome
)) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of week_index against Resolution Outcome",
    x = "Case",
    y = "Week Index",
    fill = "Resolution Outcome"
  )

ggplot(task_level_variables, aes(
  x = median_PC3_ADAC,
  y = TTR,
  fill = isAuthorWMF
  )) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-20, 20) +
  ylim(0, 1440) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Median PC3 Value in ADAC Comments",
    x = "Median PC3 Value",
    y = "Time to Resolution (up to 60 days)",
  )
# 4. save
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)