mw-lifecycle-analysis/dsl/dsl_aggregation.R

library(tidyverse)

unified_csv <-"~/analysis_data/110925_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)

# 1. aggregate to the task level
#   1a. create human info proportions (ADAC/general)
#   1b. create OLMO info proportions (ADAC/general)
#   1c.
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')
library(dplyr)
library(purrr)
library(stringr)
human_list_unified_df <- unified_df %>%
  filter(!is.na(human_labels)) |>
  mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
  mutate(list_human_labels = map(human_labels, ~ {
    if (is.na(.x)) {
      NA_character_
    } else if (str_detect(.x, '^\\s*c\\(')) {
      eval(parse(text = .x))
    } else {
      .x
    }
  })) %>%
  tidyr::unnest(list_human_labels, keep_empty = TRUE) |>
  filter(list_human_labels != "NA") |>
  group_by(TaskPHID) |>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(list_human_labels)),
    human_EP_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_TSOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    human_DIO_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_RK_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("ACTION ON ISSUE",
                                    "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),

    # ADAC==1 proportions
    n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
    human_EP_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_TSOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    human_DIO_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_RK_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("ACTION ON ISSUE",
                                               "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),
    # ADAC==0 proportions
    n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
    human_EP_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_TSOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    human_DIO_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_RK_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 0] %in% c("ACTION ON ISSUE",
                                               "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac, -n_tags_no_adac)


olmo_list_unified_df <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  tidyr::unnest(list_olmo_labels, keep_empty = TRUE) |>
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  group_by(TaskPHID)|>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(olmo_label)),
    olmo_EP_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_TSOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    olmo_DIO_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_RK_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("ACTION ON ISSUE",
                             "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),
    n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
    olmo_EP_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_TSOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    olmo_DIO_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_RK_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("ACTION ON ISSUE",
                                        "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),
    n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
    olmo_EP_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_TSOL_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
    ),
    olmo_DIO_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_RK_prop_no_adac = if_else(
      n_tags_no_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 0] %in% c("ACTION ON ISSUE",
                                        "ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac, -n_tags_no_adac)

# aggregate other Task-level variables and then join
task_level_variables <- unified_df |>
  group_by(TaskPHID) |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
            median_PC3 = median(PC3),
            median_PC3_adac = median(PC3[ADAC==1]),
            median_PC3_no_adac = median(PC3[ADAC==0]),
            median_PC1 = median(PC1),
            median_PC1_adac = median(PC1[ADAC==1]),
            median_PC1_no_adac = median(PC1[ADAC==0]),
            median_PC4 = median(PC4),
            median_PC4_adac = median(PC4[ADAC==1]),
            median_PC4_no_adac = median(PC4[ADAC==0]),
            n_comments = sum(!is.na(id)),
            n_comments_before = sum(before_close)
            )

descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
         source, phase, week_index, author_closer, resolution_outcome, priority )

task_level_variables <- task_level_variables |>
  left_join(
    descriptions,
    by="TaskPHID"
  )

task_level_variables <- task_level_variables |>
  left_join(
    olmo_list_unified_df,
    by="TaskPHID"
  )

task_level_variables <- task_level_variables |>
  left_join(
    human_list_unified_df,
    by="TaskPHID"
  )
# 2. assign sampling prob for different tasks
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
# refer to DSL specification sheet
task_level_variables <- task_level_variables |>
  mutate(
    isFirstSample = TaskPHID %in% first_sample_tasks,
    sampling_prob = case_when(
      source == "c2" ~ 0.086,
      source == "c3" ~ 0.055,
      source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
      source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
    )
  ) |>
  select(-isFirstSample) |>
  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
  mutate(TTR = (date_closed - date_created)/3600)
# 3. check validity of different aggregate variables
mean(task_level_variables$sampling_prob)
table(task_level_variables$resolution_outcome)
# look at bivariate plots
ggplot(task_level_variables, aes(
  x = as.factor(source),
  y = week_index,
  fill = resolution_outcome
)) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of week_index against Resolution Outcome",
    x = "Case",
    y = "Week Index",
    fill = "Resolution Outcome"
  )


ggplot(task_level_variables,
       aes(
         x=as.factor(source),
         y=olmo_RK_prop,
         fill=as.factor(source)
       )) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  scale_fill_viridis_d() +
  theme_minimal()

# 4. save
write.csv(task_level_variables, "111725_DSL_frame.csv", row.names = FALSE)