updated DSL data aggregation

2025-10-27 10:28:08 -07:00 · 2025-10-27 10:28:08 -07:00 · ab1cb3efea
commit ab1cb3efea
parent e955b4f50f
5 changed files with 381118 additions and 13 deletions
--- a/analysis_data/102725_unified.csv
+++ b/analysis_data/102725_unified.csv
--- a/analysis_data/data_verification_3.R
+++ b/analysis_data/data_verification_3.R
@ -4,7 +4,7 @@ library(tidyr)
 library(dplyr)
 library(purrr)
-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
 main_df <- read.csv(main_csv, header = TRUE) 
 #filter out existing olmo stuff
@ -36,9 +36,11 @@ main_df <- main_df |>
  mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
  left_join(desc_info, by = "TaskPHID") |>
  mutate(
-    ADAC = as.integer(!is.na(task_desc_author) &
+    ADAC = as.integer(
-                        AuthorPHID == task_desc_author &
+      !is.na(task_desc_author) &
-                        created < task_desc_dateClosed)
+        AuthorPHID == task_desc_author &
        (is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
    )
  )
 # add dictionary values 
 modal_verb_list <- c("will", "may", "can", "shall", "must", 
@ -130,6 +132,6 @@ pulling <- unified_df |>
 pulling <- unified_df |>
  filter(id == "23366" | id == "20846" | id == "20847")
-write.csv(unified_df, "102425_unified.csv", row.names = FALSE)
+write.csv(unified_df, "102725_unified.csv", row.names = FALSE)
--- a/dsl/102725_DSL_df.csv
+++ b/dsl/102725_DSL_df.csv
--- a/dsl/archived_dsl_data/dsl_data_transform.R
+++ b/dsl/archived_dsl_data/dsl_data_transform.R
@ -3,8 +3,8 @@ library(tidyverse)
 # load in the human labels and for each task filer, @ the task level
 # GET the proportion of Observed bug behavior  + Expected Behavior
 # GET the proportion of Solution Discussion  + Solution Usage 
-human_csv <-"~/dsl/092225_info_matt_labels.csv"
+unified_csv <-"~/analysis_data/102425_unified.csv"
-human_df <- read.csv(human_csv, header = TRUE) 
+unified_df <- read.csv(unified_csv, header = TRUE) 
 #task_authors <- human_df %>%
 #  filter(comment_type == "task_description") %>%
@ -19,8 +19,8 @@ human_df <- read.csv(human_csv, header = TRUE)
 #      mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
 #    }
 #  ) %>%
-#  ungroup()
+#  ungrou
-human_result <- human_df %>%
+unified_df<- unified_df %>%
  group_by(TaskPHID) %>%
  summarise(
    human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,16 +1,250 @@
 library(tidyverse)
-unified_csv <-"~/analysis_data/102125_unified.csv"
+unified_csv <-"~/analysis_data/102725_unified.csv"
-unified_df <- read.csv(human_csv, header = TRUE) 
+unified_df <- read.csv(unified_csv, header = TRUE) 
 # 1. aggregate to the task level 
 #   1a. create human info proportions (ADAC/general)
 #   1b. create OLMO info proportions (ADAC/general)
 #   1c. 
 valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')
 human_list_unified_df <- unified_df %>%
  filter(!is.na(human_labels)) |>
  mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
  mutate(list_human_labels = map(human_labels, ~ {
    if (is.na(.x)) {
      NA_character_
    } else if (str_detect(.x, '^\\s*c\\(')) {
      eval(parse(text = .x))
    } else {
      .x
    }
  })) %>%
  unnest(list_human_labels, keep_empty = TRUE) |>
  filter(list_human_labels != "NA") |>
  group_by(TaskPHID) |>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(list_human_labels)),         
    human_BE_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("BUG REPRODUCTION",
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    # ADAC==1 proportions
    n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
    human_BE_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    human_BI_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac)
 olmo_list_unified_df <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) |>
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  group_by(TaskPHID)|>
  summarise(
    # Overall proportions (all comments)
    n_tags = sum(!is.na(olmo_label)),         
    olmo_BE_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("BUG REPRODUCTION", 
                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
    olmo_BE_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    olmo_BI_prop_adac = if_else(
      n_tags_adac == 0L,
      NA_real_,
      mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION", 
                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags, -n_tags_adac)
 # aggregate other Task-level variables and then join
 task_level_variables <- unified_df |>
  group_by(TaskPHID) |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
            median_PC3 = median(PC3),
            median_PC3_ADAC = median(PC3[ADAC==1])
            )
 descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
         source, phase, week_index, author_closer, resolution_outcome )
 task_level_variables <- task_level_variables |>
  left_join(
    descriptions, 
    by="TaskPHID"
  )
 task_level_variables <- task_level_variables |>
  left_join(
    olmo_list_unified_df,
    by="TaskPHID"
  )
 task_level_variables <- task_level_variables |>
  left_join(
    human_list_unified_df, 
    by="TaskPHID"
  )
 # 2. assign sampling prob for different tasks
 # need to ID those selected in the first round of sampling that were removed for the second round of sampling
 large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
 large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
 first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
 # refer to DSL specification sheet 
-
+task_level_variables <- task_level_variables |>
  mutate(
    isFirstSample = TaskPHID %in% first_sample_tasks,
    sampling_prob = case_when(
      source == "c2" ~ 0.086,
      source == "c3" ~ 0.055,
      source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
      source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
    )
  ) |>
  select(-isFirstSample) |>
  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
  mutate(TTR = (date_closed - date_created)/3600)
 # 3. check validity of different aggregate variables 
 mean(task_level_variables$sampling_prob)
 table(task_level_variables$resolution_outcome)
 # look at bivariate plots 
 ggplot(task_level_variables, aes(
  x = as.factor(source),  
  y = week_index,
  fill = resolution_outcome
 )) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of week_index against Resolution Outcome",
    x = "Case",
    y = "Week Index",
    fill = "Resolution Outcome"
  )
-# 4. save
+ggplot(task_level_variables, aes(
  x = median_PC3_ADAC, 
  y = TTR, 
  fill = isAuthorWMF
  )) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-20, 20) + 
  ylim(0, 1440) + 
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Median PC3 Value in ADAC Comments",
    x = "Median PC3 Value",
    y = "Time to Resolution (up to 60 days)",
  )
 # 4. save
 write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)