updated DSL data aggregation

2025-10-27 10:28:08 -07:00 · 2025-10-27 10:28:08 -07:00 · ab1cb3efea
commit ab1cb3efea
parent e955b4f50f
5 changed files with 381118 additions and 13 deletions
--- a/analysis_data/102725_unified.csv
+++ b/analysis_data/102725_unified.csv
--- a/analysis_data/data_verification_3.R
+++ b/analysis_data/data_verification_3.R
@ -4,7 +4,7 @@ library(tidyr)
 library(dplyr)
 library(purrr)

-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
 main_df <- read.csv(main_csv, header = TRUE) 

 #filter out existing olmo stuff
@ -36,9 +36,11 @@ main_df <- main_df |>
  mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
  left_join(desc_info, by = "TaskPHID") |>
  mutate(
-    ADAC = as.integer(!is.na(task_desc_author) &
-                        AuthorPHID == task_desc_author &
-                        created < task_desc_dateClosed)
+    ADAC = as.integer(
+      !is.na(task_desc_author) &
+        AuthorPHID == task_desc_author &
+        (is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
+    )
  )
 # add dictionary values 
 modal_verb_list <- c("will", "may", "can", "shall", "must", 
@ -130,6 +132,6 @@ pulling <- unified_df |>
 pulling <- unified_df |>
  filter(id == "23366" | id == "20846" | id == "20847")

-write.csv(unified_df, "102425_unified.csv", row.names = FALSE)
+write.csv(unified_df, "102725_unified.csv", row.names = FALSE)


--- a/dsl/102725_DSL_df.csv
+++ b/dsl/102725_DSL_df.csv
--- a/dsl/archived_dsl_data/dsl_data_transform.R
+++ b/dsl/archived_dsl_data/dsl_data_transform.R
@ -3,8 +3,8 @@ library(tidyverse)
 # load in the human labels and for each task filer, @ the task level
 # GET the proportion of Observed bug behavior  + Expected Behavior
 # GET the proportion of Solution Discussion  + Solution Usage 
-human_csv <-"~/dsl/092225_info_matt_labels.csv"
-human_df <- read.csv(human_csv, header = TRUE) 
+unified_csv <-"~/analysis_data/102425_unified.csv"
+unified_df <- read.csv(unified_csv, header = TRUE) 

 #task_authors <- human_df %>%
 #  filter(comment_type == "task_description") %>%
@ -19,8 +19,8 @@ human_df <- read.csv(human_csv, header = TRUE)
 #      mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
 #    }
 #  ) %>%
-#  ungroup()
-human_result <- human_df %>%
+#  ungrou
+unified_df<- unified_df %>%
  group_by(TaskPHID) %>%
  summarise(
    human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,16 +1,250 @@
 library(tidyverse)

-unified_csv <-"~/analysis_data/102125_unified.csv"
-unified_df <- read.csv(human_csv, header = TRUE) 
+unified_csv <-"~/analysis_data/102725_unified.csv"
+unified_df <- read.csv(unified_csv, header = TRUE) 

 # 1. aggregate to the task level 
 #   1a. create human info proportions (ADAC/general)
 #   1b. create OLMO info proportions (ADAC/general)
 #   1c. 
+valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
+                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
+                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
+                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
+                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
+                      'SOCIAL CONVERSATION')

+human_list_unified_df <- unified_df %>%
+  filter(!is.na(human_labels)) |>
+  mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
+  mutate(list_human_labels = map(human_labels, ~ {
+    if (is.na(.x)) {
+      NA_character_
+    } else if (str_detect(.x, '^\\s*c\\(')) {
+      eval(parse(text = .x))
+    } else {
+      .x
+    }
+  })) %>%
+  unnest(list_human_labels, keep_empty = TRUE) |>
+  filter(list_human_labels != "NA") |>
+  group_by(TaskPHID) |>
+  summarise(
+    # Overall proportions (all comments)
+    n_tags = sum(!is.na(list_human_labels)),         
+    human_BE_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    human_SOL_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    human_VR_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    human_BI_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("BUG REPRODUCTION",
+                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    
+    # ADAC==1 proportions
+    n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
+    human_BE_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    human_SOL_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    human_VR_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    human_BI_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION", 
+                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    .groups = "drop"
+  ) |>
+  select(-n_tags, -n_tags_adac)
+
+
+olmo_list_unified_df <- unified_df %>%
+  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
+  mutate(list_olmo_labels = str_extract_all(
+    olmo_sentence_labels,
+    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
+  )) %>%
+  unnest(list_olmo_labels, keep_empty = TRUE) |>
+  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
+  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
+  filter(list_olmo_labels != "") %>%
+  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
+                             "WORKAROUND",
+                             list_olmo_labels))|>
+  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
+                             "BUG REPRODUCTION",
+                             olmo_label))|>
+  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
+                             "INVALID LABEL",
+                             olmo_label))|>
+  group_by(TaskPHID)|>
+  summarise(
+    # Overall proportions (all comments)
+    n_tags = sum(!is.na(olmo_label)),         
+    olmo_BE_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    olmo_SOL_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    olmo_VR_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    olmo_BI_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("BUG REPRODUCTION", 
+                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
+    olmo_BE_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    olmo_SOL_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    olmo_VR_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                               "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                               "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    olmo_BI_prop_adac = if_else(
+      n_tags_adac == 0L,
+      NA_real_,
+      mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION", 
+                                        "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    .groups = "drop"
+  ) |>
+  select(-n_tags, -n_tags_adac)
+
+# aggregate other Task-level variables and then join
+task_level_variables <- unified_df |>
+  group_by(TaskPHID) |>
+  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
+            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
+            median_PC3 = median(PC3),
+            median_PC3_ADAC = median(PC3[ADAC==1])
+            )
+
+descriptions <- unified_df |>
+  filter(comment_type == "task_description")|>
+  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
+         source, phase, week_index, author_closer, resolution_outcome )
+
+task_level_variables <- task_level_variables |>
+  left_join(
+    descriptions, 
+    by="TaskPHID"
+  )
+
+task_level_variables <- task_level_variables |>
+  left_join(
+    olmo_list_unified_df,
+    by="TaskPHID"
+  )
+
+task_level_variables <- task_level_variables |>
+  left_join(
+    human_list_unified_df, 
+    by="TaskPHID"
+  )
 # 2. assign sampling prob for different tasks
+# need to ID those selected in the first round of sampling that were removed for the second round of sampling
+large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
+large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
+first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
 # refer to DSL specification sheet 
-
+task_level_variables <- task_level_variables |>
+  mutate(
+    isFirstSample = TaskPHID %in% first_sample_tasks,
+    sampling_prob = case_when(
+      source == "c2" ~ 0.086,
+      source == "c3" ~ 0.055,
+      source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
+      source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
+    )
+  ) |>
+  select(-isFirstSample) |>
+  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
+  mutate(TTR = (date_closed - date_created)/3600)
 # 3. check validity of different aggregate variables 
+mean(task_level_variables$sampling_prob)
+table(task_level_variables$resolution_outcome)
+# look at bivariate plots 
+ggplot(task_level_variables, aes(
+  x = as.factor(source),  
+  y = week_index,
+  fill = resolution_outcome
+)) +
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Boxplot of week_index against Resolution Outcome",
+    x = "Case",
+    y = "Week Index",
+    fill = "Resolution Outcome"
+  )

-# 4. save
+ggplot(task_level_variables, aes(
+  x = median_PC3_ADAC, 
+  y = TTR, 
+  fill = isAuthorWMF
+  )) +
+  facet_grid(~source, scales="fixed") +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  xlim(-20, 20) + 
+  ylim(0, 1440) + 
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Median PC3 Value in ADAC Comments",
+    x = "Median PC3 Value",
+    y = "Time to Resolution (up to 60 days)",
+  )
+# 4. save
+write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)