1
0

updated DSL data aggregation

This commit is contained in:
Matthew Gaughan 2025-10-27 10:28:08 -07:00
parent e955b4f50f
commit ab1cb3efea
5 changed files with 381118 additions and 13 deletions

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@ library(tidyr)
library(dplyr)
library(purrr)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)
#filter out existing olmo stuff
@ -36,9 +36,11 @@ main_df <- main_df |>
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
left_join(desc_info, by = "TaskPHID") |>
mutate(
ADAC = as.integer(!is.na(task_desc_author) &
AuthorPHID == task_desc_author &
created < task_desc_dateClosed)
ADAC = as.integer(
!is.na(task_desc_author) &
AuthorPHID == task_desc_author &
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
)
)
# add dictionary values
modal_verb_list <- c("will", "may", "can", "shall", "must",
@ -130,6 +132,6 @@ pulling <- unified_df |>
pulling <- unified_df |>
filter(id == "23366" | id == "20846" | id == "20847")
write.csv(unified_df, "102425_unified.csv", row.names = FALSE)
write.csv(unified_df, "102725_unified.csv", row.names = FALSE)

3130
dsl/102725_DSL_df.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,8 +3,8 @@ library(tidyverse)
# load in the human labels and for each task filer, @ the task level
# GET the proportion of Observed bug behavior + Expected Behavior
# GET the proportion of Solution Discussion + Solution Usage
human_csv <-"~/dsl/092225_info_matt_labels.csv"
human_df <- read.csv(human_csv, header = TRUE)
unified_csv <-"~/analysis_data/102425_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
#task_authors <- human_df %>%
# filter(comment_type == "task_description") %>%
@ -19,8 +19,8 @@ human_df <- read.csv(human_csv, header = TRUE)
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
# }
# ) %>%
# ungroup()
human_result <- human_df %>%
# ungrou
unified_df<- unified_df %>%
group_by(TaskPHID) %>%
summarise(
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),

View File

@ -1,16 +1,250 @@
library(tidyverse)
unified_csv <-"~/analysis_data/102125_unified.csv"
unified_df <- read.csv(human_csv, header = TRUE)
unified_csv <-"~/analysis_data/102725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
# 1. aggregate to the task level
# 1a. create human info proportions (ADAC/general)
# 1b. create OLMO info proportions (ADAC/general)
# 1c.
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
'SOCIAL CONVERSATION')
human_list_unified_df <- unified_df %>%
filter(!is.na(human_labels)) |>
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
mutate(list_human_labels = map(human_labels, ~ {
if (is.na(.x)) {
NA_character_
} else if (str_detect(.x, '^\\s*c\\(')) {
eval(parse(text = .x))
} else {
.x
}
})) %>%
unnest(list_human_labels, keep_empty = TRUE) |>
filter(list_human_labels != "NA") |>
group_by(TaskPHID) |>
summarise(
# Overall proportions (all comments)
n_tags = sum(!is.na(list_human_labels)),
human_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
human_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
human_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
human_BI_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
# ADAC==1 proportions
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
human_BE_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
human_SOL_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
human_VR_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
human_BI_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags, -n_tags_adac)
olmo_list_unified_df <- unified_df %>%
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
mutate(list_olmo_labels = str_extract_all(
olmo_sentence_labels,
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
)) %>%
unnest(list_olmo_labels, keep_empty = TRUE) |>
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
filter(list_olmo_labels != "") %>%
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
"WORKAROUND",
list_olmo_labels))|>
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
"BUG REPRODUCTION",
olmo_label))|>
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
"INVALID LABEL",
olmo_label))|>
group_by(TaskPHID)|>
summarise(
# Overall proportions (all comments)
n_tags = sum(!is.na(olmo_label)),
olmo_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
olmo_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
olmo_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
olmo_BI_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
olmo_BE_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
olmo_SOL_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
olmo_VR_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
olmo_BI_prop_adac = if_else(
n_tags_adac == 0L,
NA_real_,
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags, -n_tags_adac)
# aggregate other Task-level variables and then join
task_level_variables <- unified_df |>
group_by(TaskPHID) |>
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
median_PC3 = median(PC3),
median_PC3_ADAC = median(PC3[ADAC==1])
)
descriptions <- unified_df |>
filter(comment_type == "task_description")|>
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
source, phase, week_index, author_closer, resolution_outcome )
task_level_variables <- task_level_variables |>
left_join(
descriptions,
by="TaskPHID"
)
task_level_variables <- task_level_variables |>
left_join(
olmo_list_unified_df,
by="TaskPHID"
)
task_level_variables <- task_level_variables |>
left_join(
human_list_unified_df,
by="TaskPHID"
)
# 2. assign sampling prob for different tasks
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
# refer to DSL specification sheet
task_level_variables <- task_level_variables |>
mutate(
isFirstSample = TaskPHID %in% first_sample_tasks,
sampling_prob = case_when(
source == "c2" ~ 0.086,
source == "c3" ~ 0.055,
source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
)
) |>
select(-isFirstSample) |>
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
mutate(TTR = (date_closed - date_created)/3600)
# 3. check validity of different aggregate variables
mean(task_level_variables$sampling_prob)
table(task_level_variables$resolution_outcome)
# look at bivariate plots
ggplot(task_level_variables, aes(
x = as.factor(source),
y = week_index,
fill = resolution_outcome
)) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of week_index against Resolution Outcome",
x = "Case",
y = "Week Index",
fill = "Resolution Outcome"
)
# 4. save
ggplot(task_level_variables, aes(
x = median_PC3_ADAC,
y = TTR,
fill = isAuthorWMF
)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-20, 20) +
ylim(0, 1440) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Median PC3 Value in ADAC Comments",
x = "Median PC3 Value",
y = "Time to Resolution (up to 60 days)",
)
# 4. save
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)