307 lines
12 KiB
R
307 lines
12 KiB
R
library(tidyverse)
|
|
|
|
unified_csv <-"~/analysis_data/102725_unified.csv"
|
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
|
|
|
# 1. aggregate to the task level
|
|
# 1a. create human info proportions (ADAC/general)
|
|
# 1b. create OLMO info proportions (ADAC/general)
|
|
# 1c.
|
|
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
|
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
|
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
|
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
|
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
|
'SOCIAL CONVERSATION')
|
|
|
|
human_list_unified_df <- unified_df %>%
|
|
filter(!is.na(human_labels)) |>
|
|
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
|
mutate(list_human_labels = map(human_labels, ~ {
|
|
if (is.na(.x)) {
|
|
NA_character_
|
|
} else if (str_detect(.x, '^\\s*c\\(')) {
|
|
eval(parse(text = .x))
|
|
} else {
|
|
.x
|
|
}
|
|
})) %>%
|
|
unnest(list_human_labels, keep_empty = TRUE) |>
|
|
filter(list_human_labels != "NA") |>
|
|
group_by(TaskPHID) |>
|
|
summarise(
|
|
# Overall proportions (all comments)
|
|
n_tags = sum(!is.na(list_human_labels)),
|
|
human_BE_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
human_SOL_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
human_VR_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
human_BI_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
|
|
# ADAC==1 proportions
|
|
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
|
human_BE_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
human_SOL_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
human_VR_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
human_BI_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
# ADAC==0 proportions
|
|
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
|
human_BE_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
human_SOL_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
human_VR_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
human_BI_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
.groups = "drop"
|
|
) |>
|
|
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
|
|
|
|
|
olmo_list_unified_df <- unified_df %>%
|
|
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
|
|
mutate(list_olmo_labels = str_extract_all(
|
|
olmo_sentence_labels,
|
|
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
|
)) %>%
|
|
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
|
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
|
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
|
filter(list_olmo_labels != "") %>%
|
|
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
|
"WORKAROUND",
|
|
list_olmo_labels))|>
|
|
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
|
"BUG REPRODUCTION",
|
|
olmo_label))|>
|
|
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
|
"INVALID LABEL",
|
|
olmo_label))|>
|
|
group_by(TaskPHID)|>
|
|
summarise(
|
|
# Overall proportions (all comments)
|
|
n_tags = sum(!is.na(olmo_label)),
|
|
olmo_BE_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
olmo_SOL_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
olmo_VR_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
olmo_BI_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
|
olmo_BE_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
olmo_SOL_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
olmo_VR_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
olmo_BI_prop_adac = if_else(
|
|
n_tags_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
|
olmo_BE_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
olmo_SOL_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
olmo_VR_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
olmo_BI_prop_no_adac = if_else(
|
|
n_tags_no_adac == 0L,
|
|
NA_real_,
|
|
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
.groups = "drop"
|
|
) |>
|
|
select(-n_tags, -n_tags_adac, -n_tags_no_adac)
|
|
|
|
# aggregate other Task-level variables and then join
|
|
task_level_variables <- unified_df |>
|
|
group_by(TaskPHID) |>
|
|
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
|
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
|
median_PC3 = median(PC3),
|
|
median_PC3_adac = median(PC3[ADAC==1]),
|
|
median_PC3_no_adac = median(PC3[ADAC==0]),
|
|
median_PC1 = median(PC1),
|
|
median_PC1_adac = median(PC1[ADAC==1]),
|
|
median_PC1_no_adac = median(PC1[ADAC==0]),
|
|
median_PC4 = median(PC4),
|
|
median_PC4_adac = median(PC4[ADAC==1]),
|
|
median_PC4_no_adac = median(PC4[ADAC==0]),
|
|
)
|
|
|
|
descriptions <- unified_df |>
|
|
filter(comment_type == "task_description")|>
|
|
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
|
|
source, phase, week_index, author_closer, resolution_outcome )
|
|
|
|
task_level_variables <- task_level_variables |>
|
|
left_join(
|
|
descriptions,
|
|
by="TaskPHID"
|
|
)
|
|
|
|
task_level_variables <- task_level_variables |>
|
|
left_join(
|
|
olmo_list_unified_df,
|
|
by="TaskPHID"
|
|
)
|
|
|
|
task_level_variables <- task_level_variables |>
|
|
left_join(
|
|
human_list_unified_df,
|
|
by="TaskPHID"
|
|
)
|
|
# 2. assign sampling prob for different tasks
|
|
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
|
|
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
|
|
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
|
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
|
|
# refer to DSL specification sheet
|
|
task_level_variables <- task_level_variables |>
|
|
mutate(
|
|
isFirstSample = TaskPHID %in% first_sample_tasks,
|
|
sampling_prob = case_when(
|
|
source == "c2" ~ 0.086,
|
|
source == "c3" ~ 0.055,
|
|
source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
|
|
source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
|
|
)
|
|
) |>
|
|
select(-isFirstSample) |>
|
|
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
|
|
mutate(TTR = (date_closed - date_created)/3600)
|
|
# 3. check validity of different aggregate variables
|
|
mean(task_level_variables$sampling_prob)
|
|
table(task_level_variables$resolution_outcome)
|
|
# look at bivariate plots
|
|
ggplot(task_level_variables, aes(
|
|
x = as.factor(source),
|
|
y = week_index,
|
|
fill = resolution_outcome
|
|
)) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Boxplot of week_index against Resolution Outcome",
|
|
x = "Case",
|
|
y = "Week Index",
|
|
fill = "Resolution Outcome"
|
|
)
|
|
|
|
ggplot(task_level_variables, aes(
|
|
x = median_PC3_ADAC,
|
|
y = TTR,
|
|
fill = isAuthorWMF
|
|
)) +
|
|
facet_grid(~source, scales="fixed") +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
xlim(-20, 20) +
|
|
ylim(0, 1440) +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Median PC3 Value in ADAC Comments",
|
|
x = "Median PC3 Value",
|
|
y = "Time to Resolution (up to 60 days)",
|
|
)
|
|
# 4. save
|
|
write.csv(task_level_variables, "102725_DSL_df_adac.csv", row.names = FALSE)
|