updated DSL data aggregation
This commit is contained in:
parent
e955b4f50f
commit
ab1cb3efea
377739
analysis_data/102725_unified.csv
Normal file
377739
analysis_data/102725_unified.csv
Normal file
File diff suppressed because one or more lines are too long
@ -4,7 +4,7 @@ library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
#filter out existing olmo stuff
|
||||
@ -36,9 +36,11 @@ main_df <- main_df |>
|
||||
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
||||
left_join(desc_info, by = "TaskPHID") |>
|
||||
mutate(
|
||||
ADAC = as.integer(!is.na(task_desc_author) &
|
||||
AuthorPHID == task_desc_author &
|
||||
created < task_desc_dateClosed)
|
||||
ADAC = as.integer(
|
||||
!is.na(task_desc_author) &
|
||||
AuthorPHID == task_desc_author &
|
||||
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
|
||||
)
|
||||
)
|
||||
# add dictionary values
|
||||
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
||||
@ -130,6 +132,6 @@ pulling <- unified_df |>
|
||||
pulling <- unified_df |>
|
||||
filter(id == "23366" | id == "20846" | id == "20847")
|
||||
|
||||
write.csv(unified_df, "102425_unified.csv", row.names = FALSE)
|
||||
write.csv(unified_df, "102725_unified.csv", row.names = FALSE)
|
||||
|
||||
|
||||
|
||||
3130
dsl/102725_DSL_df.csv
Normal file
3130
dsl/102725_DSL_df.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -3,8 +3,8 @@ library(tidyverse)
|
||||
# load in the human labels and for each task filer, @ the task level
|
||||
# GET the proportion of Observed bug behavior + Expected Behavior
|
||||
# GET the proportion of Solution Discussion + Solution Usage
|
||||
human_csv <-"~/dsl/092225_info_matt_labels.csv"
|
||||
human_df <- read.csv(human_csv, header = TRUE)
|
||||
unified_csv <-"~/analysis_data/102425_unified.csv"
|
||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
|
||||
#task_authors <- human_df %>%
|
||||
# filter(comment_type == "task_description") %>%
|
||||
@ -19,8 +19,8 @@ human_df <- read.csv(human_csv, header = TRUE)
|
||||
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
|
||||
# }
|
||||
# ) %>%
|
||||
# ungroup()
|
||||
human_result <- human_df %>%
|
||||
# ungrou
|
||||
unified_df<- unified_df %>%
|
||||
group_by(TaskPHID) %>%
|
||||
summarise(
|
||||
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
||||
|
||||
@ -1,16 +1,250 @@
|
||||
library(tidyverse)
|
||||
|
||||
unified_csv <-"~/analysis_data/102125_unified.csv"
|
||||
unified_df <- read.csv(human_csv, header = TRUE)
|
||||
unified_csv <-"~/analysis_data/102725_unified.csv"
|
||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
|
||||
# 1. aggregate to the task level
|
||||
# 1a. create human info proportions (ADAC/general)
|
||||
# 1b. create OLMO info proportions (ADAC/general)
|
||||
# 1c.
|
||||
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
||||
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
||||
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
||||
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||
'SOCIAL CONVERSATION')
|
||||
|
||||
human_list_unified_df <- unified_df %>%
|
||||
filter(!is.na(human_labels)) |>
|
||||
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
||||
mutate(list_human_labels = map(human_labels, ~ {
|
||||
if (is.na(.x)) {
|
||||
NA_character_
|
||||
} else if (str_detect(.x, '^\\s*c\\(')) {
|
||||
eval(parse(text = .x))
|
||||
} else {
|
||||
.x
|
||||
}
|
||||
})) %>%
|
||||
unnest(list_human_labels, keep_empty = TRUE) |>
|
||||
filter(list_human_labels != "NA") |>
|
||||
group_by(TaskPHID) |>
|
||||
summarise(
|
||||
# Overall proportions (all comments)
|
||||
n_tags = sum(!is.na(list_human_labels)),
|
||||
human_BE_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
|
||||
# ADAC==1 proportions
|
||||
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
||||
human_BE_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags, -n_tags_adac)
|
||||
|
||||
|
||||
olmo_list_unified_df <- unified_df %>%
|
||||
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
|
||||
mutate(list_olmo_labels = str_extract_all(
|
||||
olmo_sentence_labels,
|
||||
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||
)) %>%
|
||||
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||
filter(list_olmo_labels != "") %>%
|
||||
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
||||
"WORKAROUND",
|
||||
list_olmo_labels))|>
|
||||
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
||||
"BUG REPRODUCTION",
|
||||
olmo_label))|>
|
||||
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
||||
"INVALID LABEL",
|
||||
olmo_label))|>
|
||||
group_by(TaskPHID)|>
|
||||
summarise(
|
||||
# Overall proportions (all comments)
|
||||
n_tags = sum(!is.na(olmo_label)),
|
||||
olmo_BE_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
||||
olmo_BE_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags, -n_tags_adac)
|
||||
|
||||
# aggregate other Task-level variables and then join
|
||||
task_level_variables <- unified_df |>
|
||||
group_by(TaskPHID) |>
|
||||
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||
median_PC3 = median(PC3),
|
||||
median_PC3_ADAC = median(PC3[ADAC==1])
|
||||
)
|
||||
|
||||
descriptions <- unified_df |>
|
||||
filter(comment_type == "task_description")|>
|
||||
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
|
||||
source, phase, week_index, author_closer, resolution_outcome )
|
||||
|
||||
task_level_variables <- task_level_variables |>
|
||||
left_join(
|
||||
descriptions,
|
||||
by="TaskPHID"
|
||||
)
|
||||
|
||||
task_level_variables <- task_level_variables |>
|
||||
left_join(
|
||||
olmo_list_unified_df,
|
||||
by="TaskPHID"
|
||||
)
|
||||
|
||||
task_level_variables <- task_level_variables |>
|
||||
left_join(
|
||||
human_list_unified_df,
|
||||
by="TaskPHID"
|
||||
)
|
||||
# 2. assign sampling prob for different tasks
|
||||
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
|
||||
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
|
||||
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
||||
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
|
||||
# refer to DSL specification sheet
|
||||
|
||||
task_level_variables <- task_level_variables |>
|
||||
mutate(
|
||||
isFirstSample = TaskPHID %in% first_sample_tasks,
|
||||
sampling_prob = case_when(
|
||||
source == "c2" ~ 0.086,
|
||||
source == "c3" ~ 0.055,
|
||||
source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
|
||||
source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
|
||||
)
|
||||
) |>
|
||||
select(-isFirstSample) |>
|
||||
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
|
||||
mutate(TTR = (date_closed - date_created)/3600)
|
||||
# 3. check validity of different aggregate variables
|
||||
mean(task_level_variables$sampling_prob)
|
||||
table(task_level_variables$resolution_outcome)
|
||||
# look at bivariate plots
|
||||
ggplot(task_level_variables, aes(
|
||||
x = as.factor(source),
|
||||
y = week_index,
|
||||
fill = resolution_outcome
|
||||
)) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of week_index against Resolution Outcome",
|
||||
x = "Case",
|
||||
y = "Week Index",
|
||||
fill = "Resolution Outcome"
|
||||
)
|
||||
|
||||
# 4. save
|
||||
ggplot(task_level_variables, aes(
|
||||
x = median_PC3_ADAC,
|
||||
y = TTR,
|
||||
fill = isAuthorWMF
|
||||
)) +
|
||||
facet_grid(~source, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
xlim(-20, 20) +
|
||||
ylim(0, 1440) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Median PC3 Value in ADAC Comments",
|
||||
x = "Median PC3 Value",
|
||||
y = "Time to Resolution (up to 60 days)",
|
||||
)
|
||||
# 4. save
|
||||
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
|
||||
Loading…
Reference in New Issue
Block a user