updated DSL data aggregation
This commit is contained in:
parent
e955b4f50f
commit
ab1cb3efea
377739
analysis_data/102725_unified.csv
Normal file
377739
analysis_data/102725_unified.csv
Normal file
File diff suppressed because one or more lines are too long
@ -4,7 +4,7 @@ library(tidyr)
|
|||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(purrr)
|
library(purrr)
|
||||||
|
|
||||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
main_csv <- "~/analysis_data/stale_unifieds/100625_unified_w_affil.csv"
|
||||||
main_df <- read.csv(main_csv, header = TRUE)
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
#filter out existing olmo stuff
|
#filter out existing olmo stuff
|
||||||
@ -36,9 +36,11 @@ main_df <- main_df |>
|
|||||||
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
|
||||||
left_join(desc_info, by = "TaskPHID") |>
|
left_join(desc_info, by = "TaskPHID") |>
|
||||||
mutate(
|
mutate(
|
||||||
ADAC = as.integer(!is.na(task_desc_author) &
|
ADAC = as.integer(
|
||||||
AuthorPHID == task_desc_author &
|
!is.na(task_desc_author) &
|
||||||
created < task_desc_dateClosed)
|
AuthorPHID == task_desc_author &
|
||||||
|
(is.na(task_desc_dateClosed) | created < task_desc_dateClosed)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
# add dictionary values
|
# add dictionary values
|
||||||
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
||||||
@ -130,6 +132,6 @@ pulling <- unified_df |>
|
|||||||
pulling <- unified_df |>
|
pulling <- unified_df |>
|
||||||
filter(id == "23366" | id == "20846" | id == "20847")
|
filter(id == "23366" | id == "20846" | id == "20847")
|
||||||
|
|
||||||
write.csv(unified_df, "102425_unified.csv", row.names = FALSE)
|
write.csv(unified_df, "102725_unified.csv", row.names = FALSE)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
3130
dsl/102725_DSL_df.csv
Normal file
3130
dsl/102725_DSL_df.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -3,8 +3,8 @@ library(tidyverse)
|
|||||||
# load in the human labels and for each task filer, @ the task level
|
# load in the human labels and for each task filer, @ the task level
|
||||||
# GET the proportion of Observed bug behavior + Expected Behavior
|
# GET the proportion of Observed bug behavior + Expected Behavior
|
||||||
# GET the proportion of Solution Discussion + Solution Usage
|
# GET the proportion of Solution Discussion + Solution Usage
|
||||||
human_csv <-"~/dsl/092225_info_matt_labels.csv"
|
unified_csv <-"~/analysis_data/102425_unified.csv"
|
||||||
human_df <- read.csv(human_csv, header = TRUE)
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||||
|
|
||||||
#task_authors <- human_df %>%
|
#task_authors <- human_df %>%
|
||||||
# filter(comment_type == "task_description") %>%
|
# filter(comment_type == "task_description") %>%
|
||||||
@ -19,8 +19,8 @@ human_df <- read.csv(human_csv, header = TRUE)
|
|||||||
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
|
# mean(rows_by_author$label %in% c("Observed bug behavior", "Expected behavior"))
|
||||||
# }
|
# }
|
||||||
# ) %>%
|
# ) %>%
|
||||||
# ungroup()
|
# ungrou
|
||||||
human_result <- human_df %>%
|
unified_df<- unified_df %>%
|
||||||
group_by(TaskPHID) %>%
|
group_by(TaskPHID) %>%
|
||||||
summarise(
|
summarise(
|
||||||
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
human_BE_prop = mean(human_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")),
|
||||||
|
|||||||
@ -1,16 +1,250 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
|
||||||
unified_csv <-"~/analysis_data/102125_unified.csv"
|
unified_csv <-"~/analysis_data/102725_unified.csv"
|
||||||
unified_df <- read.csv(human_csv, header = TRUE)
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||||
|
|
||||||
# 1. aggregate to the task level
|
# 1. aggregate to the task level
|
||||||
# 1a. create human info proportions (ADAC/general)
|
# 1a. create human info proportions (ADAC/general)
|
||||||
# 1b. create OLMO info proportions (ADAC/general)
|
# 1b. create OLMO info proportions (ADAC/general)
|
||||||
# 1c.
|
# 1c.
|
||||||
|
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
||||||
|
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
||||||
|
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
||||||
|
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||||
|
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||||
|
'SOCIAL CONVERSATION')
|
||||||
|
|
||||||
|
human_list_unified_df <- unified_df %>%
|
||||||
|
filter(!is.na(human_labels)) |>
|
||||||
|
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
||||||
|
mutate(list_human_labels = map(human_labels, ~ {
|
||||||
|
if (is.na(.x)) {
|
||||||
|
NA_character_
|
||||||
|
} else if (str_detect(.x, '^\\s*c\\(')) {
|
||||||
|
eval(parse(text = .x))
|
||||||
|
} else {
|
||||||
|
.x
|
||||||
|
}
|
||||||
|
})) %>%
|
||||||
|
unnest(list_human_labels, keep_empty = TRUE) |>
|
||||||
|
filter(list_human_labels != "NA") |>
|
||||||
|
group_by(TaskPHID) |>
|
||||||
|
summarise(
|
||||||
|
# Overall proportions (all comments)
|
||||||
|
n_tags = sum(!is.na(list_human_labels)),
|
||||||
|
human_BE_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_SOL_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_VR_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_BI_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
|
||||||
|
# ADAC==1 proportions
|
||||||
|
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
||||||
|
human_BE_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_SOL_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_VR_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
human_BI_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
.groups = "drop"
|
||||||
|
) |>
|
||||||
|
select(-n_tags, -n_tags_adac)
|
||||||
|
|
||||||
|
|
||||||
|
olmo_list_unified_df <- unified_df %>%
|
||||||
|
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
|
||||||
|
mutate(list_olmo_labels = str_extract_all(
|
||||||
|
olmo_sentence_labels,
|
||||||
|
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||||
|
)) %>%
|
||||||
|
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||||
|
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||||
|
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||||
|
filter(list_olmo_labels != "") %>%
|
||||||
|
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
||||||
|
"WORKAROUND",
|
||||||
|
list_olmo_labels))|>
|
||||||
|
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
||||||
|
"BUG REPRODUCTION",
|
||||||
|
olmo_label))|>
|
||||||
|
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
||||||
|
"INVALID LABEL",
|
||||||
|
olmo_label))|>
|
||||||
|
group_by(TaskPHID)|>
|
||||||
|
summarise(
|
||||||
|
# Overall proportions (all comments)
|
||||||
|
n_tags = sum(!is.na(olmo_label)),
|
||||||
|
olmo_BE_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_SOL_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_VR_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_BI_prop = if_else(
|
||||||
|
n_tags == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
||||||
|
olmo_BE_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_SOL_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_VR_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||||
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
olmo_BI_prop_adac = if_else(
|
||||||
|
n_tags_adac == 0L,
|
||||||
|
NA_real_,
|
||||||
|
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||||
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
|
),
|
||||||
|
.groups = "drop"
|
||||||
|
) |>
|
||||||
|
select(-n_tags, -n_tags_adac)
|
||||||
|
|
||||||
|
# aggregate other Task-level variables and then join
|
||||||
|
task_level_variables <- unified_df |>
|
||||||
|
group_by(TaskPHID) |>
|
||||||
|
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||||
|
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||||
|
median_PC3 = median(PC3),
|
||||||
|
median_PC3_ADAC = median(PC3[ADAC==1])
|
||||||
|
)
|
||||||
|
|
||||||
|
descriptions <- unified_df |>
|
||||||
|
filter(comment_type == "task_description")|>
|
||||||
|
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
|
||||||
|
source, phase, week_index, author_closer, resolution_outcome )
|
||||||
|
|
||||||
|
task_level_variables <- task_level_variables |>
|
||||||
|
left_join(
|
||||||
|
descriptions,
|
||||||
|
by="TaskPHID"
|
||||||
|
)
|
||||||
|
|
||||||
|
task_level_variables <- task_level_variables |>
|
||||||
|
left_join(
|
||||||
|
olmo_list_unified_df,
|
||||||
|
by="TaskPHID"
|
||||||
|
)
|
||||||
|
|
||||||
|
task_level_variables <- task_level_variables |>
|
||||||
|
left_join(
|
||||||
|
human_list_unified_df,
|
||||||
|
by="TaskPHID"
|
||||||
|
)
|
||||||
# 2. assign sampling prob for different tasks
|
# 2. assign sampling prob for different tasks
|
||||||
|
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
|
||||||
|
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
|
||||||
|
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
||||||
|
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
|
||||||
# refer to DSL specification sheet
|
# refer to DSL specification sheet
|
||||||
|
task_level_variables <- task_level_variables |>
|
||||||
|
mutate(
|
||||||
|
isFirstSample = TaskPHID %in% first_sample_tasks,
|
||||||
|
sampling_prob = case_when(
|
||||||
|
source == "c2" ~ 0.086,
|
||||||
|
source == "c3" ~ 0.055,
|
||||||
|
source == "c1" & (phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.045,
|
||||||
|
source == "c1" & !(phase == 3 & isAuthorWMF == TRUE & isFirstSample == FALSE) ~ 0.021,
|
||||||
|
)
|
||||||
|
) |>
|
||||||
|
select(-isFirstSample) |>
|
||||||
|
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
|
||||||
|
mutate(TTR = (date_closed - date_created)/3600)
|
||||||
# 3. check validity of different aggregate variables
|
# 3. check validity of different aggregate variables
|
||||||
|
mean(task_level_variables$sampling_prob)
|
||||||
|
table(task_level_variables$resolution_outcome)
|
||||||
|
# look at bivariate plots
|
||||||
|
ggplot(task_level_variables, aes(
|
||||||
|
x = as.factor(source),
|
||||||
|
y = week_index,
|
||||||
|
fill = resolution_outcome
|
||||||
|
)) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Boxplot of week_index against Resolution Outcome",
|
||||||
|
x = "Case",
|
||||||
|
y = "Week Index",
|
||||||
|
fill = "Resolution Outcome"
|
||||||
|
)
|
||||||
|
|
||||||
# 4. save
|
ggplot(task_level_variables, aes(
|
||||||
|
x = median_PC3_ADAC,
|
||||||
|
y = TTR,
|
||||||
|
fill = isAuthorWMF
|
||||||
|
)) +
|
||||||
|
facet_grid(~source, scales="fixed") +
|
||||||
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||||
|
xlim(-20, 20) +
|
||||||
|
ylim(0, 1440) +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Median PC3 Value in ADAC Comments",
|
||||||
|
x = "Median PC3 Value",
|
||||||
|
y = "Time to Resolution (up to 60 days)",
|
||||||
|
)
|
||||||
|
# 4. save
|
||||||
|
write.csv(task_level_variables, "102725_DSL_df.csv", row.names = FALSE)
|
||||||
Loading…
Reference in New Issue
Block a user