preliminary re-aggregstion of DSL df, preliminary drafting of DSL model
This commit is contained in:
parent
7555259a3e
commit
fb490e37f5
3130
111725_DSL_frame.csv
Normal file
3130
111725_DSL_frame.csv
Normal file
File diff suppressed because it is too large
Load Diff
12
dsl/dsl.R
12
dsl/dsl.R
@ -1,7 +1,7 @@
|
||||
library(tidyverse)
|
||||
library(dsl)
|
||||
|
||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
||||
dsl_csv <-"111725_DSL_frame.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
|
||||
|
||||
@ -22,19 +22,17 @@ power_model <- power_dsl(
|
||||
summary(power_model)
|
||||
plot(power_model, coef_name = "human_SOL_prop_adac")
|
||||
|
||||
dsl_df <- dsl_df |>
|
||||
filter(source=="c1")
|
||||
|
||||
trial_model <- dsl(
|
||||
model = "logit",
|
||||
formula = dsl_score ~ human_BI_prop_adac +
|
||||
formula = dsl_score ~ human_TSOL_prop_adac +
|
||||
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||
as.factor(isAuthorWMF) +
|
||||
as.factor(author_closer) +
|
||||
median_PC4_adac +
|
||||
week_index,
|
||||
predicted_var = "human_BI_prop_adac",
|
||||
prediction = "olmo_BI_prop_adac",
|
||||
week_index + n_comments_before,
|
||||
predicted_var = "human_TSOL_prop_adac",
|
||||
prediction = "olmo_TSOL_prop_adac",
|
||||
sample_prob = "sampling_prob",
|
||||
data=dsl_df
|
||||
)
|
||||
|
||||
@ -13,7 +13,9 @@ valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
||||
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||
'SOCIAL CONVERSATION')
|
||||
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
library(stringr)
|
||||
human_list_unified_df <- unified_df %>%
|
||||
filter(!is.na(human_labels)) |>
|
||||
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
||||
@ -26,85 +28,79 @@ human_list_unified_df <- unified_df %>%
|
||||
.x
|
||||
}
|
||||
})) %>%
|
||||
unnest(list_human_labels, keep_empty = TRUE) |>
|
||||
tidyr::unnest(list_human_labels, keep_empty = TRUE) |>
|
||||
filter(list_human_labels != "NA") |>
|
||||
group_by(TaskPHID) |>
|
||||
summarise(
|
||||
# Overall proportions (all comments)
|
||||
n_tags = sum(!is.na(list_human_labels)),
|
||||
human_BE_prop = if_else(
|
||||
human_EP_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop = if_else(
|
||||
human_TSOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop = if_else(
|
||||
human_DIO_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop = if_else(
|
||||
human_RK_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
|
||||
# ADAC==1 proportions
|
||||
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
||||
human_BE_prop_adac = if_else(
|
||||
human_EP_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop_adac = if_else(
|
||||
human_TSOL_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop_adac = if_else(
|
||||
human_DIO_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop_adac = if_else(
|
||||
human_RK_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 1] %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
# ADAC==0 proportions
|
||||
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
||||
human_BE_prop_no_adac = if_else(
|
||||
human_EP_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop_no_adac = if_else(
|
||||
human_TSOL_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop_no_adac = if_else(
|
||||
human_DIO_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_BI_prop_no_adac = if_else(
|
||||
human_RK_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(list_human_labels[ADAC == 0] %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
@ -117,7 +113,7 @@ olmo_list_unified_df <- unified_df %>%
|
||||
olmo_sentence_labels,
|
||||
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||
)) %>%
|
||||
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||
tidyr::unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||
filter(list_olmo_labels != "") %>%
|
||||
@ -134,76 +130,70 @@ olmo_list_unified_df <- unified_df %>%
|
||||
summarise(
|
||||
# Overall proportions (all comments)
|
||||
n_tags = sum(!is.na(olmo_label)),
|
||||
olmo_BE_prop = if_else(
|
||||
olmo_EP_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop = if_else(
|
||||
olmo_TSOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop = if_else(
|
||||
olmo_DIO_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop = if_else(
|
||||
olmo_RK_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
||||
olmo_BE_prop_adac = if_else(
|
||||
olmo_EP_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop_adac = if_else(
|
||||
olmo_TSOL_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop_adac = if_else(
|
||||
olmo_DIO_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop_adac = if_else(
|
||||
olmo_RK_prop_adac = if_else(
|
||||
n_tags_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 1] %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
||||
olmo_BE_prop_no_adac = if_else(
|
||||
olmo_EP_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop_no_adac = if_else(
|
||||
olmo_TSOL_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop_no_adac = if_else(
|
||||
olmo_DIO_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_BI_prop_no_adac = if_else(
|
||||
olmo_RK_prop_no_adac = if_else(
|
||||
n_tags_no_adac == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
mean(olmo_label[ADAC == 0] %in% c("ACTION ON ISSUE",
|
||||
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
@ -289,5 +279,15 @@ ggplot(task_level_variables, aes(
|
||||
)
|
||||
|
||||
|
||||
ggplot(task_level_variables,
|
||||
aes(
|
||||
x=as.factor(source),
|
||||
y=olmo_RK_prop,
|
||||
fill=as.factor(source)
|
||||
)) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal()
|
||||
|
||||
# 4. save
|
||||
write.csv(task_level_variables, "110925_DSL_df_adac.csv", row.names = FALSE)
|
||||
write.csv(task_level_variables, "111725_DSL_frame.csv", row.names = FALSE)
|
||||
|
||||
17
mgaughan-rstudio-server_31035935.out
Normal file
17
mgaughan-rstudio-server_31035935.out
Normal file
@ -0,0 +1,17 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:35765 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: QKOjN5O9o8KE4QlK+t4M
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 31035935
|
||||
Loading…
Reference in New Issue
Block a user