preliminary re-aggregstion of DSL df, preliminary drafting of DSL model
This commit is contained in:
parent
7555259a3e
commit
fb490e37f5
3130
111725_DSL_frame.csv
Normal file
3130
111725_DSL_frame.csv
Normal file
File diff suppressed because it is too large
Load Diff
12
dsl/dsl.R
12
dsl/dsl.R
@ -1,7 +1,7 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
library(dsl)
|
library(dsl)
|
||||||
|
|
||||||
dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
|
dsl_csv <-"111725_DSL_frame.csv"
|
||||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||||
|
|
||||||
|
|
||||||
@ -22,19 +22,17 @@ power_model <- power_dsl(
|
|||||||
summary(power_model)
|
summary(power_model)
|
||||||
plot(power_model, coef_name = "human_SOL_prop_adac")
|
plot(power_model, coef_name = "human_SOL_prop_adac")
|
||||||
|
|
||||||
dsl_df <- dsl_df |>
|
|
||||||
filter(source=="c1")
|
|
||||||
|
|
||||||
trial_model <- dsl(
|
trial_model <- dsl(
|
||||||
model = "logit",
|
model = "logit",
|
||||||
formula = dsl_score ~ human_BI_prop_adac +
|
formula = dsl_score ~ human_TSOL_prop_adac +
|
||||||
median_gerrit_loc_delta + median_gerrit_reviewers +
|
median_gerrit_loc_delta + median_gerrit_reviewers +
|
||||||
as.factor(isAuthorWMF) +
|
as.factor(isAuthorWMF) +
|
||||||
as.factor(author_closer) +
|
as.factor(author_closer) +
|
||||||
median_PC4_adac +
|
median_PC4_adac +
|
||||||
week_index,
|
week_index + n_comments_before,
|
||||||
predicted_var = "human_BI_prop_adac",
|
predicted_var = "human_TSOL_prop_adac",
|
||||||
prediction = "olmo_BI_prop_adac",
|
prediction = "olmo_TSOL_prop_adac",
|
||||||
sample_prob = "sampling_prob",
|
sample_prob = "sampling_prob",
|
||||||
data=dsl_df
|
data=dsl_df
|
||||||
)
|
)
|
||||||
|
|||||||
@ -13,7 +13,9 @@ valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
|||||||
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||||
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||||
'SOCIAL CONVERSATION')
|
'SOCIAL CONVERSATION')
|
||||||
|
library(dplyr)
|
||||||
|
library(purrr)
|
||||||
|
library(stringr)
|
||||||
human_list_unified_df <- unified_df %>%
|
human_list_unified_df <- unified_df %>%
|
||||||
filter(!is.na(human_labels)) |>
|
filter(!is.na(human_labels)) |>
|
||||||
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
mutate(human_labels = tidyr::replace_na(human_labels, "")) |>
|
||||||
@ -26,85 +28,79 @@ human_list_unified_df <- unified_df %>%
|
|||||||
.x
|
.x
|
||||||
}
|
}
|
||||||
})) %>%
|
})) %>%
|
||||||
unnest(list_human_labels, keep_empty = TRUE) |>
|
tidyr::unnest(list_human_labels, keep_empty = TRUE) |>
|
||||||
filter(list_human_labels != "NA") |>
|
filter(list_human_labels != "NA") |>
|
||||||
group_by(TaskPHID) |>
|
group_by(TaskPHID) |>
|
||||||
summarise(
|
summarise(
|
||||||
# Overall proportions (all comments)
|
# Overall proportions (all comments)
|
||||||
n_tags = sum(!is.na(list_human_labels)),
|
n_tags = sum(!is.na(list_human_labels)),
|
||||||
human_BE_prop = if_else(
|
human_EP_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_SOL_prop = if_else(
|
human_TSOL_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_VR_prop = if_else(
|
human_DIO_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(list_human_labels %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
human_BI_prop = if_else(
|
human_RK_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels %in% c("BUG REPRODUCTION",
|
mean(list_human_labels %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
|
|
||||||
# ADAC==1 proportions
|
# ADAC==1 proportions
|
||||||
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
n_tags_adac = sum(!is.na(list_human_labels) & ADAC == 1),
|
||||||
human_BE_prop_adac = if_else(
|
human_EP_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_SOL_prop_adac = if_else(
|
human_TSOL_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(list_human_labels[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_VR_prop_adac = if_else(
|
human_DIO_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(list_human_labels[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
human_BI_prop_adac = if_else(
|
human_RK_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 1] %in% c("BUG REPRODUCTION",
|
mean(list_human_labels[ADAC == 1] %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
# ADAC==0 proportions
|
# ADAC==0 proportions
|
||||||
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
n_tags_no_adac = sum(!is.na(list_human_labels) & ADAC == 0),
|
||||||
human_BE_prop_no_adac = if_else(
|
human_EP_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_SOL_prop_no_adac = if_else(
|
human_TSOL_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(list_human_labels[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
human_VR_prop_no_adac = if_else(
|
human_DIO_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(list_human_labels[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
human_BI_prop_no_adac = if_else(
|
human_RK_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(list_human_labels[ADAC == 0] %in% c("BUG REPRODUCTION",
|
mean(list_human_labels[ADAC == 0] %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
.groups = "drop"
|
.groups = "drop"
|
||||||
) |>
|
) |>
|
||||||
@ -117,7 +113,7 @@ olmo_list_unified_df <- unified_df %>%
|
|||||||
olmo_sentence_labels,
|
olmo_sentence_labels,
|
||||||
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||||
)) %>%
|
)) %>%
|
||||||
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
tidyr::unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||||
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||||
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||||
filter(list_olmo_labels != "") %>%
|
filter(list_olmo_labels != "") %>%
|
||||||
@ -134,76 +130,70 @@ olmo_list_unified_df <- unified_df %>%
|
|||||||
summarise(
|
summarise(
|
||||||
# Overall proportions (all comments)
|
# Overall proportions (all comments)
|
||||||
n_tags = sum(!is.na(olmo_label)),
|
n_tags = sum(!is.na(olmo_label)),
|
||||||
olmo_BE_prop = if_else(
|
olmo_EP_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_SOL_prop = if_else(
|
olmo_TSOL_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_VR_prop = if_else(
|
olmo_DIO_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(olmo_label %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
olmo_BI_prop = if_else(
|
olmo_RK_prop = if_else(
|
||||||
n_tags == 0L,
|
n_tags == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label %in% c("BUG REPRODUCTION",
|
mean(olmo_label %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
n_tags_adac = sum(!is.na(olmo_label) & ADAC == 1),
|
||||||
olmo_BE_prop_adac = if_else(
|
olmo_EP_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_SOL_prop_adac = if_else(
|
olmo_TSOL_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(olmo_label[ADAC == 1] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_VR_prop_adac = if_else(
|
olmo_DIO_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 1] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(olmo_label[ADAC == 1] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
olmo_BI_prop_adac = if_else(
|
olmo_RK_prop_adac = if_else(
|
||||||
n_tags_adac == 0L,
|
n_tags_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 1] %in% c("BUG REPRODUCTION",
|
mean(olmo_label[ADAC == 1] %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
n_tags_no_adac = sum(!is.na(olmo_label) & ADAC == 0),
|
||||||
olmo_BE_prop_no_adac = if_else(
|
olmo_EP_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_SOL_prop_no_adac = if_else(
|
olmo_TSOL_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
mean(olmo_label[ADAC == 0] %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE", "TESTING", "WORKAROUNDS", "TASK PROGRESS"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
olmo_VR_prop_no_adac = if_else(
|
olmo_DIO_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 0] %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
mean(olmo_label[ADAC == 0] %in% c("MOTIVATION", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
||||||
),
|
),
|
||||||
olmo_BI_prop_no_adac = if_else(
|
olmo_RK_prop_no_adac = if_else(
|
||||||
n_tags_no_adac == 0L,
|
n_tags_no_adac == 0L,
|
||||||
NA_real_,
|
NA_real_,
|
||||||
mean(olmo_label[ADAC == 0] %in% c("BUG REPRODUCTION",
|
mean(olmo_label[ADAC == 0] %in% c("ACTION ON ISSUE",
|
||||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
"ISSUE CONTENT MANAGEMENT"), na.rm = TRUE)
|
||||||
),
|
),
|
||||||
.groups = "drop"
|
.groups = "drop"
|
||||||
) |>
|
) |>
|
||||||
@ -289,5 +279,15 @@ ggplot(task_level_variables, aes(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ggplot(task_level_variables,
|
||||||
|
aes(
|
||||||
|
x=as.factor(source),
|
||||||
|
y=olmo_RK_prop,
|
||||||
|
fill=as.factor(source)
|
||||||
|
)) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal()
|
||||||
|
|
||||||
# 4. save
|
# 4. save
|
||||||
write.csv(task_level_variables, "110925_DSL_df_adac.csv", row.names = FALSE)
|
write.csv(task_level_variables, "111725_DSL_frame.csv", row.names = FALSE)
|
||||||
|
|||||||
17
mgaughan-rstudio-server_31035935.out
Normal file
17
mgaughan-rstudio-server_31035935.out
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3439:35765 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: QKOjN5O9o8KE4QlK+t4M
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 31035935
|
||||||
Loading…
Reference in New Issue
Block a user