1
0

updating with new EDA

This commit is contained in:
Matthew Gaughan 2025-07-07 13:08:58 -07:00
parent 067fd08dd4
commit 55964c754b
2 changed files with 57 additions and 34 deletions

View File

@ -0,0 +1,18 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: lM83HdgeT310p2tkyoCk
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 27419348
slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 ***

View File

@ -34,6 +34,9 @@ c1_input_df <- c1_input_df |>
date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out
)) |> )) |>
mutate(author_closer = AuthorPHID %in% CloserPHID,
same_author = AuthorPHID == CloserPHID) |>
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |>
mutate(week_index = relative_week(date_created, as.Date("2013-07-01"))) mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))
@ -51,6 +54,9 @@ c2_input_df <- c2_input_df |>
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out
)) |> )) |>
mutate(author_closer = AuthorPHID %in% CloserPHID,
same_author = AuthorPHID == CloserPHID) |>
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |>
mutate(week_index = relative_week(date_created, as.Date("2013-08-28"))) mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))
# c3 key dates # c3 key dates
@ -66,6 +72,9 @@ c3_input_df <- c3_input_df %>%
date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out
)) |> )) |>
mutate(author_closer = AuthorPHID %in% CloserPHID,
same_author = AuthorPHID == CloserPHID) |>
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |>
mutate(week_index = relative_week(date_created, as.Date("2015-07-02"))) mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))
# Combine the dataframes into one # Combine the dataframes into one
@ -80,7 +89,8 @@ combined_df <- combined_df %>%
arrange(date_created, .by_group = TRUE) %>% arrange(date_created, .by_group = TRUE) %>%
mutate( mutate(
task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"), task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment") comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"),
author_prior_phab_contrib = task_index_prev + comment_index_prev
) %>% ) %>%
ungroup() |> ungroup() |>
rowwise() %>% rowwise() %>%
@ -103,52 +113,47 @@ combined_df <- combined_df %>%
combined_task_df <- combined_df %>% combined_task_df <- combined_df %>%
add_count(TaskPHID, name = "TaskPHID_count") |> add_count(TaskPHID, name = "task_event_comment_count") |>
filter(comment_type == "task_description") |> filter(comment_type == "task_description") |>
mutate(time_to_close = date_closed - date_created, mutate(time_to_close = date_closed - date_created,
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
) |> ) |>
group_by(AuthorPHID, source) %>% group_by(AuthorPHID, source) %>%
arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
mutate(task_index = row_number()) %>% mutate(author_task_index = row_number()) %>%
ungroup() ungroup()
ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) + library(dplyr)
geom_point(alpha = 0.6) + # Points, with some transparency
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
theme_minimal()
library(stringr) combined_task_df <- combined_task_df |>
group_by(source) %>%
# 1. Count modal verbs in each task comment_text
combined_task_df <- combined_task_df %>%
rowwise() %>%
mutate( mutate(
modal_verb_count = sum(str_detect( time_to_close_percentile = 1- percent_rank(time_to_close_hours),
str_to_lower(comment_text), comment_count_percentile = percent_rank(task_event_comment_count),
paste0("\\b", modal_verbs, "\\b", collapse = "|") author_task_percentile = percent_rank(task_index_prev)
)), # inverting it so that higher percentile is faster
modal_subset_count = sum(str_detect(
str_to_lower(comment_text),
paste0("\\b", modal_subset, "\\b", collapse = "|")
)),
user_count = sum(str_detect(
str_to_lower(comment_text),
paste0("\\b", whatever_subset, "\\b", collapse = "|")
))
) %>% ) %>%
ungroup() ungroup()
ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) +
geom_point(alpha = 0.6) + # Points, with some transparency
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
theme_minimal() +
facet_grid(source ~ author_closer)
library(ggdist) library(ggdist)
ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
geom_point(alpha=0.1) + # Points, with some transparency ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) +
geom_smooth(method = "loess", se = FALSE) + stat_slabinterval() +
theme_minimal() theme_minimal()+
facet_grid(source ~ AuthorWMFAffil)
closed_combined_task_df <- combined_task_df |>
filter(!is.na(closed_relevance))
combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000) ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) +
geom_point(alpha = 0.6) +
ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) + geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency
geom_smooth(method = "loess", se = TRUE) + theme_minimal()+
geom_point(alpha=0.1) + facet_grid(source ~ author_closer)
theme_minimal()