diff --git a/mgaughan-rstudio-server_27419348.out b/mgaughan-rstudio-server_27419348.out new file mode 100644 index 0000000..03325fe --- /dev/null +++ b/mgaughan-rstudio-server_27419348.out @@ -0,0 +1,18 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: lM83HdgeT310p2tkyoCk + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 27419348 +slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 *** diff --git a/p2_EDA/phab_weekly_bins.R b/p2_EDA/phab_weekly_bins.R index 60632c6..1aa3f8d 100644 --- a/p2_EDA/phab_weekly_bins.R +++ b/p2_EDA/phab_weekly_bins.R @@ -34,6 +34,9 @@ c1_input_df <- c1_input_df |> date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out )) |> + mutate(author_closer = AuthorPHID %in% CloserPHID, + same_author = AuthorPHID == CloserPHID) |> + mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |> mutate(week_index = relative_week(date_created, as.Date("2013-07-01"))) @@ -51,6 +54,9 @@ c2_input_df <- c2_input_df |> date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out )) |> + mutate(author_closer = AuthorPHID %in% CloserPHID, + same_author = AuthorPHID == CloserPHID) |> + mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |> mutate(week_index = relative_week(date_created, as.Date("2013-08-28"))) # c3 key dates @@ -66,6 +72,9 @@ c3_input_df <- c3_input_df %>% date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out )) |> + mutate(author_closer = AuthorPHID %in% CloserPHID, + same_author = AuthorPHID == CloserPHID) |> + mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |> mutate(week_index = relative_week(date_created, as.Date("2015-07-02"))) # Combine the dataframes into one @@ -80,7 +89,8 @@ combined_df <- combined_df %>% arrange(date_created, .by_group = TRUE) %>% mutate( task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"), - comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment") + comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"), + author_prior_phab_contrib = task_index_prev + comment_index_prev ) %>% ungroup() |> rowwise() %>% @@ -103,52 +113,47 @@ combined_df <- combined_df %>% combined_task_df <- combined_df %>% - add_count(TaskPHID, name = "TaskPHID_count") |> + add_count(TaskPHID, name = "task_event_comment_count") |> filter(comment_type == "task_description") |> mutate(time_to_close = date_closed - date_created, time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) ) |> group_by(AuthorPHID, source) %>% arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created - mutate(task_index = row_number()) %>% + mutate(author_task_index = row_number()) %>% ungroup() -ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) + - geom_point(alpha = 0.6) + # Points, with some transparency - geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band - theme_minimal() +library(dplyr) -library(stringr) - -# 1. Count modal verbs in each task comment_text -combined_task_df <- combined_task_df %>% - rowwise() %>% +combined_task_df <- combined_task_df |> + group_by(source) %>% mutate( - modal_verb_count = sum(str_detect( - str_to_lower(comment_text), - paste0("\\b", modal_verbs, "\\b", collapse = "|") - )), - modal_subset_count = sum(str_detect( - str_to_lower(comment_text), - paste0("\\b", modal_subset, "\\b", collapse = "|") - )), - user_count = sum(str_detect( - str_to_lower(comment_text), - paste0("\\b", whatever_subset, "\\b", collapse = "|") - )) + time_to_close_percentile = 1- percent_rank(time_to_close_hours), + comment_count_percentile = percent_rank(task_event_comment_count), + author_task_percentile = percent_rank(task_index_prev) + # inverting it so that higher percentile is faster ) %>% ungroup() + +ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) + + geom_point(alpha = 0.6) + # Points, with some transparency + geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band + theme_minimal() + + facet_grid(source ~ author_closer) + library(ggdist) -ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) + - geom_point(alpha=0.1) + # Points, with some transparency - geom_smooth(method = "loess", se = FALSE) + - theme_minimal() + +ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) + + stat_slabinterval() + + theme_minimal()+ + facet_grid(source ~ AuthorWMFAffil) +closed_combined_task_df <- combined_task_df |> + filter(!is.na(closed_relevance)) -combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000) - -ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) + - geom_smooth(method = "loess", se = TRUE) + - geom_point(alpha=0.1) + - theme_minimal() +ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) + + geom_point(alpha = 0.6) + + geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency + theme_minimal()+ + facet_grid(source ~ author_closer)