updating with new EDA
This commit is contained in:
		
							parent
							
								
									067fd08dd4
								
							
						
					
					
						commit
						55964c754b
					
				
							
								
								
									
										18
									
								
								mgaughan-rstudio-server_27419348.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								mgaughan-rstudio-server_27419348.out
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| 1. SSH tunnel from your workstation using the following command: | ||||
| 
 | ||||
|    ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu | ||||
| 
 | ||||
|    and point your web browser to http://localhost:8787 | ||||
| 
 | ||||
| 2. log in to RStudio Server using the following credentials: | ||||
| 
 | ||||
|    user: mjilg | ||||
|    password: lM83HdgeT310p2tkyoCk | ||||
| 
 | ||||
| When done using RStudio Server, terminate the job by: | ||||
| 
 | ||||
| 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) | ||||
| 2. Issue the following command on the login node: | ||||
| 
 | ||||
|       scancel -f 27419348 | ||||
| slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 *** | ||||
| @ -34,6 +34,9 @@ c1_input_df <- c1_input_df |> | ||||
|     date_created >=  as.numeric(as.POSIXct("2013-06-06", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment | ||||
|     date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3                             # post-deployment opt-out | ||||
|   )) |> | ||||
|   mutate(author_closer = AuthorPHID %in% CloserPHID, | ||||
|          same_author = AuthorPHID == CloserPHID) |> | ||||
|   mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |> | ||||
|   mutate(week_index = relative_week(date_created, as.Date("2013-07-01"))) | ||||
| 
 | ||||
| 
 | ||||
| @ -51,6 +54,9 @@ c2_input_df <- c2_input_df |> | ||||
|     date_created >=  as.numeric(as.POSIXct("2013-08-01", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment | ||||
|     date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3                             # post-deployment opt-out | ||||
|   )) |> | ||||
|   mutate(author_closer = AuthorPHID %in% CloserPHID, | ||||
|          same_author = AuthorPHID == CloserPHID) |> | ||||
|   mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |> | ||||
|   mutate(week_index = relative_week(date_created, as.Date("2013-08-28"))) | ||||
| 
 | ||||
| # c3 key dates  | ||||
| @ -66,6 +72,9 @@ c3_input_df <- c3_input_df %>% | ||||
|     date_created >=  as.numeric(as.POSIXct("2015-06-12", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment | ||||
|     date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3                             # post-deployment opt-out | ||||
|   )) |> | ||||
|   mutate(author_closer = AuthorPHID %in% CloserPHID, | ||||
|          same_author = AuthorPHID == CloserPHID) |> | ||||
|   mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |> | ||||
|   mutate(week_index = relative_week(date_created, as.Date("2015-07-02"))) | ||||
| 
 | ||||
| # Combine the dataframes into one | ||||
| @ -80,7 +89,8 @@ combined_df <- combined_df %>% | ||||
|   arrange(date_created, .by_group = TRUE) %>% | ||||
|   mutate( | ||||
|     task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"), | ||||
|     comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment") | ||||
|     comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"), | ||||
|     author_prior_phab_contrib = task_index_prev + comment_index_prev | ||||
|   ) %>% | ||||
|   ungroup() |>  | ||||
|   rowwise() %>% | ||||
| @ -103,52 +113,47 @@ combined_df <- combined_df %>% | ||||
| 
 | ||||
| 
 | ||||
| combined_task_df <- combined_df %>%  | ||||
|   add_count(TaskPHID, name = "TaskPHID_count") |> | ||||
|   add_count(TaskPHID, name = "task_event_comment_count") |> | ||||
|   filter(comment_type == "task_description") |> | ||||
|   mutate(time_to_close = date_closed - date_created, | ||||
|          time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours")) | ||||
|   ) |> | ||||
|   group_by(AuthorPHID, source) %>% | ||||
|   arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created | ||||
|   mutate(task_index = row_number()) %>% | ||||
|   mutate(author_task_index = row_number()) %>% | ||||
|   ungroup() | ||||
| 
 | ||||
| ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) + | ||||
|   geom_point(alpha = 0.6) +                # Points, with some transparency | ||||
|   geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band | ||||
|   theme_minimal()         | ||||
| library(dplyr) | ||||
| 
 | ||||
| library(stringr) | ||||
| 
 | ||||
| # 1. Count modal verbs in each task comment_text | ||||
| combined_task_df <- combined_task_df %>% | ||||
|   rowwise() %>% | ||||
| combined_task_df <- combined_task_df |> | ||||
|   group_by(source) %>% | ||||
|   mutate( | ||||
|     modal_verb_count = sum(str_detect( | ||||
|       str_to_lower(comment_text), | ||||
|       paste0("\\b", modal_verbs, "\\b", collapse = "|") | ||||
|     )), | ||||
|     modal_subset_count = sum(str_detect( | ||||
|       str_to_lower(comment_text), | ||||
|       paste0("\\b", modal_subset, "\\b", collapse = "|") | ||||
|     )), | ||||
|     user_count = sum(str_detect( | ||||
|       str_to_lower(comment_text), | ||||
|       paste0("\\b", whatever_subset, "\\b", collapse = "|") | ||||
|     )) | ||||
|     time_to_close_percentile = 1- percent_rank(time_to_close_hours), | ||||
|     comment_count_percentile = percent_rank(task_event_comment_count), | ||||
|     author_task_percentile = percent_rank(task_index_prev) | ||||
|     # inverting it so that higher percentile is faster | ||||
|   ) %>% | ||||
|   ungroup() | ||||
| 
 | ||||
| ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) + | ||||
|   geom_point(alpha = 0.6) +                # Points, with some transparency | ||||
|   geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band | ||||
|   theme_minimal() + | ||||
|   facet_grid(source ~ author_closer) | ||||
| 
 | ||||
| library(ggdist) | ||||
| ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) + | ||||
|   geom_point(alpha=0.1) +             # Points, with some transparency | ||||
|   geom_smooth(method = "loess", se = FALSE) +  | ||||
|   theme_minimal()       | ||||
| 
 | ||||
| ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) + | ||||
|   stat_slabinterval() + | ||||
|   theme_minimal()+ | ||||
|   facet_grid(source ~ AuthorWMFAffil) | ||||
| 
 | ||||
| 
 | ||||
| closed_combined_task_df <- combined_task_df |> | ||||
|   filter(!is.na(closed_relevance)) | ||||
| 
 | ||||
| combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000) | ||||
| 
 | ||||
| ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) + | ||||
|   geom_smooth(method = "loess", se = TRUE) +  | ||||
|   geom_point(alpha=0.1) +    | ||||
|   theme_minimal() | ||||
| ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) +  | ||||
|   geom_point(alpha = 0.6) +  | ||||
|   geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency | ||||
|   theme_minimal()+ | ||||
|   facet_grid(source ~ author_closer) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user