updating with new EDA

some tidying up following m2 figure creation, more needed
2025-07-07 13:08:58 -07:00 · 2025-07-07 10:44:33 -07:00
28 changed files with 62 additions and 84 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,7 @@
 # mw-lifecycle-analysis

-Analysis scripts and code for studying lifecycles of MediaWiki projects
+Analysis scripts and code for studying the deployment processes of three MediaWiki/Wikimedia features (2013-2015)
+
+
+
+
--- a/artifact-figures/m2-figures/d1-m2-bot-commits-faceted.png
+++ b/artifact-figures/m2-figures/d1-m2-bot-commits-faceted.png
--- a/artifact-figures/m2-figures/d1-m2-commits-faceted.png
+++ b/artifact-figures/m2-figures/d1-m2-commits-faceted.png
--- a/artifact-figures/m2-figures/d1-m2-tasks-faceted.png
+++ b/artifact-figures/m2-figures/d1-m2-tasks-faceted.png
--- a/artifact-figures/ww-figures/ww-0501-bot-commits-faceted.png
+++ b/artifact-figures/ww-figures/ww-0501-bot-commits-faceted.png
--- a/artifact-figures/ww-figures/ww-0501-bot-commits-grey.png
+++ b/artifact-figures/ww-figures/ww-0501-bot-commits-grey.png
--- a/artifact-figures/ww-figures/ww-0501-commits-faceted.png
+++ b/artifact-figures/ww-figures/ww-0501-commits-faceted.png
--- a/artifact-figures/ww-figures/ww-0501-tasks-faceted.png
+++ b/artifact-figures/ww-figures/ww-0501-tasks-faceted.png
--- a/artifact-figures/ww-figures/ww-c1-0430-bot-commits.png
+++ b/artifact-figures/ww-figures/ww-c1-0430-bot-commits.png
--- a/artifact-figures/ww-figures/ww-c1-0430-bot-spike.png
+++ b/artifact-figures/ww-figures/ww-c1-0430-bot-spike.png
--- a/artifact-figures/ww-figures/ww-c1-0430-commits.png
+++ b/artifact-figures/ww-figures/ww-c1-0430-commits.png
--- a/artifact-figures/ww-figures/ww-c1-0430-unaff-commit-spike.png
+++ b/artifact-figures/ww-figures/ww-c1-0430-unaff-commit-spike.png
--- a/artifact-figures/ww-figures/ww-c2-0430-bot-commits.png
+++ b/artifact-figures/ww-figures/ww-c2-0430-bot-commits.png
--- a/artifact-figures/ww-figures/ww-c2-0430-bot-spike.png
+++ b/artifact-figures/ww-figures/ww-c2-0430-bot-spike.png
--- a/artifact-figures/ww-figures/ww-c2-0430-commits.png
+++ b/artifact-figures/ww-figures/ww-c2-0430-commits.png
--- a/artifact-figures/ww-figures/ww-c2-0430-unaff-commit-spike.png
+++ b/artifact-figures/ww-figures/ww-c2-0430-unaff-commit-spike.png
--- a/artifact-figures/ww-figures/ww-c2c3-relevance-viz.png
+++ b/artifact-figures/ww-figures/ww-c2c3-relevance-viz.png
--- a/artifact-figures/ww-figures/ww-c3-0430-bot-spike.png
+++ b/artifact-figures/ww-figures/ww-c3-0430-bot-spike.png
--- a/artifact-figures/ww-figures/ww-c3-0430-commits.png
+++ b/artifact-figures/ww-figures/ww-c3-0430-commits.png
--- a/artifact-figures/ww-figures/ww-c3-0430-unaff-commit-spike.png
+++ b/artifact-figures/ww-figures/ww-c3-0430-unaff-commit-spike.png
--- a/artifact-figures/ww-figures/ww-task-plot-script.R
+++ b/artifact-figures/ww-figures/ww-task-plot-script.R
--- a/govdoc-cr-age-dist.R
+++ b/govdoc-cr-age-dist.R
@ -1,45 +0,0 @@
-library(dplyr)
-contributing_df_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_weekly_count_data.csv"
-contributing_df = read.csv(contributing_df_filepath, header = TRUE) 
-
-readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_weekly_count_data.csv"
-readme_df = read.csv(readme_df_filepath, header = TRUE)
-
-combined_df <- bind_rows(
-  contributing_df %>%
-    group_by(project_id) %>%
-    select(project_id, age_at_commit) %>%
-    mutate(document = factor("CONTRIBUTING", levels = c("CONTRIBUTING", "README"))),
-  readme_df %>%
-    group_by(project_id) %>%
-    select(project_id, age_at_commit) %>%
-    mutate(document = factor("README", levels = c("CONTRIBUTING", "README")))
-)
-
-unique_combined_df <- combined_df %>%
-  distinct(project_id, age_at_commit, document)
-
-library(tidyverse)
-library(tidyquant)
-library(ggdist)
-library(ggthemes)
-library(ggplot2)
-
-age_raincloud <- unique_combined_df |>
-  ggplot(aes(x = factor(document), y = age_at_commit, fill = factor(document))) +
-  geom_boxplot(
-    width = 0.12,
-    # removing outliers
-    outlier.color = NA,
-    alpha = 0.5
-  ) +
-  ggplot::stat_dots(
-    # ploting on left side
-    side = "left",
-    # adjusting position
-    justification = 1.1,
-    # adjust grouping (binning) of observations
-    binwidth = 0.25
-  )
-
-age_raincloud
--- a/mgaughan-rstudio-server_27419348.out
+++ b/mgaughan-rstudio-server_27419348.out
@ -0,0 +1,18 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: lM83HdgeT310p2tkyoCk
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 27419348
+slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 ***
--- a/p2_EDA/phab_weekly_bins.R
+++ b/p2_EDA/phab_weekly_bins.R
@ -34,6 +34,9 @@ c1_input_df <- c1_input_df |>
    date_created >=  as.numeric(as.POSIXct("2013-06-06", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
+  mutate(author_closer = AuthorPHID %in% CloserPHID,
+         same_author = AuthorPHID == CloserPHID) |>
+  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))


@ -51,6 +54,9 @@ c2_input_df <- c2_input_df |>
    date_created >=  as.numeric(as.POSIXct("2013-08-01", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
+  mutate(author_closer = AuthorPHID %in% CloserPHID,
+         same_author = AuthorPHID == CloserPHID) |>
+  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))

 # c3 key dates 
@ -66,6 +72,9 @@ c3_input_df <- c3_input_df %>%
    date_created >=  as.numeric(as.POSIXct("2015-06-12", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
+  mutate(author_closer = AuthorPHID %in% CloserPHID,
+         same_author = AuthorPHID == CloserPHID) |>
+  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))

 # Combine the dataframes into one
@ -80,7 +89,8 @@ combined_df <- combined_df %>%
  arrange(date_created, .by_group = TRUE) %>%
  mutate(
    task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
-    comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment")
+    comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"),
+    author_prior_phab_contrib = task_index_prev + comment_index_prev
  ) %>%
  ungroup() |> 
  rowwise() %>%
@ -103,52 +113,47 @@ combined_df <- combined_df %>%


 combined_task_df <- combined_df %>% 
-  add_count(TaskPHID, name = "TaskPHID_count") |>
+  add_count(TaskPHID, name = "task_event_comment_count") |>
  filter(comment_type == "task_description") |>
  mutate(time_to_close = date_closed - date_created,
         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
  ) |>
  group_by(AuthorPHID, source) %>%
  arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
-  mutate(task_index = row_number()) %>%
+  mutate(author_task_index = row_number()) %>%
  ungroup()

-ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) +
-  geom_point(alpha = 0.6) +                # Points, with some transparency
-  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
-  theme_minimal()        
+library(dplyr)

-library(stringr)
-
-# 1. Count modal verbs in each task comment_text
-combined_task_df <- combined_task_df %>%
-  rowwise() %>%
+combined_task_df <- combined_task_df |>
+  group_by(source) %>%
  mutate(
-    modal_verb_count = sum(str_detect(
-      str_to_lower(comment_text),
-      paste0("\\b", modal_verbs, "\\b", collapse = "|")
-    )),
-    modal_subset_count = sum(str_detect(
-      str_to_lower(comment_text),
-      paste0("\\b", modal_subset, "\\b", collapse = "|")
-    )),
-    user_count = sum(str_detect(
-      str_to_lower(comment_text),
-      paste0("\\b", whatever_subset, "\\b", collapse = "|")
-    ))
+    time_to_close_percentile = 1- percent_rank(time_to_close_hours),
+    comment_count_percentile = percent_rank(task_event_comment_count),
+    author_task_percentile = percent_rank(task_index_prev)
+    # inverting it so that higher percentile is faster
  ) %>%
  ungroup()
+
+ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) +
+  geom_point(alpha = 0.6) +                # Points, with some transparency
+  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
+  theme_minimal() +
+  facet_grid(source ~ author_closer)
+
 library(ggdist)
-ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
-  geom_point(alpha=0.1) +             # Points, with some transparency
-  geom_smooth(method = "loess", se = FALSE) + 
-  theme_minimal()      
+
+ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) +
+  stat_slabinterval() +
+  theme_minimal()+
+  facet_grid(source ~ AuthorWMFAffil)


+closed_combined_task_df <- combined_task_df |>
+  filter(!is.na(closed_relevance))

-combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000)
-
-ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) +
-  geom_smooth(method = "loess", se = TRUE) + 
-  geom_point(alpha=0.1) +   
-  theme_minimal()
+ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) + 
+  geom_point(alpha = 0.6) + 
+  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency
+  theme_minimal()+
+  facet_grid(source ~ author_closer)
--- a/phab_data_exploration.R
+++ b/phab_data_exploration.R
@ -1,4 +0,0 @@
-library(tidyverse)
-
-c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
-c1_phab_df <- read.csv(c1_count , header = TRUE) 
--- a/plots/070525-d1-m2-commits-faceted.png
+++ b/plots/070525-d1-m2-commits-faceted.png
--- a/plots/070525-d1-m2-tasks-faceted.png
+++ b/plots/070525-d1-m2-tasks-faceted.png
--- a/ww-figures/.ipynb_checkpoints/ww-c2c3-relevance-viz-checkpoint.png
+++ b/ww-figures/.ipynb_checkpoints/ww-c2c3-relevance-viz-checkpoint.png
Author	SHA1	Message	Date
Matthew Gaughan	55964c754b	updating with new EDA	2025-07-07 13:08:58 -07:00
Matthew Gaughan	067fd08dd4	some tidying up following m2 figure creation, more needed	2025-07-07 10:44:33 -07:00