Compare commits
No commits in common. "55964c754bea9ea9a4e8ef7f0b16cb933b5dc097" and "7966e9212573d88f197cbda4171f2aa3ead44165" have entirely different histories.
55964c754b
...
7966e92125
Before Width: | Height: | Size: 698 KiB After Width: | Height: | Size: 698 KiB |
Before Width: | Height: | Size: 747 KiB After Width: | Height: | Size: 747 KiB |
@ -1,7 +1,3 @@
|
|||||||
# mw-lifecycle-analysis
|
# mw-lifecycle-analysis
|
||||||
|
|
||||||
Analysis scripts and code for studying the deployment processes of three MediaWiki/Wikimedia features (2013-2015)
|
Analysis scripts and code for studying lifecycles of MediaWiki projects
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
45
govdoc-cr-age-dist.R
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
library(dplyr)
|
||||||
|
contributing_df_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_weekly_count_data.csv"
|
||||||
|
contributing_df = read.csv(contributing_df_filepath, header = TRUE)
|
||||||
|
|
||||||
|
readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_weekly_count_data.csv"
|
||||||
|
readme_df = read.csv(readme_df_filepath, header = TRUE)
|
||||||
|
|
||||||
|
combined_df <- bind_rows(
|
||||||
|
contributing_df %>%
|
||||||
|
group_by(project_id) %>%
|
||||||
|
select(project_id, age_at_commit) %>%
|
||||||
|
mutate(document = factor("CONTRIBUTING", levels = c("CONTRIBUTING", "README"))),
|
||||||
|
readme_df %>%
|
||||||
|
group_by(project_id) %>%
|
||||||
|
select(project_id, age_at_commit) %>%
|
||||||
|
mutate(document = factor("README", levels = c("CONTRIBUTING", "README")))
|
||||||
|
)
|
||||||
|
|
||||||
|
unique_combined_df <- combined_df %>%
|
||||||
|
distinct(project_id, age_at_commit, document)
|
||||||
|
|
||||||
|
library(tidyverse)
|
||||||
|
library(tidyquant)
|
||||||
|
library(ggdist)
|
||||||
|
library(ggthemes)
|
||||||
|
library(ggplot2)
|
||||||
|
|
||||||
|
age_raincloud <- unique_combined_df |>
|
||||||
|
ggplot(aes(x = factor(document), y = age_at_commit, fill = factor(document))) +
|
||||||
|
geom_boxplot(
|
||||||
|
width = 0.12,
|
||||||
|
# removing outliers
|
||||||
|
outlier.color = NA,
|
||||||
|
alpha = 0.5
|
||||||
|
) +
|
||||||
|
ggplot::stat_dots(
|
||||||
|
# ploting on left side
|
||||||
|
side = "left",
|
||||||
|
# adjusting position
|
||||||
|
justification = 1.1,
|
||||||
|
# adjust grouping (binning) of observations
|
||||||
|
binwidth = 0.25
|
||||||
|
)
|
||||||
|
|
||||||
|
age_raincloud
|
Before Width: | Height: | Size: 781 KiB After Width: | Height: | Size: 781 KiB |
Before Width: | Height: | Size: 774 KiB After Width: | Height: | Size: 774 KiB |
Before Width: | Height: | Size: 743 KiB After Width: | Height: | Size: 743 KiB |
@ -1,18 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: lM83HdgeT310p2tkyoCk
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 27419348
|
|
||||||
slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 ***
|
|
@ -34,9 +34,6 @@ c1_input_df <- c1_input_df |>
|
|||||||
date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
)) |>
|
)) |>
|
||||||
mutate(author_closer = AuthorPHID %in% CloserPHID,
|
|
||||||
same_author = AuthorPHID == CloserPHID) |>
|
|
||||||
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |>
|
|
||||||
mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))
|
mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))
|
||||||
|
|
||||||
|
|
||||||
@ -54,9 +51,6 @@ c2_input_df <- c2_input_df |>
|
|||||||
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
)) |>
|
)) |>
|
||||||
mutate(author_closer = AuthorPHID %in% CloserPHID,
|
|
||||||
same_author = AuthorPHID == CloserPHID) |>
|
|
||||||
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |>
|
|
||||||
mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))
|
mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))
|
||||||
|
|
||||||
# c3 key dates
|
# c3 key dates
|
||||||
@ -72,9 +66,6 @@ c3_input_df <- c3_input_df %>%
|
|||||||
date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
)) |>
|
)) |>
|
||||||
mutate(author_closer = AuthorPHID %in% CloserPHID,
|
|
||||||
same_author = AuthorPHID == CloserPHID) |>
|
|
||||||
mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |>
|
|
||||||
mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))
|
mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))
|
||||||
|
|
||||||
# Combine the dataframes into one
|
# Combine the dataframes into one
|
||||||
@ -89,8 +80,7 @@ combined_df <- combined_df %>%
|
|||||||
arrange(date_created, .by_group = TRUE) %>%
|
arrange(date_created, .by_group = TRUE) %>%
|
||||||
mutate(
|
mutate(
|
||||||
task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
|
task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
|
||||||
comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"),
|
comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment")
|
||||||
author_prior_phab_contrib = task_index_prev + comment_index_prev
|
|
||||||
) %>%
|
) %>%
|
||||||
ungroup() |>
|
ungroup() |>
|
||||||
rowwise() %>%
|
rowwise() %>%
|
||||||
@ -113,47 +103,52 @@ combined_df <- combined_df %>%
|
|||||||
|
|
||||||
|
|
||||||
combined_task_df <- combined_df %>%
|
combined_task_df <- combined_df %>%
|
||||||
add_count(TaskPHID, name = "task_event_comment_count") |>
|
add_count(TaskPHID, name = "TaskPHID_count") |>
|
||||||
filter(comment_type == "task_description") |>
|
filter(comment_type == "task_description") |>
|
||||||
mutate(time_to_close = date_closed - date_created,
|
mutate(time_to_close = date_closed - date_created,
|
||||||
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
|
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
|
||||||
) |>
|
) |>
|
||||||
group_by(AuthorPHID, source) %>%
|
group_by(AuthorPHID, source) %>%
|
||||||
arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
|
arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
|
||||||
mutate(author_task_index = row_number()) %>%
|
mutate(task_index = row_number()) %>%
|
||||||
ungroup()
|
ungroup()
|
||||||
|
|
||||||
library(dplyr)
|
ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) +
|
||||||
|
|
||||||
combined_task_df <- combined_task_df |>
|
|
||||||
group_by(source) %>%
|
|
||||||
mutate(
|
|
||||||
time_to_close_percentile = 1- percent_rank(time_to_close_hours),
|
|
||||||
comment_count_percentile = percent_rank(task_event_comment_count),
|
|
||||||
author_task_percentile = percent_rank(task_index_prev)
|
|
||||||
# inverting it so that higher percentile is faster
|
|
||||||
) %>%
|
|
||||||
ungroup()
|
|
||||||
|
|
||||||
ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) +
|
|
||||||
geom_point(alpha = 0.6) + # Points, with some transparency
|
geom_point(alpha = 0.6) + # Points, with some transparency
|
||||||
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
|
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
|
||||||
theme_minimal() +
|
theme_minimal()
|
||||||
facet_grid(source ~ author_closer)
|
|
||||||
|
|
||||||
|
library(stringr)
|
||||||
|
|
||||||
|
# 1. Count modal verbs in each task comment_text
|
||||||
|
combined_task_df <- combined_task_df %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(
|
||||||
|
modal_verb_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_verbs, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
modal_subset_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_subset, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
user_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", whatever_subset, "\\b", collapse = "|")
|
||||||
|
))
|
||||||
|
) %>%
|
||||||
|
ungroup()
|
||||||
library(ggdist)
|
library(ggdist)
|
||||||
|
ggplot(combined_df, aes(x = week_index, y = modal_subset_count, color = source, linetype=AuthorWMFAffil)) +
|
||||||
ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) +
|
geom_point(alpha=0.1) + # Points, with some transparency
|
||||||
stat_slabinterval() +
|
geom_smooth(method = "loess", se = FALSE) +
|
||||||
theme_minimal()+
|
theme_minimal()
|
||||||
facet_grid(source ~ AuthorWMFAffil)
|
|
||||||
|
|
||||||
|
|
||||||
closed_combined_task_df <- combined_task_df |>
|
|
||||||
filter(!is.na(closed_relevance))
|
|
||||||
|
|
||||||
ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) +
|
combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000)
|
||||||
geom_point(alpha = 0.6) +
|
|
||||||
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency
|
ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) +
|
||||||
theme_minimal()+
|
geom_smooth(method = "loess", se = TRUE) +
|
||||||
facet_grid(source ~ author_closer)
|
geom_point(alpha=0.1) +
|
||||||
|
theme_minimal()
|
||||||
|
4
phab_data_exploration.R
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
|
||||||
|
c1_phab_df <- read.csv(c1_count , header = TRUE)
|
Before Width: | Height: | Size: 420 KiB After Width: | Height: | Size: 420 KiB |
Before Width: | Height: | Size: 810 KiB After Width: | Height: | Size: 810 KiB |
Before Width: | Height: | Size: 670 KiB After Width: | Height: | Size: 670 KiB |
Before Width: | Height: | Size: 787 KiB After Width: | Height: | Size: 787 KiB |
Before Width: | Height: | Size: 734 KiB After Width: | Height: | Size: 734 KiB |
Before Width: | Height: | Size: 391 KiB After Width: | Height: | Size: 391 KiB |
Before Width: | Height: | Size: 431 KiB After Width: | Height: | Size: 431 KiB |
Before Width: | Height: | Size: 377 KiB After Width: | Height: | Size: 377 KiB |
Before Width: | Height: | Size: 420 KiB After Width: | Height: | Size: 420 KiB |
Before Width: | Height: | Size: 381 KiB After Width: | Height: | Size: 381 KiB |
Before Width: | Height: | Size: 421 KiB After Width: | Height: | Size: 421 KiB |
Before Width: | Height: | Size: 372 KiB After Width: | Height: | Size: 372 KiB |
Before Width: | Height: | Size: 413 KiB After Width: | Height: | Size: 413 KiB |
BIN
ww-figures/ww-c2c3-relevance-viz.png
Normal file
After Width: | Height: | Size: 420 KiB |
Before Width: | Height: | Size: 425 KiB After Width: | Height: | Size: 425 KiB |
Before Width: | Height: | Size: 370 KiB After Width: | Height: | Size: 370 KiB |
Before Width: | Height: | Size: 406 KiB After Width: | Height: | Size: 406 KiB |