updated preliminary phabricator EDA with things re: longitudinal data
This commit is contained in:
parent
2af7983fdb
commit
edcb174d42
@ -9,6 +9,11 @@ c2_input_df <- read.csv(c2_count , header = TRUE)
|
|||||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
|
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/062725_c3_cleaned_phab.csv"
|
||||||
c3_input_df <- read.csv(c3_count , header = TRUE)
|
c3_input_df <- read.csv(c3_count , header = TRUE)
|
||||||
|
|
||||||
|
#getting the relative weeks to the publication date
|
||||||
|
relative_week <- function(date, ref_date) {
|
||||||
|
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
|
||||||
|
}
|
||||||
|
|
||||||
#phase of feature deployments
|
#phase of feature deployments
|
||||||
# pre opt-in (0)
|
# pre opt-in (0)
|
||||||
# opt-in beta (1)
|
# opt-in beta (1)
|
||||||
@ -18,7 +23,7 @@ c3_input_df <- read.csv(c3_count , header = TRUE)
|
|||||||
# opt-in = as.Date("2012-12-11)
|
# opt-in = as.Date("2012-12-11)
|
||||||
# deployment announcement = as.Date("2013-06-06")
|
# deployment announcement = as.Date("2013-06-06")
|
||||||
# deployment_date <- as.Date("2013-07-01")
|
# deployment_date <- as.Date("2013-07-01")
|
||||||
|
library(dplyr)
|
||||||
c1_input_df <- c1_input_df |>
|
c1_input_df <- c1_input_df |>
|
||||||
mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
|
mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
|
||||||
mutate(source = "c1") |>
|
mutate(source = "c1") |>
|
||||||
@ -27,7 +32,8 @@ c1_input_df <- c1_input_df |>
|
|||||||
date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1, # opt-in beta
|
date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1, # opt-in beta
|
||||||
date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
))
|
)) |>
|
||||||
|
mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))
|
||||||
|
|
||||||
|
|
||||||
# c2 key dates
|
# c2 key dates
|
||||||
@ -43,7 +49,8 @@ c2_input_df <- c2_input_df |>
|
|||||||
date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1, # opt-in beta
|
date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1, # opt-in beta
|
||||||
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
))
|
)) |>
|
||||||
|
mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))
|
||||||
|
|
||||||
# c3 key dates
|
# c3 key dates
|
||||||
# opt-in = as.Date("2013-08-01)
|
# opt-in = as.Date("2013-08-01)
|
||||||
@ -57,18 +64,90 @@ c3_input_df <- c3_input_df %>%
|
|||||||
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1, # opt-in beta
|
date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1, # opt-in beta
|
||||||
date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
date_created >= as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
|
||||||
date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out
|
date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3 # post-deployment opt-out
|
||||||
))
|
)) |>
|
||||||
|
mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))
|
||||||
|
|
||||||
# Combine the dataframes into one
|
# Combine the dataframes into one
|
||||||
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
|
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
|
||||||
|
|
||||||
|
modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
|
||||||
|
modal_subset <- c('should', 'ought', 'must')
|
||||||
|
whatever_subset <- c('user')
|
||||||
|
|
||||||
|
combined_df <- combined_df %>%
|
||||||
|
group_by(AuthorPHID, source) %>%
|
||||||
|
arrange(date_created, .by_group = TRUE) %>%
|
||||||
|
mutate(
|
||||||
|
task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
|
||||||
|
comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment")
|
||||||
|
) %>%
|
||||||
|
ungroup() |>
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(
|
||||||
|
modal_verb_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_verbs, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
modal_subset_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_subset, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
user_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", whatever_subset, "\\b", collapse = "|")
|
||||||
|
))
|
||||||
|
) %>%
|
||||||
|
ungroup() |>
|
||||||
|
filter(week_index <= 13)
|
||||||
|
|
||||||
|
|
||||||
combined_task_df <- combined_df %>%
|
combined_task_df <- combined_df %>%
|
||||||
|
add_count(TaskPHID, name = "TaskPHID_count") |>
|
||||||
filter(comment_type == "task_description") |>
|
filter(comment_type == "task_description") |>
|
||||||
mutate(time_to_close = date_closed - date_created,
|
mutate(time_to_close = date_closed - date_created,
|
||||||
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
|
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
|
||||||
)
|
) |>
|
||||||
|
group_by(AuthorPHID, source) %>%
|
||||||
|
arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
|
||||||
|
mutate(task_index = row_number()) %>%
|
||||||
|
ungroup()
|
||||||
|
|
||||||
ggplot(combined_task_df, aes(x = priority_score, y = phase, color = source)) +
|
ggplot(combined_task_df, aes(x = week_index, y = priority_score, color = source)) +
|
||||||
geom_point(alpha = 0.6) + # Points, with some transparency
|
geom_point(alpha = 0.6) + # Points, with some transparency
|
||||||
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
|
geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
|
||||||
theme_minimal()
|
theme_minimal()
|
||||||
|
|
||||||
|
library(stringr)
|
||||||
|
|
||||||
|
# 1. Count modal verbs in each task comment_text
|
||||||
|
combined_task_df <- combined_task_df %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(
|
||||||
|
modal_verb_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_verbs, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
modal_subset_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", modal_subset, "\\b", collapse = "|")
|
||||||
|
)),
|
||||||
|
user_count = sum(str_detect(
|
||||||
|
str_to_lower(comment_text),
|
||||||
|
paste0("\\b", whatever_subset, "\\b", collapse = "|")
|
||||||
|
))
|
||||||
|
) %>%
|
||||||
|
ungroup()
|
||||||
|
library(ggdist)
|
||||||
|
ggplot(combined_df, aes(x = week_index, y = modal_verb_count, color = source, linetype=AuthorWMFAffil)) +
|
||||||
|
geom_point(alpha=0.1) + # Points, with some transparency
|
||||||
|
geom_smooth(method = "loess", se = FALSE) +
|
||||||
|
theme_minimal()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
combined_task_df_subset <- subset(combined_task_df, time_to_close_hours < 1000)
|
||||||
|
|
||||||
|
ggplot(combined_task_df_subset, aes(x = TaskPHID_count, y = task_index, color = source)) +
|
||||||
|
geom_smooth(method = "loess", se = TRUE) +
|
||||||
|
geom_point(alpha=0.1) +
|
||||||
|
theme_minimal()
|
||||||
|
Loading…
Reference in New Issue
Block a user