mw-lifecycle-analysis/p2/p2_EDA/phab_weekly_bins.R

library(tidyverse)

c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
c1_input_df <- read.csv(c1_count , header = TRUE)

c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
c2_input_df <- read.csv(c2_count , header = TRUE)

c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
c3_input_df <- read.csv(c3_count , header = TRUE)


#getting the relative weeks to the publication date
relative_week <- function(date, ref_date) {
  as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}

#phase of feature deployments
# pre opt-in (0)
# opt-in beta (1)
# post-announcement pre-deployment (2)
# post-deployment opt-out (3)
# c1 key dates
# opt-in = as.Date("2012-12-11)
# deployment announcement = as.Date("2013-06-06")
# deployment_date <- as.Date("2013-07-01")
library(dplyr)
c1_input_df <- c1_input_df |>
  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
  mutate(source = "c1") |>
  mutate(phase = case_when(
    date_created < as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) ~ 0,                                 # pre opt-in
    date_created >= as.numeric(as.POSIXct("2012-12-11", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-06-06", tz = "UTC")) ~ 1,  # opt-in beta
    date_created >=  as.numeric(as.POSIXct("2013-06-06", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-07-01", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2013-07-01", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
  mutate(author_closer = AuthorPHID %in% CloserPHID,
         same_author = AuthorPHID == CloserPHID) |>
  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-10-01", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2013-07-01")))


# c2 key dates
# opt-in = as.Date("2011-10-03)
# deployment announcement = as.Date("2013-08-01")
# deployment_date <- as.Date("2013-08-28")

c2_input_df <- c2_input_df |>
  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
  mutate(source = "c2") |>
  mutate(phase = case_when(
    date_created < as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) ~ 0,                                 # pre opt-in
    date_created >= as.numeric(as.POSIXct("2011-10-03", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 1,  # opt-in beta
    date_created >=  as.numeric(as.POSIXct("2013-08-01", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2013-08-28", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2013-08-28", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
  mutate(author_closer = AuthorPHID %in% CloserPHID,
         same_author = AuthorPHID == CloserPHID) |>
  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2013-11-27", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2013-08-28")))

# c3 key dates
# opt-in = as.Date("2013-08-01)
# deployment announcement = as.Date("2015-06-12")
# deployment_date <- as.Date("2015-07-02")
c3_input_df <- c3_input_df %>%
  mutate(date_created = as.numeric(as.POSIXct(date_created, tz = "UTC"))) |>
  mutate(source = "c3") |>
  mutate(phase = case_when(
    date_created < as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) ~ 0,                                 # pre opt-in
    date_created >= as.numeric(as.POSIXct("2013-08-01", tz = "UTC")) & date_created < as.numeric(as.POSIXct("2015-06-12", tz = "UTC")) ~ 1,  # opt-in beta
    date_created >=  as.numeric(as.POSIXct("2015-06-12", tz = "UTC"))  & date_created < as.numeric(as.POSIXct("2015-07-02", tz = "UTC")) ~ 2, # post-announcement pre-deployment
    date_created >= as.numeric(as.POSIXct("2015-07-02", tz = "UTC"))~ 3                             # post-deployment opt-out
  )) |>
  mutate(author_closer = AuthorPHID %in% CloserPHID,
         same_author = AuthorPHID == CloserPHID) |>
  mutate(closed_relevance = date_closed <= as.numeric(as.POSIXct("2015-10-02", tz = "UTC"))) |>
  mutate(week_index = relative_week(date_created, as.Date("2015-07-02")))

# Combine the dataframes into one
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)

#write.csv(combined_df, "~/p2/071425_master_discussion_data.csv", row.names = FALSE)

modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
modal_subset <- c('should', 'ought', 'must')
whatever_subset <- c('user')

combined_df <- combined_df %>%
  group_by(AuthorPHID, source) %>%
  arrange(date_created, .by_group = TRUE) %>%
  mutate(
    task_index_prev = cumsum(comment_type == "task_description") - (comment_type == "task_description"),
    comment_index_prev = cumsum(comment_type == "task_subcomment") - (comment_type == "task_subcomment"),
    author_prior_phab_contrib = task_index_prev + comment_index_prev
  ) %>%
  ungroup() |>
  rowwise() %>%
  mutate(
    modal_verb_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", modal_verbs, "\\b", collapse = "|")
    )),
    modal_subset_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", modal_subset, "\\b", collapse = "|")
    )),
    user_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", whatever_subset, "\\b", collapse = "|")
    ))
  ) %>%
  ungroup() |>
  filter(week_index <= 13)


combined_task_df <- combined_df %>%
  add_count(TaskPHID, name = "task_event_comment_count") |>
  filter(comment_type == "task_description") |>
  mutate(time_to_close = date_closed - date_created,
         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
  ) |>
  group_by(AuthorPHID, source) %>%
  arrange(date_created, .by_group = TRUE) %>% # recommended: order by date_created
  mutate(author_task_index = row_number()) %>%
  ungroup()

library(dplyr)

combined_task_df <- combined_task_df |>
  group_by(source) %>%
  mutate(
    time_to_close_percentile = 1- percent_rank(time_to_close_hours),
    comment_count_percentile = percent_rank(task_event_comment_count),
    author_task_percentile = percent_rank(task_index_prev)
    # inverting it so that higher percentile is faster
  ) %>%
  ungroup()

ggplot(combined_task_df, aes(x = author_task_percentile, y =priority_score, color = source)) +
  geom_point(alpha = 0.6) +                # Points, with some transparency
  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
  theme_minimal() +
  facet_grid(source ~ author_closer)

library(ggdist)

ggplot(combined_task_df, aes(x=phase, y=comment_count_percentile)) +
  stat_slabinterval() +
  theme_minimal()+
  facet_grid(source ~ AuthorWMFAffil)


closed_combined_task_df <- combined_task_df |>
  filter(!is.na(closed_relevance))

ggplot(combined_task_df, aes(x=time_to_close_percentile, y=priority_score)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band# Points, with some transparency
  theme_minimal()+
  facet_grid(source ~ author_closer)