mw-lifecycle-analysis/p2/p2_EDA/062325_EDA.R

library(tidyverse)

c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/062725_c1_cleaned_phab.csv"
c1_input_df <- read.csv(c1_count , header = TRUE)

c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/070125_c2_title_cleaned.csv"
c2_input_df <- read.csv(c2_count , header = TRUE)

c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/070125_c3_title_cleaned.csv"
c3_input_df <- read.csv(c3_count , header = TRUE)

library(dplyr)

# Add a column to each dataframe to label them
c1_input_df <- c1_input_df |> mutate(source = "c1")
c2_input_df <- c2_input_df %>% mutate(source = "c2")
c3_input_df <- c3_input_df %>% mutate(source = "c3")

# Combine the dataframes into one
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)

combined_task_df <- combined_df %>%
  filter(comment_type == "task_description") |>
  mutate(time_to_close = date_closed - date_created,
         time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
  )


ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) +
  ggdist::stat_halfeye(
    adjust = 0.5,
    width = 1.5,         # increase width
    scale = 8.8,         # new: increase scale for fatter density
    .width = 0,
    justification = 0,
    point_colour = NA
  ) +
  facet_wrap(~ AuthorWMFAffil) +
  labs(
    title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source",
    x = "Source",
    y = "Time to Close (hours)"
  ) +
  theme_minimal()

# Calculate proportions of status within each (AuthorWMFAffil, source) group
prop_df <- combined_task_df %>%
  group_by(AuthorWMFAffil, source, status) %>%
  summarize(n = n(), .groups = "drop") %>%
  group_by(AuthorWMFAffil, source) %>%
  mutate(prop = n / sum(n))

# Plot: filled bar plot (proportion)
ggplot(prop_df, aes(x = source, y = prop, fill = status)) +
  geom_col(position = "fill") +
  facet_wrap(~ AuthorWMFAffil) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Proportion of Phabricator Task Status by Affiliation and Case",
    x = "Source",
    y = "Proportion",
    fill = "Status"
  ) +
  theme_minimal()

library(stringr)
# modal verbs
modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
modal_subset <- c('should', 'ought', 'must')
whatever_subset <- c('user')
# 1. Count modal verbs in each comment_text
combined_task_df <- combined_task_df %>%
  rowwise() %>%
  mutate(
    modal_verb_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", modal_verbs, "\\b", collapse = "|")
    )),
    modal_subset_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", modal_subset, "\\b", collapse = "|")
    )),
    whatever_subset_count = sum(str_detect(
      str_to_lower(comment_text),
      paste0("\\b", whatever_subset, "\\b", collapse = "|")
    ))
  ) %>%
  ungroup()

# 3. Plot (e.g., bar plot of mean modal verbs per group)
ggplot(combined_task_df, aes(x = source, y = modal_verb_count, fill = AuthorWMFAffil)) +
  geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
  stat_summary(
    fun = mean,
    geom = "point",
    shape = 23,
    size = 3,
    color = "black",
    fill = "yellow",
    position = position_dodge(width = 0.8)
  ) +
  facet_wrap(~ AuthorWMFAffil) +
  labs(
    title = "Distribution and Mean of 'user' by Affiliation and Source",
    x = "Source",
    y = "Count"
  ) +
  theme_minimal()

binned_task_df <- combined_task_df |>
  mutate(description_length = nchar(comment_text))

ggplot(binned_task_df, aes(x = time_to_close_hours, y = priority_score, color = source)) +
  geom_point(alpha = 0.6) +                # Points, with some transparency
  geom_smooth(method = "loess", se = TRUE) + # LOESS curve, no confidence band
  theme_minimal()