mw-lifecycle-analysis/analysis_data/style_dict_variables.R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)

unified_csv <-"~/analysis_data/102725_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)


unified_df |>
  ggplot(
    aes(
      x=leng,
      y=as.factor(isAuthorWMF)
    )
  ) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal()


BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")

human_list_unified_df <- unified_df %>%
  filter(!is.na(human_labels))|>
  mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
  mutate(list_human_labels = map(human_labels, ~ {
    if (is.na(.x)) {
      NA_character_
    } else if (str_detect(.x, '^\\s*c\\(')) {
      eval(parse(text = .x))
    } else {
      .x
    }
  })) %>%
  unnest(list_human_labels, keep_empty = TRUE) |>
  filter(list_human_labels != "NA") |>
  group_by(id)|>
  summarise(
    n_tags = sum(!is.na(list_human_labels)),
    human_BE_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE",
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags)

valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')

unique_olmo_labels <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) %>%
  # drop empty / NA / literal "NA" entries
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  pull(olmo_label) %>%
  unique() %>%
  sort()

print(unique_olmo_labels)

olmo_list_unified_df <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) |>
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  group_by(id)|>
  summarise(
    n_tags = sum(!is.na(olmo_label)),
    olmo_BE_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop = if_else(
      n_tags == 0L,
      NA_real_,
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
                             "SOLUTION DISCUSSION", "SOLUTION USAGE",
                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags)

first_join <- unified_df |>
  left_join(
    olmo_list_unified_df,
    by="id"
  )

second_join <- first_join |>
  left_join(
    human_list_unified_df,
    by="id"
  )
library(ggdist)
ggplot(second_join, aes(x = olmo_VR_prop,
                        y = human_VR_prop,
                        ymin = 0, ymax = 1)) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  geom_abline() +
  geom_smooth()+
  xlim(0, 1) +
  ylim(0, 1) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Tags of OLMO solution % and Human solution %",
    x = "OLMO solution % tag",
    y = "Human solution % tag",
  )

ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 19, alpha=0.3, size=2) +
  scale_fill_viridis_d() +
  xlim(0, 20) +
  theme_minimal() +
  labs(
    title = "Modal Verbs v. PC3",
    x = "modal verb count",
    y = "PC3",
  )


ggplot(second_join, aes(
  x = as.factor(comment_type),    # x-axis grouping
  y = modal_verbs,
  fill = isAuthorWMF
)) +
  ylim(0, 3) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of modal verb usage",
    x = "Comment_type",
    y = "Count of modal verbs",
    fill = "isAuthorWMF?"
  )