1
0

crossectional EDA for phase 2 of the project, need to make it longitudinal

This commit is contained in:
Matthew Gaughan 2025-06-23 14:37:15 -07:00
parent fd1479775d
commit ab1fe8e051
3 changed files with 111 additions and 3 deletions

103
062325_EDA.R Normal file
View File

@ -0,0 +1,103 @@
library(tidyverse)
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
c1_input_df <- read.csv(c1_count , header = TRUE)
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
c2_input_df <- read.csv(c2_count , header = TRUE)
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
c3_input_df <- read.csv(c3_count , header = TRUE)
library(dplyr)
# Add a column to each dataframe to label them
c1_input_df <- c1_input_df |> mutate(source = "c1")
c2_input_df <- c2_input_df %>% mutate(source = "c2")
c3_input_df <- c3_input_df %>% mutate(source = "c3")
# Combine the dataframes into one
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
combined_task_df <- combined_df %>%
filter(comment_type == "task_description") |>
mutate(time_to_close = date_closed - date_created,
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
)
ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) +
ggdist::stat_halfeye(
adjust = 0.5,
width = 1.5, # increase width
scale = 8.8, # new: increase scale for fatter density
.width = 0,
justification = 0,
point_colour = NA
) +
facet_wrap(~ AuthorWMFAffil) +
labs(
title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source",
x = "Source",
y = "Time to Close (hours)"
) +
theme_minimal()
# Calculate proportions of status within each (AuthorWMFAffil, source) group
prop_df <- combined_task_df %>%
group_by(AuthorWMFAffil, source, status) %>%
summarize(n = n(), .groups = "drop") %>%
group_by(AuthorWMFAffil, source) %>%
mutate(prop = n / sum(n))
# Plot: filled bar plot (proportion)
ggplot(prop_df, aes(x = source, y = prop, fill = status)) +
geom_col(position = "fill") +
facet_wrap(~ AuthorWMFAffil) +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Proportion of Phabricator Task Status by Affiliation and Case",
x = "Source",
y = "Proportion",
fill = "Status"
) +
theme_minimal()
library(stringr)
# modal verbs
modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
modal_subset <- c('should', 'ought', 'must')
# 1. Count modal verbs in each comment_text
combined_task_df <- combined_task_df %>%
rowwise() %>%
mutate(
modal_verb_count = sum(str_detect(
str_to_lower(comment_text),
paste0("\\b", modal_verbs, "\\b", collapse = "|")
)),
modal_subset_count = sum(str_detect(
str_to_lower(comment_text),
paste0("\\b", modal_subset, "\\b", collapse = "|")
))
) %>%
ungroup()
# 3. Plot (e.g., bar plot of mean modal verbs per group)
ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) +
geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
stat_summary(
fun = mean,
geom = "point",
shape = 23,
size = 3,
color = "black",
fill = "yellow",
position = position_dodge(width = 0.8)
) +
facet_wrap(~ AuthorWMFAffil) +
labs(
title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source",
x = "Source",
y = "Modal Verb Count"
) +
theme_minimal()

View File

@ -1,17 +1,18 @@
1. SSH tunnel from your workstation using the following command: 1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787 and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials: 2. log in to RStudio Server using the following credentials:
user: mjilg user: mjilg
password: YXXLCjS/064zAiagiRdx password: xR04Y8VD4WRBYcJKI7NH
When done using RStudio Server, terminate the job by: When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node: 2. Issue the following command on the login node:
scancel -f 26402644 scancel -f 27074957
slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 ***

4
phab_data_exploration.R Normal file
View File

@ -0,0 +1,4 @@
library(tidyverse)
c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
c1_phab_df <- read.csv(c1_count , header = TRUE)