crossectional EDA for phase 2 of the project, need to make it longitudinal
This commit is contained in:
parent
fd1479775d
commit
ab1fe8e051
103
062325_EDA.R
Normal file
103
062325_EDA.R
Normal file
@ -0,0 +1,103 @@
|
||||
library(tidyverse)
|
||||
|
||||
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/060325_c1_ve_phab_comments.csv"
|
||||
c1_input_df <- read.csv(c1_count , header = TRUE)
|
||||
|
||||
c2_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/060325_c2_https_phab_comments.csv"
|
||||
c2_input_df <- read.csv(c2_count , header = TRUE)
|
||||
|
||||
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/060325_c3_http_phab_comments.csv"
|
||||
c3_input_df <- read.csv(c3_count , header = TRUE)
|
||||
|
||||
library(dplyr)
|
||||
|
||||
# Add a column to each dataframe to label them
|
||||
c1_input_df <- c1_input_df |> mutate(source = "c1")
|
||||
c2_input_df <- c2_input_df %>% mutate(source = "c2")
|
||||
c3_input_df <- c3_input_df %>% mutate(source = "c3")
|
||||
|
||||
# Combine the dataframes into one
|
||||
combined_df <- bind_rows(c1_input_df, c2_input_df, c3_input_df)
|
||||
|
||||
combined_task_df <- combined_df %>%
|
||||
filter(comment_type == "task_description") |>
|
||||
mutate(time_to_close = date_closed - date_created,
|
||||
time_to_close_hours = as.numeric(difftime(date_closed, date_created, units = "hours"))
|
||||
)
|
||||
|
||||
|
||||
ggplot(combined_task_df, aes(x = source, y = time_to_close_hours, fill = AuthorWMFAffil)) +
|
||||
ggdist::stat_halfeye(
|
||||
adjust = 0.5,
|
||||
width = 1.5, # increase width
|
||||
scale = 8.8, # new: increase scale for fatter density
|
||||
.width = 0,
|
||||
justification = 0,
|
||||
point_colour = NA
|
||||
) +
|
||||
facet_wrap(~ AuthorWMFAffil) +
|
||||
labs(
|
||||
title = "Distribution Plot: Time to Close by AuthorWMFAffil and Source",
|
||||
x = "Source",
|
||||
y = "Time to Close (hours)"
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
# Calculate proportions of status within each (AuthorWMFAffil, source) group
|
||||
prop_df <- combined_task_df %>%
|
||||
group_by(AuthorWMFAffil, source, status) %>%
|
||||
summarize(n = n(), .groups = "drop") %>%
|
||||
group_by(AuthorWMFAffil, source) %>%
|
||||
mutate(prop = n / sum(n))
|
||||
|
||||
# Plot: filled bar plot (proportion)
|
||||
ggplot(prop_df, aes(x = source, y = prop, fill = status)) +
|
||||
geom_col(position = "fill") +
|
||||
facet_wrap(~ AuthorWMFAffil) +
|
||||
scale_y_continuous(labels = scales::percent) +
|
||||
labs(
|
||||
title = "Proportion of Phabricator Task Status by Affiliation and Case",
|
||||
x = "Source",
|
||||
y = "Proportion",
|
||||
fill = "Status"
|
||||
) +
|
||||
theme_minimal()
|
||||
|
||||
library(stringr)
|
||||
# modal verbs
|
||||
modal_verbs <- c("can", "could", "may", "might", "must", "shall", "should", "will", "would", "ought")
|
||||
modal_subset <- c('should', 'ought', 'must')
|
||||
# 1. Count modal verbs in each comment_text
|
||||
combined_task_df <- combined_task_df %>%
|
||||
rowwise() %>%
|
||||
mutate(
|
||||
modal_verb_count = sum(str_detect(
|
||||
str_to_lower(comment_text),
|
||||
paste0("\\b", modal_verbs, "\\b", collapse = "|")
|
||||
)),
|
||||
modal_subset_count = sum(str_detect(
|
||||
str_to_lower(comment_text),
|
||||
paste0("\\b", modal_subset, "\\b", collapse = "|")
|
||||
))
|
||||
) %>%
|
||||
ungroup()
|
||||
|
||||
# 3. Plot (e.g., bar plot of mean modal verbs per group)
|
||||
ggplot(combined_task_df, aes(x = source, y = modal_subset_count, fill = AuthorWMFAffil)) +
|
||||
geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
|
||||
stat_summary(
|
||||
fun = mean,
|
||||
geom = "point",
|
||||
shape = 23,
|
||||
size = 3,
|
||||
color = "black",
|
||||
fill = "yellow",
|
||||
position = position_dodge(width = 0.8)
|
||||
) +
|
||||
facet_wrap(~ AuthorWMFAffil) +
|
||||
labs(
|
||||
title = "Distribution and Mean of 'should'|'ought'|'must' by Affiliation and Source",
|
||||
x = "Source",
|
||||
y = "Modal Verb Count"
|
||||
) +
|
||||
theme_minimal()
|
@ -1,17 +1,18 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu
|
||||
ssh -N -L 8787:n3439:34951 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: YXXLCjS/064zAiagiRdx
|
||||
password: xR04Y8VD4WRBYcJKI7NH
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 26402644
|
||||
scancel -f 27074957
|
||||
slurmstepd: error: *** JOB 27074957 ON n3439 CANCELLED AT 2025-06-23T14:36:35 ***
|
4
phab_data_exploration.R
Normal file
4
phab_data_exploration.R
Normal file
@ -0,0 +1,4 @@
|
||||
library(tidyverse)
|
||||
|
||||
c1_phab <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0228_ve_phab_comments.csv "
|
||||
c1_phab_df <- read.csv(c1_count , header = TRUE)
|
Loading…
Reference in New Issue
Block a user