commit 176e6cceec719eeb971737807fee844288489f4e
Author: Matthew Gaughan <mjilg@n3439.hyak.local>
Date:   Wed Jan 29 20:24:43 2025 -0800

    initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..198b817
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+# ignore the R studio docker image needed by hyak 
+rstudio_latest.sif
+
+
+# do not need to include any R items 
+.Rhistory
+.cache/
+.config/
+.local/
diff --git a/cleaning_scripts/get_weekly_commit_counts.R b/cleaning_scripts/get_weekly_commit_counts.R
new file mode 100644
index 0000000..3872fa7
--- /dev/null
+++ b/cleaning_scripts/get_weekly_commit_counts.R
@@ -0,0 +1,45 @@
+library(tidyverse)
+# test data directory: /gscratch/comdata/users/mjilg/program_testing/
+# load in the paritioned directories
+library(dplyr)
+library(lubridate)
+
+#for a given file we want to get the count data and produce a csv
+test_file <- "/gscratch/comdata/users/mjilg/program_testing/core_2012-01-01_to_2014-12-31.csv"
+test_dir <- "/gscratch/comdata/users/mjilg/program_testing/"
+
+transform_commit_data <- function(filepath){
+  df = read.csv(filepath, header = TRUE) 
+  dir_path = dirname(filepath)
+  file_name = basename(filepath)
+  
+  # transform the rows of commit data to weekly count data
+  project_name <- sub("_[0-9]{4}-[0-9]{2}-[0-9]{2}_to_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv$", "", file_name)
+  
+  df <- df |>
+    mutate(commit_date = ymd_hms(commit_date)) |>
+    mutate(project_name = project_name)
+  
+  weekly_commits <- df |>
+    mutate(week = floor_date(commit_date, "week")) |>
+    group_by(week, project_name) |>
+    summarise(commit_count = n(), .groups = 'drop')
+  
+  #prepare to save the new, transformed file
+  count_path <- file.path(dir_path, "weekly_counts")
+  count_file_name <- paste0("weeklycount_", file_name)
+  output_file_path <- file.path(count_path, count_file_name)
+  #save and gracefully exit
+  write.csv(weekly_commits, output_file_path, row.names = FALSE)
+  return(weekly_commits)
+}
+
+#then for all files in a directory
+transform_directory_of_commit_data <- function(dir_path) {
+  file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE)
+  for (filepath in file_list) {
+    transform_commit_data(filepath)
+  }
+}
+
+transform_directory_of_commit_data(test_dir)
diff --git a/rstudio-server.job b/rstudio-server.job
new file mode 100644
index 0000000..3d9f565
--- /dev/null
+++ b/rstudio-server.job
@@ -0,0 +1,100 @@
+#!/bin/sh
+
+#SBATCH --job-name=mg-govdoc-cr
+#SBATCH --partition=cpu-g2-mem2x  #update this line - use hyakalloc to find partitions you can use
+
+#SBATCH --time=03:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --mem=64G
+
+#SBATCH --signal=USR2
+#SBATCH --output=%x_%j.out
+
+# This script will request a single CPU with four threads with 20GB of RAM for 2 hours. 
+# You can adjust --time, --nodes, --ntasks, and --mem above to adjust these settings for your session.
+
+# --output=%x_%j.out creates a output file called rstudio-server_XXXXXXXX.out 
+# where the %x is short hand for --job-name above and the X's are an 8-digit 
+# jobID assigned by SLURM when our job is submitted.
+
+RSTUDIO_CWD="/mmfs1/home/mjilg/git/govdoc-cr-analysis" # UPDATE THIS LINE
+RSTUDIO_SIF="rstudio_latest.sif" # update this line
+
+# Create temp directory for ephemeral content to bind-mount in the container
+RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')
+
+mkdir -p -m 700 \
+        ${RSTUDIO_TMP}/run \
+        ${RSTUDIO_TMP}/tmp \
+        ${RSTUDIO_TMP}/var/lib/rstudio-server
+
+cat > ${RSTUDIO_TMP}/database.conf <<END
+provider=sqlite
+directory=/var/lib/rstudio-server
+END
+
+# Set OMP_NUM_THREADS to prevent OpenBLAS (and any other OpenMP-enhanced
+# libraries used by R) from spawning more threads than the number of processors
+# allocated to the job.
+#
+# Set R_LIBS_USER to a path specific to rocker/rstudio to avoid conflicts with
+# personal libraries from any R installation in the host environment
+
+cat > ${RSTUDIO_TMP}/rsession.sh <<END
+#!/bin/sh
+
+export OMP_NUM_THREADS=${SLURM_JOB_CPUS_PER_NODE}
+export R_LIBS_USER=/gscratch/scrubbed/mjilg/R
+exec /usr/lib/rstudio-server/bin/rsession "\${@}"
+END
+
+chmod +x ${RSTUDIO_TMP}/rsession.sh
+
+export APPTAINER_BIND="${RSTUDIO_CWD}:${RSTUDIO_CWD},/gscratch:/gscratch,${RSTUDIO_TMP}/run:/run,${RSTUDIO_TMP}/tmp:/tmp,${RSTUDIO_TMP}/database.conf:/etc/rstudio/database.conf,${RSTUDIO_TMP}/rsession.sh:/etc/rstudio/rsession.sh,${RSTUDIO_TMP}/var/lib/rstudio-server:/var/lib/rstudio-server"
+
+# Do not suspend idle sessions.
+# Alternative to setting session-timeout-minutes=0 in /etc/rstudio/rsession.conf
+export APPTAINERENV_RSTUDIO_SESSION_TIMEOUT=0
+
+export APPTAINERENV_USER=$(id -un)
+export APPTAINERENV_PASSWORD=$(openssl rand -base64 15)
+
+# get unused socket per https://unix.stackexchange.com/a/132524
+# tiny race condition between the python & apptainer commands
+readonly PORT=$(/mmfs1/sw/pyenv/versions/3.9.5/bin/python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
+cat 1>&2 <<END
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:${HOSTNAME}:${PORT} ${APPTAINERENV_USER}@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: ${APPTAINERENV_USER}
+   password: ${APPTAINERENV_PASSWORD}
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f ${SLURM_JOB_ID}
+END
+
+source /etc/bashrc
+module load apptainer
+
+apptainer exec --cleanenv --home ${RSTUDIO_CWD} ${RSTUDIO_CWD}/${RSTUDIO_SIF} \
+    rserver --www-port ${PORT} \
+            --auth-none=0 \
+            --auth-pam-helper-path=pam-helper \
+            --auth-stay-signed-in-days=30 \
+            --auth-timeout-minutes=0 \
+            --rsession-path=/etc/rstudio/rsession.sh \
+            --server-user=${APPTAINERENV_USER}
+
+APPTAINER_EXIT_CODE=$?
+echo "rserver exited $APPTAINER_EXIT_CODE" 1>&2
+exit $APPTAINER_EXIT_CODE