commit 176e6cceec719eeb971737807fee844288489f4e Author: Matthew Gaughan Date: Wed Jan 29 20:24:43 2025 -0800 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..198b817 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +# ignore the R studio docker image needed by hyak +rstudio_latest.sif + + +# do not need to include any R items +.Rhistory +.cache/ +.config/ +.local/ diff --git a/cleaning_scripts/get_weekly_commit_counts.R b/cleaning_scripts/get_weekly_commit_counts.R new file mode 100644 index 0000000..3872fa7 --- /dev/null +++ b/cleaning_scripts/get_weekly_commit_counts.R @@ -0,0 +1,45 @@ +library(tidyverse) +# test data directory: /gscratch/comdata/users/mjilg/program_testing/ +# load in the paritioned directories +library(dplyr) +library(lubridate) + +#for a given file we want to get the count data and produce a csv +test_file <- "/gscratch/comdata/users/mjilg/program_testing/core_2012-01-01_to_2014-12-31.csv" +test_dir <- "/gscratch/comdata/users/mjilg/program_testing/" + +transform_commit_data <- function(filepath){ + df = read.csv(filepath, header = TRUE) + dir_path = dirname(filepath) + file_name = basename(filepath) + + # transform the rows of commit data to weekly count data + project_name <- sub("_[0-9]{4}-[0-9]{2}-[0-9]{2}_to_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv$", "", file_name) + + df <- df |> + mutate(commit_date = ymd_hms(commit_date)) |> + mutate(project_name = project_name) + + weekly_commits <- df |> + mutate(week = floor_date(commit_date, "week")) |> + group_by(week, project_name) |> + summarise(commit_count = n(), .groups = 'drop') + + #prepare to save the new, transformed file + count_path <- file.path(dir_path, "weekly_counts") + count_file_name <- paste0("weeklycount_", file_name) + output_file_path <- file.path(count_path, count_file_name) + #save and gracefully exit + write.csv(weekly_commits, output_file_path, row.names = FALSE) + return(weekly_commits) +} + +#then for all files in a directory +transform_directory_of_commit_data <- function(dir_path) { + file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE) + for (filepath in file_list) { + transform_commit_data(filepath) + } +} + +transform_directory_of_commit_data(test_dir) diff --git a/rstudio-server.job b/rstudio-server.job new file mode 100644 index 0000000..3d9f565 --- /dev/null +++ b/rstudio-server.job @@ -0,0 +1,100 @@ +#!/bin/sh + +#SBATCH --job-name=mg-govdoc-cr +#SBATCH --partition=cpu-g2-mem2x #update this line - use hyakalloc to find partitions you can use + +#SBATCH --time=03:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --mem=64G + +#SBATCH --signal=USR2 +#SBATCH --output=%x_%j.out + +# This script will request a single CPU with four threads with 20GB of RAM for 2 hours. +# You can adjust --time, --nodes, --ntasks, and --mem above to adjust these settings for your session. + +# --output=%x_%j.out creates a output file called rstudio-server_XXXXXXXX.out +# where the %x is short hand for --job-name above and the X's are an 8-digit +# jobID assigned by SLURM when our job is submitted. + +RSTUDIO_CWD="/mmfs1/home/mjilg/git/govdoc-cr-analysis" # UPDATE THIS LINE +RSTUDIO_SIF="rstudio_latest.sif" # update this line + +# Create temp directory for ephemeral content to bind-mount in the container +RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())') + +mkdir -p -m 700 \ + ${RSTUDIO_TMP}/run \ + ${RSTUDIO_TMP}/tmp \ + ${RSTUDIO_TMP}/var/lib/rstudio-server + +cat > ${RSTUDIO_TMP}/database.conf < ${RSTUDIO_TMP}/rsession.sh <&2 <&2 +exit $APPTAINER_EXIT_CODE