initial commit
This commit is contained in:
commit
176e6cceec
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# ignore the R studio docker image needed by hyak
|
||||||
|
rstudio_latest.sif
|
||||||
|
|
||||||
|
|
||||||
|
# do not need to include any R items
|
||||||
|
.Rhistory
|
||||||
|
.cache/
|
||||||
|
.config/
|
||||||
|
.local/
|
45
cleaning_scripts/get_weekly_commit_counts.R
Normal file
45
cleaning_scripts/get_weekly_commit_counts.R
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
# test data directory: /gscratch/comdata/users/mjilg/program_testing/
|
||||||
|
# load in the paritioned directories
|
||||||
|
library(dplyr)
|
||||||
|
library(lubridate)
|
||||||
|
|
||||||
|
#for a given file we want to get the count data and produce a csv
|
||||||
|
test_file <- "/gscratch/comdata/users/mjilg/program_testing/core_2012-01-01_to_2014-12-31.csv"
|
||||||
|
test_dir <- "/gscratch/comdata/users/mjilg/program_testing/"
|
||||||
|
|
||||||
|
transform_commit_data <- function(filepath){
|
||||||
|
df = read.csv(filepath, header = TRUE)
|
||||||
|
dir_path = dirname(filepath)
|
||||||
|
file_name = basename(filepath)
|
||||||
|
|
||||||
|
# transform the rows of commit data to weekly count data
|
||||||
|
project_name <- sub("_[0-9]{4}-[0-9]{2}-[0-9]{2}_to_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv$", "", file_name)
|
||||||
|
|
||||||
|
df <- df |>
|
||||||
|
mutate(commit_date = ymd_hms(commit_date)) |>
|
||||||
|
mutate(project_name = project_name)
|
||||||
|
|
||||||
|
weekly_commits <- df |>
|
||||||
|
mutate(week = floor_date(commit_date, "week")) |>
|
||||||
|
group_by(week, project_name) |>
|
||||||
|
summarise(commit_count = n(), .groups = 'drop')
|
||||||
|
|
||||||
|
#prepare to save the new, transformed file
|
||||||
|
count_path <- file.path(dir_path, "weekly_counts")
|
||||||
|
count_file_name <- paste0("weeklycount_", file_name)
|
||||||
|
output_file_path <- file.path(count_path, count_file_name)
|
||||||
|
#save and gracefully exit
|
||||||
|
write.csv(weekly_commits, output_file_path, row.names = FALSE)
|
||||||
|
return(weekly_commits)
|
||||||
|
}
|
||||||
|
|
||||||
|
#then for all files in a directory
|
||||||
|
transform_directory_of_commit_data <- function(dir_path) {
|
||||||
|
file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE)
|
||||||
|
for (filepath in file_list) {
|
||||||
|
transform_commit_data(filepath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
transform_directory_of_commit_data(test_dir)
|
100
rstudio-server.job
Normal file
100
rstudio-server.job
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
#SBATCH --job-name=mg-govdoc-cr
|
||||||
|
#SBATCH --partition=cpu-g2-mem2x #update this line - use hyakalloc to find partitions you can use
|
||||||
|
|
||||||
|
#SBATCH --time=03:00:00
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks=4
|
||||||
|
#SBATCH --mem=64G
|
||||||
|
|
||||||
|
#SBATCH --signal=USR2
|
||||||
|
#SBATCH --output=%x_%j.out
|
||||||
|
|
||||||
|
# This script will request a single CPU with four threads with 20GB of RAM for 2 hours.
|
||||||
|
# You can adjust --time, --nodes, --ntasks, and --mem above to adjust these settings for your session.
|
||||||
|
|
||||||
|
# --output=%x_%j.out creates a output file called rstudio-server_XXXXXXXX.out
|
||||||
|
# where the %x is short hand for --job-name above and the X's are an 8-digit
|
||||||
|
# jobID assigned by SLURM when our job is submitted.
|
||||||
|
|
||||||
|
RSTUDIO_CWD="/mmfs1/home/mjilg/git/govdoc-cr-analysis" # UPDATE THIS LINE
|
||||||
|
RSTUDIO_SIF="rstudio_latest.sif" # update this line
|
||||||
|
|
||||||
|
# Create temp directory for ephemeral content to bind-mount in the container
|
||||||
|
RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')
|
||||||
|
|
||||||
|
mkdir -p -m 700 \
|
||||||
|
${RSTUDIO_TMP}/run \
|
||||||
|
${RSTUDIO_TMP}/tmp \
|
||||||
|
${RSTUDIO_TMP}/var/lib/rstudio-server
|
||||||
|
|
||||||
|
cat > ${RSTUDIO_TMP}/database.conf <<END
|
||||||
|
provider=sqlite
|
||||||
|
directory=/var/lib/rstudio-server
|
||||||
|
END
|
||||||
|
|
||||||
|
# Set OMP_NUM_THREADS to prevent OpenBLAS (and any other OpenMP-enhanced
|
||||||
|
# libraries used by R) from spawning more threads than the number of processors
|
||||||
|
# allocated to the job.
|
||||||
|
#
|
||||||
|
# Set R_LIBS_USER to a path specific to rocker/rstudio to avoid conflicts with
|
||||||
|
# personal libraries from any R installation in the host environment
|
||||||
|
|
||||||
|
cat > ${RSTUDIO_TMP}/rsession.sh <<END
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=${SLURM_JOB_CPUS_PER_NODE}
|
||||||
|
export R_LIBS_USER=/gscratch/scrubbed/mjilg/R
|
||||||
|
exec /usr/lib/rstudio-server/bin/rsession "\${@}"
|
||||||
|
END
|
||||||
|
|
||||||
|
chmod +x ${RSTUDIO_TMP}/rsession.sh
|
||||||
|
|
||||||
|
export APPTAINER_BIND="${RSTUDIO_CWD}:${RSTUDIO_CWD},/gscratch:/gscratch,${RSTUDIO_TMP}/run:/run,${RSTUDIO_TMP}/tmp:/tmp,${RSTUDIO_TMP}/database.conf:/etc/rstudio/database.conf,${RSTUDIO_TMP}/rsession.sh:/etc/rstudio/rsession.sh,${RSTUDIO_TMP}/var/lib/rstudio-server:/var/lib/rstudio-server"
|
||||||
|
|
||||||
|
# Do not suspend idle sessions.
|
||||||
|
# Alternative to setting session-timeout-minutes=0 in /etc/rstudio/rsession.conf
|
||||||
|
export APPTAINERENV_RSTUDIO_SESSION_TIMEOUT=0
|
||||||
|
|
||||||
|
export APPTAINERENV_USER=$(id -un)
|
||||||
|
export APPTAINERENV_PASSWORD=$(openssl rand -base64 15)
|
||||||
|
|
||||||
|
# get unused socket per https://unix.stackexchange.com/a/132524
|
||||||
|
# tiny race condition between the python & apptainer commands
|
||||||
|
readonly PORT=$(/mmfs1/sw/pyenv/versions/3.9.5/bin/python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
|
||||||
|
cat 1>&2 <<END
|
||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:${HOSTNAME}:${PORT} ${APPTAINERENV_USER}@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: ${APPTAINERENV_USER}
|
||||||
|
password: ${APPTAINERENV_PASSWORD}
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f ${SLURM_JOB_ID}
|
||||||
|
END
|
||||||
|
|
||||||
|
source /etc/bashrc
|
||||||
|
module load apptainer
|
||||||
|
|
||||||
|
apptainer exec --cleanenv --home ${RSTUDIO_CWD} ${RSTUDIO_CWD}/${RSTUDIO_SIF} \
|
||||||
|
rserver --www-port ${PORT} \
|
||||||
|
--auth-none=0 \
|
||||||
|
--auth-pam-helper-path=pam-helper \
|
||||||
|
--auth-stay-signed-in-days=30 \
|
||||||
|
--auth-timeout-minutes=0 \
|
||||||
|
--rsession-path=/etc/rstudio/rsession.sh \
|
||||||
|
--server-user=${APPTAINERENV_USER}
|
||||||
|
|
||||||
|
APPTAINER_EXIT_CODE=$?
|
||||||
|
echo "rserver exited $APPTAINER_EXIT_CODE" 1>&2
|
||||||
|
exit $APPTAINER_EXIT_CODE
|
Loading…
Reference in New Issue
Block a user