3 changed files with 0 additions and 147 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +0,0 @@
 # ignore the R studio docker image needed by hyak 
 rstudio_latest.sif
 # do not need to include any R items 
 .Rhistory
 .cache/
 .config/
 .local/
--- a/bot_activity_analysis/bot_activity_exploration.R
+++ b/bot_activity_analysis/bot_activity_exploration.R
@ -1,37 +0,0 @@
 library(tidyverse)
 # data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts
 # load in the paritioned directories
 library(dplyr)
 monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/"
 yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/"
 single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/"
 column_names <- c("wiki_db", "date", "event_entity", "event_action", "count")
 # define a function to combing the multiple csv files in each directory
 consolidate_csv <- function(directory, column_names) {
  file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE)
  df_list <- lapply(file_list, function(file){
    df = read.csv(file, header = FALSE) 
    colnames(df) <- column_names
    return(df)
    })
  combined_df <- do.call(rbind, df_list)
  return(combined_df)
 }
 #apply the function to our three directories of data 
 monthly_df <- consolidate_csv(monthly_file_dir, column_names)
 yearly_df <- consolidate_csv(yearly_file_dir, column_names)
 single_df <- consolidate_csv(single_file_dir, column_names)
 #rbind 
 combined_df <- rbind(monthly_df, yearly_df, single_df)
 rm(monthly_df)
 rm(yearly_df)
 rm(single_df)
 #making sure data columns are of the right type
 combined_df <- combined_df |> 
  mutate( 
    wiki_db = as.factor(wiki_db),
    date = as.Date(date),
    event_entity = as.factor(event_entity),
    event_action = as.factor(event_action),
    count = as.numeric(count)
  )
--- a/rstudio-server.job
+++ b/rstudio-server.job
@ -1,100 +0,0 @@
 #!/bin/sh
 #SBATCH --job-name=mgaughan-rstudio-server
 #SBATCH --partition=cpu-g2-mem2x
 #SBATCH --time=02:00:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
 #SBATCH --mem=20G
 #SBATCH --signal=USR2
 #SBATCH --output=%x_%j.out
 # This script will request a single CPU with four threads with 20GB of RAM for 2 hours. 
 # You can adjust --time, --nodes, --ntasks, and --mem above to adjust these settings for your session.
 # --output=%x_%j.out creates a output file called rstudio-server_XXXXXXXX.out 
 # where the %x is short hand for --job-name above and the X's are an 8-digit 
 # jobID assigned by SLURM when our job is submitted.
 RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis"
 RSTUDIO_SIF="rstudio_latest.sif"
 # Create temp directory for ephemeral content to bind-mount in the container
 RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')
 mkdir -p -m 700 \
        ${RSTUDIO_TMP}/run \
        ${RSTUDIO_TMP}/tmp \
        ${RSTUDIO_TMP}/var/lib/rstudio-server
 cat > ${RSTUDIO_TMP}/database.conf <<END
 provider=sqlite
 directory=/var/lib/rstudio-server
 END
 # Set OMP_NUM_THREADS to prevent OpenBLAS (and any other OpenMP-enhanced
 # libraries used by R) from spawning more threads than the number of processors
 # allocated to the job.
 #
 # Set R_LIBS_USER to a path specific to rocker/rstudio to avoid conflicts with
 # personal libraries from any R installation in the host environment
 cat > ${RSTUDIO_TMP}/rsession.sh <<END
 #!/bin/sh
 export OMP_NUM_THREADS=${SLURM_JOB_CPUS_PER_NODE}
 export R_LIBS_USER=/gscratch/scrubbed/mjilg/R
 exec /usr/lib/rstudio-server/bin/rsession "\${@}"
 END
 chmod +x ${RSTUDIO_TMP}/rsession.sh
 export APPTAINER_BIND="${RSTUDIO_CWD}:${RSTUDIO_CWD},/gscratch:/gscratch,${RSTUDIO_TMP}/run:/run,${RSTUDIO_TMP}/tmp:/tmp,${RSTUDIO_TMP}/database.conf:/etc/rstudio/database.conf,${RSTUDIO_TMP}/rsession.sh:/etc/rstudio/rsession.sh,${RSTUDIO_TMP}/var/lib/rstudio-server:/var/lib/rstudio-server"
 # Do not suspend idle sessions.
 # Alternative to setting session-timeout-minutes=0 in /etc/rstudio/rsession.conf
 export APPTAINERENV_RSTUDIO_SESSION_TIMEOUT=0
 export APPTAINERENV_USER=$(id -un)
 export APPTAINERENV_PASSWORD=$(openssl rand -base64 15)
 # get unused socket per https://unix.stackexchange.com/a/132524
 # tiny race condition between the python & apptainer commands
 readonly PORT=$(/mmfs1/sw/pyenv/versions/3.9.5/bin/python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
 cat 1>&2 <<END
 1. SSH tunnel from your workstation using the following command:
   ssh -N -L 8787:${HOSTNAME}:${PORT} ${APPTAINERENV_USER}@klone.hyak.uw.edu
   and point your web browser to http://localhost:8787
 2. log in to RStudio Server using the following credentials:
   user: ${APPTAINERENV_USER}
   password: ${APPTAINERENV_PASSWORD}
 When done using RStudio Server, terminate the job by:
 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:
      scancel -f ${SLURM_JOB_ID}
 END
 source /etc/bashrc
 module load apptainer
 apptainer exec --cleanenv --home ${RSTUDIO_CWD} ${RSTUDIO_CWD}/${RSTUDIO_SIF} \
    rserver --www-port ${PORT} \
            --auth-none=0 \
            --auth-pam-helper-path=pam-helper \
            --auth-stay-signed-in-days=30 \
            --auth-timeout-minutes=0 \
            --rsession-path=/etc/rstudio/rsession.sh \
            --server-user=${APPTAINERENV_USER}
 APPTAINER_EXIT_CODE=$?
 echo "rserver exited $APPTAINER_EXIT_CODE" 1>&2
 exit $APPTAINER_EXIT_CODE