From 49eaade66697a539ec7d40c804bff94dad9ce443 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 23 Jan 2025 14:17:05 -0800 Subject: [PATCH] initial loading of activity_data --- .gitignore | 8 ++++ .../bot_activity_exploration.R | 37 +++++++++++++++++++ rstudio-server.job | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 bot_activity_analysis/bot_activity_exploration.R diff --git a/.gitignore b/.gitignore index 65294d7..a6603ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,10 @@ # ignore the R studio docker image needed by hyak rstudio_latest.sif + + +# do not need to include any R items +.Rhistory +.cache/ +.config/ +.local/ + diff --git a/bot_activity_analysis/bot_activity_exploration.R b/bot_activity_analysis/bot_activity_exploration.R new file mode 100644 index 0000000..a005c01 --- /dev/null +++ b/bot_activity_analysis/bot_activity_exploration.R @@ -0,0 +1,37 @@ +library(tidyverse) +# data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts +# load in the paritioned directories +library(dplyr) +monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/" +yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/" +single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/" +column_names <- c("wiki_db", "date", "event_entity", "event_action", "count") +# define a function to combing the multiple csv files in each directory +consolidate_csv <- function(directory, column_names) { + file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE) + df_list <- lapply(file_list, function(file){ + df = read.csv(file, header = FALSE) + colnames(df) <- column_names + return(df) + }) + combined_df <- do.call(rbind, df_list) + return(combined_df) +} +#apply the function to our three directories of data +monthly_df <- consolidate_csv(monthly_file_dir, column_names) +yearly_df <- consolidate_csv(yearly_file_dir, column_names) +single_df <- consolidate_csv(single_file_dir, column_names) +#rbind +combined_df <- rbind(monthly_df, yearly_df, single_df) +rm(monthly_df) +rm(yearly_df) +rm(single_df) +#making sure data columns are of the right type +combined_df <- combined_df |> + mutate( + wiki_db = as.factor(wiki_db), + date = as.Date(date), + event_entity = as.factor(event_entity), + event_action = as.factor(event_action), + count = as.numeric(count) + ) \ No newline at end of file diff --git a/rstudio-server.job b/rstudio-server.job index e7081d0..3df6e3f 100644 --- a/rstudio-server.job +++ b/rstudio-server.job @@ -19,7 +19,7 @@ # jobID assigned by SLURM when our job is submitted. RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis" -RSTUDIO_SIF="/mmfs1/home/mjilg/rstudio_latest.sif" +RSTUDIO_SIF="rstudio_latest.sif" # Create temp directory for ephemeral content to bind-mount in the container RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')