library(tidyverse) # data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts # load in the paritioned directories library(dplyr) monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/" yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/" single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/" column_names <- c("wiki_db", "date", "event_entity", "event_action", "count") # define a function to combing the multiple csv files in each directory consolidate_csv <- function(directory, column_names) { file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE) df_list <- lapply(file_list, function(file){ df = read.csv(file, header = FALSE) colnames(df) <- column_names return(df) }) combined_df <- do.call(rbind, df_list) return(combined_df) } #apply the function to our three directories of data monthly_df <- consolidate_csv(monthly_file_dir, column_names) yearly_df <- consolidate_csv(yearly_file_dir, column_names) single_df <- consolidate_csv(single_file_dir, column_names) #rbind combined_df <- rbind(monthly_df, yearly_df, single_df) rm(monthly_df) rm(yearly_df) rm(single_df) #making sure data columns are of the right type combined_df <- combined_df |> mutate( wiki_db = as.factor(wiki_db), date = as.Date(date), event_entity = as.factor(event_entity), event_action = as.factor(event_action), count = as.numeric(count) )