1
0
mw-lifecycle-analysis/bot_activity_analysis/bot_activity_exploration.R
2025-01-23 14:17:05 -08:00

37 lines
1.6 KiB
R

library(tidyverse)
# data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts
# load in the paritioned directories
library(dplyr)
monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/"
yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/"
single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/"
column_names <- c("wiki_db", "date", "event_entity", "event_action", "count")
# define a function to combing the multiple csv files in each directory
consolidate_csv <- function(directory, column_names) {
file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE)
df_list <- lapply(file_list, function(file){
df = read.csv(file, header = FALSE)
colnames(df) <- column_names
return(df)
})
combined_df <- do.call(rbind, df_list)
return(combined_df)
}
#apply the function to our three directories of data
monthly_df <- consolidate_csv(monthly_file_dir, column_names)
yearly_df <- consolidate_csv(yearly_file_dir, column_names)
single_df <- consolidate_csv(single_file_dir, column_names)
#rbind
combined_df <- rbind(monthly_df, yearly_df, single_df)
rm(monthly_df)
rm(yearly_df)
rm(single_df)
#making sure data columns are of the right type
combined_df <- combined_df |>
mutate(
wiki_db = as.factor(wiki_db),
date = as.Date(date),
event_entity = as.factor(event_entity),
event_action = as.factor(event_action),
count = as.numeric(count)
)