library(tidyverse) # data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts # load in the paritioned directories library(dplyr) monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/" yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/" single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/" column_names <- c("wiki_db", "date", "event_entity", "event_action", "count") # define a function to combing the multiple csv files in each directory consolidate_csv <- function(directory, column_names) { file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE) df_list <- lapply(file_list, function(file){ df = read.csv(file, header = FALSE) colnames(df) <- column_names return(df) }) combined_df <- do.call(rbind, df_list) return(combined_df) } #apply the function to our three directories of data monthly_df <- consolidate_csv(monthly_file_dir, column_names) yearly_df <- consolidate_csv(yearly_file_dir, column_names) single_df <- consolidate_csv(single_file_dir, column_names) #rbind combined_df <- rbind(monthly_df, yearly_df, single_df) rm(monthly_df) rm(yearly_df) rm(single_df) #making sure data columns are of the right type combined_df <- combined_df |> filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete mutate( wiki_db = as.factor(wiki_db), date = as.Date(date, format = "%Y-%m-%d"), event_entity = as.factor(event_entity), event_action = as.factor(event_action), count = as.numeric(count) ) combined_df <- combined_df[order(combined_df$date),] # autoregression; 1-26-2025: doesn't seem to be that much autocorrelation acf(combined_df$count) #multicolinearity #icc library(lme4) library(performance) null_model <- lmer(count ~ (1|wiki_db), data = combined_df) summary(null_model) icc(null_model) # low ICC: 0.044 #naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df) #summary(naive_model) #icc(naive_model)# low ICC: 0.044 #variance-to-mean count_var <- var(combined_df$count) #164655364.926 count_mean <- mean(combined_df$count) #628.119 count_var_to_mean <- count_var / count_mean #2262140.471 # the mean count values for each day summary_df <- combined_df |> group_by(date) |> summarize( mean_count = mean(count), median_count = median(count) ) #plotting it p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + geom_line(color = "blue") + # Line plot geom_point(color = "red") + # Points on the line geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + labs(title = "Median Bot Actions", x = "Date", y = "Median Count") + theme_minimal() p1 p1_5 <- ggplot(summary_df, aes(x = date)) + geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + #geom_point(color = "red") + # Points on the line labs(title = "Median Bot Actions", x = "Date", y = "Median Count") + theme_minimal() p1_5 #plotting mean count values p2 <- ggplot(summary_df, aes(x = date)) + geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + labs(title = "Mean Bot Actions", x = "Date", y = "Mean Count") + theme_minimal() p2 large_wikis <- c("wikidatawiki", "commonswiki", "enwiki") medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki", "ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki", "shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki", "hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki", "nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource", "frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki", "enwikinews", "specieswiki", "thwiki" ) small_wiki_summary_df <- combined_df |> filter(!wiki_db %in% large_wikis)|> filter(!wiki_db %in% medium_wikis)|> group_by(date) |> summarize( mean_count = mean(count), median_count = median(count) ) ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) + geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") + labs(title = "Mean Bot Actions (small, single file wikis)", x = "Date", y = "Mean Count") + theme_minimal() ps2