diff --git a/bot_activity_analysis/bot_activity_exploration.R b/bot_activity_analysis/bot_activity_exploration.R index a005c01..561f6d8 100644 --- a/bot_activity_analysis/bot_activity_exploration.R +++ b/bot_activity_analysis/bot_activity_exploration.R @@ -28,10 +28,97 @@ rm(yearly_df) rm(single_df) #making sure data columns are of the right type combined_df <- combined_df |> + filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete mutate( wiki_db = as.factor(wiki_db), - date = as.Date(date), + date = as.Date(date, format = "%Y-%m-%d"), event_entity = as.factor(event_entity), event_action = as.factor(event_action), count = as.numeric(count) - ) \ No newline at end of file + ) +combined_df <- combined_df[order(combined_df$date),] +# autoregression; 1-26-2025: doesn't seem to be that much autocorrelation +acf(combined_df$count) +#multicolinearity + +#icc +library(lme4) +library(performance) + +null_model <- lmer(count ~ (1|wiki_db), data = combined_df) +summary(null_model) +icc(null_model) # low ICC: 0.044 + +#naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df) +#summary(naive_model) +#icc(naive_model)# low ICC: 0.044 + +#variance-to-mean +count_var <- var(combined_df$count) #164655364.926 +count_mean <- mean(combined_df$count) #628.119 +count_var_to_mean <- count_var / count_mean #2262140.471 + +# the mean count values for each day +summary_df <- combined_df |> + group_by(date) |> + summarize( + mean_count = mean(count), + median_count = median(count) + ) +#plotting it +p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + + geom_line(color = "blue") + # Line plot + geom_point(color = "red") + # Points on the line + geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + + labs(title = "Median Bot Actions", + x = "Date", + y = "Median Count") + + theme_minimal() +p1 +p1_5 <- ggplot(summary_df, aes(x = date)) + + geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + + #geom_point(color = "red") + # Points on the line + labs(title = "Median Bot Actions", + x = "Date", + y = "Median Count") + + theme_minimal() +p1_5 + +#plotting mean count values +p2 <- ggplot(summary_df, aes(x = date)) + + geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + + labs(title = "Mean Bot Actions", + x = "Date", + y = "Mean Count") + + theme_minimal() +p2 + + +large_wikis <- c("wikidatawiki", "commonswiki", "enwiki") +medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki", + "ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki", + "shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki", + "hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki", + "nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource", + "frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki", + "enwikinews", "specieswiki", "thwiki" +) +small_wiki_summary_df <- combined_df |> + filter(!wiki_db %in% large_wikis)|> + filter(!wiki_db %in% medium_wikis)|> + group_by(date) |> + summarize( + mean_count = mean(count), + median_count = median(count) + ) +ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) + + geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + + geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") + + labs(title = "Mean Bot Actions (small, single file wikis)", + x = "Date", + y = "Mean Count") + + theme_minimal() +ps2 + + + diff --git a/plots/exploratory/01262025_mean_bot_actions_smooth.png b/plots/exploratory/01262025_mean_bot_actions_smooth.png new file mode 100644 index 0000000..45110ae Binary files /dev/null and b/plots/exploratory/01262025_mean_bot_actions_smooth.png differ diff --git a/plots/exploratory/01262025_mean_excluding_bot_action.png b/plots/exploratory/01262025_mean_excluding_bot_action.png new file mode 100644 index 0000000..7b10418 Binary files /dev/null and b/plots/exploratory/01262025_mean_excluding_bot_action.png differ diff --git a/plots/exploratory/01262025_median_action_point_plot.png b/plots/exploratory/01262025_median_action_point_plot.png new file mode 100644 index 0000000..4c56fe5 Binary files /dev/null and b/plots/exploratory/01262025_median_action_point_plot.png differ diff --git a/plots/exploratory/01262025_small_wiki.png b/plots/exploratory/01262025_small_wiki.png new file mode 100644 index 0000000..bd02c3c Binary files /dev/null and b/plots/exploratory/01262025_small_wiki.png differ diff --git a/plots/exploratory/01262025_small_wiki_mean_actions.png b/plots/exploratory/01262025_small_wiki_mean_actions.png new file mode 100644 index 0000000..bd02c3c Binary files /dev/null and b/plots/exploratory/01262025_small_wiki_mean_actions.png differ diff --git a/rstudio-server.job b/rstudio-server.job index 3df6e3f..df75f69 100644 --- a/rstudio-server.job +++ b/rstudio-server.job @@ -3,10 +3,10 @@ #SBATCH --job-name=mgaughan-rstudio-server #SBATCH --partition=cpu-g2-mem2x -#SBATCH --time=02:00:00 +#SBATCH --time=03:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=4 -#SBATCH --mem=20G +#SBATCH --mem=32G #SBATCH --signal=USR2 #SBATCH --output=%x_%j.out