1
0
mw-lifecycle-analysis/bot_activity_analysis/bot_activity_exploration.R

125 lines
4.6 KiB
R
Raw Permalink Normal View History

2025-01-23 22:17:05 +00:00
library(tidyverse)
# data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts
# load in the paritioned directories
library(dplyr)
monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/"
yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/"
single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/"
column_names <- c("wiki_db", "date", "event_entity", "event_action", "count")
# define a function to combing the multiple csv files in each directory
consolidate_csv <- function(directory, column_names) {
file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE)
df_list <- lapply(file_list, function(file){
df = read.csv(file, header = FALSE)
colnames(df) <- column_names
return(df)
})
combined_df <- do.call(rbind, df_list)
return(combined_df)
}
#apply the function to our three directories of data
monthly_df <- consolidate_csv(monthly_file_dir, column_names)
yearly_df <- consolidate_csv(yearly_file_dir, column_names)
single_df <- consolidate_csv(single_file_dir, column_names)
#rbind
combined_df <- rbind(monthly_df, yearly_df, single_df)
rm(monthly_df)
rm(yearly_df)
rm(single_df)
#making sure data columns are of the right type
combined_df <- combined_df |>
2025-01-26 19:51:25 +00:00
filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete
2025-01-23 22:17:05 +00:00
mutate(
wiki_db = as.factor(wiki_db),
2025-01-26 19:51:25 +00:00
date = as.Date(date, format = "%Y-%m-%d"),
2025-01-23 22:17:05 +00:00
event_entity = as.factor(event_entity),
event_action = as.factor(event_action),
count = as.numeric(count)
2025-01-26 19:51:25 +00:00
)
combined_df <- combined_df[order(combined_df$date),]
# autoregression; 1-26-2025: doesn't seem to be that much autocorrelation
acf(combined_df$count)
#multicolinearity
#icc
library(lme4)
library(performance)
null_model <- lmer(count ~ (1|wiki_db), data = combined_df)
summary(null_model)
icc(null_model) # low ICC: 0.044
#naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df)
#summary(naive_model)
#icc(naive_model)# low ICC: 0.044
#variance-to-mean
count_var <- var(combined_df$count) #164655364.926
count_mean <- mean(combined_df$count) #628.119
count_var_to_mean <- count_var / count_mean #2262140.471
# the mean count values for each day
summary_df <- combined_df |>
group_by(date) |>
summarize(
mean_count = mean(count),
median_count = median(count)
)
#plotting it
p1 <- ggplot(summary_df, aes(x = date, y = median_count)) +
geom_line(color = "blue") + # Line plot
geom_point(color = "red") + # Points on the line
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
labs(title = "Median Bot Actions",
x = "Date",
y = "Median Count") +
theme_minimal()
p1
p1_5 <- ggplot(summary_df, aes(x = date)) +
geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
#geom_point(color = "red") + # Points on the line
labs(title = "Median Bot Actions",
x = "Date",
y = "Median Count") +
theme_minimal()
p1_5
#plotting mean count values
p2 <- ggplot(summary_df, aes(x = date)) +
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
labs(title = "Mean Bot Actions",
x = "Date",
y = "Mean Count") +
theme_minimal()
p2
large_wikis <- c("wikidatawiki", "commonswiki", "enwiki")
medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki",
"ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki",
"shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki",
"hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki",
"nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource",
"frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki",
"enwikinews", "specieswiki", "thwiki"
)
small_wiki_summary_df <- combined_df |>
filter(!wiki_db %in% large_wikis)|>
filter(!wiki_db %in% medium_wikis)|>
group_by(date) |>
summarize(
mean_count = mean(count),
median_count = median(count)
)
ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) +
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") +
labs(title = "Mean Bot Actions (small, single file wikis)",
x = "Date",
y = "Mean Count") +
theme_minimal()
ps2