1
0

exploratory data analysis

This commit is contained in:
Matthew Gaughan 2025-01-26 11:51:25 -08:00
parent 49eaade666
commit 27a9781564
7 changed files with 91 additions and 4 deletions

View File

@ -28,10 +28,97 @@ rm(yearly_df)
rm(single_df)
#making sure data columns are of the right type
combined_df <- combined_df |>
filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete
mutate(
wiki_db = as.factor(wiki_db),
date = as.Date(date),
date = as.Date(date, format = "%Y-%m-%d"),
event_entity = as.factor(event_entity),
event_action = as.factor(event_action),
count = as.numeric(count)
)
)
combined_df <- combined_df[order(combined_df$date),]
# autoregression; 1-26-2025: doesn't seem to be that much autocorrelation
acf(combined_df$count)
#multicolinearity
#icc
library(lme4)
library(performance)
null_model <- lmer(count ~ (1|wiki_db), data = combined_df)
summary(null_model)
icc(null_model) # low ICC: 0.044
#naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df)
#summary(naive_model)
#icc(naive_model)# low ICC: 0.044
#variance-to-mean
count_var <- var(combined_df$count) #164655364.926
count_mean <- mean(combined_df$count) #628.119
count_var_to_mean <- count_var / count_mean #2262140.471
# the mean count values for each day
summary_df <- combined_df |>
group_by(date) |>
summarize(
mean_count = mean(count),
median_count = median(count)
)
#plotting it
p1 <- ggplot(summary_df, aes(x = date, y = median_count)) +
geom_line(color = "blue") + # Line plot
geom_point(color = "red") + # Points on the line
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
labs(title = "Median Bot Actions",
x = "Date",
y = "Median Count") +
theme_minimal()
p1
p1_5 <- ggplot(summary_df, aes(x = date)) +
geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
#geom_point(color = "red") + # Points on the line
labs(title = "Median Bot Actions",
x = "Date",
y = "Median Count") +
theme_minimal()
p1_5
#plotting mean count values
p2 <- ggplot(summary_df, aes(x = date)) +
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
labs(title = "Mean Bot Actions",
x = "Date",
y = "Mean Count") +
theme_minimal()
p2
large_wikis <- c("wikidatawiki", "commonswiki", "enwiki")
medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki",
"ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki",
"shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki",
"hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki",
"nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource",
"frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki",
"enwikinews", "specieswiki", "thwiki"
)
small_wiki_summary_df <- combined_df |>
filter(!wiki_db %in% large_wikis)|>
filter(!wiki_db %in% medium_wikis)|>
group_by(date) |>
summarize(
mean_count = mean(count),
median_count = median(count)
)
ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) +
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") +
labs(title = "Mean Bot Actions (small, single file wikis)",
x = "Date",
y = "Mean Count") +
theme_minimal()
ps2

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

View File

@ -3,10 +3,10 @@
#SBATCH --job-name=mgaughan-rstudio-server
#SBATCH --partition=cpu-g2-mem2x
#SBATCH --time=02:00:00
#SBATCH --time=03:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --mem=20G
#SBATCH --mem=32G
#SBATCH --signal=USR2
#SBATCH --output=%x_%j.out