exploratory data analysis
This commit is contained in:
parent
49eaade666
commit
27a9781564
@ -28,10 +28,97 @@ rm(yearly_df)
|
|||||||
rm(single_df)
|
rm(single_df)
|
||||||
#making sure data columns are of the right type
|
#making sure data columns are of the right type
|
||||||
combined_df <- combined_df |>
|
combined_df <- combined_df |>
|
||||||
|
filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete
|
||||||
mutate(
|
mutate(
|
||||||
wiki_db = as.factor(wiki_db),
|
wiki_db = as.factor(wiki_db),
|
||||||
date = as.Date(date),
|
date = as.Date(date, format = "%Y-%m-%d"),
|
||||||
event_entity = as.factor(event_entity),
|
event_entity = as.factor(event_entity),
|
||||||
event_action = as.factor(event_action),
|
event_action = as.factor(event_action),
|
||||||
count = as.numeric(count)
|
count = as.numeric(count)
|
||||||
)
|
)
|
||||||
|
combined_df <- combined_df[order(combined_df$date),]
|
||||||
|
# autoregression; 1-26-2025: doesn't seem to be that much autocorrelation
|
||||||
|
acf(combined_df$count)
|
||||||
|
#multicolinearity
|
||||||
|
|
||||||
|
#icc
|
||||||
|
library(lme4)
|
||||||
|
library(performance)
|
||||||
|
|
||||||
|
null_model <- lmer(count ~ (1|wiki_db), data = combined_df)
|
||||||
|
summary(null_model)
|
||||||
|
icc(null_model) # low ICC: 0.044
|
||||||
|
|
||||||
|
#naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df)
|
||||||
|
#summary(naive_model)
|
||||||
|
#icc(naive_model)# low ICC: 0.044
|
||||||
|
|
||||||
|
#variance-to-mean
|
||||||
|
count_var <- var(combined_df$count) #164655364.926
|
||||||
|
count_mean <- mean(combined_df$count) #628.119
|
||||||
|
count_var_to_mean <- count_var / count_mean #2262140.471
|
||||||
|
|
||||||
|
# the mean count values for each day
|
||||||
|
summary_df <- combined_df |>
|
||||||
|
group_by(date) |>
|
||||||
|
summarize(
|
||||||
|
mean_count = mean(count),
|
||||||
|
median_count = median(count)
|
||||||
|
)
|
||||||
|
#plotting it
|
||||||
|
p1 <- ggplot(summary_df, aes(x = date, y = median_count)) +
|
||||||
|
geom_line(color = "blue") + # Line plot
|
||||||
|
geom_point(color = "red") + # Points on the line
|
||||||
|
geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") +
|
||||||
|
labs(title = "Median Bot Actions",
|
||||||
|
x = "Date",
|
||||||
|
y = "Median Count") +
|
||||||
|
theme_minimal()
|
||||||
|
p1
|
||||||
|
p1_5 <- ggplot(summary_df, aes(x = date)) +
|
||||||
|
geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) +
|
||||||
|
#geom_point(color = "red") + # Points on the line
|
||||||
|
labs(title = "Median Bot Actions",
|
||||||
|
x = "Date",
|
||||||
|
y = "Median Count") +
|
||||||
|
theme_minimal()
|
||||||
|
p1_5
|
||||||
|
|
||||||
|
#plotting mean count values
|
||||||
|
p2 <- ggplot(summary_df, aes(x = date)) +
|
||||||
|
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
|
||||||
|
labs(title = "Mean Bot Actions",
|
||||||
|
x = "Date",
|
||||||
|
y = "Mean Count") +
|
||||||
|
theme_minimal()
|
||||||
|
p2
|
||||||
|
|
||||||
|
|
||||||
|
large_wikis <- c("wikidatawiki", "commonswiki", "enwiki")
|
||||||
|
medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki",
|
||||||
|
"ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki",
|
||||||
|
"shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki",
|
||||||
|
"hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki",
|
||||||
|
"nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource",
|
||||||
|
"frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki",
|
||||||
|
"enwikinews", "specieswiki", "thwiki"
|
||||||
|
)
|
||||||
|
small_wiki_summary_df <- combined_df |>
|
||||||
|
filter(!wiki_db %in% large_wikis)|>
|
||||||
|
filter(!wiki_db %in% medium_wikis)|>
|
||||||
|
group_by(date) |>
|
||||||
|
summarize(
|
||||||
|
mean_count = mean(count),
|
||||||
|
median_count = median(count)
|
||||||
|
)
|
||||||
|
ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) +
|
||||||
|
geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) +
|
||||||
|
geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") +
|
||||||
|
labs(title = "Mean Bot Actions (small, single file wikis)",
|
||||||
|
x = "Date",
|
||||||
|
y = "Mean Count") +
|
||||||
|
theme_minimal()
|
||||||
|
ps2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BIN
plots/exploratory/01262025_mean_bot_actions_smooth.png
Normal file
BIN
plots/exploratory/01262025_mean_bot_actions_smooth.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
BIN
plots/exploratory/01262025_mean_excluding_bot_action.png
Normal file
BIN
plots/exploratory/01262025_mean_excluding_bot_action.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
BIN
plots/exploratory/01262025_median_action_point_plot.png
Normal file
BIN
plots/exploratory/01262025_median_action_point_plot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
BIN
plots/exploratory/01262025_small_wiki.png
Normal file
BIN
plots/exploratory/01262025_small_wiki.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 31 KiB |
BIN
plots/exploratory/01262025_small_wiki_mean_actions.png
Normal file
BIN
plots/exploratory/01262025_small_wiki_mean_actions.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 31 KiB |
@ -3,10 +3,10 @@
|
|||||||
#SBATCH --job-name=mgaughan-rstudio-server
|
#SBATCH --job-name=mgaughan-rstudio-server
|
||||||
#SBATCH --partition=cpu-g2-mem2x
|
#SBATCH --partition=cpu-g2-mem2x
|
||||||
|
|
||||||
#SBATCH --time=02:00:00
|
#SBATCH --time=03:00:00
|
||||||
#SBATCH --nodes=1
|
#SBATCH --nodes=1
|
||||||
#SBATCH --ntasks=4
|
#SBATCH --ntasks=4
|
||||||
#SBATCH --mem=20G
|
#SBATCH --mem=32G
|
||||||
|
|
||||||
#SBATCH --signal=USR2
|
#SBATCH --signal=USR2
|
||||||
#SBATCH --output=%x_%j.out
|
#SBATCH --output=%x_%j.out
|
||||||
|
Loading…
Reference in New Issue
Block a user