exploratory data analysis
This commit is contained in:
		
							parent
							
								
									49eaade666
								
							
						
					
					
						commit
						27a9781564
					
				| @ -28,10 +28,97 @@ rm(yearly_df) | ||||
| rm(single_df) | ||||
| #making sure data columns are of the right type | ||||
| combined_df <- combined_df |>  | ||||
|   filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete | ||||
|   mutate(  | ||||
|     wiki_db = as.factor(wiki_db), | ||||
|     date = as.Date(date), | ||||
|     date = as.Date(date, format = "%Y-%m-%d"), | ||||
|     event_entity = as.factor(event_entity), | ||||
|     event_action = as.factor(event_action), | ||||
|     count = as.numeric(count) | ||||
|   ) | ||||
|   ) | ||||
| combined_df <- combined_df[order(combined_df$date),] | ||||
| # autoregression; 1-26-2025: doesn't seem to be that much autocorrelation | ||||
| acf(combined_df$count) | ||||
| #multicolinearity | ||||
| 
 | ||||
| #icc | ||||
| library(lme4) | ||||
| library(performance) | ||||
| 
 | ||||
| null_model <- lmer(count ~ (1|wiki_db), data = combined_df) | ||||
| summary(null_model) | ||||
| icc(null_model) # low ICC: 0.044 | ||||
| 
 | ||||
| #naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df) | ||||
| #summary(naive_model) | ||||
| #icc(naive_model)# low ICC: 0.044 | ||||
| 
 | ||||
| #variance-to-mean | ||||
| count_var <- var(combined_df$count) #164655364.926 | ||||
| count_mean <- mean(combined_df$count) #628.119 | ||||
| count_var_to_mean <- count_var / count_mean #2262140.471 | ||||
| 
 | ||||
| # the mean count values for each day | ||||
| summary_df <- combined_df |> | ||||
|   group_by(date) |> | ||||
|   summarize( | ||||
|     mean_count = mean(count), | ||||
|     median_count = median(count) | ||||
|   ) | ||||
| #plotting it | ||||
| p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + | ||||
|   geom_line(color = "blue") +     # Line plot | ||||
|   geom_point(color = "red") +     # Points on the line | ||||
|   geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + | ||||
|   labs(title = "Median Bot Actions", | ||||
|        x = "Date", | ||||
|        y = "Median Count") + | ||||
|   theme_minimal()     | ||||
| p1 | ||||
| p1_5 <- ggplot(summary_df, aes(x = date)) + | ||||
|   geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + | ||||
|   #geom_point(color = "red") +     # Points on the line | ||||
|   labs(title = "Median Bot Actions", | ||||
|        x = "Date", | ||||
|        y = "Median Count") + | ||||
|   theme_minimal()     | ||||
| p1_5 | ||||
| 
 | ||||
| #plotting mean count values | ||||
| p2 <- ggplot(summary_df, aes(x = date)) + | ||||
|   geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + | ||||
|   labs(title = "Mean Bot Actions", | ||||
|        x = "Date", | ||||
|        y = "Mean Count") + | ||||
|   theme_minimal()     | ||||
| p2 | ||||
| 
 | ||||
| 
 | ||||
| large_wikis <- c("wikidatawiki", "commonswiki", "enwiki") | ||||
| medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki",  | ||||
|                   "ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki",  | ||||
|                   "shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki",  | ||||
|                   "hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki",  | ||||
|                   "nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource",  | ||||
|                   "frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki",  | ||||
|                   "enwikinews", "specieswiki", "thwiki" | ||||
| ) | ||||
| small_wiki_summary_df <- combined_df |> | ||||
|   filter(!wiki_db %in% large_wikis)|> | ||||
|   filter(!wiki_db %in% medium_wikis)|> | ||||
|   group_by(date) |> | ||||
|   summarize( | ||||
|     mean_count = mean(count), | ||||
|     median_count = median(count) | ||||
|   ) | ||||
| ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) + | ||||
|   geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + | ||||
|   geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") + | ||||
|   labs(title = "Mean Bot Actions (small, single file wikis)", | ||||
|        x = "Date", | ||||
|        y = "Mean Count") + | ||||
|   theme_minimal()     | ||||
| ps2 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_bot_actions_smooth.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_bot_actions_smooth.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 25 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_excluding_bot_action.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_excluding_bot_action.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 33 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_median_action_point_plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_median_action_point_plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 60 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 31 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki_mean_actions.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki_mean_actions.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 31 KiB | 
| @ -3,10 +3,10 @@ | ||||
| #SBATCH --job-name=mgaughan-rstudio-server | ||||
| #SBATCH --partition=cpu-g2-mem2x | ||||
| 
 | ||||
| #SBATCH --time=02:00:00 | ||||
| #SBATCH --time=03:00:00 | ||||
| #SBATCH --nodes=1 | ||||
| #SBATCH --ntasks=4 | ||||
| #SBATCH --mem=20G | ||||
| #SBATCH --mem=32G | ||||
| 
 | ||||
| #SBATCH --signal=USR2 | ||||
| #SBATCH --output=%x_%j.out | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user