exploratory data analysis
This commit is contained in:
		
							parent
							
								
									49eaade666
								
							
						
					
					
						commit
						27a9781564
					
				| @ -28,10 +28,97 @@ rm(yearly_df) | |||||||
| rm(single_df) | rm(single_df) | ||||||
| #making sure data columns are of the right type | #making sure data columns are of the right type | ||||||
| combined_df <- combined_df |>  | combined_df <- combined_df |>  | ||||||
|  |   filter(date >= as.Date("2010-01-01")) |> # pre-2010 data is incomplete | ||||||
|   mutate(  |   mutate(  | ||||||
|     wiki_db = as.factor(wiki_db), |     wiki_db = as.factor(wiki_db), | ||||||
|     date = as.Date(date), |     date = as.Date(date, format = "%Y-%m-%d"), | ||||||
|     event_entity = as.factor(event_entity), |     event_entity = as.factor(event_entity), | ||||||
|     event_action = as.factor(event_action), |     event_action = as.factor(event_action), | ||||||
|     count = as.numeric(count) |     count = as.numeric(count) | ||||||
|   ) |   ) | ||||||
|  | combined_df <- combined_df[order(combined_df$date),] | ||||||
|  | # autoregression; 1-26-2025: doesn't seem to be that much autocorrelation | ||||||
|  | acf(combined_df$count) | ||||||
|  | #multicolinearity | ||||||
|  | 
 | ||||||
|  | #icc | ||||||
|  | library(lme4) | ||||||
|  | library(performance) | ||||||
|  | 
 | ||||||
|  | null_model <- lmer(count ~ (1|wiki_db), data = combined_df) | ||||||
|  | summary(null_model) | ||||||
|  | icc(null_model) # low ICC: 0.044 | ||||||
|  | 
 | ||||||
|  | #naive_model <- lmer(count ~ date + (1|wiki_db), data = combined_df) | ||||||
|  | #summary(naive_model) | ||||||
|  | #icc(naive_model)# low ICC: 0.044 | ||||||
|  | 
 | ||||||
|  | #variance-to-mean | ||||||
|  | count_var <- var(combined_df$count) #164655364.926 | ||||||
|  | count_mean <- mean(combined_df$count) #628.119 | ||||||
|  | count_var_to_mean <- count_var / count_mean #2262140.471 | ||||||
|  | 
 | ||||||
|  | # the mean count values for each day | ||||||
|  | summary_df <- combined_df |> | ||||||
|  |   group_by(date) |> | ||||||
|  |   summarize( | ||||||
|  |     mean_count = mean(count), | ||||||
|  |     median_count = median(count) | ||||||
|  |   ) | ||||||
|  | #plotting it | ||||||
|  | p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + | ||||||
|  |   geom_line(color = "blue") +     # Line plot | ||||||
|  |   geom_point(color = "red") +     # Points on the line | ||||||
|  |   geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + | ||||||
|  |   labs(title = "Median Bot Actions", | ||||||
|  |        x = "Date", | ||||||
|  |        y = "Median Count") + | ||||||
|  |   theme_minimal()     | ||||||
|  | p1 | ||||||
|  | p1_5 <- ggplot(summary_df, aes(x = date)) + | ||||||
|  |   geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + | ||||||
|  |   #geom_point(color = "red") +     # Points on the line | ||||||
|  |   labs(title = "Median Bot Actions", | ||||||
|  |        x = "Date", | ||||||
|  |        y = "Median Count") + | ||||||
|  |   theme_minimal()     | ||||||
|  | p1_5 | ||||||
|  | 
 | ||||||
|  | #plotting mean count values | ||||||
|  | p2 <- ggplot(summary_df, aes(x = date)) + | ||||||
|  |   geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + | ||||||
|  |   labs(title = "Mean Bot Actions", | ||||||
|  |        x = "Date", | ||||||
|  |        y = "Mean Count") + | ||||||
|  |   theme_minimal()     | ||||||
|  | p2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | large_wikis <- c("wikidatawiki", "commonswiki", "enwiki") | ||||||
|  | medium_wikis <- c("dewiki", "frwiki", "eswiki", "itwiki", "ruwiki", "jawiki", "viwiki", "zhwiki",  | ||||||
|  |                   "ptwiki", "enwiktionary", "plwiki", "nlwiki", "svwiki", "metawiki", "arwiki",  | ||||||
|  |                   "shwiki", "cebwiki", "mgwiktionary", "fawiki", "frwiktionary", "ukwiki",  | ||||||
|  |                   "hewiki", "kowiki", "srwiki", "trwiki", "loginwiki", "huwiki", "cawiki",  | ||||||
|  |                   "nowiki", "mediawikiwiki", "fiwiki", "cswiki", "idwiki", "rowiki", "enwikisource",  | ||||||
|  |                   "frwikisource", "ruwiktionary", "dawiki", "bgwiki", "incubatorwiki",  | ||||||
|  |                   "enwikinews", "specieswiki", "thwiki" | ||||||
|  | ) | ||||||
|  | small_wiki_summary_df <- combined_df |> | ||||||
|  |   filter(!wiki_db %in% large_wikis)|> | ||||||
|  |   filter(!wiki_db %in% medium_wikis)|> | ||||||
|  |   group_by(date) |> | ||||||
|  |   summarize( | ||||||
|  |     mean_count = mean(count), | ||||||
|  |     median_count = median(count) | ||||||
|  |   ) | ||||||
|  | ps2 <- ggplot(small_wiki_summary_df, aes(x = date)) + | ||||||
|  |   geom_smooth(aes(y = mean_count), method = "loess", color = "blue", fill = "blue", alpha = 0.2, se = TRUE) + | ||||||
|  |   geom_vline(xintercept = as.Date("2016-03-01"), linetype = "dashed", color = "black") + | ||||||
|  |   labs(title = "Mean Bot Actions (small, single file wikis)", | ||||||
|  |        x = "Date", | ||||||
|  |        y = "Mean Count") + | ||||||
|  |   theme_minimal()     | ||||||
|  | ps2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | |||||||
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_bot_actions_smooth.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_bot_actions_smooth.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 25 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_excluding_bot_action.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_mean_excluding_bot_action.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 33 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_median_action_point_plot.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_median_action_point_plot.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 60 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 31 KiB | 
							
								
								
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki_mean_actions.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								plots/exploratory/01262025_small_wiki_mean_actions.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 31 KiB | 
| @ -3,10 +3,10 @@ | |||||||
| #SBATCH --job-name=mgaughan-rstudio-server | #SBATCH --job-name=mgaughan-rstudio-server | ||||||
| #SBATCH --partition=cpu-g2-mem2x | #SBATCH --partition=cpu-g2-mem2x | ||||||
| 
 | 
 | ||||||
| #SBATCH --time=02:00:00 | #SBATCH --time=03:00:00 | ||||||
| #SBATCH --nodes=1 | #SBATCH --nodes=1 | ||||||
| #SBATCH --ntasks=4 | #SBATCH --ntasks=4 | ||||||
| #SBATCH --mem=20G | #SBATCH --mem=32G | ||||||
| 
 | 
 | ||||||
| #SBATCH --signal=USR2 | #SBATCH --signal=USR2 | ||||||
| #SBATCH --output=%x_%j.out | #SBATCH --output=%x_%j.out | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user