initial loading of activity_data
This commit is contained in:
		
							parent
							
								
									b81af11776
								
							
						
					
					
						commit
						49eaade666
					
				
							
								
								
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,2 +1,10 @@ | |||||||
| # ignore the R studio docker image needed by hyak  | # ignore the R studio docker image needed by hyak  | ||||||
| rstudio_latest.sif | rstudio_latest.sif | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # do not need to include any R items  | ||||||
|  | .Rhistory | ||||||
|  | .cache/ | ||||||
|  | .config/ | ||||||
|  | .local/ | ||||||
|  | 
 | ||||||
|  | |||||||
							
								
								
									
										37
									
								
								bot_activity_analysis/bot_activity_exploration.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								bot_activity_analysis/bot_activity_exploration.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | # data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts | ||||||
|  | # load in the paritioned directories | ||||||
|  | library(dplyr) | ||||||
|  | monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/" | ||||||
|  | yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/" | ||||||
|  | single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/" | ||||||
|  | column_names <- c("wiki_db", "date", "event_entity", "event_action", "count") | ||||||
|  | # define a function to combing the multiple csv files in each directory | ||||||
|  | consolidate_csv <- function(directory, column_names) { | ||||||
|  |   file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE) | ||||||
|  |   df_list <- lapply(file_list, function(file){ | ||||||
|  |     df = read.csv(file, header = FALSE)  | ||||||
|  |     colnames(df) <- column_names | ||||||
|  |     return(df) | ||||||
|  |     }) | ||||||
|  |   combined_df <- do.call(rbind, df_list) | ||||||
|  |   return(combined_df) | ||||||
|  | } | ||||||
|  | #apply the function to our three directories of data  | ||||||
|  | monthly_df <- consolidate_csv(monthly_file_dir, column_names) | ||||||
|  | yearly_df <- consolidate_csv(yearly_file_dir, column_names) | ||||||
|  | single_df <- consolidate_csv(single_file_dir, column_names) | ||||||
|  | #rbind  | ||||||
|  | combined_df <- rbind(monthly_df, yearly_df, single_df) | ||||||
|  | rm(monthly_df) | ||||||
|  | rm(yearly_df) | ||||||
|  | rm(single_df) | ||||||
|  | #making sure data columns are of the right type | ||||||
|  | combined_df <- combined_df |>  | ||||||
|  |   mutate(  | ||||||
|  |     wiki_db = as.factor(wiki_db), | ||||||
|  |     date = as.Date(date), | ||||||
|  |     event_entity = as.factor(event_entity), | ||||||
|  |     event_action = as.factor(event_action), | ||||||
|  |     count = as.numeric(count) | ||||||
|  |   ) | ||||||
| @ -19,7 +19,7 @@ | |||||||
| # jobID assigned by SLURM when our job is submitted. | # jobID assigned by SLURM when our job is submitted. | ||||||
| 
 | 
 | ||||||
| RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis" | RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis" | ||||||
| RSTUDIO_SIF="/mmfs1/home/mjilg/rstudio_latest.sif" | RSTUDIO_SIF="rstudio_latest.sif" | ||||||
| 
 | 
 | ||||||
| # Create temp directory for ephemeral content to bind-mount in the container | # Create temp directory for ephemeral content to bind-mount in the container | ||||||
| RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())') | RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())') | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user