initial loading of activity_data
This commit is contained in:
parent
b81af11776
commit
49eaade666
8
.gitignore
vendored
8
.gitignore
vendored
@ -1,2 +1,10 @@
|
|||||||
# ignore the R studio docker image needed by hyak
|
# ignore the R studio docker image needed by hyak
|
||||||
rstudio_latest.sif
|
rstudio_latest.sif
|
||||||
|
|
||||||
|
|
||||||
|
# do not need to include any R items
|
||||||
|
.Rhistory
|
||||||
|
.cache/
|
||||||
|
.config/
|
||||||
|
.local/
|
||||||
|
|
||||||
|
37
bot_activity_analysis/bot_activity_exploration.R
Normal file
37
bot_activity_analysis/bot_activity_exploration.R
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
# data directory: /gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts
|
||||||
|
# load in the paritioned directories
|
||||||
|
library(dplyr)
|
||||||
|
monthly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_monthly/"
|
||||||
|
yearly_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_yearly/"
|
||||||
|
single_file_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/bot_activity_counts/011625_dab_single/"
|
||||||
|
column_names <- c("wiki_db", "date", "event_entity", "event_action", "count")
|
||||||
|
# define a function to combing the multiple csv files in each directory
|
||||||
|
consolidate_csv <- function(directory, column_names) {
|
||||||
|
file_list <- list.files(path = directory, pattern = "*.csv", full.names = TRUE)
|
||||||
|
df_list <- lapply(file_list, function(file){
|
||||||
|
df = read.csv(file, header = FALSE)
|
||||||
|
colnames(df) <- column_names
|
||||||
|
return(df)
|
||||||
|
})
|
||||||
|
combined_df <- do.call(rbind, df_list)
|
||||||
|
return(combined_df)
|
||||||
|
}
|
||||||
|
#apply the function to our three directories of data
|
||||||
|
monthly_df <- consolidate_csv(monthly_file_dir, column_names)
|
||||||
|
yearly_df <- consolidate_csv(yearly_file_dir, column_names)
|
||||||
|
single_df <- consolidate_csv(single_file_dir, column_names)
|
||||||
|
#rbind
|
||||||
|
combined_df <- rbind(monthly_df, yearly_df, single_df)
|
||||||
|
rm(monthly_df)
|
||||||
|
rm(yearly_df)
|
||||||
|
rm(single_df)
|
||||||
|
#making sure data columns are of the right type
|
||||||
|
combined_df <- combined_df |>
|
||||||
|
mutate(
|
||||||
|
wiki_db = as.factor(wiki_db),
|
||||||
|
date = as.Date(date),
|
||||||
|
event_entity = as.factor(event_entity),
|
||||||
|
event_action = as.factor(event_action),
|
||||||
|
count = as.numeric(count)
|
||||||
|
)
|
@ -19,7 +19,7 @@
|
|||||||
# jobID assigned by SLURM when our job is submitted.
|
# jobID assigned by SLURM when our job is submitted.
|
||||||
|
|
||||||
RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis"
|
RSTUDIO_CWD="/mmfs1/home/mjilg/git/mw-lifecycle-analysis"
|
||||||
RSTUDIO_SIF="/mmfs1/home/mjilg/rstudio_latest.sif"
|
RSTUDIO_SIF="rstudio_latest.sif"
|
||||||
|
|
||||||
# Create temp directory for ephemeral content to bind-mount in the container
|
# Create temp directory for ephemeral content to bind-mount in the container
|
||||||
RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')
|
RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())')
|
||||||
|
Loading…
Reference in New Issue
Block a user