library(dplyr) count_path <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/co_0205_CONTRIBUTING_weekly_count_data.csv" manifest_path <- "validation/020525_CONTRIBUTING_manifest.csv" count_df <- read.csv(count_path) manifest_df <- read.csv(manifest_path) length(unique(count_df$project_id)) unique_project_ids <- unique(count_df$project_id) missing_project_ids <- setdiff(unique_project_ids, manifest_df$repo_id) missing_project_ids count_df_filtered <- count_df %>% filter(!project_id %in% missing_project_ids) length(unique(count_df_filtered$project_id)) missing_repo_ids <- setdiff(manifest_df$repo_id, count_df_filtered$project_id) write.csv(count_df_filtered, "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv", row.names =FALSE)