1
0
govdoc-cr-analysis/cleaning_scripts/verifying_count_on_manifest.R
2025-02-07 10:10:31 -08:00

20 lines
834 B
R

library(dplyr)
count_path <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/co_0205_CONTRIBUTING_weekly_count_data.csv"
manifest_path <- "validation/020525_CONTRIBUTING_manifest.csv"
count_df <- read.csv(count_path)
manifest_df <- read.csv(manifest_path)
length(unique(count_df$project_id))
unique_project_ids <- unique(count_df$project_id)
missing_project_ids <- setdiff(unique_project_ids, manifest_df$repo_id)
missing_project_ids
count_df_filtered <- count_df %>% filter(!project_id %in% missing_project_ids)
length(unique(count_df_filtered$project_id))
missing_repo_ids <- setdiff(manifest_df$repo_id, count_df_filtered$project_id)
write.csv(count_df_filtered, "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv", row.names =FALSE)