From c97028fabba9be875ff81ba8ee89f78a9b623879 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Sat, 4 Apr 2020 15:20:34 -0700 Subject: [PATCH] update cron scripts with new data format --- cron-wikipedia_revisions.sh | 25 +++++++++++++++++++------ cron-wikipedia_views.sh | 21 ++++++++++++++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/cron-wikipedia_revisions.sh b/cron-wikipedia_revisions.sh index 5ba02ed..555fd46 100644 --- a/cron-wikipedia_revisions.sh +++ b/cron-wikipedia_revisions.sh @@ -6,12 +6,25 @@ date_string=$(date +%Y%m%d) revs_log="enwp-revisions-${date_string}.log" ./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/${revs_log}) -./wikipedia/scripts/fetch_enwiki_revisions.py 2> >(tee -a wikipedia/logs/${revs_log}) +wd_log="wd-page-crawler-${date_string}.log" +python3 ./real-time-wiki-covid-tracker/PageCrawler.py -a "./wikipedia/resources/enwp_wikiproject_covid19_articles.txt" 2> >(tee wikipedia/logs/${wd_log}) + +./wikipedia/scripts/fetch_revisions.py 2> >(tee -a wikipedia/logs/${revs_log}) mv wikipedia/logs/${revs_log} /var/www/covid19/wikipedia/logs/ -revs_tsv="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.tsv" -mv wikipedia/data/${revs_tsv} /var/www/covid19/wikipedia +python3 ./wikipedia/scripts/copy_revisions_data.py ${date_string} -revs_json="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.json" -xz wikipedia/data/${revs_json} -mv wikipedia/data/${revs_json}.xz /var/www/covid19/wikipedia +cd wikipedia/data +xz */${date_string}/*revisions*.json + +find */${date_string}/*revisions*.xz | while read line; do + mkdir -p /var/www/covid9/wikipedia/$line + mv $line /var/www/covid19/wikipedia/$line +done + +find */${date_string}/*revisions*.tsv | while read line; do + mkdir -p /var/www/covid19/wikipedia/$line + mv $line /var/www/covid19/wikipedia/$line +done + +cd ../.. diff --git a/cron-wikipedia_views.sh b/cron-wikipedia_views.sh index 4afe380..851cc25 100644 --- a/cron-wikipedia_views.sh +++ b/cron-wikipedia_views.sh @@ -3,14 +3,25 @@ TZ="UTC" date_string=${OVERRIDE_DATE_STRING:-$(date +%Y%m%d)} -view_log="enwp-daily_views-${date_string}.log" +view_log="daily_views-${date_string}.log" ./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/${view_log}) +wd_log="wd-page-crawler-${date_string}.log" +python3 ./real-time-wiki-covid-tracker/PageCrawler.py -a "./wikipedia/resources/enwp_wikiproject_covid19_articles.txt" 2> >(tee wikipedia/logs/${wd_log}) + # get the list of files -./wikipedia/scripts/fetch_enwiki_daily_views.py -d "${date_string}" 2> >(tee -a wikipedia/logs/${view_log}) +./wikipedia/scripts/fetch_daily_views.py -d "${date_string}" 2> >(tee -a wikipedia/logs/${view_log}) mv wikipedia/logs/${view_log} /var/www/covid19/wikipedia/logs/${view_log} -mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.tsv /var/www/covid19/wikipedia/ -# xz wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json -mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json /var/www/covid19/wikipedia/ +cd wikipedia/data +find */${date_string}/*dailyviews*.tsv | while read line; do + mkdir -p /var/www/covid19/wikipedia/$line + mv $line /var/www/covid19/wikipedia/$line +done +find */${date_string}/*dailyviews*.json | while read line; do + mkdir -p /var/www/covid19/wikipedia/$line + mv $line /var/www/covid19/wikipedia/$line +done + +cd ../..