covid19/wikipedia/scripts/fetch_daily_views.py

#!/usr/bin/env python3
import argparse
import sqlite3
import requests
from datetime import datetime, timedelta
import logging
import digobs
from os import path, mkdir
from functools import partial
from itertools import chain

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities.")
    parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
    parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel)
    parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
    parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=lambda s: datetime.strptime(s, "%Y%m%d"))

    parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')

    parser.add_argument('-b', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')

    args = parser.parse_args()
    conn = args.input_db
    conn.row_factory = sqlite3.Row

    #handle -d
    if args.query_date:
        query_date = args.query_date.strftime("%Y%m%d")
    else:
        yesterday = datetime.today() - timedelta(days=1)
        query_date = yesterday.strftime("%Y%m%d")

    digobs.init_logging(args)

    logging.info(f"Destructively outputting results to {args.output_folder}")

    #1 Load up the list of article names

    logging.info("loading info from database")
    projects = [row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall()]

    successes = 0
    failures = 0

    for project in projects:
        project_folder = path.join(args.output_folder, project)
        if not path.exists(project.folder):
            mkdir(project_folder)

        dump_folder = path.join(projct.folder, export_date)
        if not path.exists(dump_folder):
            mkdir(dump_folder)

        logging.info(f"Getting page views for {project}")
        rows = conn.execute(f"SELECT DISTINCT page from pagesPerProjectTable WHERE project='{project}';").fetchall()
        pages = (row['page'] for row in rows)

        # special case for english, we have a wikiproject input file
        if project == "en.wikipedia":
            pages = chain(pages, map(str.strip,args.input_file))

        call_view_api = partial(digobs.call_view_api, project=project, query_date = query_date)

        responses = map(call_view_api, pages)
        
        j_outfilename = path.join(j_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.json")
        t_outfilename = path.join(t_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.tsv")

        with open(j_outfilename, 'w') as j_outfile, \
             open(t_outfilename, 'w') as t_outfile:

            proj_successes, proj_failures = digobs.process_view_responses(responses, j_outfile, t_outfile, logging)
        logging.info(f"(Processed {proj_successes} successes and {proj_failures} for {project}")
        successes = proj_successes + successes
        failures = proj_failures + failures

    conn.close()
    # f_Out = outputPath + "dailyviews" + query_date + ".feather"
    # read the json back in and make a feather file? 
    logging.debug(f"Run complete at {datetime.now()}")
    logging.info(f"Processed {successes} successful URLs and {failures} failures.")
for testing 2020-03-27 23:00:36 +00:00			`#!/usr/bin/env python3`
			`import argparse`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`import sqlite3`
			`import requests`
			`from datetime import datetime, timedelta`
adds in new logging capability 2020-03-29 01:46:35 +00:00			`import logging`
changes in response to code review by nate - moved some common functions into files - other smaller changes 2020-04-01 22:16:34 +00:00			`import digobs`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`from os import path, mkdir`
			`from functools import partial`
			`from itertools import chain`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`if __name__ == "__main__":`

			`parser = argparse.ArgumentParser(description="Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities.")`
renamed the wikipedia_views module to wikipedia 2020-04-01 14:51:20 +00:00			`parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel)`
			`parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))`
			`parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=lambda s: datetime.strptime(s, "%Y%m%d"))`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`parser.add_argument('-b', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`args = parser.parse_args()`
			`conn = args.input_db`
			`conn.row_factory = sqlite3.Row`
many bug fixes 2020-03-28 00:24:18 +00:00
adds in new logging capability 2020-03-29 01:46:35 +00:00			`#handle -d`
changes to a bunch of the wikipedia view code - Renamed the articles.txt to something more specific Changes to both scripts: - Updated filenames to match the new standard - Reworked the logging code so that it can write to stderr by default. Because we can only call logging.basicConfig() once, this eneded up being a bigger changes. - Caused scripts to output git commits and export to track which code produced which dataset. - Caused programs to take files instead of directories as output (allows us to run programs more than once a day). Changes to the wikipedia_views/scripts/fetch_daily_views.py: - Change output that it outputs a sequence of JSON dictionaries (one per line) as per the standard we agreed to and which is what Twitter, Github, and other dumps do. Previous behavior was to create output a single JSON list object. - A number of other small changes and tweaks throughout. 2020-04-01 14:15:12 +00:00			`if args.query_date:`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`query_date = args.query_date.strftime("%Y%m%d")`
for testing 2020-03-27 23:00:36 +00:00			`else:`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`yesterday = datetime.today() - timedelta(days=1)`
address confusion with date The timestamps in files should be the day that the exports are done. For the view data, the query date needs to be the day before but this shouldn't be the timestamp we use in files, etc. 2020-04-01 20:14:05 +00:00			`query_date = yesterday.strftime("%Y%m%d")`
many bug fixes 2020-03-28 00:24:18 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`digobs.init_logging(args)`
adds in new logging capability 2020-03-29 01:46:35 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`logging.info(f"Destructively outputting results to {args.output_folder}")`
for testing 2020-03-27 23:00:36 +00:00
changes to a bunch of the wikipedia view code - Renamed the articles.txt to something more specific Changes to both scripts: - Updated filenames to match the new standard - Reworked the logging code so that it can write to stderr by default. Because we can only call logging.basicConfig() once, this eneded up being a bigger changes. - Caused scripts to output git commits and export to track which code produced which dataset. - Caused programs to take files instead of directories as output (allows us to run programs more than once a day). Changes to the wikipedia_views/scripts/fetch_daily_views.py: - Change output that it outputs a sequence of JSON dictionaries (one per line) as per the standard we agreed to and which is what Twitter, Github, and other dumps do. Previous behavior was to create output a single JSON list object. - A number of other small changes and tweaks throughout. 2020-04-01 14:15:12 +00:00			`#1 Load up the list of article names`
makes TSV makes JSON 2020-03-28 01:08:43 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`logging.info("loading info from database")`
			`projects = [row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall()]`

			`successes = 0`
			`failures = 0`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`for project in projects:`
			`project_folder = path.join(args.output_folder, project)`
			`if not path.exists(project.folder):`
			`mkdir(project_folder)`
Merge branch 'kaylea/master' of github.com:CommunityDataScienceCollective/COVID-19_Digital_Observatory into kaylea/master 2020-03-28 21:12:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`dump_folder = path.join(projct.folder, export_date)`
			`if not path.exists(dump_folder):`
			`mkdir(dump_folder)`
Merge branch 'kaylea/master' of github.com:CommunityDataScienceCollective/COVID-19_Digital_Observatory into kaylea/master 2020-03-28 21:12:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`logging.info(f"Getting page views for {project}")`
			`rows = conn.execute(f"SELECT DISTINCT page from pagesPerProjectTable WHERE project='{project}';").fetchall()`
			`pages = (row['page'] for row in rows)`
Merge branch 'kaylea/master' of github.com:CommunityDataScienceCollective/COVID-19_Digital_Observatory into kaylea/master 2020-03-28 21:12:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`# special case for english, we have a wikiproject input file`
			`if project == "en.wikipedia":`
			`pages = chain(pages, map(str.strip,args.input_file))`
Merge branch 'kaylea/master' of github.com:CommunityDataScienceCollective/COVID-19_Digital_Observatory into kaylea/master 2020-03-28 21:12:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`call_view_api = partial(digobs.call_view_api, project=project, query_date = query_date)`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`responses = map(call_view_api, pages)`

			`j_outfilename = path.join(j_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.json")`
			`t_outfilename = path.join(t_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.tsv")`
adds in new logging capability 2020-03-29 01:46:35 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`with open(j_outfilename, 'w') as j_outfile, \`
			`open(t_outfilename, 'w') as t_outfile:`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`proj_successes, proj_failures = digobs.process_view_responses(responses, j_outfile, t_outfile, logging)`
			`logging.info(f"(Processed {proj_successes} successes and {proj_failures} for {project}")`
			`successes = proj_successes + successes`
			`failures = proj_failures + failures`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`conn.close()`
address confusion with date The timestamps in files should be the day that the exports are done. For the view data, the query date needs to be the day before but this shouldn't be the timestamp we use in files, etc. 2020-04-01 20:14:05 +00:00			`# f_Out = outputPath + "dailyviews" + query_date + ".feather"`
A few suggestions for the python script: - using format strings (f-strings) is a nice way in python to build strings using variables. - you can read and process a file in one pass if you iterate over the open file itself instead of reading it into a variable and then looping - i had to change your strip code when i stopped using csv reader - my python linter and auto-formater hate non-indendent comments - i added a few lines to print cases where we don't get Ok responses. 2020-03-28 03:27:02 +00:00			`# read the json back in and make a feather file?`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`logging.debug(f"Run complete at {datetime.now()}")`
			`logging.info(f"Processed {successes} successful URLs and {failures} failures.")`
for testing 2020-03-27 23:00:36 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00