covid19/wikipedia/scripts/fetch_revisions.py

#!/usr/bin/env python3

###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
# 
# It (1) reads in the article list and then (2) calls the Wikimedia API to 
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################

import argparse
import logging
from os import path, mkdir
import json
import datetime
import sqlite3
from functools import partial
from itertools import chain
from csv import DictWriter
import digobs

def main():

    parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
    parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
    parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
    parser.add_argument('-d', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
    parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel), 
    parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))

    args = parser.parse_args()

    logging = digobs.init_logging(args)

    conn = args.input_db
    conn.row_factory = sqlite3.Row

    projects = (row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall())

    tsv_fields = ['title', 'pageid', 'namespace']

    # list of properties from the API we want to gather (basically all of
    # them supported by mediawik-utilities)

    rv_props =  {'revid' : 'ids',
                 'timestamp' : 'timestamp',
                 'user' : 'user',
                 'userid' : 'userid',
                 'size' : 'size',
                 'sha1' : 'sha1',
                 'contentmodel' : 'contentmodel',
                 'tags' : 'tags',
                 'flags' : 'flags',
                 'comment' : 'comment',
                 'content' : 'content' }
    
    def get_project_pages(project):
        return (row['page'] for row in conn.execute(f"SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == '{project}';").fetchall())

    def get_project_revisions(project):
        pages = get_project_pages(project)
        if project=="en.wikipedia":
            pages = chain(pages, map(str.strip,args.input_file))
        return digobs.get_pages_revisions(pages, project=project, logging=logging, rv_props=rv_props)

    tsv_fields = tsv_fields + list(rv_props.keys())

    exclude_from_tsv = ['tags', 'comment', 'content', 'flags']

    # drop fields that we identified for exclusion
    tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]

    # add special export fields
    tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
    
    export_time = str(datetime.datetime.now())

    rev_batch_to_tsv = partial(digobs.rev_batch_to_tsv,
                         tsv_fields = tsv_fields,
                         export_info={'export_timestamp':export_time,
                                      'export_commit':digobs.git_hash(short=True)})    

    export_info = { 'git_commit' : digobs.git_hash(),
                    'timestamp' : export_time }

    export_date = datetime.datetime.today().strftime("%Y%m%d")

    rev_batch_to_json = partial(digobs.rev_batch_to_json,
                                export_info = export_info)

    def write_project_pages(project):
        project_folder = path.join(args.output_folder, project)
        if not path.exists(project_folder):
            mkdir(project_folder)

        dump_folder = path.join(project_folder, export_date)
        if not path.exists(dump_folder):
            mkdir(dump_folder)

        project_revs = get_project_revisions(project)
        
        json_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.json")
        tsv_output_filename =  path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.tsv")

        with open(json_output_filename, 'w') as json_output, \
             open(tsv_output_filename, 'w') as tsv_output:
            tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
            tsv_writer.writeheader()
  
            for rev_batch in project_revs:
                logging.debug(f"processing raw revision: {rev_batch}")
                rev_batch_to_json(rev_batch, json_output=json_output)
                rev_batch_to_tsv(rev_batch, project=project, tsv_writer=tsv_writer)

    for project in projects:
        write_project_pages(project)

if __name__ == "__main__":
    main()
address confusion with date The timestamps in files should be the day that the exports are done. For the view data, the query date needs to be the day before but this shouldn't be the timestamp we use in files, etc. 2020-04-01 20:14:05 +00:00			`#!/usr/bin/env python3`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
			`###############################################################################`
			`#`
			`# This script assumes the presence of the COVID-19 repo.`
			`#`
			`# It (1) reads in the article list and then (2) calls the Wikimedia API to`
			`# fetch view information for each article. Output is to (3) JSON and TSV.`
			`#`
			`###############################################################################`

			`import argparse`
			`import logging`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`from os import path, mkdir`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00			`import json`
			`import datetime`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`import sqlite3`
			`from functools import partial`
			`from itertools import chain`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00			`from csv import DictWriter`
changes in response to code review by nate - moved some common functions into files - other smaller changes 2020-04-01 22:16:34 +00:00			`import digobs`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`def main():`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
			`parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')`
renamed the wikipedia_views module to wikipedia 2020-04-01 14:51:20 +00:00			`parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')`
			`parser.add_argument('-d', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')`
use the type= feature in argparse - integrated the type= feature in argparse in all three scripts - removed some redundant code from the third file 2020-04-01 23:13:02 +00:00			`parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`args = parser.parse_args()`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`logging = digobs.init_logging(args)`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`conn = args.input_db`
			`conn.row_factory = sqlite3.Row`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`projects = (row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall())`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`tsv_fields = ['title', 'pageid', 'namespace']`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
			`# list of properties from the API we want to gather (basically all of`
			`# them supported by mediawik-utilities)`

			`rv_props = {'revid' : 'ids',`
			`'timestamp' : 'timestamp',`
			`'user' : 'user',`
			`'userid' : 'userid',`
			`'size' : 'size',`
			`'sha1' : 'sha1',`
			`'contentmodel' : 'contentmodel',`
			`'tags' : 'tags',`
tweaks to revision export code - flags were not being exported (e.g., minor, anon) - broke with hidden/deleted user names 2020-04-01 21:39:53 +00:00			`'flags' : 'flags',`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00			`'comment' : 'comment',`
			`'content' : 'content' }`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00
			`def get_project_pages(project):`
			`return (row['page'] for row in conn.execute(f"SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == '{project}';").fetchall())`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`def get_project_revisions(project):`
			`pages = get_project_pages(project)`
			`if project=="en.wikipedia":`
			`pages = chain(pages, map(str.strip,args.input_file))`
			`return digobs.get_pages_revisions(pages, project=project, logging=logging, rv_props=rv_props)`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
			`tsv_fields = tsv_fields + list(rv_props.keys())`

monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`exclude_from_tsv = ['tags', 'comment', 'content', 'flags']`

added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00			`# drop fields that we identified for exclusion`
			`tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]`

			`# add special export fields`
tweaks to revision export code - flags were not being exported (e.g., minor, anon) - broke with hidden/deleted user names 2020-04-01 21:39:53 +00:00			`tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']`
monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00
			`export_time = str(datetime.datetime.now())`

			`rev_batch_to_tsv = partial(digobs.rev_batch_to_tsv,`
			`tsv_fields = tsv_fields,`
			`export_info={'export_timestamp':export_time,`
			`'export_commit':digobs.git_hash(short=True)})`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
changes in response to code review by nate - moved some common functions into files - other smaller changes 2020-04-01 22:16:34 +00:00			`export_info = { 'git_commit' : digobs.git_hash(),`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00			`'timestamp' : export_time }`

monitor pages from dsaez's wikidata crawler 2020-04-04 22:23:33 +00:00			`export_date = datetime.datetime.today().strftime("%Y%m%d")`

			`rev_batch_to_json = partial(digobs.rev_batch_to_json,`
			`export_info = export_info)`

			`def write_project_pages(project):`
			`project_folder = path.join(args.output_folder, project)`
			`if not path.exists(project_folder):`
			`mkdir(project_folder)`

			`dump_folder = path.join(project_folder, export_date)`
			`if not path.exists(dump_folder):`
			`mkdir(dump_folder)`

			`project_revs = get_project_revisions(project)`

			`json_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.json")`
			`tsv_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.tsv")`

			`with open(json_output_filename, 'w') as json_output, \`
			`open(tsv_output_filename, 'w') as tsv_output:`
			`tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")`
			`tsv_writer.writeheader()`

			`for rev_batch in project_revs:`
			`logging.debug(f"processing raw revision: {rev_batch}")`
			`rev_batch_to_json(rev_batch, json_output=json_output)`
			`rev_batch_to_tsv(rev_batch, project=project, tsv_writer=tsv_writer)`

			`for project in projects:`
			`write_project_pages(project)`
added initial version of revision-scraper Borrows much of the structure from the (patched) version of the dailyview scraper. 2020-04-01 14:42:38 +00:00
			`if __name__ == "__main__":`
			`main()`