Compare commits

..

4 Commits

Author SHA1 Message Date
Benjamin Mako Hill
befb87c8f5
Merge pull request #20 from makoshark/master
bug fix: change to the correct working directory before running cron jobs
2020-04-07 14:45:05 -07:00
Benjamin Mako Hill
1a27b68061
Merge pull request #18 from CommunityDataScienceCollective/dsaez_submodule
Add dsaez's submodule for crawling wikidata
2020-04-07 14:43:26 -07:00
Benjamin Mako Hill
e32e826083 updated script to ensure the correct working dir 2020-04-07 16:39:58 -05:00
Benjamin Mako Hill
0c4cfcdfcf made cronjobs executable 2020-04-04 11:19:43 -05:00
8 changed files with 276 additions and 384 deletions

27
cron-wikipedia_revisions.sh Normal file → Executable file
View File

@ -1,30 +1,21 @@
#!/bin/bash -x
WORKING_DIR="/home/SOC.NORTHWESTERN.EDU/bmh1867/covid19"
cd $WORKING_DIR
TZ="UTC"
date_string=$(date +%Y%m%d)
revs_log="enwp-revisions-${date_string}.log"
./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/${revs_log})
wd_log="wd-page-crawler-${date_string}.log"
python3 ./real-time-wiki-covid-tracker/PageCrawler.py -a "./wikipedia/resources/enwp_wikiproject_covid19_articles.txt" 2> >(tee wikipedia/logs/${wd_log})
./wikipedia/scripts/fetch_revisions.py 2> >(tee -a wikipedia/logs/${revs_log})
./wikipedia/scripts/fetch_enwiki_revisions.py 2> >(tee -a wikipedia/logs/${revs_log})
mv wikipedia/logs/${revs_log} /var/www/covid19/wikipedia/logs/
python3 ./wikipedia/scripts/copy_revisions_data.py ${date_string}
revs_tsv="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.tsv"
mv wikipedia/data/${revs_tsv} /var/www/covid19/wikipedia
cd wikipedia/data
xz */${date_string}/*revisions*.json
revs_json="digobs_covid19-wikipedia-enwiki_revisions-${date_string}.json"
xz wikipedia/data/${revs_json}
mv wikipedia/data/${revs_json}.xz /var/www/covid19/wikipedia
find */${date_string}/*revisions*.xz | while read line; do
mkdir -p /var/www/covid9/wikipedia/$line
mv $line /var/www/covid19/wikipedia/$line
done
find */${date_string}/*revisions*.tsv | while read line; do
mkdir -p /var/www/covid19/wikipedia/$line
mv $line /var/www/covid19/wikipedia/$line
done
cd ../..

24
cron-wikipedia_views.sh Normal file → Executable file
View File

@ -1,27 +1,19 @@
#!/bin/bash -x
WORKING_DIR="/home/SOC.NORTHWESTERN.EDU/bmh1867/covid19"
cd $WORKING_DIR
TZ="UTC"
date_string=${OVERRIDE_DATE_STRING:-$(date +%Y%m%d)}
view_log="daily_views-${date_string}.log"
view_log="enwp-daily_views-${date_string}.log"
./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/${view_log})
wd_log="wd-page-crawler-${date_string}.log"
python3 ./real-time-wiki-covid-tracker/PageCrawler.py -a "./wikipedia/resources/enwp_wikiproject_covid19_articles.txt" 2> >(tee wikipedia/logs/${wd_log})
# get the list of files
./wikipedia/scripts/fetch_daily_views.py -d "${date_string}" 2> >(tee -a wikipedia/logs/${view_log})
./wikipedia/scripts/fetch_enwiki_daily_views.py -d "${date_string}" 2> >(tee -a wikipedia/logs/${view_log})
mv wikipedia/logs/${view_log} /var/www/covid19/wikipedia/logs/${view_log}
mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.tsv /var/www/covid19/wikipedia/
cd wikipedia/data
find */${date_string}/*dailyviews*.tsv | while read line; do
mkdir -p /var/www/covid19/wikipedia/$line
mv $line /var/www/covid19/wikipedia/$line
done
# xz wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json
mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.json /var/www/covid19/wikipedia/
find */${date_string}/*dailyviews*.json | while read line; do
mkdir -p /var/www/covid19/wikipedia/$line
mv $line /var/www/covid19/wikipedia/$line
done
cd ../..

View File

@ -1,3 +0,0 @@
\#*\#
.\#*
*~

View File

@ -1,17 +1,8 @@
#!/usr/bin/env python3
from requests import Request
from datetime import datetime
import sys
import subprocess
import logging
import itertools
import requests
from functools import partial
from csv import DictWriter
import json
import mwapi as api
user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
def git_hash(short=False):
if short:
@ -19,67 +10,6 @@ def git_hash(short=False):
else:
subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
def init_logging(args):
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.now())
logging.info(f"Starting at {export_time}.")
logging.info(f"Last commit: {export_git_hash}")
return(logging)
def call_view_api(page, project, query_date):
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/all-access/all-agents/{page}/daily/{query_date}00/{query_date}00"
response = requests.get(url)
if response.ok:
return response.json().get('items',[None])[0]
else:
logging.warning(f"Failure: {response.status_code} from {url}")
print(response.json().get("detail",None))
return None
# This function writes out the view data to a json file (j_outfile)
# and a tsv file (t_outfile), keeps track of failures,
# and returns the number of successes and failures
def process_view_responses(responses, j_outfile, t_outfile, logging = None):
failures = len(list(itertools.takewhile(lambda r: r is None, responses)))
successes = 1
try:
first_response = next(responses)
except StopIteration:
logging.error("No valid responses")
exit()
dw = DictWriter(t_outfile, sorted(first_response.keys()), delimiter='\t')
dw.writeheader()
json.dump(first_response, j_outfile)
dw.writerow(first_response)
for response in responses:
if response is None:
failures = failures + 1
continue
else:
successes = successes + 1
if logging is not None:
logging.debug(f"printing data: {response}")
json.dump(response, j_outfile)
dw.writerow(response)
return (successes,failures)
def get_loglevel(arg_loglevel):
loglevel_mapping = { 'debug' : logging.DEBUG,
'info' : logging.INFO,
@ -95,74 +25,3 @@ def get_loglevel(arg_loglevel):
return logging.INFO
def get_revisions_for_page(title, api_session, logging, rv_props):
result = api_session.get(action='query',
prop='revisions',
rvprop=rv_props.values(),
titles = {title},
rvdir='newer',
rvslots='*',
continuation=True)
return result
def get_pages_revisions(titles, project, logging, rv_props):
logging.info(f"pulling revisions for: {project}")
api_session = api.Session(f"https://{project}.org/w/api.php",
user_agent=user_agent
)
return itertools.chain(* map(partial(get_revisions_for_page, api_session = api_session, logging = logging, rv_props = rv_props), titles))
def rev_batch_to_json(rev, export_info, json_output = None):
rev['exported'] = export_info
if json_output is None:
return json.dumps(rev)
else:
json.dump(rev, json_output)
def rev_batch_to_tsv(batch, tsv_fields, export_info, project, tsv_writer=None):
batch = batch.get('query',dict())
pages = batch.get('pages',dict())
for pageid, page in pages.items():
pageid = page.get('pageid',None)
ns = page.get('ns',None)
title = page.get('title','')
logging.info(f"pulling revisions for: {title}")
revs = page.get('revisions',[])
for rev in revs:
# handle missing data
if "sha1" not in rev:
rev["sha1"] = ""
if "userhidden" in rev:
rev["user"] = ""
rev["userid"] = ""
# recode anon so it's true or false instead of present/missing
if "anon" in rev:
rev["anon"] = True
else:
rev["anon"] = False
# let's recode "minor" in the same way
if "minor" in rev:
rev["minor"] = True
else:
rev["minor"] = False
# add page title information
rev['title'] = title
rev['pageid'] = pageid
rev['namespace'] = ns
rev['contentmodel'] = rev['slots']['main']['contentmodel']
# construct a URL
rev['url'] = Request('GET', f'https://{project}.org/w/index.php',
params={'title' : rev['title'].replace(" ", "_"),
'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_info['export_timestamp']
rev['export_commit'] = export_info['export_commit']
tsv_writer.writerow({k: rev[k] for k in tsv_fields})

View File

@ -1,85 +0,0 @@
#!/usr/bin/env python3
import argparse
import sqlite3
import requests
from datetime import datetime, timedelta
import logging
import digobs
from os import path, mkdir
from functools import partial
from itertools import chain
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get a list of pages related to COVID19, pandemic, and SARS-COV2 virus related entities.")
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel)
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=lambda s: datetime.strptime(s, "%Y%m%d"))
parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-b', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
args = parser.parse_args()
conn = args.input_db
conn.row_factory = sqlite3.Row
#handle -d
if args.query_date:
query_date = args.query_date.strftime("%Y%m%d")
else:
yesterday = datetime.today() - timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
digobs.init_logging(args)
logging.info(f"Destructively outputting results to {args.output_folder}")
#1 Load up the list of article names
logging.info("loading info from database")
projects = [row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall()]
successes = 0
failures = 0
for project in projects:
project_folder = path.join(args.output_folder, project)
if not path.exists(project.folder):
mkdir(project_folder)
dump_folder = path.join(projct.folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
logging.info(f"Getting page views for {project}")
rows = conn.execute(f"SELECT DISTINCT page from pagesPerProjectTable WHERE project='{project}';").fetchall()
pages = (row['page'] for row in rows)
# special case for english, we have a wikiproject input file
if project == "en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
call_view_api = partial(digobs.call_view_api, project=project, query_date = query_date)
responses = map(call_view_api, pages)
j_outfilename = path.join(j_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.json")
t_outfilename = path.join(t_output_folder, f"digobs_covid19_{project}_dailyviews-{query_date}.tsv")
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
proj_successes, proj_failures = digobs.process_view_responses(responses, j_outfile, t_outfile, logging)
logging.info(f"(Processed {proj_successes} successes and {proj_failures} for {project}")
successes = proj_successes + successes
failures = proj_failures + failures
conn.close()
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.now()}")
logging.info(f"Processed {successes} successful URLs and {failures} failures.")

View File

@ -0,0 +1,110 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import sys
import requests
import argparse
import json
import time
import os.path
import datetime
import logging
from csv import DictWriter
import digobs
#import feather #TBD
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
outputPath = args.output_folder
articleFile = args.article_file
#handle -d
if args.query_date:
query_date = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
query_date = yesterday.strftime("%Y%m%d")
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
#1 Load up the list of article names
j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.json")
t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.tsv")
with open(articleFile, 'r') as infile:
articleList = list(map(str.strip, infile))
success = 0 #for logging how many work/fail
failure = 0
#3 Save results as a JSON and TSV
with open(j_outfilename, 'w') as j_outfile, \
open(t_outfilename, 'w') as t_outfile:
#2 Repeatedly call the API with that list of names
for a in articleList:
url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url)
if response.ok:
jd = response.json()["items"][0]
success = success + 1
else:
failure = failure + 1
logging.warning(f"Failure: {response.status_code} from {url}")
continue
# start writing the CSV File if it doesn't exist yet
try:
dw
except NameError:
dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
dw.writeheader()
logging.debug(f"printing data: {jd}")
# write out the line of the json file
print(json.dumps(jd), file=j_outfile)
# write out of the csv file
dw.writerow(jd)
# f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import argparse
import logging
import os.path
import json
import datetime
from requests import Request
from csv import DictWriter
from mw import api
import digobs
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
return(args)
def main():
args = parse_args()
output_path = args.output_folder
article_filename = args.article_file
#handle -W
if args.logging_destination:
logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
else:
logging.basicConfig(level=args.logging_level)
export_time = str(datetime.datetime.now())
export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {digobs.git_hash()}")
json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php")
# list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities)
rv_props = {'revid' : 'ids',
'timestamp' : 'timestamp',
'user' : 'user',
'userid' : 'userid',
'size' : 'size',
'sha1' : 'sha1',
'contentmodel' : 'contentmodel',
'tags' : 'tags',
'flags' : 'flags',
'comment' : 'comment',
'content' : 'content' }
exclude_from_tsv = ['tags', 'comment', 'content', 'flags']
# load the list of articles
with open(article_filename, 'r') as infile:
article_list= list(map(str.strip, infile))
def get_revisions_for_page(title):
return api_session.revisions.query(properties=rv_props.values(),
titles={title},
direction="newer")
tsv_fields = ['title', 'pageid', 'namespace']
tsv_fields = tsv_fields + list(rv_props.keys())
# drop fields that we identified for exclusion
tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
# add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time }
with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output:
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader()
for article in article_list:
logging.info(f"pulling revisions for: {article}")
for rev in get_revisions_for_page(article):
logging.debug(f"processing raw revision: {rev}")
# add export metadata
rev['exported'] = export_info
# save the json version of the code
print(json.dumps(rev), file=json_output)
# handle missing data
if "sha1" not in rev:
rev["sha1"] = ""
if "userhidden" in rev:
rev["user"] = ""
rev["userid"] = ""
# recode anon so it's true or false instead of present/missing
if "anon" in rev:
rev["anon"] = True
else:
rev["anon"] = False
# let's recode "minor" in the same way
if "minor" in rev:
rev["minor"] = True
else:
rev["minor"] = False
# add page title information
rev['title'] = rev['page']['title']
rev['pageid'] = rev['page']['pageid']
rev['namespace'] = rev['page']['ns']
# construct a URL
rev['url'] = Request('GET', 'https://en.wikipedia.org/w/index.php',
params={'title' : rev['title'].replace(" ", "_"),
'oldid' : rev['revid']}).prepare().url
rev['export_timestamp'] = export_time
rev['export_commit'] = digobs.git_hash(short=True)
tsv_writer.writerow({k: rev[k] for k in tsv_fields})
if __name__ == "__main__":
main()

View File

@ -1,120 +0,0 @@
#!/usr/bin/env python3
###############################################################################
#
# This script assumes the presence of the COVID-19 repo.
#
# It (1) reads in the article list and then (2) calls the Wikimedia API to
# fetch view information for each article. Output is to (3) JSON and TSV.
#
###############################################################################
import argparse
import logging
from os import path, mkdir
import json
import datetime
import sqlite3
from functools import partial
from itertools import chain
from csv import DictWriter
import digobs
def main():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--input_file', help="Input a file of page names from the English Wikiproject.", type=argparse.FileType('r'), default='./wikipedia/resources/enwp_wikiproject_covid19_articles.txt')
parser.add_argument('-d', '--input_db', help="Input a path to a sqlite3 database from the real-time-covid-tracker project", type = sqlite3.connect, default='real-time-wiki-covid-tracker/AllWikidataItems.sqlite')
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=argparse.FileType('a'))
args = parser.parse_args()
logging = digobs.init_logging(args)
conn = args.input_db
conn.row_factory = sqlite3.Row
projects = (row['project'] for row in conn.execute("SELECT DISTINCT project from pagesPerProjectTable;").fetchall())
tsv_fields = ['title', 'pageid', 'namespace']
# list of properties from the API we want to gather (basically all of
# them supported by mediawik-utilities)
rv_props = {'revid' : 'ids',
'timestamp' : 'timestamp',
'user' : 'user',
'userid' : 'userid',
'size' : 'size',
'sha1' : 'sha1',
'contentmodel' : 'contentmodel',
'tags' : 'tags',
'flags' : 'flags',
'comment' : 'comment',
'content' : 'content' }
def get_project_pages(project):
return (row['page'] for row in conn.execute(f"SELECT DISTINCT page FROM pagesPerProjectTable WHERE project == '{project}';").fetchall())
def get_project_revisions(project):
pages = get_project_pages(project)
if project=="en.wikipedia":
pages = chain(pages, map(str.strip,args.input_file))
return digobs.get_pages_revisions(pages, project=project, logging=logging, rv_props=rv_props)
tsv_fields = tsv_fields + list(rv_props.keys())
exclude_from_tsv = ['tags', 'comment', 'content', 'flags']
# drop fields that we identified for exclusion
tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
# add special export fields
tsv_fields = tsv_fields + ['anon', 'minor', 'url', 'export_timestamp', 'export_commit']
export_time = str(datetime.datetime.now())
rev_batch_to_tsv = partial(digobs.rev_batch_to_tsv,
tsv_fields = tsv_fields,
export_info={'export_timestamp':export_time,
'export_commit':digobs.git_hash(short=True)})
export_info = { 'git_commit' : digobs.git_hash(),
'timestamp' : export_time }
export_date = datetime.datetime.today().strftime("%Y%m%d")
rev_batch_to_json = partial(digobs.rev_batch_to_json,
export_info = export_info)
def write_project_pages(project):
project_folder = path.join(args.output_folder, project)
if not path.exists(project_folder):
mkdir(project_folder)
dump_folder = path.join(project_folder, export_date)
if not path.exists(dump_folder):
mkdir(dump_folder)
project_revs = get_project_revisions(project)
json_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.json")
tsv_output_filename = path.join(dump_folder, f"digobs_covid19_{project}_revisions-{export_date}.tsv")
with open(json_output_filename, 'w') as json_output, \
open(tsv_output_filename, 'w') as tsv_output:
tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
tsv_writer.writeheader()
for rev_batch in project_revs:
logging.debug(f"processing raw revision: {rev_batch}")
rev_batch_to_json(rev_batch, json_output=json_output)
rev_batch_to_tsv(rev_batch, project=project, tsv_writer=tsv_writer)
for project in projects:
write_project_pages(project)
if __name__ == "__main__":
main()