covid19/transliterations/src/wikidata_search.py

# generate a list of wikidata entities related to keywords
from os import path
from sys import stdout
from wikidata_api_calls import search_wikidata, get_wikidata_api
import csv
from itertools import chain

class Wikidata_ResultSet:
    def __init__(self):
        self.results = []

    def extend(self, term, results):
        self.results.append(
            (Wikidata_Result(term, result, i)
             for i, result in enumerate(results))
        )

    def to_csv(self, outfile=None, mode='w'):
        if outfile is None:
            of = stdout

        else:
            if path.exists(outfile) and mode != 'w':
                of = open(outfile,'a',newline='')
            else:
                of = open(outfile,'w',newline='')
        writer = csv.writer(of)
        writer.writerow(Wikidata_Result.__slots__)
        writer.writerows(map(Wikidata_Result.to_list, chain(* self.results)))


class Wikidata_Result:
    # store unique entities found in the search results, the position in the search result, and the date
    __slots__=['search_term','entityid','pageid','search_position','timestamp']

    def __init__(self,
                 term,
                 search_result,
                 position):

        self.search_term = term.strip()
        self.entityid = search_result['title']
        self.pageid = int(search_result['pageid'])
        self.search_position = int(position)
        self.timestamp = search_result['timestamp']

    def to_list(self):
        return [self.search_term,
                self.entityid,
                self.pageid,
                self.search_position,
                self.timestamp]

def run_wikidata_searches(terms):
    api = get_wikidata_api()
    resultset = Wikidata_ResultSet()
    for term in terms:
        search_results = search_wikidata(api, term)
        resultset.extend(term, search_results)
    return resultset

def read_google_trends_files(terms_files):
    def _read_file(infile):
        return csv.DictReader(open(infile,'r',newline=''))

    for row in chain(* [_read_file(terms_file) for terms_file in terms_files]):
        yield row['query']


def trawl_google_trends(terms_files, outfile = None, mode='w'):
    terms = list(read_google_trends_files(terms_files))
    resultset = run_wikidata_searches(terms)
    resultset.to_csv(outfile, mode)

def trawl_base_terms(infiles, outfile = None, mode='w'):
    terms = list(chain(* (open(infile,'r') for infile in infiles)))
    resultset = run_wikidata_searches(terms)
    resultset.to_csv(outfile, mode)

    ## search each of the base terms in wikidata

    # store unique entities found in the search results, the position in the search result, and the date

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser("Search wikidata for entities related to a set of terms.")
    parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read')
    parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends')
    parser.add_argument('--output', type=str, help='an output file. defaults to stdout')
    parser.add_argument('--overwrite', action='store_true', help = 'overwrite existing output files instead of appending')
    args = parser.parse_args()
    if args.use_gtrends:
        trawl_google_trends(args.inputs, args.output)
    else:
        trawl_base_terms(args.inputs, args.output)