covid19/transliterations/src/wikidata_transliterations.py

from wikidata_api_calls import run_sparql_query
from itertools import chain, islice
import csv
from json import JSONDecodeError

class LabelData:
    __slots__ = ['entityid','label','langcode','is_alt']

    def __init__(self, wd_res, is_alt):
        obj = wd_res.get('label',None)
        self.label = obj.get('value',None)
        self.langcode = obj.get('xml:lang',None)
        self.entityid = wd_res.get('entity',None).get('value',None)
        self.is_alt = is_alt

    def to_list(self):
        return [self.entityid,
                self.label,
                self.langcode,
                self.is_alt]

def GetAllLabels(in_csvs, outfile, topNs):

    def load_entity_ids(in_csv, topN=5):
        with open(in_csv,'r',newline='') as infile:
            reader = list(csv.DictReader(infile))
            for row in reader:
                if int(row['search_position']) < topN:
                    yield row["entityid"]

    ids = set(chain(* map(lambda in_csv, topN: load_entity_ids(in_csv, topN), in_csvs, topNs)))

    labeldata = GetEntityLabels(ids)

    with open(outfile, 'w', newline='') as of:
        writer = csv.writer(of)
        writer.writerow(LabelData.__slots__)
        writer.writerows(map(LabelData.to_list,labeldata))

    
def GetEntityLabels(entityids):

    def run_query_and_parse(query, is_alt):
        results = run_sparql_query(query)
        try:
            jobj = results.json()

            res = jobj.get('results',None)
            if res is not None:
                res = res.get('bindings',None)
            if res is None:
                raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")

            for info in res:
                yield LabelData(info, is_alt)

        except JSONDecodeError as e:
            print(e)
            print(query)
            
    def prep_query(query, prop, entityids):
        values = ' '.join(('wd:{0}'.format(id) for id in entityids))
        return query.format(prop, values)
    
    base_query = """
    SELECT DISTINCT ?entity ?label WHERE {{
    ?entity {0} ?label;
    VALUES ?entity  {{ {1} }}
    }}"""

    # we can't get all the entities at once. how about 100 at a time?
    chunksize = 100
    entityids = (id for id in entityids)
    chunk = list(islice(entityids, chunksize))
    calls = []
    while len(chunk) > 0:
        label_query = prep_query(base_query, "rdfs:label", chunk)
        altLabel_query = prep_query(base_query, "skos:altLabel", chunk)
        label_results = run_query_and_parse(label_query,  is_alt=False)
        altLabel_results = run_query_and_parse(altLabel_query, is_alt=True)
        calls.extend([label_results, altLabel_results])
        chunk = list(islice(entityids, chunksize))

    return chain(*calls)
        

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser("Use wikidata to find transliterations of terms")
    parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read. the inputs are generated by wikidata_search.py')
    parser.add_argument('--topN', type=int, nargs='+', help='limit number of wikidata search results to use, can pass one arg for each source.')
    parser.add_argument('--output', type=str, help='an output file. defaults to stdout',default=20)

    args = parser.parse_args()

    GetAllLabels(args.inputs, args.output, topNs=args.topN)
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`from wikidata_api_calls import run_sparql_query`
			`from itertools import chain, islice`
			`import csv`
			`from json import JSONDecodeError`

			`class LabelData:`
			`__slots__ = ['entityid','label','langcode','is_alt']`

expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`def __init__(self, wd_res, is_alt):`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`obj = wd_res.get('label',None)`
			`self.label = obj.get('value',None)`
			`self.langcode = obj.get('xml:lang',None)`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`self.entityid = wd_res.get('entity',None).get('value',None)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`self.is_alt = is_alt`

			`def to_list(self):`
			`return [self.entityid,`
			`self.label,`
			`self.langcode,`
			`self.is_alt]`

expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`def GetAllLabels(in_csvs, outfile, topNs):`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
			`def load_entity_ids(in_csv, topN=5):`
			`with open(in_csv,'r',newline='') as infile:`
Read entire input files before making api calls. This is nicer style to not hold onto resources for as long. It will use a bit more memory. 2020-03-28 20:55:52 +00:00			`reader = list(csv.DictReader(infile))`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`for row in reader:`
			`if int(row['search_position']) < topN:`
			`yield row["entityid"]`

expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`ids = set(chain(* map(lambda in_csv, topN: load_entity_ids(in_csv, topN), in_csvs, topNs)))`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`labeldata = GetEntityLabels(ids)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
			`with open(outfile, 'w', newline='') as of:`
			`writer = csv.writer(of)`
			`writer.writerow(LabelData.__slots__)`
			`writer.writerows(map(LabelData.to_list,labeldata))`


expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`def GetEntityLabels(entityids):`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`def run_query_and_parse(query, is_alt):`
			`results = run_sparql_query(query)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`try:`
			`jobj = results.json()`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`res = jobj.get('results',None)`
			`if res is not None:`
			`res = res.get('bindings',None)`
			`if res is None:`
			`raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00			`for info in res:`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`yield LabelData(info, is_alt)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
			`except JSONDecodeError as e:`
			`print(e)`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`print(query)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`def prep_query(query, prop, entityids):`
			`values = ' '.join(('wd:{0}'.format(id) for id in entityids))`
			`return query.format(prop, values)`

			`base_query = """`
			`SELECT DISTINCT ?entity ?label WHERE {{`
			`?entity {0} ?label;`
			`VALUES ?entity {{ {1} }}`
			`}}"""`

			`# we can't get all the entities at once. how about 100 at a time?`
			`chunksize = 100`
			`entityids = (id for id in entityids)`
			`chunk = list(islice(entityids, chunksize))`
			`calls = []`
			`while len(chunk) > 0:`
			`label_query = prep_query(base_query, "rdfs:label", chunk)`
			`altLabel_query = prep_query(base_query, "skos:altLabel", chunk)`
			`label_results = run_query_and_parse(label_query, is_alt=False)`
			`altLabel_results = run_query_and_parse(altLabel_query, is_alt=True)`
			`calls.extend([label_results, altLabel_results])`
			`chunk = list(islice(entityids, chunksize))`

			`return chain(*calls)`
Finish MVP for transliterations code is reasonably well-written checked that we get seemingly good data back adding README adding data 2020-03-25 05:06:08 +00:00

			`if __name__ == "__main__":`
expand wikidata search to get keywords from google trends 2020-03-27 23:52:19 +00:00			`import argparse`
			`parser = argparse.ArgumentParser("Use wikidata to find transliterations of terms")`
			`parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read. the inputs are generated by wikidata_search.py')`
			`parser.add_argument('--topN', type=int, nargs='+', help='limit number of wikidata search results to use, can pass one arg for each source.')`
			`parser.add_argument('--output', type=str, help='an output file. defaults to stdout',default=20)`

			`args = parser.parse_args()`

			`GetAllLabels(args.inputs, args.output, topNs=args.topN)`