Finish MVP for transliterations

code is reasonably well-written checked that we get seemingly good data back adding README adding data
2020-03-24 22:06:08 -07:00 · 2020-03-24 22:06:08 -07:00 · 36167295ec
commit 36167295ec
parent 308d462e76
10 changed files with 5828 additions and 23 deletions
--- a/translations/data/input/base_terms.txt
+++ b/translations/data/input/base_terms.txt
@ -1,2 +0,0 @@
 coronavirus
 covid-19
--- a/transliterations/README.md
+++ b/transliterations/README.md
@ -0,0 +1,3 @@
 # Transliterations
 This part of the project collects tranliterations of key phrases related to COVID-19 using Wikidata.  We search the Wikidata API for entities in `src/wikidata_search.py` and then we make simple SPARQL queries in `src/wikidata_transliterations.py` to collect labels and aliases the entities.  The labels come with language metadata.  This seems to provide a decent initial list of relevant terms across multiple languages. 
--- a/transliterations/data/input/base_terms.txt
+++ b/transliterations/data/input/base_terms.txt
@ -0,0 +1,9 @@
 coronavirus
 covid-19
 covid19
 sars-cov-2
 covid-19 pandemic
 sars-cov-2 pandemic
 fask mask
 social distancing
 hand sanitizer
--- a/transliterations/data/output/wikidata_entity_labels.csv
+++ b/transliterations/data/output/wikidata_entity_labels.csv
--- a/transliterations/data/output/wikidata_search_results.csv
+++ b/transliterations/data/output/wikidata_search_results.csv
--- a/transliterations/src/init.py
+++ b/transliterations/src/init.py
--- a/transliterations/src/defaults.py
+++ b/transliterations/src/defaults.py
--- a/transliterations/src/wikidata_api_calls.py
+++ b/transliterations/src/wikidata_api_calls.py
@ -1,8 +1,8 @@
 # File defines functions for making api calls to find translations and transliterations for key terms.
 import mwapi
 import requests
 import sys
 import time
 from defaults import user_agent
 def get_wikidata_api():
@ -29,5 +29,7 @@ def search_wikidata(session, term, *args, **kwargs):
    return results
 def run_sparql_query(q):
-    results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={q}&format=json")
+    results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql",params={"query":q,"format":"json"})
    time.sleep(2)
    return results
--- a/transliterations/src/wikidata_search.py
+++ b/transliterations/src/wikidata_search.py
@ -2,8 +2,9 @@
 from os import path
 from sys import stdout
 from wikidata_api_calls import search_wikidata, get_wikidata_api
 import csv
-class Wikidata_ResultSet(object):
+class Wikidata_ResultSet:
    def __init__(self):
        self.results = []
@ -12,22 +13,18 @@ class Wikidata_ResultSet(object):
                                    for i, result in enumerate(results)])
    def to_csv(self, outfile=None):
        header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + '\n'
        if outfile is None:
            of = stdout
        else:
-            of = open(outfile,'w')
+            of = open(outfile,'w',newline='')
-        of.write(header)
+        writer = csv.writer(of)
-        for result in self.results:
+        writer.writerow(Wikidata_Result.__slots__)
-            of.write(result.to_csv())
+        writer.writerows(map(Wikidata_Result.to_list, self.results))
        of.close()
-class Wikidata_Result(object):
+class Wikidata_Result:
    # store unique entities found in the search results, the position in the search result, and the date
    __slots__=['search_term','entityid','pageid','search_position','timestamp']
@ -38,16 +35,16 @@ class Wikidata_Result(object):
        self.search_term = term.strip()
        self.entityid = search_result['title']
-        self.pageid = search_result['pageid']
+        self.pageid = int(search_result['pageid'])
-        self.search_position = position
+        self.search_position = int(position)
        self.timestamp = search_result['timestamp']
-    def to_csv(self):
+    def to_list(self):
-        return ','.join([self.search_term,
+        return [self.search_term,
                self.entityid,
-                         str(self.pageid),
+                self.pageid,
-                         str(self.search_position),
+                self.search_position,
-                         str(self.timestamp)]) + '\n'
+                self.timestamp]
 def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
--- a/transliterations/src/wikidata_transliterations.py
+++ b/transliterations/src/wikidata_transliterations.py
@ -0,0 +1,79 @@
 from wikidata_api_calls import run_sparql_query
 from itertools import chain, islice
 import csv
 from json import JSONDecodeError
 class LabelData:
    __slots__ = ['entityid','label','langcode','is_alt']
    def __init__(self, wd_res, entityid, is_alt):
        obj = wd_res.get('label',None)
        self.label = obj.get('value',None)
        self.langcode = obj.get('xml:lang',None)
        self.entityid = entityid
        self.is_alt = is_alt
    def to_list(self):
        return [self.entityid,
                self.label,
                self.langcode,
                self.is_alt]
 def GetAllLabels(in_csv, outfile, topN):
    def load_entity_ids(in_csv, topN=5):
        with open(in_csv,'r',newline='') as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                if int(row['search_position']) < topN:
                    yield row["entityid"]
    ids = set(load_entity_ids(in_csv, topN))
    labeldata = chain(* map(GetEntityLabels, ids))
    with open(outfile, 'w', newline='') as of:
        writer = csv.writer(of)
        writer.writerow(LabelData.__slots__)
        writer.writerows(map(LabelData.to_list,labeldata))
 def GetEntityLabels(entityid):
    def run_query_and_parse(query, entityid, is_alt):
        results = run_sparql_query(query % entityid)
        try:
            jobj = results.json()
            res = jobj.get('results',None)
            if res is not None:
                res = res.get('bindings',None)
            if res is None:
                raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")
            for info in res:
                yield LabelData(info, entityid, is_alt)
        except JSONDecodeError as e:
            print(e)
            print(query % entityid)
    label_base_query = """
    SELECT DISTINCT ?label WHERE {
    wd:%s rdfs:label ?label;
    }"""
    altLabel_base_query = """
    SELECT DISTINCT ?label WHERE {
    wd:%s skos:altLabel ?label;
    }"""
    label_results = run_query_and_parse(label_base_query, entityid, is_alt=False)
    altLabel_results = run_query_and_parse(altLabel_base_query, entityid, is_alt=True)
    return chain(label_results, altLabel_results)
 if __name__ == "__main__":
    GetAllLabels("../data/output/wikidata_search_results.csv","../data/output/wikidata_entity_labels.csv", topN=20)
		`@ -0,0 +1,3 @@`
							`# Transliterations`

							This part of the project collects tranliterations of key phrases related to COVID-19 using Wikidata. We search the Wikidata API for entities in `src/wikidata_search.py` and then we make simple SPARQL queries in `src/wikidata_transliterations.py` to collect labels and aliases the entities. The labels come with language metadata. This seems to provide a decent initial list of relevant terms across multiple languages.