Finish MVP for transliterations

code is reasonably well-written checked that we get seemingly good data back adding README adding data
2020-03-24 22:06:08 -07:00 · 2020-03-24 22:06:08 -07:00 · 36167295ec
commit 36167295ec
parent 308d462e76
10 changed files with 5828 additions and 23 deletions
--- a/translations/data/input/base_terms.txt
+++ b/translations/data/input/base_terms.txt
@ -1,2 +0,0 @@
-coronavirus
-covid-19
--- a/transliterations/README.md
+++ b/transliterations/README.md
@ -0,0 +1,3 @@
+# Transliterations
+
+This part of the project collects tranliterations of key phrases related to COVID-19 using Wikidata.  We search the Wikidata API for entities in `src/wikidata_search.py` and then we make simple SPARQL queries in `src/wikidata_transliterations.py` to collect labels and aliases the entities.  The labels come with language metadata.  This seems to provide a decent initial list of relevant terms across multiple languages. 
--- a/transliterations/data/input/base_terms.txt
+++ b/transliterations/data/input/base_terms.txt
@ -0,0 +1,9 @@
+coronavirus
+covid-19
+covid19
+sars-cov-2
+covid-19 pandemic
+sars-cov-2 pandemic
+fask mask
+social distancing
+hand sanitizer
--- a/transliterations/data/output/wikidata_entity_labels.csv
+++ b/transliterations/data/output/wikidata_entity_labels.csv
--- a/transliterations/data/output/wikidata_search_results.csv
+++ b/transliterations/data/output/wikidata_search_results.csv
--- a/transliterations/src/init.py
+++ b/transliterations/src/init.py
--- a/transliterations/src/defaults.py
+++ b/transliterations/src/defaults.py
--- a/transliterations/src/wikidata_api_calls.py
+++ b/transliterations/src/wikidata_api_calls.py
@ -1,8 +1,8 @@
 # File defines functions for making api calls to find translations and transliterations for key terms.
-
 import mwapi
 import requests
 import sys
+import time
 from defaults import user_agent

 def get_wikidata_api():
@ -29,5 +29,7 @@ def search_wikidata(session, term, *args, **kwargs):
    return results

 def run_sparql_query(q):
-    results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={q}&format=json")
+    results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql",params={"query":q,"format":"json"})
+    time.sleep(2)
+    return results

--- a/transliterations/src/wikidata_search.py
+++ b/transliterations/src/wikidata_search.py
@ -2,8 +2,9 @@
 from os import path
 from sys import stdout
 from wikidata_api_calls import search_wikidata, get_wikidata_api
+import csv

-class Wikidata_ResultSet(object):
+class Wikidata_ResultSet:
    def __init__(self):
        self.results = []

@ -12,22 +13,18 @@ class Wikidata_ResultSet(object):
                                    for i, result in enumerate(results)])

    def to_csv(self, outfile=None):
-        
-        header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + '\n'
        if outfile is None:
            of = stdout

        else:
-            of = open(outfile,'w')
+            of = open(outfile,'w',newline='')

-        of.write(header)
-        for result in self.results:
-            of.write(result.to_csv())
-
-        of.close()
+        writer = csv.writer(of)
+        writer.writerow(Wikidata_Result.__slots__)
+        writer.writerows(map(Wikidata_Result.to_list, self.results))


-class Wikidata_Result(object):
+class Wikidata_Result:
    # store unique entities found in the search results, the position in the search result, and the date
    __slots__=['search_term','entityid','pageid','search_position','timestamp']

@ -38,16 +35,16 @@ class Wikidata_Result(object):

        self.search_term = term.strip()
        self.entityid = search_result['title']
-        self.pageid = search_result['pageid']
-        self.search_position = position
+        self.pageid = int(search_result['pageid'])
+        self.search_position = int(position)
        self.timestamp = search_result['timestamp']

-    def to_csv(self):
-        return ','.join([self.search_term,
-                         self.entityid,
-                         str(self.pageid),
-                         str(self.search_position),
-                         str(self.timestamp)]) + '\n'
+    def to_list(self):
+        return [self.search_term,
+                self.entityid,
+                self.pageid,
+                self.search_position,
+                self.timestamp]
    
 def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):

--- a/transliterations/src/wikidata_transliterations.py
+++ b/transliterations/src/wikidata_transliterations.py
@ -0,0 +1,79 @@
+from wikidata_api_calls import run_sparql_query
+from itertools import chain, islice
+import csv
+from json import JSONDecodeError
+
+class LabelData:
+    __slots__ = ['entityid','label','langcode','is_alt']
+
+    def __init__(self, wd_res, entityid, is_alt):
+        obj = wd_res.get('label',None)
+        self.label = obj.get('value',None)
+        self.langcode = obj.get('xml:lang',None)
+        self.entityid = entityid
+        self.is_alt = is_alt
+
+    def to_list(self):
+        return [self.entityid,
+                self.label,
+                self.langcode,
+                self.is_alt]
+
+
+def GetAllLabels(in_csv, outfile, topN):
+
+    def load_entity_ids(in_csv, topN=5):
+        with open(in_csv,'r',newline='') as infile:
+            reader = csv.DictReader(infile)
+            for row in reader:
+                if int(row['search_position']) < topN:
+                    yield row["entityid"]
+
+    ids = set(load_entity_ids(in_csv, topN))
+
+    labeldata = chain(* map(GetEntityLabels, ids))
+
+    with open(outfile, 'w', newline='') as of:
+        writer = csv.writer(of)
+        writer.writerow(LabelData.__slots__)
+        writer.writerows(map(LabelData.to_list,labeldata))
+
+    
+def GetEntityLabels(entityid):
+
+    def run_query_and_parse(query, entityid, is_alt):
+        results = run_sparql_query(query % entityid)
+        try:
+            jobj = results.json()
+            res = jobj.get('results',None)
+            if res is not None:
+                res = res.get('bindings',None)
+            if res is None:
+                raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")
+            for info in res:
+                yield LabelData(info, entityid, is_alt)
+
+        except JSONDecodeError as e:
+            print(e)
+            print(query % entityid)
+            
+
+    label_base_query = """
+    SELECT DISTINCT ?label WHERE {
+    wd:%s rdfs:label ?label;
+    }"""
+
+    altLabel_base_query = """
+    SELECT DISTINCT ?label WHERE {
+    wd:%s skos:altLabel ?label;
+    }"""
+
+    label_results = run_query_and_parse(label_base_query, entityid, is_alt=False)
+
+    altLabel_results = run_query_and_parse(altLabel_base_query, entityid, is_alt=True)
+
+    return chain(label_results, altLabel_results)
+        
+
+if __name__ == "__main__":
+    GetAllLabels("../data/output/wikidata_search_results.csv","../data/output/wikidata_entity_labels.csv", topN=20)