Finish MVP for transliterations

code is reasonably well-written
checked that we get seemingly good data back
adding README
adding data
This commit is contained in:
Nathan TeBlunthuis 2020-03-24 22:06:08 -07:00
parent 308d462e76
commit 36167295ec
10 changed files with 5828 additions and 23 deletions

View File

@ -1,2 +0,0 @@
coronavirus
covid-19

View File

@ -0,0 +1,3 @@
# Transliterations
This part of the project collects tranliterations of key phrases related to COVID-19 using Wikidata. We search the Wikidata API for entities in `src/wikidata_search.py` and then we make simple SPARQL queries in `src/wikidata_transliterations.py` to collect labels and aliases the entities. The labels come with language metadata. This seems to provide a decent initial list of relevant terms across multiple languages.

View File

@ -0,0 +1,9 @@
coronavirus
covid-19
covid19
sars-cov-2
covid-19 pandemic
sars-cov-2 pandemic
fask mask
social distancing
hand sanitizer

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,8 @@
# File defines functions for making api calls to find translations and transliterations for key terms. # File defines functions for making api calls to find translations and transliterations for key terms.
import mwapi import mwapi
import requests import requests
import sys import sys
import time
from defaults import user_agent from defaults import user_agent
def get_wikidata_api(): def get_wikidata_api():
@ -29,5 +29,7 @@ def search_wikidata(session, term, *args, **kwargs):
return results return results
def run_sparql_query(q): def run_sparql_query(q):
results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql?query={q}&format=json") results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql",params={"query":q,"format":"json"})
time.sleep(2)
return results

View File

@ -2,8 +2,9 @@
from os import path from os import path
from sys import stdout from sys import stdout
from wikidata_api_calls import search_wikidata, get_wikidata_api from wikidata_api_calls import search_wikidata, get_wikidata_api
import csv
class Wikidata_ResultSet(object): class Wikidata_ResultSet:
def __init__(self): def __init__(self):
self.results = [] self.results = []
@ -12,22 +13,18 @@ class Wikidata_ResultSet(object):
for i, result in enumerate(results)]) for i, result in enumerate(results)])
def to_csv(self, outfile=None): def to_csv(self, outfile=None):
header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp']) + '\n'
if outfile is None: if outfile is None:
of = stdout of = stdout
else: else:
of = open(outfile,'w') of = open(outfile,'w',newline='')
of.write(header) writer = csv.writer(of)
for result in self.results: writer.writerow(Wikidata_Result.__slots__)
of.write(result.to_csv()) writer.writerows(map(Wikidata_Result.to_list, self.results))
of.close()
class Wikidata_Result(object): class Wikidata_Result:
# store unique entities found in the search results, the position in the search result, and the date # store unique entities found in the search results, the position in the search result, and the date
__slots__=['search_term','entityid','pageid','search_position','timestamp'] __slots__=['search_term','entityid','pageid','search_position','timestamp']
@ -38,16 +35,16 @@ class Wikidata_Result(object):
self.search_term = term.strip() self.search_term = term.strip()
self.entityid = search_result['title'] self.entityid = search_result['title']
self.pageid = search_result['pageid'] self.pageid = int(search_result['pageid'])
self.search_position = position self.search_position = int(position)
self.timestamp = search_result['timestamp'] self.timestamp = search_result['timestamp']
def to_csv(self): def to_list(self):
return ','.join([self.search_term, return [self.search_term,
self.entityid, self.entityid,
str(self.pageid), self.pageid,
str(self.search_position), self.search_position,
str(self.timestamp)]) + '\n' self.timestamp]
def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"): def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):

View File

@ -0,0 +1,79 @@
from wikidata_api_calls import run_sparql_query
from itertools import chain, islice
import csv
from json import JSONDecodeError
class LabelData:
__slots__ = ['entityid','label','langcode','is_alt']
def __init__(self, wd_res, entityid, is_alt):
obj = wd_res.get('label',None)
self.label = obj.get('value',None)
self.langcode = obj.get('xml:lang',None)
self.entityid = entityid
self.is_alt = is_alt
def to_list(self):
return [self.entityid,
self.label,
self.langcode,
self.is_alt]
def GetAllLabels(in_csv, outfile, topN):
def load_entity_ids(in_csv, topN=5):
with open(in_csv,'r',newline='') as infile:
reader = csv.DictReader(infile)
for row in reader:
if int(row['search_position']) < topN:
yield row["entityid"]
ids = set(load_entity_ids(in_csv, topN))
labeldata = chain(* map(GetEntityLabels, ids))
with open(outfile, 'w', newline='') as of:
writer = csv.writer(of)
writer.writerow(LabelData.__slots__)
writer.writerows(map(LabelData.to_list,labeldata))
def GetEntityLabels(entityid):
def run_query_and_parse(query, entityid, is_alt):
results = run_sparql_query(query % entityid)
try:
jobj = results.json()
res = jobj.get('results',None)
if res is not None:
res = res.get('bindings',None)
if res is None:
raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")
for info in res:
yield LabelData(info, entityid, is_alt)
except JSONDecodeError as e:
print(e)
print(query % entityid)
label_base_query = """
SELECT DISTINCT ?label WHERE {
wd:%s rdfs:label ?label;
}"""
altLabel_base_query = """
SELECT DISTINCT ?label WHERE {
wd:%s skos:altLabel ?label;
}"""
label_results = run_query_and_parse(label_base_query, entityid, is_alt=False)
altLabel_results = run_query_and_parse(altLabel_base_query, entityid, is_alt=True)
return chain(label_results, altLabel_results)
if __name__ == "__main__":
GetAllLabels("../data/output/wikidata_search_results.csv","../data/output/wikidata_entity_labels.csv", topN=20)