Python code to find wikidata entities to translate. Here we search the api for entities that have covid keywords.

Building system for finding translations from Wikidata.
This commit is contained in:
Nathan TeBlunthuis 2020-03-24 15:03:47 -07:00
parent c0e067ecc1
commit 836098461e
5 changed files with 102 additions and 0 deletions

View File

@ -0,0 +1,2 @@
coronavirus
covid-19

View File

@ -0,0 +1,2 @@
from wikidata_api_calls import *
from find_entities import *

View File

@ -0,0 +1 @@
user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"

View File

@ -0,0 +1,68 @@
# generate a list of wikidata entities related to keywords
from os import path
from sys import stdout
from wikidata_api_calls import search_wikidata
class Wikidata_ResultSet(object):
def __init__(self):
self.results = []
def extend(self, term, results):
self.results.extend([Wikidata_Result(term, result, i)
for i, result in enumerate(results)])
def to_csv(self, outfile=None):
header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp'])
if outfile is None:
of = stdout
else:
of = open(outfile,'w')
of.write(header)
for result in self.results:
of.write(result.to_csv())
of.close()
class Wikidata_Result(object):
# store unique entities found in the search results, the position in the search result, and the date
__slots__=['search_term','entityid','pageid','search_position','timestamp']
def __init__(self,
term,
search_result,
position):
self.search_term = term.strip()
self.entityid = search_result['title']
self.pageid = search_result['pageid']
self.search_position = position
self.timestamp = search_result['timestamp']
def to_csv(self):
return ','.join([self.search_term,
self.entityid,
str(self.pageid),
str(self.search_position),
str(self.timestamp)]) + '\n'
def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
resultset = Wikidata_ResultSet()
for term in open(terms_file,'r'):
api = get_wikidata_api()
search_results = search_wikidata(api, term)
resultset.extend(term, search_results)
resultset.to_csv(outfile)
## search each of the base terms in wikidata
# store unique entities found in the search results, the position in the search result, and the date
if __name__ == "__main__":
run_wikidata_searches()

View File

@ -0,0 +1,29 @@
# File defines functions for making api calls to find translations and transliterations for key terms.
import mwapi
import sys
sys.path.append("..")
from defaults import user_agent
def get_wikidata_api():
session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent)
return session
def search_wikidata(session, term, *args, **kwargs):
search_results = session.get(action='query',
list='search',
srsearch=term,
# srqiprofile='popular_inclinks_pv',
srlimit='max',
srnamespace=0,
*args,
**kwargs)
query = search_results.get('query', None)
results = query.get('search', None)
if results is None:
raise mwapi.session.APIError(f"No results for query: {term}")
return results