Python code to find wikidata entities to translate. Here we search the api for entities that have covid keywords.
Building system for finding translations from Wikidata.
This commit is contained in:
parent
c0e067ecc1
commit
836098461e
2
translations/data/input/base_terms.txt
Normal file
2
translations/data/input/base_terms.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
coronavirus
|
||||||
|
covid-19
|
2
translations/src/__init__.py
Normal file
2
translations/src/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
from wikidata_api_calls import *
|
||||||
|
from find_entities import *
|
1
translations/src/defaults.py
Normal file
1
translations/src/defaults.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
|
68
translations/src/find_entities.py
Normal file
68
translations/src/find_entities.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
# generate a list of wikidata entities related to keywords
|
||||||
|
from os import path
|
||||||
|
from sys import stdout
|
||||||
|
from wikidata_api_calls import search_wikidata
|
||||||
|
|
||||||
|
class Wikidata_ResultSet(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
def extend(self, term, results):
|
||||||
|
self.results.extend([Wikidata_Result(term, result, i)
|
||||||
|
for i, result in enumerate(results)])
|
||||||
|
|
||||||
|
def to_csv(self, outfile=None):
|
||||||
|
|
||||||
|
header = ','.join(['search_term', 'entityid', 'pageid', 'search_position','timestamp'])
|
||||||
|
if outfile is None:
|
||||||
|
of = stdout
|
||||||
|
|
||||||
|
else:
|
||||||
|
of = open(outfile,'w')
|
||||||
|
|
||||||
|
of.write(header)
|
||||||
|
for result in self.results:
|
||||||
|
of.write(result.to_csv())
|
||||||
|
|
||||||
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
|
class Wikidata_Result(object):
|
||||||
|
# store unique entities found in the search results, the position in the search result, and the date
|
||||||
|
__slots__=['search_term','entityid','pageid','search_position','timestamp']
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
term,
|
||||||
|
search_result,
|
||||||
|
position):
|
||||||
|
|
||||||
|
self.search_term = term.strip()
|
||||||
|
self.entityid = search_result['title']
|
||||||
|
self.pageid = search_result['pageid']
|
||||||
|
self.search_position = position
|
||||||
|
self.timestamp = search_result['timestamp']
|
||||||
|
|
||||||
|
def to_csv(self):
|
||||||
|
return ','.join([self.search_term,
|
||||||
|
self.entityid,
|
||||||
|
str(self.pageid),
|
||||||
|
str(self.search_position),
|
||||||
|
str(self.timestamp)]) + '\n'
|
||||||
|
|
||||||
|
def run_wikidata_searches(terms_file = '../data/input/base_terms.txt', outfile="../data/output/wikidata_search_results.csv"):
|
||||||
|
|
||||||
|
resultset = Wikidata_ResultSet()
|
||||||
|
for term in open(terms_file,'r'):
|
||||||
|
api = get_wikidata_api()
|
||||||
|
search_results = search_wikidata(api, term)
|
||||||
|
resultset.extend(term, search_results)
|
||||||
|
|
||||||
|
resultset.to_csv(outfile)
|
||||||
|
|
||||||
|
|
||||||
|
## search each of the base terms in wikidata
|
||||||
|
|
||||||
|
# store unique entities found in the search results, the position in the search result, and the date
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_wikidata_searches()
|
29
translations/src/wikidata_api_calls.py
Normal file
29
translations/src/wikidata_api_calls.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# File defines functions for making api calls to find translations and transliterations for key terms.
|
||||||
|
|
||||||
|
import mwapi
|
||||||
|
import sys
|
||||||
|
sys.path.append("..")
|
||||||
|
from defaults import user_agent
|
||||||
|
|
||||||
|
def get_wikidata_api():
|
||||||
|
session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent)
|
||||||
|
return session
|
||||||
|
|
||||||
|
def search_wikidata(session, term, *args, **kwargs):
|
||||||
|
search_results = session.get(action='query',
|
||||||
|
list='search',
|
||||||
|
srsearch=term,
|
||||||
|
# srqiprofile='popular_inclinks_pv',
|
||||||
|
srlimit='max',
|
||||||
|
srnamespace=0,
|
||||||
|
*args,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
query = search_results.get('query', None)
|
||||||
|
results = query.get('search', None)
|
||||||
|
|
||||||
|
if results is None:
|
||||||
|
raise mwapi.session.APIError(f"No results for query: {term}")
|
||||||
|
|
||||||
|
return results
|
Loading…
Reference in New Issue
Block a user