rename 'transliterations' to 'keywords'

2020-03-31 15:15:01 -07:00
parent 09d171608f
commit 98b07b8098
21 changed files with 0 additions and 0 deletions
--- a/keywords/src/init.py
+++ b/keywords/src/init.py
@@ -0,0 +1,2 @@
+from wikidata_api_calls import *
+from find_entities import *
--- a/keywords/src/collect_trends.py
+++ b/keywords/src/collect_trends.py
@@ -0,0 +1,76 @@
+# this follows a similar approach to nick's trends.js but in python
+from pytrends.request import TrendReq
+from datetime import datetime
+from os import path
+import csv
+from itertools import islice, chain, zip_longest
+import pandas as pd
+
+
+# from itertools recipes
+#https://docs.python.org/3.6/library/itertools.html#itertools-recipes
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=fillvalue)
+
+def get_daily_trends():
+    trendReq = TrendReq(backoff_factor=0.2)
+    today_trending = trendReq.today_searches()
+    daily_trends_outfile = path.join("..","output","daily_google_trends.csv")
+
+    write_header = False
+    header = ['date','term','top']
+
+    if not path.exists(daily_trends_outfile):
+        write_header = True
+
+    with open("../output/intermediate/daily_google_trends.csv",'a',newline='') as of:
+        writer = csv.writer(of)
+        if write_header:
+            writer.writerow(header)
+
+        for i, trend in enumerate(today_trending):
+            writer.writerow([str(datetime.now().date()),trend,i])
+
+def get_related_queries(stems):
+    # we have to batch these in sets of 5
+    trendReq = TrendReq(backoff_factor=0.2)
+    def _get_related_queries(chunk):
+        kw_list = list(filter(lambda x: x is not None, chunk))
+        trendReq.build_payload(kw_list=kw_list)
+        related_queries = trendReq.related_queries()
+        for term, results in related_queries.items():
+            for key, df in results.items():
+                if df is not None:
+                    df["term"] = term
+                yield (key,df)
+
+    l = chain(*map(_get_related_queries, grouper(stems,5)))
+    out = {}
+    for key, value in l:
+        if key in out:
+            out[key].append(value)
+        else:
+            out[key] = [value]
+
+    for k in out.keys():
+        df = pd.concat(out[k])
+        df['date'] = str(datetime.now().date())
+        out[k] = df
+        outfile = path.join('..','output','intermediate',f"related_searches_{k}.csv")
+        if path.exists(outfile):
+            mode = 'a'
+            header = False
+        else:
+            mode = 'w'
+            header = True
+
+        df.to_csv(outfile, mode=mode, header=header,index=False)
+
+stems = [t.strip() for t in open("../resources/base_terms.txt",'r')]
+
+get_daily_trends()
+
+get_related_queries(stems)
--- a/keywords/src/compile_transliterated_phrases.sh
+++ b/keywords/src/compile_transliterated_phrases.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# For now these scripts don't accept command line arguments. It's an MVP
+
+echo "Reading Google trends"
+python3 collect_trends.py
+
+echo "Searching for Wikidata entities using base_terms.txt"
+python3 wikidata_search.py ../resources/base_terms.txt --output ../output/intermediate/wikidata_search_results.csv
+
+echo "Searching for Wikidata entities using Google trends"
+python3 wikidata_search.py ../output/intermediate/related_searches_rising.csv ../output/intermediate/related_searches_top.csv --use-gtrends --output ../output/intermediate/wikidata_search_results_from_gtrends.csv
+
+echo "Finding transliterations from Wikidata using sparql"
+python3 wikidata_transliterations.py  ../output/intermediate/wikidata_search_results_from_gtrends.csv  ../output/intermediate/wikidata_search_results.csv --topN 10 20 --output ../output/csv/$(date '+%Y-%m-%d')_wikidata_entity_labels.csv
+
--- a/keywords/src/defaults.py
+++ b/keywords/src/defaults.py
@@ -0,0 +1 @@
+user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
--- a/keywords/src/wikidata_api_calls.py
+++ b/keywords/src/wikidata_api_calls.py
@@ -0,0 +1,35 @@
+# File defines functions for making api calls to find translations and transliterations for key terms.
+import mwapi
+import requests
+import sys
+import time
+from defaults import user_agent
+
+def get_wikidata_api():
+    session = mwapi.Session(host="https://wikidata.org/w/api.php", user_agent=user_agent)
+    return session
+
+def search_wikidata(session, term, *args, **kwargs):
+    search_results = session.get(action='query',
+                                 list='search',
+                                 srsearch=term,
+#                                 srqiprofile='popular_inclinks_pv',
+                                 srlimit='max',
+                                 srnamespace=0,
+                                 *args,
+                                 **kwargs)
+
+
+    query = search_results.get('query', None)
+    results = query.get('search', None)
+
+    if results is None:
+        raise mwapi.session.APIError(f"No results for query: {term}")
+
+    return results
+
+def run_sparql_query(q):
+    results = requests.get("https://query.wikidata.org/bigdata/namespace/wdq/sparql",params={"format":"json","query":q})
+    time.sleep(2)
+    return results
+
--- a/keywords/src/wikidata_search.py
+++ b/keywords/src/wikidata_search.py
@@ -0,0 +1,95 @@
+# generate a list of wikidata entities related to keywords
+from os import path
+from sys import stdout
+from wikidata_api_calls import search_wikidata, get_wikidata_api
+import csv
+from itertools import chain
+
+class Wikidata_ResultSet:
+    def __init__(self):
+        self.results = []
+
+    def extend(self, term, results):
+        self.results.append(
+            (Wikidata_Result(term, result, i)
+             for i, result in enumerate(results))
+        )
+
+    def to_csv(self, outfile=None, mode='w'):
+        if outfile is None:
+            of = stdout
+
+        else:
+            if path.exists(outfile) and mode != 'w':
+                of = open(outfile,'a',newline='')
+            else:
+                of = open(outfile,'w',newline='')
+        writer = csv.writer(of)
+        writer.writerow(Wikidata_Result.__slots__)
+        writer.writerows(map(Wikidata_Result.to_list, chain(* self.results)))
+
+
+class Wikidata_Result:
+    # store unique entities found in the search results, the position in the search result, and the date
+    __slots__=['search_term','entityid','pageid','search_position','timestamp']
+
+    def __init__(self,
+                 term,
+                 search_result,
+                 position):
+
+        self.search_term = term.strip()
+        self.entityid = search_result['title']
+        self.pageid = int(search_result['pageid'])
+        self.search_position = int(position)
+        self.timestamp = search_result['timestamp']
+
+    def to_list(self):
+        return [self.search_term,
+                self.entityid,
+                self.pageid,
+                self.search_position,
+                self.timestamp]
+    
+def run_wikidata_searches(terms):
+    api = get_wikidata_api()
+    resultset = Wikidata_ResultSet()
+    for term in terms:
+        search_results = search_wikidata(api, term)
+        resultset.extend(term, search_results)
+    return resultset
+
+def read_google_trends_files(terms_files):
+    def _read_file(infile):
+        return csv.DictReader(open(infile,'r',newline=''))
+
+    for row in chain(* [_read_file(terms_file) for terms_file in terms_files]):
+        yield row['query']
+
+
+def trawl_google_trends(terms_files, outfile = None, mode='w'):
+    terms = list(read_google_trends_files(terms_files))
+    resultset = run_wikidata_searches(terms)
+    resultset.to_csv(outfile, mode)
+
+def trawl_base_terms(infiles, outfile = None, mode='w'):
+    terms = list(chain(* (open(infile,'r') for infile in infiles)))
+    resultset = run_wikidata_searches(terms)
+    resultset.to_csv(outfile, mode)
+
+    ## search each of the base terms in wikidata
+
+    # store unique entities found in the search results, the position in the search result, and the date
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser("Search wikidata for entities related to a set of terms.")
+    parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read')
+    parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends')
+    parser.add_argument('--output', type=str, help='an output file. defaults to stdout')
+    parser.add_argument('--overwrite', action='store_true', help = 'overwrite existing output files instead of appending')
+    args = parser.parse_args()
+    if args.use_gtrends:
+        trawl_google_trends(args.inputs, args.output)
+    else:
+        trawl_base_terms(args.inputs, args.output)
--- a/keywords/src/wikidata_transliterations.py
+++ b/keywords/src/wikidata_transliterations.py
@@ -0,0 +1,107 @@
+from wikidata_api_calls import run_sparql_query
+from itertools import chain, islice
+import csv
+from json import JSONDecodeError
+from os import path
+
+class LabelData:
+    __slots__ = ['entityid','label','langcode','is_alt']
+
+    def __init__(self, wd_res, is_alt):
+        obj = wd_res.get('label',None)
+        self.label = obj.get('value',None)
+        self.langcode = obj.get('xml:lang',None)
+        self.entityid = wd_res.get('entity',None).get('value',None)
+        self.is_alt = is_alt
+
+    def to_list(self):
+        return [self.entityid,
+                self.label,
+                self.langcode,
+                self.is_alt]
+
+def GetAllLabels(in_csvs, outfile, topNs):
+
+    def load_entity_ids(in_csv, topN=5):
+        with open(in_csv,'r',newline='') as infile:
+            reader = list(csv.DictReader(infile))
+            for row in reader:
+                if int(row['search_position']) < topN:
+                    yield row["entityid"]
+
+    ids = set(chain(* map(lambda in_csv, topN: load_entity_ids(in_csv, topN), in_csvs, topNs)))
+
+    labeldata = GetEntityLabels(ids)
+
+    with open(outfile, 'w', newline='') as of:
+        writer = csv.writer(of)
+        writer.writerow(LabelData.__slots__)
+        writer.writerows(map(LabelData.to_list,labeldata))
+
+    
+def GetEntityLabels(entityids):
+
+    def run_query_and_parse(query, is_alt):
+        results = run_sparql_query(query)
+        try:
+            jobj = results.json()
+
+            res = jobj.get('results',None)
+            if res is not None:
+                res = res.get('bindings',None)
+            if res is None:
+                raise requests.APIError(f"got invalid response from wikidata for {query % entityid}")
+
+            for info in res:
+                yield LabelData(info, is_alt)
+
+        except JSONDecodeError as e:
+            print(e)
+            print(query)
+            
+    def prep_query(query, prop, entityids):
+        values = ' '.join(('wd:{0}'.format(id) for id in entityids))
+        return query.format(prop, values)
+    
+    base_query = """
+    SELECT DISTINCT ?entity ?label WHERE {{
+    ?entity {0} ?label;
+    VALUES ?entity  {{ {1} }}
+    }}"""
+
+    # we can't get all the entities at once. how about 100 at a time?
+    chunksize = 100
+    entityids = (id for id in entityids)
+    chunk = list(islice(entityids, chunksize))
+    calls = []
+    while len(chunk) > 0:
+        label_query = prep_query(base_query, "rdfs:label", chunk)
+        altLabel_query = prep_query(base_query, "skos:altLabel", chunk)
+        label_results = run_query_and_parse(label_query,  is_alt=False)
+        altLabel_results = run_query_and_parse(altLabel_query, is_alt=True)
+        calls.extend([label_results, altLabel_results])
+        chunk = list(islice(entityids, chunksize))
+
+    return chain(*calls)
+        
+
+def find_new_output_file(output, i = 1):
+    if path.exists(output):
+        name, ext = path.splitext(output)
+
+        return find_new_output_file(f"{name}_{i}.{ext}", i+1)
+    else:
+        return output
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser("Use wikidata to find transliterations of terms")
+    parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read. the inputs are generated by wikidata_search.py')
+    parser.add_argument('--topN', type=int, nargs='+', help='limit number of wikidata search results to use, can pass one arg for each source.')
+    parser.add_argument('--output', type=str, help='an output file. defaults to stdout',default=20)
+
+    args = parser.parse_args()
+
+    output = find_new_output_file(args.output)
+
+    GetAllLabels(args.inputs, output, topNs=args.topN)
				`@@ -0,0 +1 @@`
				`user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"`