social-media-chapter/code/data_collection/request_functions.py

import requests
from datetime import datetime
from scopus_api import key as API_KEY
import json
import os
import logging
import re

logging.basicConfig(level=logging.DEBUG)

RETRY_COUNT = 5
TIMEOUT_SECS = 10

# Initialize a global session object
s = requests.Session()
s.headers.update({'X-ELS-APIKey' : API_KEY,
            'X-ELS-ResourceVersion' : 'XOCS',
            'Accept' : 'application/json'})

def get_token(location_id = None):
    '''Given a location_id, gets an authentication token'''
    print('Getting a token')
    api_resource = 'http://api.elsevier.com/authenticate'
    # Parameters
    payload = {'platform':'SCOPUS',
            'choice': location_id}
    r = s.get(api_resource, params = payload)
    r.raise_for_status()
    s.headers['X-ELS-AuthToken'] = r.json()['authenticate-response']['authtoken']

def get_search_results(query, output_file, results_per_call = 200,
        tot_results=None, year=None, sort='+title', citation_call=False):
    '''Handles getting search results. Takes a query and an output
    file. Writes as many of the search results as possible to the
    output file as JSON dictionaries, one per line.'''
    result_set = []
    results_added = 0
    def curr_call(start=0, count=results_per_call):
        '''Shorthand for the current call: DRY'''
        return make_search_call(query, start=start,
            count=count, year=year, sort=sort)
    if tot_results == None:
        # Call the API initially to figure out how many results there are, and write the results
        initial_results = curr_call(count=results_per_call)
        tot_results = int(initial_results['search-results']['opensearch:totalResults'])
        result_set.append((initial_results, sort))
        results_added += results_per_call
    logging.debug("Total results: {}".format(tot_results))

    if tot_results == 0:
        return None
    if tot_results > 5000:
            # If this is just one year, we can't get any more granular, and
            # we need to return what we can.
        if tot_results > 10000:
            print("{} results for {}. We can only retrieve 10,000".format(tot_results, year))
            first_half = last_half = 5000
        else:
            # Get half, and correct for odd # of results
            first_half = tot_results//2 + tot_results % 2
            last_half = tot_results//2
        # Break the search into the first half and the bottom half of results.
        get_search_results(query, output_file,
               year = year,
               tot_results=first_half)
         # Get the other half
        get_search_results(query, output_file,
                year = year,
                tot_results = last_half, sort='-title')
# If there are 5000 or fewer to retrieve, then get them
    else:
        logging.debug('Retrieving {} results'.format(tot_results))
        # As long as there are more citations to retrieve, then do it, and write
        # them to the file
        while results_added < tot_results:
            # If we are near the end, then only get as many results as are left.
            to_retrieve = min(results_per_call, (tot_results - results_added))
            curr_results = curr_call(start=results_added, count=to_retrieve)
            result_set.append((curr_results, sort))
            results_added += results_per_call
    # This is hacky, but I'm doing it
    # If this is a citation call, then construct metadata to be written with the result
    if citation_call:
        metadata = {'parent_eid': re.match(r'refeid\((.*)\)', query).group(1)}
    else:
        metadata = {}
    write_results(result_set, output_file, metadata)

def write_results(result_set, output_file, metadata={}):
    for x in result_set:
        search_json = x[0]
        to_reverse = x[1].startswith('-')
        try:
            results = [x for x in search_json['search-results']['entry']]
        except KeyError:
            raise
        if to_reverse:
            results = results[::-1]
        for x in results:
            for k, v in metadata.items():
                x[k] = v
            json.dump(x, output_file)
            output_file.write('\n')


def make_search_call(query, start=0, count=200,
        sort='+title', year=None,
        retry_limit = RETRY_COUNT,
        timeout_secs = TIMEOUT_SECS):
    api_resource = "https://api.elsevier.com/content/search/scopus"
    # Parameters
    payload = {'query':query,
            'count':count,
            'start':start,
            'sort': sort,
            'date': year}
    for _ in range(retry_limit):
        try:
            r = s.get(api_resource,
                    params = payload,
                    timeout = timeout_secs)
            logging.debug(r.url)
            if r.status_code == 401:
                get_token()
                continue
            if r.status_code == 400:
                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
            break
        except requests.exceptions.Timeout:
            pass
    else:
        raise requests.exceptions.Timeout('Timeout Error')

    r.raise_for_status()
    return r.json()


def get_cited_by(eid, output_file):
    return get_search_results('refeid({})'.format(eid), output_file, results_per_call=200,
            citation_call = True)


def get_abstract(eid, retry_limit = RETRY_COUNT,
        timeout_secs = TIMEOUT_SECS):
    api_resource = "http://api.elsevier.com/content/abstract/eid/{}".format(eid)
    # Parameters
    payload = {}
    for _ in range(retry_limit):
        try:
            r = s.get(api_resource,
                    params = payload,
                    timeout = timeout_secs)
            if r.status_code == 401:
                get_token()
                continue
            if r.status_code == 400:
                raise requests.exceptions.HTTPError('Bad request; possibly you aren\'t connected to an institution with Scopus acces?')
            break
        except requests.exceptions.Timeout:
            pass
    else:
        raise requests.exceptions.Timeout('Timeout Error')
    if r.status_code == 404:
        return None
    r.raise_for_status()
    return r.content.decode('utf-8')