131 lines
5.9 KiB
Python
Executable File
131 lines
5.9 KiB
Python
Executable File
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
""" Bot to scrape a list of EasyChair submissions and upload them to a wiki """
|
|
#
|
|
# (C) Benjamin Mako Hill, 2018
|
|
# (C) Federico Leva, 2016
|
|
#
|
|
# Distributed under the terms of the MIT license.
|
|
#
|
|
__version__ = '0.2.0'
|
|
|
|
# NOTE: change all copies of FIXME
|
|
|
|
import requests
|
|
from lxml import html
|
|
import re
|
|
from kitchen.text.converters import to_bytes
|
|
import pandas as pd
|
|
|
|
cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } )
|
|
headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" }
|
|
index = requests.get("https://easychair.org/conferences/submission_show_all.cgi?a=FIXME", cookies=cj, headers=headers)
|
|
indexdata = html.fromstring(index.text)
|
|
urls = indexdata.xpath('//a[contains(@href,"submission_info_show.cgi")]/@href')
|
|
|
|
submissions = pd.DataFrame()
|
|
authors = pd.DataFrame()
|
|
reviewers = pd.DataFrame()
|
|
author_keywords = pd.DataFrame()
|
|
easychair_keywords = pd.DataFrame()
|
|
bids = pd.DataFrame()
|
|
|
|
for url in urls:
|
|
sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url,
|
|
cookies=cj, headers=headers).text)
|
|
|
|
# capture features of submissions
|
|
sub_id = sub_html.xpath('//title')[0].text
|
|
sub_id = re.sub(r'^Submission (\d+)$', r'\1', sub_id)
|
|
|
|
final_type = sub_html.xpath('//td[text()="Category"]/../td[2]')[0].text
|
|
title = sub_html.xpath('//td[text()="Title:"]/../td[2]/text()')[0].strip()
|
|
|
|
# it's possible to submit papers w/o topics
|
|
try:
|
|
topic = sub_html.xpath('//span[text()="Topics:"]/../../td[2]/text()')[0].strip()
|
|
except IndexError:
|
|
topic = None
|
|
|
|
abstract = sub_html.xpath('//td[text()="Abstract:"]/../td[2]')[0].text.strip()
|
|
result = sub_html.xpath('//td[text()="Decision:"]/../td[2]')[0].text_content().strip()
|
|
|
|
submissions = submissions.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'type' : final_type,
|
|
'title' : title,
|
|
'topic' : topic,
|
|
'abstract' : abstract,
|
|
'result' : result},
|
|
index=[0]))
|
|
|
|
# create a list of authors
|
|
names = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[1]/text()')
|
|
surnames = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[2]/text()')
|
|
countries = sub_html.xpath('//b[text()="Authors"]/../../..//tr[@id!="row37"]/td[4]/text()')
|
|
|
|
for i in range(1, len(names)):
|
|
authors = authors.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'author' : " ".join([names[i], surnames[i]]),
|
|
'country' : countries[i] },
|
|
index=[0]))
|
|
|
|
# add the list of reviewers
|
|
assigned_to = sub_html.xpath('//span[text()="Assigned to:"]/../../td[2]')[0].text.strip().split(", ")
|
|
|
|
reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'reviewer' : assigned_to,
|
|
'type' : 'normal' }))
|
|
|
|
senior_pc = sub_html.xpath('//span[text()="Senior PC member:"]/../../td[2]')[0].text
|
|
senior_pc = re.sub(r'^(.+?) \<.*$', r'\1', senior_pc)
|
|
|
|
reviewers = reviewers.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'reviewer' : senior_pc,
|
|
'type' : 'senior' },
|
|
index=[0]))
|
|
|
|
# add author keywords
|
|
sub_author_keywords = sub_html.xpath('//div[parent::td[@class="value"]]/text()')
|
|
sub_author_keywords = [x.lower() for x in sub_author_keywords]
|
|
|
|
author_keywords = author_keywords.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'keyword' : sub_author_keywords}))
|
|
|
|
|
|
# easychair keywords
|
|
sub_easychair_keywords = sub_html.xpath('//span[text()="EasyChair keyphrases:"]/../../td[2]')[0].text.strip()
|
|
sub_easychair_keywords = sub_easychair_keywords.split(", ")
|
|
|
|
for kw in sub_easychair_keywords:
|
|
g = re.match(r'^\s*([A-Za-z1-9 ]+) \((\d+)\)\s*$', kw).groups()
|
|
easychair_keywords = easychair_keywords.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'keyword' : g[0].lower(),
|
|
'number' : g[1]},
|
|
index=[0]))
|
|
|
|
#coi = sub_html.xpath('//span[text()="Conflict of interest:"]/../../td[2]')[0].text.strip()
|
|
#if coi == "nobody":
|
|
# coi = []
|
|
#else: # TODO this is not tested on /any/ data
|
|
# coi = coi.split(", ")
|
|
|
|
def parse_bid_tbl(tbl):
|
|
key = re.sub(r'^\s*([a-z]+):\s*$', r'\1', tbl[0][0].text)
|
|
return((key, tbl[0][1].text.split(", ")))
|
|
|
|
sub_bids = dict([ parse_bid_tbl(x) for x in sub_html.xpath('//td[text()="Bid:"]/../td[2]/table[*]') ])
|
|
|
|
for bid_type in sub_bids:
|
|
bids = bids.append(pd.DataFrame({ 'sub_id' : sub_id,
|
|
'bid' : bid_type,
|
|
'bidder' : sub_bids[bid_type] }))
|
|
|
|
|
|
submissions.to_csv("opensym-submissions-20180113.csv", index=False, index_label=False)
|
|
authors.to_csv("opensym-authors-20180113.csv", index=False, index_label=False)
|
|
reviewers.to_csv("opensym-reviewers-20180113.csv", index=False, index_label=False)
|
|
author_keywords.to_csv("opensym-author_keywords-20180113.csv", index=False, index_label=False)
|
|
easychair_keywords.to_csv("opensym-easychair_keywords-20180113.csv", index=False, index_label=False)
|
|
bids.to_csv("opensym-bids-20180113.csv", index=False, index_label=False)
|
|
|