- added a statistics to the writeup and fixed an error - changed the description in the scrapers to be more accurate
54 lines
1.9 KiB
Python
Executable File
54 lines
1.9 KiB
Python
Executable File
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
""" script to scrape a list of EasyChair review data and save them as CSV files """
|
|
#
|
|
# (C) Benjamin Mako Hill, 2018
|
|
# (C) Federico Leva, 2016
|
|
#
|
|
# Distributed under the terms of the MIT license.
|
|
#
|
|
__version__ = '0.2.0'
|
|
|
|
# NOTE: change all copies of FIXME
|
|
|
|
import requests
|
|
from lxml import html
|
|
import re
|
|
from kitchen.text.converters import to_bytes
|
|
import pandas as pd
|
|
|
|
cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } )
|
|
headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" }
|
|
index = requests.get("https://easychair.org/conferences/status.cgi?a=FIXME", cookies=cj, headers=headers)
|
|
indexdata = html.fromstring(index.text)
|
|
urls = indexdata.xpath('//a[contains(@href,"review_for_paper.cgi")]/@href')
|
|
|
|
reviews = pd.DataFrame()
|
|
|
|
def empty_to_none(s):
|
|
if s == "":
|
|
s = None
|
|
return(s)
|
|
|
|
for url in urls:
|
|
sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url,
|
|
cookies=cj, headers=headers).text)
|
|
|
|
# capture features of submissions
|
|
sub_id = sub_html.xpath('//title')[0].text
|
|
sub_id = re.sub(r'^Reviews and Comments on Submission (\d+)$', r'\1', sub_id)
|
|
|
|
score_labels = ['label', 'date', 'reviewer', 'subreviewer', 'score', 'confidence' 'overall']
|
|
for tr in sub_html.xpath('//th[text()="PC member"]/../../../tbody/tr'):
|
|
score = [td.text_content() for td in tr.xpath('td')]
|
|
score = [empty_to_none(x) for x in score]
|
|
score_dict = dict(zip(score_labels, score))
|
|
score_dict["sub_id"] = sub_id
|
|
reviews = reviews.append(pd.DataFrame(score_dict, index=[0]))
|
|
|
|
reviews["date"] = reviews["date"] + ", 2017"
|
|
reviews["date"] = pd.to_datetime(reviews["date"])
|
|
|
|
reviews.to_csv("opensym-reviews-20180113.csv", index=False, index_label=False)
|
|
|