initial import of opensym2017 scraper/report

2018-01-14 19:04:58 -08:00
commit 81a31bb283
4 changed files with 424 additions and 0 deletions
--- a/easychair-review-scraper.py
+++ b/easychair-review-scraper.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# -*- coding: utf-8  -*-
+""" Bot to scrape a list of EasyChair submissions and upload them to a wiki """
+#
+# (C) Benjamin Mako Hill, 2018
+# (C) Federico Leva, 2016
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '0.2.0'
+
+# NOTE: change all copies of FIXME
+
+import requests
+from lxml import html
+import re
+from kitchen.text.converters import to_bytes
+import pandas as pd
+
+cj = requests.utils.cookiejar_from_dict( { "cool2": "FIXME", "cool1": "FIXME" } )
+headers = {"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0" }
+index = requests.get("https://easychair.org/conferences/status.cgi?a=FIXME", cookies=cj, headers=headers)
+indexdata = html.fromstring(index.text)
+urls = indexdata.xpath('//a[contains(@href,"review_for_paper.cgi")]/@href')
+
+reviews = pd.DataFrame()
+
+def empty_to_none(s):
+    if s == "":
+        s = None
+    return(s)
+
+for url in urls:
+    sub_html = html.fromstring(requests.get("https://easychair.org/conferences/" + url, 
+                                            cookies=cj, headers=headers).text)
+
+    # capture features of submissions
+    sub_id = sub_html.xpath('//title')[0].text
+    sub_id = re.sub(r'^Reviews and Comments on Submission (\d+)$', r'\1', sub_id)
+
+    score_labels = ['label', 'date', 'reviewer', 'subreviewer', 'score', 'confidence' 'overall']
+    for tr in sub_html.xpath('//th[text()="PC member"]/../../../tbody/tr'):
+        score = [td.text_content() for td in tr.xpath('td')]
+        score = [empty_to_none(x) for x in score]
+        score_dict = dict(zip(score_labels, score))
+        score_dict["sub_id"] = sub_id
+        reviews = reviews.append(pd.DataFrame(score_dict, index=[0]))
+
+reviews["date"] = reviews["date"] + ", 2017"
+reviews["date"] = pd.to_datetime(reviews["date"])
+
+reviews.to_csv("opensym-reviews-20180113.csv", index=False, index_label=False)
+