diff --git a/wikiq b/wikiq index f2cd9cd..3ed9d0e 100755 --- a/wikiq +++ b/wikiq @@ -37,6 +37,7 @@ import pyarrow.csv as pacsv DIFFS_URL = 'http://localhost:8000' + class PersistMethod: none = 0 sequence = 1 @@ -57,7 +58,7 @@ def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]: if not rev.sha1 and not rev.deleted.text: rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() revs[i] = rev - i+=1 + i += 1 return revs @@ -378,8 +379,7 @@ class WikiqParser: # Iterate through pages for page in dump: - incremental_diffs = [] - previous_text = "" + payload = [] # skip namespaces not in the filter if self.namespace_filter is not None: @@ -420,33 +420,34 @@ class WikiqParser: regex_matches[k].append(v) if self.compute_incremental_diffs: - payload = { - 'arg1': previous_text, - 'arg2': rev.text, - } - - try: - response = requests.post(DIFFS_URL, json=payload) - response.raise_for_status() - incremental_diffs.append(response.text) - except requests.exceptions.ConnectionError as e: - print( - f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") - print(e) - raise e - except requests.exceptions.HTTPError as e: - print(f"HTTP Error: {e}") - print(f"Response Body: {response.text}") - raise e - except requests.exceptions.RequestException as e: - print(f"An unexpected error occurred: {e}") - raise e - - previous_text = rev.text + payload.append(rev.text) # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() if self.compute_incremental_diffs: + try: + response = requests.post(DIFFS_URL, json=payload) + response.raise_for_status() + incremental_diffs = response.json() + except requests.exceptions.ConnectionError as e: + print( + f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") + print(e) + raise e + except requests.exceptions.HTTPError as e: + print(f"HTTP Error: {e}") + print(f"Response Body: {response.text}") + raise e + except requests.exceptions.JSONDecodeError as e: + # Must come before RequestException as JSONDecodeError is + # a subclass. + print(f"JSON Decode Error: {e}", file=sys.stderr) + print(f"Response Body: {response.text}", file=sys.stderr) + raise e + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + raise e + row_buffer['incremental diffs'] = incremental_diffs is_revert_column: list[Union[bool, None]] = []