Pass arrays of diffs instead of incremental

This is 3.5x faster

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-23 14:17:01 -05:00
parent 96915a074b
commit 62db384aa4

27
wikiq
View File

@ -37,6 +37,7 @@ import pyarrow.csv as pacsv
DIFFS_URL = 'http://localhost:8000'
class PersistMethod:
none = 0
sequence = 1
@ -57,7 +58,7 @@ def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]:
if not rev.sha1 and not rev.deleted.text:
rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
revs[i] = rev
i+=1
i += 1
return revs
@ -378,8 +379,7 @@ class WikiqParser:
# Iterate through pages
for page in dump:
incremental_diffs = []
previous_text = ""
payload = []
# skip namespaces not in the filter
if self.namespace_filter is not None:
@ -420,15 +420,15 @@ class WikiqParser:
regex_matches[k].append(v)
if self.compute_incremental_diffs:
payload = {
'arg1': previous_text,
'arg2': rev.text,
}
payload.append(rev.text)
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop()
if self.compute_incremental_diffs:
try:
response = requests.post(DIFFS_URL, json=payload)
response.raise_for_status()
incremental_diffs.append(response.text)
incremental_diffs = response.json()
except requests.exceptions.ConnectionError as e:
print(
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
@ -438,15 +438,16 @@ class WikiqParser:
print(f"HTTP Error: {e}")
print(f"Response Body: {response.text}")
raise e
except requests.exceptions.JSONDecodeError as e:
# Must come before RequestException as JSONDecodeError is
# a subclass.
print(f"JSON Decode Error: {e}", file=sys.stderr)
print(f"Response Body: {response.text}", file=sys.stderr)
raise e
except requests.exceptions.RequestException as e:
print(f"An unexpected error occurred: {e}")
raise e
previous_text = rev.text
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop()
if self.compute_incremental_diffs:
row_buffer['incremental diffs'] = incremental_diffs
is_revert_column: list[Union[bool, None]] = []