Pass arrays of diffs instead of incremental
This is 3.5x faster Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
96915a074b
commit
62db384aa4
25
wikiq
25
wikiq
@ -37,6 +37,7 @@ import pyarrow.csv as pacsv
|
||||
|
||||
DIFFS_URL = 'http://localhost:8000'
|
||||
|
||||
|
||||
class PersistMethod:
|
||||
none = 0
|
||||
sequence = 1
|
||||
@ -378,8 +379,7 @@ class WikiqParser:
|
||||
|
||||
# Iterate through pages
|
||||
for page in dump:
|
||||
incremental_diffs = []
|
||||
previous_text = ""
|
||||
payload = []
|
||||
|
||||
# skip namespaces not in the filter
|
||||
if self.namespace_filter is not None:
|
||||
@ -420,15 +420,15 @@ class WikiqParser:
|
||||
regex_matches[k].append(v)
|
||||
|
||||
if self.compute_incremental_diffs:
|
||||
payload = {
|
||||
'arg1': previous_text,
|
||||
'arg2': rev.text,
|
||||
}
|
||||
payload.append(rev.text)
|
||||
|
||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||
row_buffer = table.pop()
|
||||
if self.compute_incremental_diffs:
|
||||
try:
|
||||
response = requests.post(DIFFS_URL, json=payload)
|
||||
response.raise_for_status()
|
||||
incremental_diffs.append(response.text)
|
||||
incremental_diffs = response.json()
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
print(
|
||||
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
|
||||
@ -438,15 +438,16 @@ class WikiqParser:
|
||||
print(f"HTTP Error: {e}")
|
||||
print(f"Response Body: {response.text}")
|
||||
raise e
|
||||
except requests.exceptions.JSONDecodeError as e:
|
||||
# Must come before RequestException as JSONDecodeError is
|
||||
# a subclass.
|
||||
print(f"JSON Decode Error: {e}", file=sys.stderr)
|
||||
print(f"Response Body: {response.text}", file=sys.stderr)
|
||||
raise e
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
raise e
|
||||
|
||||
previous_text = rev.text
|
||||
|
||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||
row_buffer = table.pop()
|
||||
if self.compute_incremental_diffs:
|
||||
row_buffer['incremental diffs'] = incremental_diffs
|
||||
|
||||
is_revert_column: list[Union[bool, None]] = []
|
||||
|
Loading…
Reference in New Issue
Block a user