Pass arrays of diffs instead of incremental
This is 3.5x faster Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
96915a074b
commit
62db384aa4
53
wikiq
53
wikiq
@ -37,6 +37,7 @@ import pyarrow.csv as pacsv
|
|||||||
|
|
||||||
DIFFS_URL = 'http://localhost:8000'
|
DIFFS_URL = 'http://localhost:8000'
|
||||||
|
|
||||||
|
|
||||||
class PersistMethod:
|
class PersistMethod:
|
||||||
none = 0
|
none = 0
|
||||||
sequence = 1
|
sequence = 1
|
||||||
@ -57,7 +58,7 @@ def fix_hex_digests(revs: list[mwxml.Revision]) -> list[mwxml.Revision]:
|
|||||||
if not rev.sha1 and not rev.deleted.text:
|
if not rev.sha1 and not rev.deleted.text:
|
||||||
rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
|
rev.sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
|
||||||
revs[i] = rev
|
revs[i] = rev
|
||||||
i+=1
|
i += 1
|
||||||
return revs
|
return revs
|
||||||
|
|
||||||
|
|
||||||
@ -378,8 +379,7 @@ class WikiqParser:
|
|||||||
|
|
||||||
# Iterate through pages
|
# Iterate through pages
|
||||||
for page in dump:
|
for page in dump:
|
||||||
incremental_diffs = []
|
payload = []
|
||||||
previous_text = ""
|
|
||||||
|
|
||||||
# skip namespaces not in the filter
|
# skip namespaces not in the filter
|
||||||
if self.namespace_filter is not None:
|
if self.namespace_filter is not None:
|
||||||
@ -420,33 +420,34 @@ class WikiqParser:
|
|||||||
regex_matches[k].append(v)
|
regex_matches[k].append(v)
|
||||||
|
|
||||||
if self.compute_incremental_diffs:
|
if self.compute_incremental_diffs:
|
||||||
payload = {
|
payload.append(rev.text)
|
||||||
'arg1': previous_text,
|
|
||||||
'arg2': rev.text,
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(DIFFS_URL, json=payload)
|
|
||||||
response.raise_for_status()
|
|
||||||
incremental_diffs.append(response.text)
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
|
||||||
print(
|
|
||||||
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
|
|
||||||
print(e)
|
|
||||||
raise e
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
print(f"HTTP Error: {e}")
|
|
||||||
print(f"Response Body: {response.text}")
|
|
||||||
raise e
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
print(f"An unexpected error occurred: {e}")
|
|
||||||
raise e
|
|
||||||
|
|
||||||
previous_text = rev.text
|
|
||||||
|
|
||||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||||
row_buffer = table.pop()
|
row_buffer = table.pop()
|
||||||
if self.compute_incremental_diffs:
|
if self.compute_incremental_diffs:
|
||||||
|
try:
|
||||||
|
response = requests.post(DIFFS_URL, json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
incremental_diffs = response.json()
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
print(
|
||||||
|
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
|
||||||
|
print(e)
|
||||||
|
raise e
|
||||||
|
except requests.exceptions.HTTPError as e:
|
||||||
|
print(f"HTTP Error: {e}")
|
||||||
|
print(f"Response Body: {response.text}")
|
||||||
|
raise e
|
||||||
|
except requests.exceptions.JSONDecodeError as e:
|
||||||
|
# Must come before RequestException as JSONDecodeError is
|
||||||
|
# a subclass.
|
||||||
|
print(f"JSON Decode Error: {e}", file=sys.stderr)
|
||||||
|
print(f"Response Body: {response.text}", file=sys.stderr)
|
||||||
|
raise e
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"An unexpected error occurred: {e}")
|
||||||
|
raise e
|
||||||
|
|
||||||
row_buffer['incremental diffs'] = incremental_diffs
|
row_buffer['incremental diffs'] = incremental_diffs
|
||||||
|
|
||||||
is_revert_column: list[Union[bool, None]] = []
|
is_revert_column: list[Union[bool, None]] = []
|
||||||
|
Loading…
Reference in New Issue
Block a user