Add call to compute diffs via local PHP server

This is inefficient as it requires an individal request per diff.

Going to try collecting the revision texts to reduce communication
overhead.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-23 13:09:27 -05:00
parent 0d9ab003f0
commit 96915a074b
2 changed files with 47 additions and 2 deletions

View File

@ -107,6 +107,7 @@ class WikiqTestCase(unittest.TestCase):
try: try:
tester.call_wikiq() tester.call_wikiq()
# tester.call_wikiq("--compute-incremental-diffs")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) self.fail(exc.stderr.decode("utf8"))

48
wikiq
View File

@ -17,6 +17,7 @@ from hashlib import sha1
from typing import Any, IO, TextIO, Generator, Union from typing import Any, IO, TextIO, Generator, Union
import mwxml import mwxml
import requests
from mwxml import Dump from mwxml import Dump
from deltas.tokenizers import wikitext_split from deltas.tokenizers import wikitext_split
@ -34,6 +35,7 @@ import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow.csv as pacsv import pyarrow.csv as pacsv
DIFFS_URL = 'http://localhost:8000'
class PersistMethod: class PersistMethod:
none = 0 none = 0
@ -214,7 +216,10 @@ class WikiqParser:
namespaces: Union[list[int], None] = None, namespaces: Union[list[int], None] = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
parquet_buffer_size: int = 2000): parquet_buffer_size: int = 2000,
compute_incremental_diffs: bool = False,
):
""" """
Parameters: Parameters:
persist : what persistence method to use. Takes a PersistMethod value persist : what persistence method to use. Takes a PersistMethod value
@ -225,6 +230,7 @@ class WikiqParser:
self.persist: int = persist self.persist: int = persist
self.namespaces = [] self.namespaces = []
self.revert_radius = revert_radius self.revert_radius = revert_radius
self.compute_incremental_diffs: bool = compute_incremental_diffs
if namespaces is not None: if namespaces is not None:
self.namespace_filter = set(namespaces) self.namespace_filter = set(namespaces)
@ -360,6 +366,9 @@ class WikiqParser:
schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True)) schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True))
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True)) schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
if self.compute_incremental_diffs:
schema = schema.append(pa.field('incremental diffs', pa.string()))
if self.output_parquet: if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
else: else:
@ -369,6 +378,8 @@ class WikiqParser:
# Iterate through pages # Iterate through pages
for page in dump: for page in dump:
incremental_diffs = []
previous_text = ""
# skip namespaces not in the filter # skip namespaces not in the filter
if self.namespace_filter is not None: if self.namespace_filter is not None:
@ -408,8 +419,35 @@ class WikiqParser:
regex_matches[k] = [] regex_matches[k] = []
regex_matches[k].append(v) regex_matches[k].append(v)
if self.compute_incremental_diffs:
payload = {
'arg1': previous_text,
'arg2': rev.text,
}
try:
response = requests.post(DIFFS_URL, json=payload)
response.raise_for_status()
incremental_diffs.append(response.text)
except requests.exceptions.ConnectionError as e:
print(
f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
print(e)
raise e
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
print(f"Response Body: {response.text}")
raise e
except requests.exceptions.RequestException as e:
print(f"An unexpected error occurred: {e}")
raise e
previous_text = rev.text
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them. # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop() row_buffer = table.pop()
if self.compute_incremental_diffs:
row_buffer['incremental diffs'] = incremental_diffs
is_revert_column: list[Union[bool, None]] = [] is_revert_column: list[Union[bool, None]] = []
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
@ -575,6 +613,10 @@ def main():
action='store_true', action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs",
action='store_true',
help="Compute and store incremental diffs by edit session.")
args = parser.parse_args() args = parser.parse_args()
# set persistence method # set persistence method
@ -626,7 +668,9 @@ def main():
regex_revision_label=args.regex_revision_label, regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment, regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label, regex_comment_label=args.regex_comment_label,
output_parquet=output_parquet) output_parquet=output_parquet,
compute_incremental_diffs=args.compute_incremental_diffs,
)
wikiq.process() wikiq.process()