From 96915a074b08d803f23b41bae9982055411f8307 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 23 Jun 2025 13:09:27 -0500 Subject: [PATCH] Add call to compute diffs via local PHP server This is inefficient as it requires an individal request per diff. Going to try collecting the revision texts to reduce communication overhead. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 1 + wikiq | 48 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index f1086f4..a26c387 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -107,6 +107,7 @@ class WikiqTestCase(unittest.TestCase): try: tester.call_wikiq() + # tester.call_wikiq("--compute-incremental-diffs") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index da902be..f2cd9cd 100755 --- a/wikiq +++ b/wikiq @@ -17,6 +17,7 @@ from hashlib import sha1 from typing import Any, IO, TextIO, Generator, Union import mwxml +import requests from mwxml import Dump from deltas.tokenizers import wikitext_split @@ -34,6 +35,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pyarrow.csv as pacsv +DIFFS_URL = 'http://localhost:8000' class PersistMethod: none = 0 @@ -214,7 +216,10 @@ class WikiqParser: namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_parquet: bool = True, - parquet_buffer_size: int = 2000): + parquet_buffer_size: int = 2000, + compute_incremental_diffs: bool = False, + ): + """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -225,6 +230,7 @@ class WikiqParser: self.persist: int = persist self.namespaces = [] self.revert_radius = revert_radius + self.compute_incremental_diffs: bool = compute_incremental_diffs if namespaces is not None: self.namespace_filter = set(namespaces) @@ -360,6 +366,9 @@ class WikiqParser: schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True)) schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True)) + if self.compute_incremental_diffs: + schema = schema.append(pa.field('incremental diffs', pa.string())) + if self.output_parquet: writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') else: @@ -369,6 +378,8 @@ class WikiqParser: # Iterate through pages for page in dump: + incremental_diffs = [] + previous_text = "" # skip namespaces not in the filter if self.namespace_filter is not None: @@ -408,8 +419,35 @@ class WikiqParser: regex_matches[k] = [] regex_matches[k].append(v) + if self.compute_incremental_diffs: + payload = { + 'arg1': previous_text, + 'arg2': rev.text, + } + + try: + response = requests.post(DIFFS_URL, json=payload) + response.raise_for_status() + incremental_diffs.append(response.text) + except requests.exceptions.ConnectionError as e: + print( + f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") + print(e) + raise e + except requests.exceptions.HTTPError as e: + print(f"HTTP Error: {e}") + print(f"Response Body: {response.text}") + raise e + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + raise e + + previous_text = rev.text + # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() + if self.compute_incremental_diffs: + row_buffer['incremental diffs'] = incremental_diffs is_revert_column: list[Union[bool, None]] = [] for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): @@ -575,6 +613,10 @@ def main(): action='store_true', help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs", + action='store_true', + help="Compute and store incremental diffs by edit session.") + args = parser.parse_args() # set persistence method @@ -626,7 +668,9 @@ def main(): regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, - output_parquet=output_parquet) + output_parquet=output_parquet, + compute_incremental_diffs=args.compute_incremental_diffs, + ) wikiq.process()