From 96915a074b08d803f23b41bae9982055411f8307 Mon Sep 17 00:00:00 2001
From: Will Beason <willbeason@gmail.com>
Date: Mon, 23 Jun 2025 13:09:27 -0500
Subject: [PATCH] Add call to compute diffs via local PHP server

This is inefficient as it requires an individal request per diff.

Going to try collecting the revision texts to reduce communication
overhead.

Signed-off-by: Will Beason <willbeason@gmail.com>
---
 test/Wikiq_Unit_Test.py |  1 +
 wikiq                   | 48 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py
index f1086f4..a26c387 100644
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -107,6 +107,7 @@ class WikiqTestCase(unittest.TestCase):
 
         try:
             tester.call_wikiq()
+            # tester.call_wikiq("--compute-incremental-diffs")
         except subprocess.CalledProcessError as exc:
             self.fail(exc.stderr.decode("utf8"))
 
diff --git a/wikiq b/wikiq
index da902be..f2cd9cd 100755
--- a/wikiq
+++ b/wikiq
@@ -17,6 +17,7 @@ from hashlib import sha1
 from typing import Any, IO, TextIO, Generator, Union
 
 import mwxml
+import requests
 from mwxml import Dump
 
 from deltas.tokenizers import wikitext_split
@@ -34,6 +35,7 @@ import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.csv as pacsv
 
+DIFFS_URL = 'http://localhost:8000'
 
 class PersistMethod:
     none = 0
@@ -214,7 +216,10 @@ class WikiqParser:
                  namespaces: Union[list[int], None] = None,
                  revert_radius: int = 15,
                  output_parquet: bool = True,
-                 parquet_buffer_size: int = 2000):
+                 parquet_buffer_size: int = 2000,
+                 compute_incremental_diffs: bool = False,
+                 ):
+
         """ 
         Parameters:
            persist : what persistence method to use. Takes a PersistMethod value
@@ -225,6 +230,7 @@ class WikiqParser:
         self.persist: int = persist
         self.namespaces = []
         self.revert_radius = revert_radius
+        self.compute_incremental_diffs: bool = compute_incremental_diffs
 
         if namespaces is not None:
             self.namespace_filter = set(namespaces)
@@ -360,6 +366,9 @@ class WikiqParser:
             schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True))
             schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
 
+        if self.compute_incremental_diffs:
+            schema = schema.append(pa.field('incremental diffs', pa.string()))
+
         if self.output_parquet:
             writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
         else:
@@ -369,6 +378,8 @@ class WikiqParser:
 
         # Iterate through pages
         for page in dump:
+            incremental_diffs = []
+            previous_text = ""
 
             # skip namespaces not in the filter
             if self.namespace_filter is not None:
@@ -408,8 +419,35 @@ class WikiqParser:
                         regex_matches[k] = []
                     regex_matches[k].append(v)
 
+                if self.compute_incremental_diffs:
+                    payload = {
+                        'arg1': previous_text,
+                        'arg2': rev.text,
+                    }
+
+                    try:
+                        response = requests.post(DIFFS_URL, json=payload)
+                        response.raise_for_status()
+                        incremental_diffs.append(response.text)
+                    except requests.exceptions.ConnectionError as e:
+                        print(
+                            f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.")
+                        print(e)
+                        raise e
+                    except requests.exceptions.HTTPError as e:
+                        print(f"HTTP Error: {e}")
+                        print(f"Response Body: {response.text}")
+                        raise e
+                    except requests.exceptions.RequestException as e:
+                        print(f"An unexpected error occurred: {e}")
+                        raise e
+
+                    previous_text = rev.text
+
             # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
             row_buffer = table.pop()
+            if self.compute_incremental_diffs:
+                row_buffer['incremental diffs'] = incremental_diffs
 
             is_revert_column: list[Union[bool, None]] = []
             for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
@@ -575,6 +613,10 @@ def main():
                         action='store_true',
                         help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
 
+    parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs",
+                        action='store_true',
+                        help="Compute and store incremental diffs by edit session.")
+
     args = parser.parse_args()
 
     # set persistence method
@@ -626,7 +668,9 @@ def main():
                                 regex_revision_label=args.regex_revision_label,
                                 regex_match_comment=args.regex_match_comment,
                                 regex_comment_label=args.regex_comment_label,
-                                output_parquet=output_parquet)
+                                output_parquet=output_parquet,
+                                compute_incremental_diffs=args.compute_incremental_diffs,
+                                )
 
             wikiq.process()