Merge branch 'tmp' into compute-diffs

2025-06-30 20:52:23 -05:00
parent 37734ed092 5a3e4102b5
commit 20de5b93f9
6 changed files with 211 additions and 148 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "mediawiki-php-wikidiff2"]
+	path = mediawiki-php-wikidiff2
+	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/
--- a/README.rst
+++ b/README.rst
@@ -20,6 +20,13 @@ associated tests to work.
 - 7zip
 - ffmpeg

+A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via.
+
+``apt-get install php-wikidiff2``
+
+You may have to also run.
+``sudo phpenmod wikidiff2``.
+
 Tests
 ----
 To run tests::
@@ -30,3 +37,5 @@ TODO:
 _______________
 1. [] Output metadata about the run. What parameters were used? What versions of deltas?
 2. [] Url encoding by default
+
+.. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = "~=3.9"
+requires-python = ">=3.9"
 dependencies = [
    "deltas>=0.7.0",
    "mediawiki-utilities>=0.4.18",
@@ -18,8 +18,11 @@ dependencies = [
 [tool.uv.sources]
 yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
 mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
+deltas = { git = "https://github.com/groceryheist/deltas" }

 [dependency-groups]
 dev = [
-    "pandas>=2.1.0"
+    "pandas>=2.1.0",
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
 ]
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@@ -1,10 +1,14 @@
 import json
 import sys
+from itertools import chain
+from typing import Generator, List, Optional, Tuple

 import requests
-from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
+from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
+                    RegexTokenizer, Token, tokenizers)
+
+TOKENIZER = tokenizers.wikitext_split

-TOKENIZER = tokenizers.text_split

 def compute_diffs(url: str, texts: list[str]) -> list:
    response = None
@@ -14,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        incremental_diffs = response.json()
    except requests.exceptions.ConnectionError as e:
        print(
-            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
+            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
+        )
        print(e)
        raise e
    except requests.exceptions.HTTPError as e:
@@ -33,161 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        print(f"An unexpected error occurred: {e}")
        raise e

+    # for diff in incremental_diffs:
+    #     for wikidiffop in json.loads(diff)["diff"][0:5]:
+    #         print(wikidiffop)
+
    return incremental_diffs


-def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list:
-    d = json.loads(diff)
+class DiffToOperationMap:

-    # Keep track of the last difference we saw in order to notice unaccounted-for
-    # tokens. Each token at the end of "to" which is skipped for the next diff
-    # must be represented as an "Equal()" segment.
-    from_last_end = 0
-    to_last_end = 0
+    def __init__(self, from_text, to_text, diff, tokenizer):

-    result = []
-    # DiffState expects differences to be represented in order from the
+        self.diff = diff
+        self.tokenizer = tokenizer
+        self.diff = json.loads(diff)
+
+        # the code below is designed to work in bytes because that's how wikidiff2 indexes
+        self.from_bytes = from_text.encode("utf-8")
+        self.to_bytes = to_text.encode("utf-8")
+
+
+        self.from_last_end_bytes = 0
+        self.from_last_to_bytes = 0
+        self.n_from_start_tokens = 0
+        self.n_to_start_tokens = 0
+        self.last_to_start_line = 0
+        self.last_from_start_line = 0
+        self.from_last_end_bytes = 0
+        self.to_last_end_bytes = 0
+        
+    def tokenize(self, bytes):
+        return self.tokenizer.tokenize(bytes.decode("utf-8"))
+
+    def to_operations(self):
+        parmove_from_dict = {}  # lookup move diffs based on moveinfo id.
+        parmove_to_dict = {}
+        for entry in self.diff["diff"]:
+            offset = entry['offset']
+            linebytes = entry["text"].encode("utf-8")
+
+            # ignore empty diffs. They don't have any tokens
+            if len(linebytes) == 0:
+                continue
+            # this is the first byte of the line in the 'from' revision.
+            from_start_line = entry["offset"]["from"]
+            # this is the first byte of the line in the 'to' revision.
+            to_start_line = entry["offset"]["to"]
+
+            if entry["type"] == 0:
+                yield from self.doEqual(linebytes, offset)
+            
+                # a line included in the 'to' revision, but not in the 'from' revision
+            elif entry["type"] == 1:
+                yield from self.doInsert(linebytes, offset)
+
+                # a line included in the 'from' revision, but not in the 'to' revision
+            elif entry["type"] == 2:
+                yield from self.doDelete(linebytes, offset)
+    
+            elif entry["type"] == 3:
+                yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset)
+            
+            elif entry["type"] == 4:
+                parmove_from_dict["moveInfo"]["id"] = diff
+
+            elif entry["type"] == 5:
+                # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
+                parmove_to_dict["moveInfo"]["id"] = diff
+            # for deletions and equality report the token indexes from the 'from' revision.
+
+            else:
+                # The 'type' isn't one of the known
+                raise ValueError(d)
+
+        
+        # mwpersistence expects differences to be represented in order from the
        # result's perspective ("to"), not the previous text. Thus, if a line
        # is moved earlier then its insertion should appear before its deletion.
        # As a rule of thumb, the "to" segments should be non-overlapping and
        # strictly increasing, while the "from" segments should merely be
        # non-overlapping.
-    #
-    # wikidiff2 appears to follow this same convention, but this behavior
-    # is not documented.

-    for entry in d['diff']:
-        from_start_line = entry['offset']['from']
-        to_start_line = entry['offset']['to']
-        # Per above, to_start_line appears to be nondecreasing, but
-        # from_start_line may sometimes decrease for detected paragraph moves.
-
-        from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line]))
-        to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line]))
-        # These constant calls to tokenizer.tokenize can definitely be optimized
-        # as tokenization is currently a bottleneck. Ideally tokenization would
-        # happen incrementally where possible, or somehow be cached, but this
-        # would be more complex.
-
-        if entry['type'] == 0:
-            # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
-            from_end_tokens = from_start_tokens + line_tokens
-            to_end_tokens = to_start_tokens + line_tokens
-
-            result.append(Equal(from_start_tokens, from_end_tokens,
-                                to_start_tokens, to_end_tokens))
-
-            from_last_end = from_end_tokens
-            to_last_end  = to_end_tokens
-
-            continue
-        else:
-            # These do not appear to be generated by wikidiff2, and so must be
-            # inferred.
-            equal_tokens = to_start_tokens - to_last_end
-            # If we notice that the next non-zero segment (which must be a
-            # change, given that its type is non-zero), begins after the end
-            # of the previous segment, we must add an Equal segment.
-            # TODO: While the "to" token ranges are correct, the "from"
-            #  ranges are likely not, particularly in histories with paragraph
-            #  moves.
-            if equal_tokens > 0:
-                result.append(Equal(from_last_end, from_start_line,
-                                    to_last_end, to_start_line))
+        # now we go through the parmoves
+        for id, from_diff in parmove_from_dict.items():
+            to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]]
            

-        if entry['type'] == 1 or entry['type'] == 4:
-            # TODO: Separate out type 4 to recognize this is the insertion
-            #  part of a paragraph move. Note that for paragraph moves
-            #  the text is not necessarily identical, just similar.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
-            to_end_tokens = to_start_tokens + line_tokens
+    def doEqual(self, equal_bytes, offset):
+        tokens = self.tokenize(equal_bytes)
+        n_tokens = len(tokens)
+        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
+        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
+        yield (
+            Equal(
+                self.n_from_start_tokens,
+                self.n_from_end_tokens,
+                self.n_to_start_tokens,
+                self.n_to_end_tokens,
+            ),
+            tokens,
+            tokens,
+        )
+        # we need to keep track of the to and from last end bytes
+        self.from_last_end_bytes = offset["from"] + len(equal_bytes)
+        self.to_last_end_bytes = offset["to"] + len(equal_bytes)
+        self.n_from_start_tokens += n_tokens
+        self.n_to_start_tokens += n_tokens

-            result.append(Insert(from_start_tokens, from_start_tokens,
-                                 to_start_tokens, to_end_tokens,
-                                 ))

+    def doInsert(self, insert_bytes, offset):
+        tokens = self.tokenize(insert_bytes)
+        n_tokens = len(tokens)
+        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
+        yield (
+            Insert(
+                self.n_from_start_tokens,
+                self.n_from_start_tokens,
+                self.n_to_start_tokens,
+                self.n_to_end_tokens,
+            ),
+            [],
+            tokens,
+        )
        # We have now used more of the "to" tokens.
-            to_last_end = to_end_tokens
-        elif entry['type'] == 2 or entry['type'] == 5:
-            # TODO: Separate out type 5 to recognize this is the deletion
-            #  part of a paragraph move. Note that for paragraph moves
-            #  the text is not necessarily identical, just similar.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
-            from_end_tokens = from_start_tokens + line_tokens
+        self.n_to_start_tokens += n_tokens
+        self.to_last_end_bytes = offset["to"] + len(insert_bytes)

-            result.append(Delete(from_start_tokens, from_end_tokens,
-                                 to_start_tokens, to_start_tokens,
-                                 ))
+    def doDelete(self, delete_bytes, offset):
+        tokens = self.tokenize(delete_bytes)
+        n_tokens = len(tokens)
+        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
+        yield (
+            Delete(
+                self.n_from_start_tokens,
+                self.n_from_end_tokens,
+                self.n_to_start_tokens,
+                self.n_to_start_tokens,
+            ),
+            tokens,
+            [],
+        )
+        # We have now used more of the "from" tokens.
+        self.n_from_start_tokens += n_tokens
+        self.from_last_end_bytes = offset["from"] + len(delete_bytes)

-            # We have not used more of the "from" tokens.
-            from_last_end = from_end_tokens
-        elif entry['type'] == 3:
-            # The text field is an overlapping mix of both the previous and next
-            # lines, and so we can't directly tokenize it.
-
-            text = entry['text']
-
-            last_end = 0
-            previous_line = ""
-            next_line = ""
-
-            # A line will have one or more highlightRanges.
-            # It is not guaranteed that insertions/deletions are matched,
+    def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
+        # The text field is an overlapping mix of both the from and to,
+        # so we need to handle it highlight-by-highlight.
+        # there can be gaps between highlight segments.
        # for instance, if a word is deleted from the middle of a line.
-            for highlightRange in entry['highlightRanges']:
-                if highlightRange['start'] > last_end:
-                    previous_line += text[last_end:highlightRange['start']]
-                    next_line += text[last_end:highlightRange['start']]
-                    # Add an Equal segment.
+        # we need to track that.
+        highlight_end = 0
+        highlight_offset = offset
+        # note that diffs are token-level, but the indexes are byte-level

-                rangeStart = highlightRange['start']
-                rangeEnd = rangeStart + highlightRange['length']
+        for highlightRange in highlightRanges:
+            highlight_start = highlightRange["start"]
+            # equal bytes in between highlights
+            if highlight_start > highlight_end:

-                if highlightRange['type'] == 0:
-                    # Insertion
-                    next_line += text[rangeStart:rangeEnd]
+                equal_bytes = highlight_bytes[
+                    highlight_end : highlight_start
+                ]
+                n_equal_bytes = len(equal_bytes)
+                yield from self.doEqual(equal_bytes, highlight_offset)
+                highlight_offset['from'] += n_equal_bytes
+                highlight_offset['to'] += n_equal_bytes

-                    # Add an Insert segment.
-                elif highlightRange['type'] == 1:
-                    # Deletion
-                    previous_line += text[rangeStart:rangeEnd]
-
-                    # Add a Delete segment.
+            # handle highlighted insert / delete
+            highlight_end = highlight_start + highlightRange["length"]
+            range_bytes = highlight_bytes[highlight_start:highlight_end]
+            n_range_bytes = len(range_bytes)
+            if highlightRange["type"] == 0:
+                yield from self.doInsert(range_bytes, highlight_offset)
+                highlight_offset['to'] += n_range_bytes
+            elif highlightRange["type"] == 1:
+                yield from self.doDelete(range_bytes, highlight_offset)
+                highlight_offset['from'] += n_range_bytes
            else:
                raise Exception(entry)

-            from_tokens = len(tokenizer.tokenize(previous_line))
-            to_tokens = len(tokenizer.tokenize(next_line))
+        # handle the rest of the line which is equal
+        if highlight_end < len(highlight_bytes):
+            range_bytes = highlight_bytes[highlight_end:]
+            yield from self.doEqual(range_bytes, highlight_offset)

-            from_start_tokens += from_tokens
-            to_start_tokens += to_tokens
-        else:
-            # The 'type' isn't one of the known
-            raise ValueError(d)
-
-    # TODO: Handle trailing tokens
-
-    # raise Exception(result)
-    return result

 class WikiDiffMatcher:
-    def __init__(self,
-                 url: str,
-                 texts: list[str],
-                 tokenizer: RegexTokenizer = None,
+    def __init__(
+        self,
+        texts: list[str] = None,
+        tokenizer: Optional[RegexTokenizer] = None,
+        url: Optional[str] = "http://127.0.0.1:8000",
    ):
        # Pre-compute diffs to reduce traffic overhead.
        self.diffs = compute_diffs(url, texts)
        self.tokenizer = tokenizer or TOKENIZER

    class Processor(DiffEngine.Processor):
-        def __init__(self,
-                     diffs,
-                     tokenizer=None
-                     ):
-            self.diffs = iter(diffs)
+        def __init__(self, texts, tokenizer=None):
+            self.diffs = iter(texts)
            self.tokenizer = tokenizer or TOKENIZER
            self.last_tokens = []
            self.previous_text = ""
@@ -196,28 +246,27 @@ class WikiDiffMatcher:
            self.last_tokens = last_tokens

        def process(self, text, token_class=None):
-            # IDEs will report the method signature as incorrect, but this is
-            # expected. The DiffEngine.Processor class must be inherited from,
-            # and its process definition incorrectly excludes a "self" argument.
-
            # The diff has already been computed, but we need to incrementally
            # retrieve it to recreate the behavior DiffState expects.
            diff = next(self.diffs)
+            diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer)
+            (
+                operations,
+                aseq,
+                bseq,
+            ) = list(
+                zip(*diffToOperationsMapper.to_operations())
+            )

-            tokens = self.tokenizer.tokenize(text, token_class=token_class)
-            operations = to_operations(self.previous_text, text, diff, self.tokenizer)
-
-            a = self.last_tokens
-            b = tokens
-            self.last_tokens = tokens
+            self.last_tokens = list(chain.from_iterable(aseq))
+            tokens = list(chain.from_iterable(bseq))
            self.previous_text = text

-            return operations, a, b
+            return operations, self.last_tokens, tokens

    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)

-
    def process(self):
        # DiffState checks for this method even though it is not called.
        raise Exception("Unnecessary implementation")
--- a/wikidiff2_api.php
+++ b/wikidiff2_api.php
@@ -17,7 +17,7 @@ $data = json_decode($rawData, true);
 $previous = '';
 $result = [];
 foreach ($data as $i => $value) {
-    $result[] = wikidiff2_inline_json_diff($previous, $value, 0);
+    $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000);
    $previous = $value;
 }

--- a/9
+++ b/9
@@ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will
 If the pattern does not include a capture group, then only one output column will result.
 """

-
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@@ -219,7 +218,7 @@ class WikiqParser:
                 revert_radius: int = 15,
                 output_parquet: bool = True,
                 parquet_buffer_size: int = 2000,
-                 wikidiff_url: str = "",
+                 wikidiff_url: str = "http://127.0.0.1:8000",
                 ):

        """ 
@@ -450,9 +449,9 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
-                                                                    revision_texts,
-                                                                    tokenizer=wikitext_split),
+                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
+                                                                    tokenizer=wikitext_split,
+                                                                    self.wikidiff_url),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence