got wikidiff2 persistence working except for paragraph moves.

2025-06-30 15:37:54 -07:00
parent 186cb82fb8
commit 5a3e4102b5
6 changed files with 207 additions and 177 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "mediawiki-php-wikidiff2"]
+	path = mediawiki-php-wikidiff2
+	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/
--- a/README.rst
+++ b/README.rst
@@ -20,6 +20,13 @@ associated tests to work.
 - 7zip
 - ffmpeg

+A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via.
+
+``apt-get install php-wikidiff2``
+
+You may have to also run.
+``sudo phpenmod wikidiff2``.
+
 Tests
 ----
 To run tests::
@@ -30,3 +37,5 @@ TODO:
 _______________
 1. [] Output metadata about the run. What parameters were used? What versions of deltas?
 2. [] Url encoding by default
+
+.. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = "~=3.9"
+requires-python = ">=3.9"
 dependencies = [
    "deltas>=0.7.0",
    "mediawiki-utilities>=0.4.18",
@@ -18,8 +18,11 @@ dependencies = [
 [tool.uv.sources]
 yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
 mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
+deltas = { git = "https://github.com/groceryheist/deltas" }

 [dependency-groups]
 dev = [
-    "pandas>=2.1.0"
+    "pandas>=2.1.0",
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
 ]
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@@ -1,8 +1,11 @@
 import json
 import sys
+from itertools import chain
+from typing import Generator, List, Optional, Tuple

 import requests
-from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
+from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
+                    RegexTokenizer, Token, tokenizers)

 TOKENIZER = tokenizers.wikitext_split

@@ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        incremental_diffs = response.json()
    except requests.exceptions.ConnectionError as e:
        print(
-            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
+            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
+        )
        print(e)
        raise e
    except requests.exceptions.HTTPError as e:
@@ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        print(f"An unexpected error occurred: {e}")
        raise e

+    # for diff in incremental_diffs:
+    #     for wikidiffop in json.loads(diff)["diff"][0:5]:
+    #         print(wikidiffop)
+
    return incremental_diffs


-def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
-    d = json.loads(diff)
+class DiffToOperationMap:

-    # the code below is designed to work in bytes because that's how wikidiff2 indexes
-    from_text = from_text.encode('utf-8')
-    to_text = to_text.encode('utf-8')
+    def __init__(self, from_text, to_text, diff, tokenizer):

-    # convinient function for tokenizing bytes
-    def tokenize(bytes):
-        return tokenizer.tokenize(bytes.decode('utf-8'))
+        self.diff = diff
+        self.tokenizer = tokenizer
+        self.diff = json.loads(diff)

-    # Keep track of the last difference we saw in order to notice unaccounted-for
-    # tokens. Each token at the end of "to" which is skipped for the next diff
-    # must be represented as an "Equal()" segment.
-    from_last_end_bytes = 0
-    to_last_end_bytes = 0
+        # the code below is designed to work in bytes because that's how wikidiff2 indexes
+        self.from_bytes = from_text.encode("utf-8")
+        self.to_bytes = to_text.encode("utf-8")

-    result = []
-    # DiffState expects differences to be represented in order from the
-    # result's perspective ("to"), not the previous text. Thus, if a line
-    # is moved earlier then its insertion should appear before its deletion.
-    # As a rule of thumb, the "to" segments should be non-overlapping and
-    # strictly increasing, while the "from" segments should merely be
-    # non-overlapping.
-    
-    # wikidiff2 appears to follow this same convention, but this behavior
-    # is not documented.

-    # Note that, confusingly for Insert operations only the "to" indexes matter
-    # and for the Delete and Equal operations only the "from" indexes matter.
-    # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
+        self.from_last_end_bytes = 0
+        self.from_last_to_bytes = 0
+        self.n_from_start_tokens = 0
+        self.n_to_start_tokens = 0
+        self.last_to_start_line = 0
+        self.last_from_start_line = 0
+        self.from_last_end_bytes = 0
+        self.to_last_end_bytes = 0
+        
+    def tokenize(self, bytes):
+        return self.tokenizer.tokenize(bytes.decode("utf-8"))

-    parmove_from_dict = {} # lookup move diffs based on moveinfo id.
-    parmove_to_dict = {} 
-    
-    for entry in d['diff']:
-        linebytes = entry['text'].encode('utf-8')
-        from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
-        to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
+    def to_operations(self):
+        parmove_from_dict = {}  # lookup move diffs based on moveinfo id.
+        parmove_to_dict = {}
+        for entry in self.diff["diff"]:
+            offset = entry['offset']
+            linebytes = entry["text"].encode("utf-8")

-        from_start_tokens = len(tokenize(from_text[:from_start_line]))
-        to_start_tokens = len(tokenize(to_text[:to_start_line]))
-        # These constant calls to tokenizer.tokenize can definitely be optimized
-        # as tokenization is currently a bottleneck. Ideally tokenization would
-        # happen incrementally where possible, or somehow be cached, but this
-        # would be more complex. N: I think it's okay. CPU is cheap.
+            # ignore empty diffs. They don't have any tokens
+            if len(linebytes) == 0:
+                continue
+            # this is the first byte of the line in the 'from' revision.
+            from_start_line = entry["offset"]["from"]
+            # this is the first byte of the line in the 'to' revision.
+            to_start_line = entry["offset"]["to"]

-        if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
-
-            line_tokens = len(tokenize(linebytes))
-            from_end_tokens = from_start_tokens + line_tokens
-            to_end_tokens = to_start_tokens + line_tokens
-
-            result.append(Equal(from_start_tokens, from_end_tokens,
-                                to_start_tokens, to_end_tokens))
-
-            # we need to keep track of the to and from last end bytes
-            from_last_end_bytes += len(linebytes)
-            to_last_end_bytes  += len(linebytes)
-
-            continue
-        else:
-            # These do not appear to be generated by wikidiff2, and so must be
-            # inferred.
-            equal_tokens = to_start_tokens - to_last_end_bytes
-            # If we notice that the next non-zero segment (which must be a
-            # change, given that its type is non-zero), begins after the end
-            # of the previous segment, we must add an Equal segment.
-            # TODO: While the "to" token ranges are correct,
-            # the "from"
-            #  ranges are likely not, particularly in histories with paragraph
-            #  moves. they can be corrected.
-            if equal_tokens > 0:
-                # only the 'from' indexes matter
-                result.append(Equal(from_last_end_bytes, from_start_line,
-                                    to_last_end_bytes, to_start_line))
-
-        if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
-            line_tokens = len(tokenize(linebytes))
-            to_end_tokens = to_start_tokens + line_tokens
-
-            result.append(Insert(from_start_tokens, from_start_tokens,
-                                 to_start_tokens, to_end_tokens,
-                                 ))
-
-            # We have now used more of the "to" tokens.
-            to_start_end = to_end_tokens
+            if entry["type"] == 0:
+                yield from self.doEqual(linebytes, offset)
            
-        elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
-            line_tokens = len(tokenize(linebytes))
-            from_end_tokens = from_start_tokens + line_tokens
+                # a line included in the 'to' revision, but not in the 'from' revision
+            elif entry["type"] == 1:
+                yield from self.doInsert(linebytes, offset)

-            result.append(Delete(from_start_tokens, from_end_tokens,
-                                 to_start_tokens, to_start_tokens,
-                                 ))
-
-            # We have now used more of the "from" tokens.
-            from_last_end_bytes = from_end_tokens
+                # a line included in the 'from' revision, but not in the 'to' revision
+            elif entry["type"] == 2:
+                yield from self.doDelete(linebytes, offset)
+    
+            elif entry["type"] == 3:
+                yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset)
            
-        elif entry['type'] == 3:
-            # The text field is an overlapping mix of both the from and to,
-            # so we need to handle it highlight-by-highlight.
-            # there can be gaps between highlight segments.
-            # for instance, if a word is deleted from the middle of a line.
-            # we need to track that. 
-            highlight_last_end = 0
+            elif entry["type"] == 4:
+                parmove_from_dict["moveInfo"]["id"] = diff

-            # note that diffs are token-level, but the indexes are byte-level
-            for highlightRange in entry['highlightRanges']:
-                if highlightRange['start'] > highlight_last_end:
+            elif entry["type"] == 5:
+                # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
+                parmove_to_dict["moveInfo"]["id"] = diff
+            # for deletions and equality report the token indexes from the 'from' revision.

-                    equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
-                    equal_tokens = len(tokenize(equal_bytes))
-                    from_end_tokens = from_start_tokens + equal_tokens
-                    to_end_tokens = to_end_tokens + equal_tokens
-                    result.append(Equal(from_start_tokens, from_end_tokens,
-                                        to_start_tokens, to_end_tokens
-                                        ))
+            else:
+                # The 'type' isn't one of the known
+                raise ValueError(d)

-                    from_start_tokens = from_end_tokens
-                    to_start_tokens = to_end_tokens
-                    
-                rangeStart = highlightRange['start']
-                rangeEnd = rangeStart + highlightRange['length']
-                range_bytes = linebytes[rangeStart:rangeEnd]
-                range_tokens = len(tokenize(range_bytes))
-                if highlightRange['type'] == 0:
-                    # Insertion
-                    to_end_tokens = to_start_tokens + range_tokens
-                    result.append(Insert(from_start_tokens, from_end_tokens,
-                                         to_start_tokens, to_end_tokens))
+        
+        # mwpersistence expects differences to be represented in order from the
+        # result's perspective ("to"), not the previous text. Thus, if a line
+        # is moved earlier then its insertion should appear before its deletion.
+        # As a rule of thumb, the "to" segments should be non-overlapping and
+        # strictly increasing, while the "from" segments should merely be
+        # non-overlapping.

-                    to_start_tokens = to_end_tokens
-                elif highlightRange['type'] == 1:
-                    # Deletion
-                    from_end_tokens = from_start_tokens + range_tokens
-                    result.append(Delete(from_start_tokens, from_end_tokens,
-                                         to_start_tokens, to_end_tokens))
-                    from_start_tokens = from_end_tokens
-                                         
-                else:
-                    raise Exception(entry)
-                
-                highlight_last_end = highlightRange['start'] + highlightRange['length']
+        # now we go through the parmoves
+        for id, from_diff in parmove_from_dict.items():
+            to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]]
+            

-        elif entry['type'] == 4:
+    def doEqual(self, equal_bytes, offset):
+        tokens = self.tokenize(equal_bytes)
+        n_tokens = len(tokens)
+        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
+        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
+        yield (
+            Equal(
+                self.n_from_start_tokens,
+                self.n_from_end_tokens,
+                self.n_to_start_tokens,
+                self.n_to_end_tokens,
+            ),
+            tokens,
+            tokens,
+        )
+        # we need to keep track of the to and from last end bytes
+        self.from_last_end_bytes = offset["from"] + len(equal_bytes)
+        self.to_last_end_bytes = offset["to"] + len(equal_bytes)
+        self.n_from_start_tokens += n_tokens
+        self.n_to_start_tokens += n_tokens

-            parmove_from_dict['moveInfo']['id'] = diff

-        elif entry['type'] == 5:
+    def doInsert(self, insert_bytes, offset):
+        tokens = self.tokenize(insert_bytes)
+        n_tokens = len(tokens)
+        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
+        yield (
+            Insert(
+                self.n_from_start_tokens,
+                self.n_from_start_tokens,
+                self.n_to_start_tokens,
+                self.n_to_end_tokens,
+            ),
+            [],
+            tokens,
+        )
+        # We have now used more of the "to" tokens.
+        self.n_to_start_tokens += n_tokens
+        self.to_last_end_bytes = offset["to"] + len(insert_bytes)

-            parmove_to_dict['moveInfo']['id'] = diff
-            # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
-            # for deletions and equality report the token indexes from the 'from' revision. 
-        else:
-            # The 'type' isn't one of the known
-            raise ValueError(d)
+    def doDelete(self, delete_bytes, offset):
+        tokens = self.tokenize(delete_bytes)
+        n_tokens = len(tokens)
+        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
+        yield (
+            Delete(
+                self.n_from_start_tokens,
+                self.n_from_end_tokens,
+                self.n_to_start_tokens,
+                self.n_to_start_tokens,
+            ),
+            tokens,
+            [],
+        )
+        # We have now used more of the "from" tokens.
+        self.n_from_start_tokens += n_tokens
+        self.from_last_end_bytes = offset["from"] + len(delete_bytes)

-    # now we go through the parmoves
-    for id, from_diff in parmove_from_dict.items():
-        to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
-        ### TODO calculate the correct token indexes.
+    def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
+        # The text field is an overlapping mix of both the from and to,
+        # so we need to handle it highlight-by-highlight.
+        # there can be gaps between highlight segments.
+        # for instance, if a word is deleted from the middle of a line.
+        # we need to track that.
+        highlight_end = 0
+        highlight_offset = offset
+        # note that diffs are token-level, but the indexes are byte-level

-    # TODO: Handle trailing tokens
+        for highlightRange in highlightRanges:
+            highlight_start = highlightRange["start"]
+            # equal bytes in between highlights
+            if highlight_start > highlight_end:
+
+                equal_bytes = highlight_bytes[
+                    highlight_end : highlight_start
+                ]
+                n_equal_bytes = len(equal_bytes)
+                yield from self.doEqual(equal_bytes, highlight_offset)
+                highlight_offset['from'] += n_equal_bytes
+                highlight_offset['to'] += n_equal_bytes
+
+            # handle highlighted insert / delete
+            highlight_end = highlight_start + highlightRange["length"]
+            range_bytes = highlight_bytes[highlight_start:highlight_end]
+            n_range_bytes = len(range_bytes)
+            if highlightRange["type"] == 0:
+                yield from self.doInsert(range_bytes, highlight_offset)
+                highlight_offset['to'] += n_range_bytes
+            elif highlightRange["type"] == 1:
+                yield from self.doDelete(range_bytes, highlight_offset)
+                highlight_offset['from'] += n_range_bytes
+            else:
+                raise Exception(entry)
+
+        # handle the rest of the line which is equal
+        if highlight_end < len(highlight_bytes):
+            range_bytes = highlight_bytes[highlight_end:]
+            yield from self.doEqual(range_bytes, highlight_offset)

-    # raise Exception(result)
-    return result

 class WikiDiffMatcher:
-    def __init__(self,
-                 url: str,
-                 texts: list[str],
-                 tokenizer: RegexTokenizer = None,
-                 ):
+    def __init__(
+        self,
+        texts: list[str] = None,
+        tokenizer: Optional[RegexTokenizer] = None,
+        url: Optional[str] = "http://127.0.0.1:8000",
+    ):
        # Pre-compute diffs to reduce traffic overhead.
        self.diffs = compute_diffs(url, texts)
        self.tokenizer = tokenizer or TOKENIZER

    class Processor(DiffEngine.Processor):
-        def __init__(self,
-                     diffs,
-                     tokenizer=None
-                     ):
-            self.diffs = iter(diffs)
+        def __init__(self, texts, tokenizer=None):
+            self.diffs = iter(texts)
            self.tokenizer = tokenizer or TOKENIZER
            self.last_tokens = []
            self.previous_text = ""
@@ -229,28 +246,27 @@ class WikiDiffMatcher:
            self.last_tokens = last_tokens

        def process(self, text, token_class=None):
-            # IDEs will report the method signature as incorrect, but this is
-            # expected. The DiffEngine.Processor class must be inherited from,
-            # and its process definition incorrectly excludes a "self" argument.
-
            # The diff has already been computed, but we need to incrementally
            # retrieve it to recreate the behavior DiffState expects.
            diff = next(self.diffs)
+            diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer)
+            (
+                operations,
+                aseq,
+                bseq,
+            ) = list(
+                zip(*diffToOperationsMapper.to_operations())
+            )

-            tokens = self.tokenizer.tokenize(text, token_class=token_class)
-            operations = to_operations(self.previous_text, text, diff, self.tokenizer)
-
-            a = self.last_tokens
-            b = tokens
-            self.last_tokens = tokens
+            self.last_tokens = list(chain.from_iterable(aseq))
+            tokens = list(chain.from_iterable(bseq))
            self.previous_text = text

-            return operations, a, b
+            return operations, self.last_tokens, tokens

    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)

-
    def process(self):
        # DiffState checks for this method even though it is not called.
        raise Exception("Unnecessary implementation")
--- a/wikidiff2_api.php
+++ b/wikidiff2_api.php
@@ -17,7 +17,7 @@ $data = json_decode($rawData, true);
 $previous = '';
 $result = [];
 foreach ($data as $i => $value) {
-    $result[] = wikidiff2_inline_json_diff($previous, $value, 0);
+    $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000);
    $previous = $value;
 }

--- a/9
+++ b/9
@@ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will
 If the pattern does not include a capture group, then only one output column will result.
 """

-
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@@ -219,7 +218,7 @@ class WikiqParser:
                 revert_radius: int = 15,
                 output_parquet: bool = True,
                 parquet_buffer_size: int = 2000,
-                 wikidiff_url: str = "",
+                 wikidiff_url: str = "http://127.0.0.1:8000",
                 ):

        """ 
@@ -450,9 +449,9 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
-                                                                    revision_texts,
-                                                                    tokenizer=wikitext_split),
+                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
+                                                                    tokenizer=wikitext_split,
+                                                                    self.wikidiff_url),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence