got wikidiff2 persistence working except for paragraph moves.

2025-06-30 15:37:54 -07:00
parent 186cb82fb8
commit 5a3e4102b5
6 changed files with 207 additions and 177 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
 [submodule "mediawiki-php-wikidiff2"]
 	path = mediawiki-php-wikidiff2
 	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/
--- a/README.rst
+++ b/README.rst
@@ -20,6 +20,13 @@ associated tests to work.
 - 7zip
 - ffmpeg
 A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via.
 ``apt-get install php-wikidiff2``
 You may have to also run.
 ``sudo phpenmod wikidiff2``.
 Tests
 ----
 To run tests::
@@ -30,3 +37,5 @@ TODO:
 _______________
 1. [] Output metadata about the run. What parameters were used? What versions of deltas?
 2. [] Url encoding by default
 .. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = "~=3.9"
+requires-python = ">=3.9"
 dependencies = [
    "deltas>=0.7.0",
    "mediawiki-utilities>=0.4.18",
@@ -18,8 +18,11 @@ dependencies = [
 [tool.uv.sources]
 yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
 mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
 deltas = { git = "https://github.com/groceryheist/deltas" }
 [dependency-groups]
 dev = [
-    "pandas>=2.1.0"
+    "pandas>=2.1.0",
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
 ]
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@@ -1,8 +1,11 @@
 import json
 import sys
 from itertools import chain
 from typing import Generator, List, Optional, Tuple
 import requests
-from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
+from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
                    RegexTokenizer, Token, tokenizers)
 TOKENIZER = tokenizers.wikitext_split
@@ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        incremental_diffs = response.json()
    except requests.exceptions.ConnectionError as e:
        print(
-            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
+            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
        )
        print(e)
        raise e
    except requests.exceptions.HTTPError as e:
@@ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        print(f"An unexpected error occurred: {e}")
        raise e
    # for diff in incremental_diffs:
    #     for wikidiffop in json.loads(diff)["diff"][0:5]:
    #         print(wikidiffop)
    return incremental_diffs
-def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
+class DiffToOperationMap:
    d = json.loads(diff)
-    # the code below is designed to work in bytes because that's how wikidiff2 indexes
+    def __init__(self, from_text, to_text, diff, tokenizer):
    from_text = from_text.encode('utf-8')
    to_text = to_text.encode('utf-8')
-    # convinient function for tokenizing bytes
+        self.diff = diff
-    def tokenize(bytes):
+        self.tokenizer = tokenizer
-        return tokenizer.tokenize(bytes.decode('utf-8'))
+        self.diff = json.loads(diff)
-    # Keep track of the last difference we saw in order to notice unaccounted-for
+        # the code below is designed to work in bytes because that's how wikidiff2 indexes
-    # tokens. Each token at the end of "to" which is skipped for the next diff
+        self.from_bytes = from_text.encode("utf-8")
-    # must be represented as an "Equal()" segment.
+        self.to_bytes = to_text.encode("utf-8")
    from_last_end_bytes = 0
    to_last_end_bytes = 0
    result = []
    # DiffState expects differences to be represented in order from the
    # result's perspective ("to"), not the previous text. Thus, if a line
    # is moved earlier then its insertion should appear before its deletion.
    # As a rule of thumb, the "to" segments should be non-overlapping and
    # strictly increasing, while the "from" segments should merely be
    # non-overlapping.
    # wikidiff2 appears to follow this same convention, but this behavior
    # is not documented.
-    # Note that, confusingly for Insert operations only the "to" indexes matter
+        self.from_last_end_bytes = 0
-    # and for the Delete and Equal operations only the "from" indexes matter.
+        self.from_last_to_bytes = 0
-    # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
+        self.n_from_start_tokens = 0
        self.n_to_start_tokens = 0
        self.last_to_start_line = 0
        self.last_from_start_line = 0
        self.from_last_end_bytes = 0
        self.to_last_end_bytes = 0
    def tokenize(self, bytes):
        return self.tokenizer.tokenize(bytes.decode("utf-8"))
-    parmove_from_dict = {} # lookup move diffs based on moveinfo id.
+    def to_operations(self):
-    parmove_to_dict = {} 
+        parmove_from_dict = {}  # lookup move diffs based on moveinfo id.
-    
+        parmove_to_dict = {}
-    for entry in d['diff']:
+        for entry in self.diff["diff"]:
-        linebytes = entry['text'].encode('utf-8')
+            offset = entry['offset']
-        from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
+            linebytes = entry["text"].encode("utf-8")
        to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
-        from_start_tokens = len(tokenize(from_text[:from_start_line]))
+            # ignore empty diffs. They don't have any tokens
-        to_start_tokens = len(tokenize(to_text[:to_start_line]))
+            if len(linebytes) == 0:
-        # These constant calls to tokenizer.tokenize can definitely be optimized
+                continue
-        # as tokenization is currently a bottleneck. Ideally tokenization would
+            # this is the first byte of the line in the 'from' revision.
-        # happen incrementally where possible, or somehow be cached, but this
+            from_start_line = entry["offset"]["from"]
-        # would be more complex. N: I think it's okay. CPU is cheap.
+            # this is the first byte of the line in the 'to' revision.
            to_start_line = entry["offset"]["to"]
-        if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
+            if entry["type"] == 0:
-
+                yield from self.doEqual(linebytes, offset)
            line_tokens = len(tokenize(linebytes))
            from_end_tokens = from_start_tokens + line_tokens
            to_end_tokens = to_start_tokens + line_tokens
            result.append(Equal(from_start_tokens, from_end_tokens,
                                to_start_tokens, to_end_tokens))
            # we need to keep track of the to and from last end bytes
            from_last_end_bytes += len(linebytes)
            to_last_end_bytes  += len(linebytes)
            continue
        else:
            # These do not appear to be generated by wikidiff2, and so must be
            # inferred.
            equal_tokens = to_start_tokens - to_last_end_bytes
            # If we notice that the next non-zero segment (which must be a
            # change, given that its type is non-zero), begins after the end
            # of the previous segment, we must add an Equal segment.
            # TODO: While the "to" token ranges are correct,
            # the "from"
            #  ranges are likely not, particularly in histories with paragraph
            #  moves. they can be corrected.
            if equal_tokens > 0:
                # only the 'from' indexes matter
                result.append(Equal(from_last_end_bytes, from_start_line,
                                    to_last_end_bytes, to_start_line))
        if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
            line_tokens = len(tokenize(linebytes))
            to_end_tokens = to_start_tokens + line_tokens
            result.append(Insert(from_start_tokens, from_start_tokens,
                                 to_start_tokens, to_end_tokens,
                                 ))
            # We have now used more of the "to" tokens.
            to_start_end = to_end_tokens
-        elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
+                # a line included in the 'to' revision, but not in the 'from' revision
-            line_tokens = len(tokenize(linebytes))
+            elif entry["type"] == 1:
-            from_end_tokens = from_start_tokens + line_tokens
+                yield from self.doInsert(linebytes, offset)
-            result.append(Delete(from_start_tokens, from_end_tokens,
+                # a line included in the 'from' revision, but not in the 'to' revision
-                                 to_start_tokens, to_start_tokens,
+            elif entry["type"] == 2:
-                                 ))
+                yield from self.doDelete(linebytes, offset)
-
+    
-            # We have now used more of the "from" tokens.
+            elif entry["type"] == 3:
-            from_last_end_bytes = from_end_tokens
+                yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset)
-        elif entry['type'] == 3:
+            elif entry["type"] == 4:
-            # The text field is an overlapping mix of both the from and to,
+                parmove_from_dict["moveInfo"]["id"] = diff
            # so we need to handle it highlight-by-highlight.
            # there can be gaps between highlight segments.
            # for instance, if a word is deleted from the middle of a line.
            # we need to track that. 
            highlight_last_end = 0
-            # note that diffs are token-level, but the indexes are byte-level
+            elif entry["type"] == 5:
-            for highlightRange in entry['highlightRanges']:
+                # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
-                if highlightRange['start'] > highlight_last_end:
+                parmove_to_dict["moveInfo"]["id"] = diff
            # for deletions and equality report the token indexes from the 'from' revision.
-                    equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
+            else:
-                    equal_tokens = len(tokenize(equal_bytes))
+                # The 'type' isn't one of the known
-                    from_end_tokens = from_start_tokens + equal_tokens
+                raise ValueError(d)
                    to_end_tokens = to_end_tokens + equal_tokens
                    result.append(Equal(from_start_tokens, from_end_tokens,
                                        to_start_tokens, to_end_tokens
                                        ))
-                    from_start_tokens = from_end_tokens
+        
-                    to_start_tokens = to_end_tokens
+        # mwpersistence expects differences to be represented in order from the
-                    
+        # result's perspective ("to"), not the previous text. Thus, if a line
-                rangeStart = highlightRange['start']
+        # is moved earlier then its insertion should appear before its deletion.
-                rangeEnd = rangeStart + highlightRange['length']
+        # As a rule of thumb, the "to" segments should be non-overlapping and
-                range_bytes = linebytes[rangeStart:rangeEnd]
+        # strictly increasing, while the "from" segments should merely be
-                range_tokens = len(tokenize(range_bytes))
+        # non-overlapping.
                if highlightRange['type'] == 0:
                    # Insertion
                    to_end_tokens = to_start_tokens + range_tokens
                    result.append(Insert(from_start_tokens, from_end_tokens,
                                         to_start_tokens, to_end_tokens))
-                    to_start_tokens = to_end_tokens
+        # now we go through the parmoves
-                elif highlightRange['type'] == 1:
+        for id, from_diff in parmove_from_dict.items():
-                    # Deletion
+            to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]]
-                    from_end_tokens = from_start_tokens + range_tokens
+            
                    result.append(Delete(from_start_tokens, from_end_tokens,
                                         to_start_tokens, to_end_tokens))
                    from_start_tokens = from_end_tokens
                else:
                    raise Exception(entry)
                highlight_last_end = highlightRange['start'] + highlightRange['length']
-        elif entry['type'] == 4:
+    def doEqual(self, equal_bytes, offset):
        tokens = self.tokenize(equal_bytes)
        n_tokens = len(tokens)
        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
        yield (
            Equal(
                self.n_from_start_tokens,
                self.n_from_end_tokens,
                self.n_to_start_tokens,
                self.n_to_end_tokens,
            ),
            tokens,
            tokens,
        )
        # we need to keep track of the to and from last end bytes
        self.from_last_end_bytes = offset["from"] + len(equal_bytes)
        self.to_last_end_bytes = offset["to"] + len(equal_bytes)
        self.n_from_start_tokens += n_tokens
        self.n_to_start_tokens += n_tokens
            parmove_from_dict['moveInfo']['id'] = diff
-        elif entry['type'] == 5:
+    def doInsert(self, insert_bytes, offset):
        tokens = self.tokenize(insert_bytes)
        n_tokens = len(tokens)
        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
        yield (
            Insert(
                self.n_from_start_tokens,
                self.n_from_start_tokens,
                self.n_to_start_tokens,
                self.n_to_end_tokens,
            ),
            [],
            tokens,
        )
        # We have now used more of the "to" tokens.
        self.n_to_start_tokens += n_tokens
        self.to_last_end_bytes = offset["to"] + len(insert_bytes)
-            parmove_to_dict['moveInfo']['id'] = diff
+    def doDelete(self, delete_bytes, offset):
-            # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
+        tokens = self.tokenize(delete_bytes)
-            # for deletions and equality report the token indexes from the 'from' revision. 
+        n_tokens = len(tokens)
-        else:
+        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
-            # The 'type' isn't one of the known
+        yield (
-            raise ValueError(d)
+            Delete(
                self.n_from_start_tokens,
                self.n_from_end_tokens,
                self.n_to_start_tokens,
                self.n_to_start_tokens,
            ),
            tokens,
            [],
        )
        # We have now used more of the "from" tokens.
        self.n_from_start_tokens += n_tokens
        self.from_last_end_bytes = offset["from"] + len(delete_bytes)
-    # now we go through the parmoves
+    def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
-    for id, from_diff in parmove_from_dict.items():
+        # The text field is an overlapping mix of both the from and to,
-        to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
+        # so we need to handle it highlight-by-highlight.
-        ### TODO calculate the correct token indexes.
+        # there can be gaps between highlight segments.
        # for instance, if a word is deleted from the middle of a line.
        # we need to track that.
        highlight_end = 0
        highlight_offset = offset
        # note that diffs are token-level, but the indexes are byte-level
-    # TODO: Handle trailing tokens
+        for highlightRange in highlightRanges:
            highlight_start = highlightRange["start"]
            # equal bytes in between highlights
            if highlight_start > highlight_end:
                equal_bytes = highlight_bytes[
                    highlight_end : highlight_start
                ]
                n_equal_bytes = len(equal_bytes)
                yield from self.doEqual(equal_bytes, highlight_offset)
                highlight_offset['from'] += n_equal_bytes
                highlight_offset['to'] += n_equal_bytes
            # handle highlighted insert / delete
            highlight_end = highlight_start + highlightRange["length"]
            range_bytes = highlight_bytes[highlight_start:highlight_end]
            n_range_bytes = len(range_bytes)
            if highlightRange["type"] == 0:
                yield from self.doInsert(range_bytes, highlight_offset)
                highlight_offset['to'] += n_range_bytes
            elif highlightRange["type"] == 1:
                yield from self.doDelete(range_bytes, highlight_offset)
                highlight_offset['from'] += n_range_bytes
            else:
                raise Exception(entry)
        # handle the rest of the line which is equal
        if highlight_end < len(highlight_bytes):
            range_bytes = highlight_bytes[highlight_end:]
            yield from self.doEqual(range_bytes, highlight_offset)
    # raise Exception(result)
    return result
 class WikiDiffMatcher:
-    def __init__(self,
+    def __init__(
-                 url: str,
+        self,
-                 texts: list[str],
+        texts: list[str] = None,
-                 tokenizer: RegexTokenizer = None,
+        tokenizer: Optional[RegexTokenizer] = None,
-                 ):
+        url: Optional[str] = "http://127.0.0.1:8000",
    ):
        # Pre-compute diffs to reduce traffic overhead.
        self.diffs = compute_diffs(url, texts)
        self.tokenizer = tokenizer or TOKENIZER
    class Processor(DiffEngine.Processor):
-        def __init__(self,
+        def __init__(self, texts, tokenizer=None):
-                     diffs,
+            self.diffs = iter(texts)
                     tokenizer=None
                     ):
            self.diffs = iter(diffs)
            self.tokenizer = tokenizer or TOKENIZER
            self.last_tokens = []
            self.previous_text = ""
@@ -229,28 +246,27 @@ class WikiDiffMatcher:
            self.last_tokens = last_tokens
        def process(self, text, token_class=None):
            # IDEs will report the method signature as incorrect, but this is
            # expected. The DiffEngine.Processor class must be inherited from,
            # and its process definition incorrectly excludes a "self" argument.
            # The diff has already been computed, but we need to incrementally
            # retrieve it to recreate the behavior DiffState expects.
            diff = next(self.diffs)
            diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer)
            (
                operations,
                aseq,
                bseq,
            ) = list(
                zip(*diffToOperationsMapper.to_operations())
            )
-            tokens = self.tokenizer.tokenize(text, token_class=token_class)
+            self.last_tokens = list(chain.from_iterable(aseq))
-            operations = to_operations(self.previous_text, text, diff, self.tokenizer)
+            tokens = list(chain.from_iterable(bseq))
            a = self.last_tokens
            b = tokens
            self.last_tokens = tokens
            self.previous_text = text
-            return operations, a, b
+            return operations, self.last_tokens, tokens
    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)
    def process(self):
        # DiffState checks for this method even though it is not called.
        raise Exception("Unnecessary implementation")
--- a/wikidiff2_api.php
+++ b/wikidiff2_api.php
@@ -17,7 +17,7 @@ $data = json_decode($rawData, true);
 $previous = '';
 $result = [];
 foreach ($data as $i => $value) {
-    $result[] = wikidiff2_inline_json_diff($previous, $value, 0);
+    $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000);
    $previous = $value;
 }
--- a/9
+++ b/9
@@ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will
 If the pattern does not include a capture group, then only one output column will result.
 """
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@@ -219,7 +218,7 @@ class WikiqParser:
                 revert_radius: int = 15,
                 output_parquet: bool = True,
                 parquet_buffer_size: int = 2000,
-                 wikidiff_url: str = "",
+                 wikidiff_url: str = "http://127.0.0.1:8000",
                 ):
        """ 
@@ -450,9 +449,9 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
+                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
-                                                                    revision_texts,
+                                                                    tokenizer=wikitext_split,
-                                                                    tokenizer=wikitext_split),
+                                                                    self.wikidiff_url),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence