Merge branch 'tmp' into compute-diffs

2025-06-30 20:52:23 -05:00 · 2025-06-30 20:52:23 -05:00 · 20de5b93f9
commit 20de5b93f9
parent 37734ed092 5a3e4102b5
6 changed files with 211 additions and 148 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "mediawiki-php-wikidiff2"]
 	path = mediawiki-php-wikidiff2
 	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/
--- a/README.rst
+++ b/README.rst
@ -20,6 +20,13 @@ associated tests to work.
 - 7zip
 - ffmpeg
 A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via.
 ``apt-get install php-wikidiff2``
 You may have to also run.
 ``sudo phpenmod wikidiff2``.
 Tests
 ----
 To run tests::
@ -30,3 +37,5 @@ TODO:
 _______________
 1. [] Output metadata about the run. What parameters were used? What versions of deltas?
 2. [] Url encoding by default
 .. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2
--- a/pyproject.toml
+++ b/pyproject.toml
@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = "~=3.9"
+requires-python = ">=3.9"
 dependencies = [
    "deltas>=0.7.0",
    "mediawiki-utilities>=0.4.18",
@ -18,8 +18,11 @@ dependencies = [
 [tool.uv.sources]
 yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
 mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
 deltas = { git = "https://github.com/groceryheist/deltas" }
 [dependency-groups]
 dev = [
-    "pandas>=2.1.0"
+    "pandas>=2.1.0",
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
 ]
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@ -1,10 +1,14 @@
 import json
 import sys
 from itertools import chain
 from typing import Generator, List, Optional, Tuple
 import requests
-from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
+from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
                    RegexTokenizer, Token, tokenizers)
 TOKENIZER = tokenizers.wikitext_split
 TOKENIZER = tokenizers.text_split
 def compute_diffs(url: str, texts: list[str]) -> list:
    response = None
@ -14,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        incremental_diffs = response.json()
    except requests.exceptions.ConnectionError as e:
        print(
-            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
+            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
        )
        print(e)
        raise e
    except requests.exceptions.HTTPError as e:
@ -33,161 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list:
        print(f"An unexpected error occurred: {e}")
        raise e
    # for diff in incremental_diffs:
    #     for wikidiffop in json.loads(diff)["diff"][0:5]:
    #         print(wikidiffop)
    return incremental_diffs
-def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list:
+class DiffToOperationMap:
    d = json.loads(diff)
-    # Keep track of the last difference we saw in order to notice unaccounted-for
+    def __init__(self, from_text, to_text, diff, tokenizer):
    # tokens. Each token at the end of "to" which is skipped for the next diff
    # must be represented as an "Equal()" segment.
    from_last_end = 0
    to_last_end = 0
-    result = []
+        self.diff = diff
-    # DiffState expects differences to be represented in order from the
+        self.tokenizer = tokenizer
-    # result's perspective ("to"), not the previous text. Thus, if a line
+        self.diff = json.loads(diff)
    # is moved earlier then its insertion should appear before its deletion.
    # As a rule of thumb, the "to" segments should be non-overlapping and
    # strictly increasing, while the "from" segments should merely be
    # non-overlapping.
    #
    # wikidiff2 appears to follow this same convention, but this behavior
    # is not documented.
-    for entry in d['diff']:
+        # the code below is designed to work in bytes because that's how wikidiff2 indexes
-        from_start_line = entry['offset']['from']
+        self.from_bytes = from_text.encode("utf-8")
-        to_start_line = entry['offset']['to']
+        self.to_bytes = to_text.encode("utf-8")
        # Per above, to_start_line appears to be nondecreasing, but
        # from_start_line may sometimes decrease for detected paragraph moves.
        from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line]))
        to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line]))
        # These constant calls to tokenizer.tokenize can definitely be optimized
        # as tokenization is currently a bottleneck. Ideally tokenization would
        # happen incrementally where possible, or somehow be cached, but this
        # would be more complex.
        if entry['type'] == 0:
            # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway.
            line_tokens = len(tokenizer.tokenize(entry['text']))
            from_end_tokens = from_start_tokens + line_tokens
            to_end_tokens = to_start_tokens + line_tokens
            result.append(Equal(from_start_tokens, from_end_tokens,
                                to_start_tokens, to_end_tokens))
            from_last_end = from_end_tokens
            to_last_end  = to_end_tokens
            continue
        else:
            # These do not appear to be generated by wikidiff2, and so must be
            # inferred.
            equal_tokens = to_start_tokens - to_last_end
            # If we notice that the next non-zero segment (which must be a
            # change, given that its type is non-zero), begins after the end
            # of the previous segment, we must add an Equal segment.
            # TODO: While the "to" token ranges are correct, the "from"
            #  ranges are likely not, particularly in histories with paragraph
            #  moves.
            if equal_tokens > 0:
                result.append(Equal(from_last_end, from_start_line,
                                    to_last_end, to_start_line))
-        if entry['type'] == 1 or entry['type'] == 4:
+        self.from_last_end_bytes = 0
-            # TODO: Separate out type 4 to recognize this is the insertion
+        self.from_last_to_bytes = 0
-            #  part of a paragraph move. Note that for paragraph moves
+        self.n_from_start_tokens = 0
-            #  the text is not necessarily identical, just similar.
+        self.n_to_start_tokens = 0
-            line_tokens = len(tokenizer.tokenize(entry['text']))
+        self.last_to_start_line = 0
-            to_end_tokens = to_start_tokens + line_tokens
+        self.last_from_start_line = 0
        self.from_last_end_bytes = 0
        self.to_last_end_bytes = 0
    def tokenize(self, bytes):
        return self.tokenizer.tokenize(bytes.decode("utf-8"))
-            result.append(Insert(from_start_tokens, from_start_tokens,
+    def to_operations(self):
-                                 to_start_tokens, to_end_tokens,
+        parmove_from_dict = {}  # lookup move diffs based on moveinfo id.
-                                 ))
+        parmove_to_dict = {}
        for entry in self.diff["diff"]:
            offset = entry['offset']
            linebytes = entry["text"].encode("utf-8")
-            # We have now used more of the "to" tokens.
+            # ignore empty diffs. They don't have any tokens
-            to_last_end = to_end_tokens
+            if len(linebytes) == 0:
-        elif entry['type'] == 2 or entry['type'] == 5:
+                continue
-            # TODO: Separate out type 5 to recognize this is the deletion
+            # this is the first byte of the line in the 'from' revision.
-            #  part of a paragraph move. Note that for paragraph moves
+            from_start_line = entry["offset"]["from"]
-            #  the text is not necessarily identical, just similar.
+            # this is the first byte of the line in the 'to' revision.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
+            to_start_line = entry["offset"]["to"]
            from_end_tokens = from_start_tokens + line_tokens
-            result.append(Delete(from_start_tokens, from_end_tokens,
+            if entry["type"] == 0:
-                                 to_start_tokens, to_start_tokens,
+                yield from self.doEqual(linebytes, offset)
-                                 ))
+            
                # a line included in the 'to' revision, but not in the 'from' revision
            elif entry["type"] == 1:
                yield from self.doInsert(linebytes, offset)
-            # We have not used more of the "from" tokens.
+                # a line included in the 'from' revision, but not in the 'to' revision
-            from_last_end = from_end_tokens
+            elif entry["type"] == 2:
-        elif entry['type'] == 3:
+                yield from self.doDelete(linebytes, offset)
-            # The text field is an overlapping mix of both the previous and next
+    
-            # lines, and so we can't directly tokenize it.
+            elif entry["type"] == 3:
                yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset)
            elif entry["type"] == 4:
                parmove_from_dict["moveInfo"]["id"] = diff
-            text = entry['text']
+            elif entry["type"] == 5:
                # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
                parmove_to_dict["moveInfo"]["id"] = diff
            # for deletions and equality report the token indexes from the 'from' revision.
-            last_end = 0
+            else:
-            previous_line = ""
+                # The 'type' isn't one of the known
-            next_line = ""
+                raise ValueError(d)
-            # A line will have one or more highlightRanges.
+        
-            # It is not guaranteed that insertions/deletions are matched,
+        # mwpersistence expects differences to be represented in order from the
-            # for instance, if a word is deleted from the middle of a line.
+        # result's perspective ("to"), not the previous text. Thus, if a line
-            for highlightRange in entry['highlightRanges']:
+        # is moved earlier then its insertion should appear before its deletion.
-                if highlightRange['start'] > last_end:
+        # As a rule of thumb, the "to" segments should be non-overlapping and
-                    previous_line += text[last_end:highlightRange['start']]
+        # strictly increasing, while the "from" segments should merely be
-                    next_line += text[last_end:highlightRange['start']]
+        # non-overlapping.
                    # Add an Equal segment.
-                rangeStart = highlightRange['start']
+        # now we go through the parmoves
-                rangeEnd = rangeStart + highlightRange['length']
+        for id, from_diff in parmove_from_dict.items():
            to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]]
-                if highlightRange['type'] == 0:
+    def doEqual(self, equal_bytes, offset):
-                    # Insertion
+        tokens = self.tokenize(equal_bytes)
-                    next_line += text[rangeStart:rangeEnd]
+        n_tokens = len(tokens)
        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
        yield (
            Equal(
                self.n_from_start_tokens,
                self.n_from_end_tokens,
                self.n_to_start_tokens,
                self.n_to_end_tokens,
            ),
            tokens,
            tokens,
        )
        # we need to keep track of the to and from last end bytes
        self.from_last_end_bytes = offset["from"] + len(equal_bytes)
        self.to_last_end_bytes = offset["to"] + len(equal_bytes)
        self.n_from_start_tokens += n_tokens
        self.n_to_start_tokens += n_tokens
                    # Add an Insert segment.
                elif highlightRange['type'] == 1:
                    # Deletion
                    previous_line += text[rangeStart:rangeEnd]
-                    # Add a Delete segment.
+    def doInsert(self, insert_bytes, offset):
-                else:
+        tokens = self.tokenize(insert_bytes)
-                    raise Exception(entry)
+        n_tokens = len(tokens)
        self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
        yield (
            Insert(
                self.n_from_start_tokens,
                self.n_from_start_tokens,
                self.n_to_start_tokens,
                self.n_to_end_tokens,
            ),
            [],
            tokens,
        )
        # We have now used more of the "to" tokens.
        self.n_to_start_tokens += n_tokens
        self.to_last_end_bytes = offset["to"] + len(insert_bytes)
-            from_tokens = len(tokenizer.tokenize(previous_line))
+    def doDelete(self, delete_bytes, offset):
-            to_tokens = len(tokenizer.tokenize(next_line))
+        tokens = self.tokenize(delete_bytes)
        n_tokens = len(tokens)
        self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
        yield (
            Delete(
                self.n_from_start_tokens,
                self.n_from_end_tokens,
                self.n_to_start_tokens,
                self.n_to_start_tokens,
            ),
            tokens,
            [],
        )
        # We have now used more of the "from" tokens.
        self.n_from_start_tokens += n_tokens
        self.from_last_end_bytes = offset["from"] + len(delete_bytes)
-            from_start_tokens += from_tokens
+    def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
-            to_start_tokens += to_tokens
+        # The text field is an overlapping mix of both the from and to,
-        else:
+        # so we need to handle it highlight-by-highlight.
-            # The 'type' isn't one of the known
+        # there can be gaps between highlight segments.
-            raise ValueError(d)
+        # for instance, if a word is deleted from the middle of a line.
        # we need to track that.
        highlight_end = 0
        highlight_offset = offset
        # note that diffs are token-level, but the indexes are byte-level
-    # TODO: Handle trailing tokens
+        for highlightRange in highlightRanges:
            highlight_start = highlightRange["start"]
            # equal bytes in between highlights
            if highlight_start > highlight_end:
                equal_bytes = highlight_bytes[
                    highlight_end : highlight_start
                ]
                n_equal_bytes = len(equal_bytes)
                yield from self.doEqual(equal_bytes, highlight_offset)
                highlight_offset['from'] += n_equal_bytes
                highlight_offset['to'] += n_equal_bytes
            # handle highlighted insert / delete
            highlight_end = highlight_start + highlightRange["length"]
            range_bytes = highlight_bytes[highlight_start:highlight_end]
            n_range_bytes = len(range_bytes)
            if highlightRange["type"] == 0:
                yield from self.doInsert(range_bytes, highlight_offset)
                highlight_offset['to'] += n_range_bytes
            elif highlightRange["type"] == 1:
                yield from self.doDelete(range_bytes, highlight_offset)
                highlight_offset['from'] += n_range_bytes
            else:
                raise Exception(entry)
        # handle the rest of the line which is equal
        if highlight_end < len(highlight_bytes):
            range_bytes = highlight_bytes[highlight_end:]
            yield from self.doEqual(range_bytes, highlight_offset)
    # raise Exception(result)
    return result
 class WikiDiffMatcher:
-    def __init__(self,
+    def __init__(
-                 url: str,
+        self,
-                 texts: list[str],
+        texts: list[str] = None,
-                 tokenizer: RegexTokenizer = None,
+        tokenizer: Optional[RegexTokenizer] = None,
-                 ):
+        url: Optional[str] = "http://127.0.0.1:8000",
    ):
        # Pre-compute diffs to reduce traffic overhead.
        self.diffs = compute_diffs(url, texts)
        self.tokenizer = tokenizer or TOKENIZER
    class Processor(DiffEngine.Processor):
-        def __init__(self,
+        def __init__(self, texts, tokenizer=None):
-                     diffs,
+            self.diffs = iter(texts)
                     tokenizer=None
                     ):
            self.diffs = iter(diffs)
            self.tokenizer = tokenizer or TOKENIZER
            self.last_tokens = []
            self.previous_text = ""
@ -196,28 +246,27 @@ class WikiDiffMatcher:
            self.last_tokens = last_tokens
        def process(self, text, token_class=None):
            # IDEs will report the method signature as incorrect, but this is
            # expected. The DiffEngine.Processor class must be inherited from,
            # and its process definition incorrectly excludes a "self" argument.
            # The diff has already been computed, but we need to incrementally
            # retrieve it to recreate the behavior DiffState expects.
            diff = next(self.diffs)
            diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer)
            (
                operations,
                aseq,
                bseq,
            ) = list(
                zip(*diffToOperationsMapper.to_operations())
            )
-            tokens = self.tokenizer.tokenize(text, token_class=token_class)
+            self.last_tokens = list(chain.from_iterable(aseq))
-            operations = to_operations(self.previous_text, text, diff, self.tokenizer)
+            tokens = list(chain.from_iterable(bseq))
            a = self.last_tokens
            b = tokens
            self.last_tokens = tokens
            self.previous_text = text
-            return operations, a, b
+            return operations, self.last_tokens, tokens
    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)
    def process(self):
        # DiffState checks for this method even though it is not called.
        raise Exception("Unnecessary implementation")
--- a/wikidiff2_api.php
+++ b/wikidiff2_api.php
@ -17,7 +17,7 @@ $data = json_decode($rawData, true);
 $previous = '';
 $result = [];
 foreach ($data as $i => $value) {
-    $result[] = wikidiff2_inline_json_diff($previous, $value, 0);
+    $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000);
    $previous = $value;
 }
--- a/9
+++ b/9
@ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will
 If the pattern does not include a capture group, then only one output column will result.
 """
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@ -219,7 +218,7 @@ class WikiqParser:
                 revert_radius: int = 15,
                 output_parquet: bool = True,
                 parquet_buffer_size: int = 2000,
-                 wikidiff_url: str = "",
+                 wikidiff_url: str = "http://127.0.0.1:8000",
                 ):
        """ 
@ -450,9 +449,9 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url,
+                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
-                                                                    revision_texts,
+                                                                    tokenizer=wikitext_split,
-                                                                    tokenizer=wikitext_split),
+                                                                    self.wikidiff_url),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence