From 5a3e4102b5edc1936aa4a85f457b13f9c5d9899f Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 30 Jun 2025 15:37:54 -0700 Subject: [PATCH] got wikidiff2 persistence working except for paragraph moves. --- .gitmodules | 3 + README.rst | 9 + pyproject.toml | 7 +- wiki_diff_matcher.py | 354 +++++++++++++++++---------------- index.php => wikidiff2_api.php | 2 +- wikiq | 9 +- 6 files changed, 207 insertions(+), 177 deletions(-) rename index.php => wikidiff2_api.php (86%) diff --git a/.gitmodules b/.gitmodules index e69de29..171346b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "mediawiki-php-wikidiff2"] + path = mediawiki-php-wikidiff2 + url = https://github.com/wikimedia/mediawiki-php-wikidiff2/ diff --git a/README.rst b/README.rst index 77199c8..dced9ef 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,13 @@ associated tests to work. - 7zip - ffmpeg +A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via. + +``apt-get install php-wikidiff2`` + +You may have to also run. +``sudo phpenmod wikidiff2``. + Tests ---- To run tests:: @@ -30,3 +37,5 @@ TODO: _______________ 1. [] Output metadata about the run. What parameters were used? What versions of deltas? 2. [] Url encoding by default + +.. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2 diff --git a/pyproject.toml b/pyproject.toml index 5bec82e..c013ed0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "mediawiki-dump-tools" version = "0.1.0" description = "Add your description here" readme = "README.md" -requires-python = "~=3.9" +requires-python = ">=3.9" dependencies = [ "deltas>=0.7.0", "mediawiki-utilities>=0.4.18", @@ -18,8 +18,11 @@ dependencies = [ [tool.uv.sources] yamlconf = { git = "https://github.com/groceryheist/yamlconf" } mwxml = { git = "https://github.com/groceryheist/python-mwxml" } +deltas = { git = "https://github.com/groceryheist/deltas" } [dependency-groups] dev = [ - "pandas>=2.1.0" + "pandas>=2.1.0", + "pytest>=8.4.1", + "pytest-asyncio>=1.0.0", ] diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index f930b2c..b807c75 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -1,8 +1,11 @@ import json import sys +from itertools import chain +from typing import Generator, List, Optional, Tuple import requests -from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete +from deltas import (Delete, DiffEngine, Equal, Insert, Operation, + RegexTokenizer, Token, tokenizers) TOKENIZER = tokenizers.wikitext_split @@ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list: incremental_diffs = response.json() except requests.exceptions.ConnectionError as e: print( - f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") + f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running." + ) print(e) raise e except requests.exceptions.HTTPError as e: @@ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list: print(f"An unexpected error occurred: {e}") raise e + # for diff in incremental_diffs: + # for wikidiffop in json.loads(diff)["diff"][0:5]: + # print(wikidiffop) + return incremental_diffs -def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list: - d = json.loads(diff) +class DiffToOperationMap: - # the code below is designed to work in bytes because that's how wikidiff2 indexes - from_text = from_text.encode('utf-8') - to_text = to_text.encode('utf-8') + def __init__(self, from_text, to_text, diff, tokenizer): - # convinient function for tokenizing bytes - def tokenize(bytes): - return tokenizer.tokenize(bytes.decode('utf-8')) + self.diff = diff + self.tokenizer = tokenizer + self.diff = json.loads(diff) - # Keep track of the last difference we saw in order to notice unaccounted-for - # tokens. Each token at the end of "to" which is skipped for the next diff - # must be represented as an "Equal()" segment. - from_last_end_bytes = 0 - to_last_end_bytes = 0 + # the code below is designed to work in bytes because that's how wikidiff2 indexes + self.from_bytes = from_text.encode("utf-8") + self.to_bytes = to_text.encode("utf-8") - result = [] - # DiffState expects differences to be represented in order from the - # result's perspective ("to"), not the previous text. Thus, if a line - # is moved earlier then its insertion should appear before its deletion. - # As a rule of thumb, the "to" segments should be non-overlapping and - # strictly increasing, while the "from" segments should merely be - # non-overlapping. - - # wikidiff2 appears to follow this same convention, but this behavior - # is not documented. - # Note that, confusingly for Insert operations only the "to" indexes matter - # and for the Delete and Equal operations only the "from" indexes matter. - # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas` + self.from_last_end_bytes = 0 + self.from_last_to_bytes = 0 + self.n_from_start_tokens = 0 + self.n_to_start_tokens = 0 + self.last_to_start_line = 0 + self.last_from_start_line = 0 + self.from_last_end_bytes = 0 + self.to_last_end_bytes = 0 + + def tokenize(self, bytes): + return self.tokenizer.tokenize(bytes.decode("utf-8")) - parmove_from_dict = {} # lookup move diffs based on moveinfo id. - parmove_to_dict = {} - - for entry in d['diff']: - linebytes = entry['text'].encode('utf-8') - from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision. - to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision. + def to_operations(self): + parmove_from_dict = {} # lookup move diffs based on moveinfo id. + parmove_to_dict = {} + for entry in self.diff["diff"]: + offset = entry['offset'] + linebytes = entry["text"].encode("utf-8") - from_start_tokens = len(tokenize(from_text[:from_start_line])) - to_start_tokens = len(tokenize(to_text[:to_start_line])) - # These constant calls to tokenizer.tokenize can definitely be optimized - # as tokenization is currently a bottleneck. Ideally tokenization would - # happen incrementally where possible, or somehow be cached, but this - # would be more complex. N: I think it's okay. CPU is cheap. + # ignore empty diffs. They don't have any tokens + if len(linebytes) == 0: + continue + # this is the first byte of the line in the 'from' revision. + from_start_line = entry["offset"]["from"] + # this is the first byte of the line in the 'to' revision. + to_start_line = entry["offset"]["to"] - if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0. - - line_tokens = len(tokenize(linebytes)) - from_end_tokens = from_start_tokens + line_tokens - to_end_tokens = to_start_tokens + line_tokens - - result.append(Equal(from_start_tokens, from_end_tokens, - to_start_tokens, to_end_tokens)) - - # we need to keep track of the to and from last end bytes - from_last_end_bytes += len(linebytes) - to_last_end_bytes += len(linebytes) - - continue - else: - # These do not appear to be generated by wikidiff2, and so must be - # inferred. - equal_tokens = to_start_tokens - to_last_end_bytes - # If we notice that the next non-zero segment (which must be a - # change, given that its type is non-zero), begins after the end - # of the previous segment, we must add an Equal segment. - # TODO: While the "to" token ranges are correct, - # the "from" - # ranges are likely not, particularly in histories with paragraph - # moves. they can be corrected. - if equal_tokens > 0: - # only the 'from' indexes matter - result.append(Equal(from_last_end_bytes, from_start_line, - to_last_end_bytes, to_start_line)) - - if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision - line_tokens = len(tokenize(linebytes)) - to_end_tokens = to_start_tokens + line_tokens - - result.append(Insert(from_start_tokens, from_start_tokens, - to_start_tokens, to_end_tokens, - )) - - # We have now used more of the "to" tokens. - to_start_end = to_end_tokens + if entry["type"] == 0: + yield from self.doEqual(linebytes, offset) - elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision - line_tokens = len(tokenize(linebytes)) - from_end_tokens = from_start_tokens + line_tokens + # a line included in the 'to' revision, but not in the 'from' revision + elif entry["type"] == 1: + yield from self.doInsert(linebytes, offset) - result.append(Delete(from_start_tokens, from_end_tokens, - to_start_tokens, to_start_tokens, - )) - - # We have now used more of the "from" tokens. - from_last_end_bytes = from_end_tokens + # a line included in the 'from' revision, but not in the 'to' revision + elif entry["type"] == 2: + yield from self.doDelete(linebytes, offset) + + elif entry["type"] == 3: + yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset) - elif entry['type'] == 3: - # The text field is an overlapping mix of both the from and to, - # so we need to handle it highlight-by-highlight. - # there can be gaps between highlight segments. - # for instance, if a word is deleted from the middle of a line. - # we need to track that. - highlight_last_end = 0 + elif entry["type"] == 4: + parmove_from_dict["moveInfo"]["id"] = diff - # note that diffs are token-level, but the indexes are byte-level - for highlightRange in entry['highlightRanges']: - if highlightRange['start'] > highlight_last_end: + elif entry["type"] == 5: + # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. + parmove_to_dict["moveInfo"]["id"] = diff + # for deletions and equality report the token indexes from the 'from' revision. - equal_bytes = linebytes[highlight_last_end:highlightRange['start']] - equal_tokens = len(tokenize(equal_bytes)) - from_end_tokens = from_start_tokens + equal_tokens - to_end_tokens = to_end_tokens + equal_tokens - result.append(Equal(from_start_tokens, from_end_tokens, - to_start_tokens, to_end_tokens - )) + else: + # The 'type' isn't one of the known + raise ValueError(d) - from_start_tokens = from_end_tokens - to_start_tokens = to_end_tokens - - rangeStart = highlightRange['start'] - rangeEnd = rangeStart + highlightRange['length'] - range_bytes = linebytes[rangeStart:rangeEnd] - range_tokens = len(tokenize(range_bytes)) - if highlightRange['type'] == 0: - # Insertion - to_end_tokens = to_start_tokens + range_tokens - result.append(Insert(from_start_tokens, from_end_tokens, - to_start_tokens, to_end_tokens)) + + # mwpersistence expects differences to be represented in order from the + # result's perspective ("to"), not the previous text. Thus, if a line + # is moved earlier then its insertion should appear before its deletion. + # As a rule of thumb, the "to" segments should be non-overlapping and + # strictly increasing, while the "from" segments should merely be + # non-overlapping. - to_start_tokens = to_end_tokens - elif highlightRange['type'] == 1: - # Deletion - from_end_tokens = from_start_tokens + range_tokens - result.append(Delete(from_start_tokens, from_end_tokens, - to_start_tokens, to_end_tokens)) - from_start_tokens = from_end_tokens - - else: - raise Exception(entry) - - highlight_last_end = highlightRange['start'] + highlightRange['length'] + # now we go through the parmoves + for id, from_diff in parmove_from_dict.items(): + to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]] + - elif entry['type'] == 4: + def doEqual(self, equal_bytes, offset): + tokens = self.tokenize(equal_bytes) + n_tokens = len(tokens) + self.n_from_end_tokens = self.n_from_start_tokens + n_tokens + self.n_to_end_tokens = self.n_to_start_tokens + n_tokens + yield ( + Equal( + self.n_from_start_tokens, + self.n_from_end_tokens, + self.n_to_start_tokens, + self.n_to_end_tokens, + ), + tokens, + tokens, + ) + # we need to keep track of the to and from last end bytes + self.from_last_end_bytes = offset["from"] + len(equal_bytes) + self.to_last_end_bytes = offset["to"] + len(equal_bytes) + self.n_from_start_tokens += n_tokens + self.n_to_start_tokens += n_tokens - parmove_from_dict['moveInfo']['id'] = diff - elif entry['type'] == 5: + def doInsert(self, insert_bytes, offset): + tokens = self.tokenize(insert_bytes) + n_tokens = len(tokens) + self.n_to_end_tokens = self.n_to_start_tokens + n_tokens + yield ( + Insert( + self.n_from_start_tokens, + self.n_from_start_tokens, + self.n_to_start_tokens, + self.n_to_end_tokens, + ), + [], + tokens, + ) + # We have now used more of the "to" tokens. + self.n_to_start_tokens += n_tokens + self.to_last_end_bytes = offset["to"] + len(insert_bytes) - parmove_to_dict['moveInfo']['id'] = diff - # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. - # for deletions and equality report the token indexes from the 'from' revision. - else: - # The 'type' isn't one of the known - raise ValueError(d) + def doDelete(self, delete_bytes, offset): + tokens = self.tokenize(delete_bytes) + n_tokens = len(tokens) + self.n_from_end_tokens = self.n_from_start_tokens + n_tokens + yield ( + Delete( + self.n_from_start_tokens, + self.n_from_end_tokens, + self.n_to_start_tokens, + self.n_to_start_tokens, + ), + tokens, + [], + ) + # We have now used more of the "from" tokens. + self.n_from_start_tokens += n_tokens + self.from_last_end_bytes = offset["from"] + len(delete_bytes) - # now we go through the parmoves - for id, from_diff in parmove_from_dict.items(): - to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']] - ### TODO calculate the correct token indexes. + def doHighlightRange(self, highlight_bytes, highlightRanges, offset): + # The text field is an overlapping mix of both the from and to, + # so we need to handle it highlight-by-highlight. + # there can be gaps between highlight segments. + # for instance, if a word is deleted from the middle of a line. + # we need to track that. + highlight_end = 0 + highlight_offset = offset + # note that diffs are token-level, but the indexes are byte-level - # TODO: Handle trailing tokens + for highlightRange in highlightRanges: + highlight_start = highlightRange["start"] + # equal bytes in between highlights + if highlight_start > highlight_end: + + equal_bytes = highlight_bytes[ + highlight_end : highlight_start + ] + n_equal_bytes = len(equal_bytes) + yield from self.doEqual(equal_bytes, highlight_offset) + highlight_offset['from'] += n_equal_bytes + highlight_offset['to'] += n_equal_bytes + + # handle highlighted insert / delete + highlight_end = highlight_start + highlightRange["length"] + range_bytes = highlight_bytes[highlight_start:highlight_end] + n_range_bytes = len(range_bytes) + if highlightRange["type"] == 0: + yield from self.doInsert(range_bytes, highlight_offset) + highlight_offset['to'] += n_range_bytes + elif highlightRange["type"] == 1: + yield from self.doDelete(range_bytes, highlight_offset) + highlight_offset['from'] += n_range_bytes + else: + raise Exception(entry) + + # handle the rest of the line which is equal + if highlight_end < len(highlight_bytes): + range_bytes = highlight_bytes[highlight_end:] + yield from self.doEqual(range_bytes, highlight_offset) - # raise Exception(result) - return result class WikiDiffMatcher: - def __init__(self, - url: str, - texts: list[str], - tokenizer: RegexTokenizer = None, - ): + def __init__( + self, + texts: list[str] = None, + tokenizer: Optional[RegexTokenizer] = None, + url: Optional[str] = "http://127.0.0.1:8000", + ): # Pre-compute diffs to reduce traffic overhead. self.diffs = compute_diffs(url, texts) self.tokenizer = tokenizer or TOKENIZER class Processor(DiffEngine.Processor): - def __init__(self, - diffs, - tokenizer=None - ): - self.diffs = iter(diffs) + def __init__(self, texts, tokenizer=None): + self.diffs = iter(texts) self.tokenizer = tokenizer or TOKENIZER self.last_tokens = [] self.previous_text = "" @@ -229,28 +246,27 @@ class WikiDiffMatcher: self.last_tokens = last_tokens def process(self, text, token_class=None): - # IDEs will report the method signature as incorrect, but this is - # expected. The DiffEngine.Processor class must be inherited from, - # and its process definition incorrectly excludes a "self" argument. - # The diff has already been computed, but we need to incrementally # retrieve it to recreate the behavior DiffState expects. diff = next(self.diffs) + diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer) + ( + operations, + aseq, + bseq, + ) = list( + zip(*diffToOperationsMapper.to_operations()) + ) - tokens = self.tokenizer.tokenize(text, token_class=token_class) - operations = to_operations(self.previous_text, text, diff, self.tokenizer) - - a = self.last_tokens - b = tokens - self.last_tokens = tokens + self.last_tokens = list(chain.from_iterable(aseq)) + tokens = list(chain.from_iterable(bseq)) self.previous_text = text - return operations, a, b + return operations, self.last_tokens, tokens def processor(self, *args, **kwargs): return self.Processor(self.diffs, self.tokenizer) - def process(self): # DiffState checks for this method even though it is not called. raise Exception("Unnecessary implementation") diff --git a/index.php b/wikidiff2_api.php similarity index 86% rename from index.php rename to wikidiff2_api.php index 858e604..a588b1c 100644 --- a/index.php +++ b/wikidiff2_api.php @@ -17,7 +17,7 @@ $data = json_decode($rawData, true); $previous = ''; $result = []; foreach ($data as $i => $value) { - $result[] = wikidiff2_inline_json_diff($previous, $value, 0); + $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000); $previous = $value; } diff --git a/wikiq b/wikiq index a62653a..06d8385 100755 --- a/wikiq +++ b/wikiq @@ -140,7 +140,6 @@ The pattern can include capture groups. If it does then each capture group will If the pattern does not include a capture group, then only one output column will result. """ - class RegexPair(object): def __init__(self, pattern, label): self.pattern = re.compile(pattern) @@ -219,7 +218,7 @@ class WikiqParser: revert_radius: int = 15, output_parquet: bool = True, parquet_buffer_size: int = 2000, - wikidiff_url: str = "", + wikidiff_url: str = "http://127.0.0.1:8000", ): """ @@ -450,9 +449,9 @@ class WikiqParser: state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.wikidiff: - state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url, - revision_texts, - tokenizer=wikitext_split), + state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, + tokenizer=wikitext_split, + self.wikidiff_url), revert_radius=PERSISTENCE_RADIUS) else: from mw.lib import persistence