diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index a26c387..b9e9efc 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -1,9 +1,7 @@ import shutil -import sys import unittest import os import subprocess -from shutil import copyfile import numpy as np import pandas as pd @@ -106,8 +104,8 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(IKWIKI, "noargs") try: - tester.call_wikiq() - # tester.call_wikiq("--compute-incremental-diffs") + # tester.call_wikiq() + tester.call_wikiq("--wikidiff-url=http://localhost:8000") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py new file mode 100644 index 0000000..77c00ae --- /dev/null +++ b/wiki_diff_matcher.py @@ -0,0 +1,223 @@ +import json +import sys + +import requests +from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete + +TOKENIZER = tokenizers.text_split + +def compute_diffs(url: str, texts: list[str]) -> list: + response = None + try: + response = requests.post(url, json=texts) + response.raise_for_status() + incremental_diffs = response.json() + except requests.exceptions.ConnectionError as e: + print( + f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") + print(e) + raise e + except requests.exceptions.HTTPError as e: + print(f"HTTP Error: {e}") + if response is not None: + print(f"Response Body: {response.text}") + raise e + except requests.exceptions.JSONDecodeError as e: + # Must come before RequestException as JSONDecodeError is + # a subclass. + print(f"JSON Decode Error: {e}", file=sys.stderr) + if response is not None: + print(f"Response Body: {response.text}", file=sys.stderr) + raise e + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + raise e + + return incremental_diffs + + +def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list: + d = json.loads(diff) + + # Keep track of the last difference we saw in order to notice unaccounted-for + # tokens. Each token at the end of "to" which is skipped for the next diff + # must be represented as an "Equal()" segment. + from_last_end = 0 + to_last_end = 0 + + result = [] + # DiffState expects differences to be represented in order from the + # result's perspective ("to"), not the previous text. Thus, if a line + # is moved earlier then its insertion should appear before its deletion. + # As a rule of thumb, the "to" segments should be non-overlapping and + # strictly increasing, while the "from" segments should merely be + # non-overlapping. + # + # wikidiff2 appears to follow this same convention, but this behavior + # is not documented. + + for entry in d['diff']: + from_start_line = entry['offset']['from'] + to_start_line = entry['offset']['to'] + # Per above, to_start_line appears to be nondecreasing, but + # from_start_line may sometimes decrease for detected paragraph moves. + + from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line])) + to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line])) + # These constant calls to tokenizer.tokenize can definitely be optimized + # as tokenization is currently a bottleneck. Ideally tokenization would + # happen incrementally where possible, or somehow be cached, but this + # would be more complex. + + if entry['type'] == 0: + # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway. + line_tokens = len(tokenizer.tokenize(entry['text'])) + from_end_tokens = from_start_tokens + line_tokens + to_end_tokens = to_start_tokens + line_tokens + + result.append(Equal(from_start_tokens, from_end_tokens, + to_start_tokens, to_end_tokens)) + + from_last_end = from_end_tokens + to_last_end = to_end_tokens + + continue + else: + # These do not appear to be generated by wikidiff2, and so must be + # inferred. + equal_tokens = to_start_tokens - to_last_end + # If we notice that the next non-zero segment (which must be a + # change, given that its type is non-zero), begins after the end + # of the previous segment, we must add an Equal segment. + # TODO: While the "to" token ranges are correct, the "from" + # ranges are likely not, particularly in histories with paragraph + # moves. + if equal_tokens > 0: + result.append(Equal(from_last_end, from_start_line, + to_last_end, to_start_line)) + + + if entry['type'] == 1 or entry['type'] == 4: + # TODO: Separate out type 4 to recognize this is the insertion + # part of a paragraph move. Note that for paragraph moves + # the text is not necessarily identical, just similar. + line_tokens = len(tokenizer.tokenize(entry['text'])) + to_end_tokens = to_start_tokens + line_tokens + + result.append(Insert(from_start_tokens, from_start_tokens, + to_start_tokens, to_end_tokens, + )) + + # We have now used more of the "to" tokens. + to_last_end = to_end_tokens + elif entry['type'] == 2 or entry['type'] == 5: + # TODO: Separate out type 5 to recognize this is the deletion + # part of a paragraph move. Note that for paragraph moves + # the text is not necessarily identical, just similar. + line_tokens = len(tokenizer.tokenize(entry['text'])) + from_end_tokens = from_start_tokens + line_tokens + + result.append(Delete(from_start_tokens, from_end_tokens, + to_start_tokens, to_start_tokens, + )) + + # We have not used more of the "from" tokens. + from_last_end = from_end_tokens + elif entry['type'] == 3: + # The text field is an overlapping mix of both the previous and next + # lines, and so we can't directly tokenize it. + + text = entry['text'] + + last_end = 0 + previous_line = "" + next_line = "" + + # A line will have one or more highlightRanges. + # It is not guaranteed that insertions/deletions are matched, + # for instance, if a word is deleted from the middle of a line. + for highlightRange in entry['highlightRanges']: + if highlightRange['start'] > last_end: + previous_line += text[last_end:highlightRange['start']] + next_line += text[last_end:highlightRange['start']] + # Add an Equal segment. + + rangeStart = highlightRange['start'] + rangeEnd = rangeStart + highlightRange['length'] + + if highlightRange['type'] == 0: + # Insertion + next_line += text[rangeStart:rangeEnd] + + # Add an Insert segment. + elif highlightRange['type'] == 1: + # Deletion + previous_line += text[rangeStart:rangeEnd] + + # Add a Delete segment. + else: + raise Exception(entry) + + from_tokens = len(tokenizer.tokenize(previous_line)) + to_tokens = len(tokenizer.tokenize(next_line)) + + from_start_tokens += from_tokens + to_start_tokens += to_tokens + else: + # The 'type' isn't one of the known + raise ValueError(d) + + # TODO: Handle trailing tokens + + # raise Exception(result) + return result + +class WikiDiffMatcher: + def __init__(self, + url: str, + texts: list[str], + tokenizer: RegexTokenizer = None, + ): + # Pre-compute diffs to reduce traffic overhead. + self.diffs = compute_diffs(url, texts) + self.tokenizer = tokenizer or TOKENIZER + + class Processor(DiffEngine.Processor): + def __init__(self, + diffs, + tokenizer=None + ): + self.diffs = iter(diffs) + self.tokenizer = tokenizer or TOKENIZER + self.last_tokens = [] + self.previous_text = "" + + def update(self, last_tokens): + self.last_tokens = last_tokens + + def process(self, text, token_class=None): + # IDEs will report the method signature as incorrect, but this is + # expected. The DiffEngine.Processor class must be inherited from, + # and its process definition incorrectly excludes a "self" argument. + + # The diff has already been computed, but we need to incrementally + # retrieve it to recreate the behavior DiffState expects. + diff = next(self.diffs) + + tokens = self.tokenizer.tokenize(text, token_class=token_class) + operations = to_operations(self.previous_text, text, diff, self.tokenizer) + + a = self.last_tokens + b = tokens + self.last_tokens = tokens + self.previous_text = text + + return operations, a, b + + def processor(self, *args, **kwargs): + return self.Processor(self.diffs, self.tokenizer) + + + def process(self): + # DiffState checks for this method even though it is not called. + raise Exception("Unnecessary implementation") diff --git a/wikiq b/wikiq index 3ed9d0e..a62653a 100755 --- a/wikiq +++ b/wikiq @@ -17,7 +17,6 @@ from hashlib import sha1 from typing import Any, IO, TextIO, Generator, Union import mwxml -import requests from mwxml import Dump from deltas.tokenizers import wikitext_split @@ -26,6 +25,7 @@ import mwreverts import tables from tables import RevisionTable +from wiki_diff_matcher import WikiDiffMatcher TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS = 7 @@ -43,6 +43,7 @@ class PersistMethod: sequence = 1 segment = 2 legacy = 3 + wikidiff = 4 def calculate_persistence(tokens_added): @@ -218,7 +219,7 @@ class WikiqParser: revert_radius: int = 15, output_parquet: bool = True, parquet_buffer_size: int = 2000, - compute_incremental_diffs: bool = False, + wikidiff_url: str = "", ): """ @@ -231,7 +232,7 @@ class WikiqParser: self.persist: int = persist self.namespaces = [] self.revert_radius = revert_radius - self.compute_incremental_diffs: bool = compute_incremental_diffs + self.wikidiff_url: str = wikidiff_url if namespaces is not None: self.namespace_filter = set(namespaces) @@ -367,9 +368,6 @@ class WikiqParser: schema = schema.append(pa.field('tokens_removed', pa.int64(), nullable=True)) schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True)) - if self.compute_incremental_diffs: - schema = schema.append(pa.field('incremental diffs', pa.string())) - if self.output_parquet: writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') else: @@ -379,7 +377,7 @@ class WikiqParser: # Iterate through pages for page in dump: - payload = [] + revision_texts = [] # skip namespaces not in the filter if self.namespace_filter is not None: @@ -419,36 +417,10 @@ class WikiqParser: regex_matches[k] = [] regex_matches[k].append(v) - if self.compute_incremental_diffs: - payload.append(rev.text) + revision_texts.append(rev.text) # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() - if self.compute_incremental_diffs: - try: - response = requests.post(DIFFS_URL, json=payload) - response.raise_for_status() - incremental_diffs = response.json() - except requests.exceptions.ConnectionError as e: - print( - f"Connection Error: Could not connect to the server at {DIFFS_URL}. Make sure your local server is running.") - print(e) - raise e - except requests.exceptions.HTTPError as e: - print(f"HTTP Error: {e}") - print(f"Response Body: {response.text}") - raise e - except requests.exceptions.JSONDecodeError as e: - # Must come before RequestException as JSONDecodeError is - # a subclass. - print(f"JSON Decode Error: {e}", file=sys.stderr) - print(f"Response Body: {response.text}", file=sys.stderr) - raise e - except requests.exceptions.RequestException as e: - print(f"An unexpected error occurred: {e}") - raise e - - row_buffer['incremental diffs'] = incremental_diffs is_revert_column: list[Union[bool, None]] = [] for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): @@ -477,6 +449,11 @@ class WikiqParser: elif self.persist == PersistMethod.segment: state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) + elif self.persist == PersistMethod.wikidiff: + state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url, + revision_texts, + tokenizer=wikitext_split), + revert_radius=PERSISTENCE_RADIUS) else: from mw.lib import persistence state = persistence.State() @@ -614,20 +591,22 @@ def main(): action='store_true', help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") - parser.add_argument('--compute-incremental-diffs', dest="compute_incremental_diffs", - action='store_true', - help="Compute and store incremental diffs by edit session.") + parser.add_argument('--wikidiff-url', dest="wikidiff_url", + action='store', + help="The URL to a server running WikiDiff2.") args = parser.parse_args() # set persistence method - if args.persist is None: + if args.persist is None and not args.wikidiff_url: persist = PersistMethod.none elif args.persist == "segment": persist = PersistMethod.segment elif args.persist == "legacy": persist = PersistMethod.legacy + elif args.wikidiff_url: + persist = PersistMethod.wikidiff else: persist = PersistMethod.sequence @@ -670,7 +649,7 @@ def main(): regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, output_parquet=output_parquet, - compute_incremental_diffs=args.compute_incremental_diffs, + wikidiff_url=args.wikidiff_url, ) wikiq.process()