wikidiff2 integration: pwr complete.

test for pwr based on wikidiff2.
This commit is contained in:
Nathan TeBlunthuis 2025-07-07 12:06:43 -07:00
parent 58c595bf0b
commit a8e9e7f4fd
4 changed files with 4691 additions and 32 deletions

View File

@ -104,8 +104,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(IKWIKI, "noargs") tester = WikiqTester(IKWIKI, "noargs")
try: try:
# tester.call_wikiq() tester.call_wikiq()
tester.call_wikiq("--wikidiff-url=http://localhost:8000")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) self.fail(exc.stderr.decode("utf8"))
@ -194,6 +193,18 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_pwr_wikidiff2(self):
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
try:
tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_pwr_segment(self): def test_pwr_segment(self):
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")

File diff suppressed because it is too large Load Diff

View File

@ -6,11 +6,13 @@ from typing import Dict, Generator, List, Optional, Tuple
from deltas import (Delete, DiffEngine, Equal, Insert, Operation, from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
RegexTokenizer, Token, tokenizers) RegexTokenizer, Token, tokenizers)
from mwpersistence import Token
from sortedcontainers import SortedDict from sortedcontainers import SortedDict
TOKENIZER = tokenizers.wikitext_split TOKENIZER = tokenizers.wikitext_split
import pywikidiff2 import pywikidiff2
class DiffToOperationMap: class DiffToOperationMap:
def __init__(self, diff, tokenizer): def __init__(self, diff, tokenizer):
self.tokenizer = tokenizer self.tokenizer = tokenizer
@ -23,7 +25,7 @@ class DiffToOperationMap:
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
def tokenize(self, bytes): def tokenize(self, bytes):
return self.tokenizer.tokenize(bytes.decode("utf-8")) return self.tokenizer.tokenize(bytes.decode("utf-8"), token_class=Token)
def to_operations(self): def to_operations(self):
for entry in self.diff["diff"]: for entry in self.diff["diff"]:
@ -148,8 +150,8 @@ class DiffToOperationMap:
{ {
"text": to_diff["text"], "text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"], "highlightRanges": to_diff["highlightRanges"],
'offset': offset, "offset": offset,
'lineNumber': to_diff["lineNumber"], "lineNumber": to_diff["lineNumber"],
} }
) )
diffops = [ diffops = [
@ -247,7 +249,9 @@ class DiffToOperationMap:
raise ValueError(delete_segment) raise ValueError(delete_segment)
tokens = self.tokenize(delete_bytes) tokens = self.tokenize(delete_bytes)
if lineNumber is not None: if lineNumber is not None:
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes) self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(
delete_bytes
)
yield ( yield (
Delete(offset["from"], None, None, None), Delete(offset["from"], None, None, None),
@ -355,8 +359,8 @@ class DiffToOperationMap:
{ {
"text": to_diff["text"], "text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"], "highlightRanges": to_diff["highlightRanges"],
'offset': offset, "offset": offset,
'lineNumber': to_diff["lineNumber"], "lineNumber": to_diff["lineNumber"],
} }
) )
@ -367,11 +371,11 @@ class WikiDiffMatcher:
texts: list[str] = None, texts: list[str] = None,
tokenizer: Optional[RegexTokenizer] = None, tokenizer: Optional[RegexTokenizer] = None,
): ):
differ = pywikidiff2.pywikidiff2(numContextLines=1000000, differ = pywikidiff2.pywikidiff2(
moved_paragraph_detection_cutoff=200000) numContextLines=1000000, moved_paragraph_detection_cutoff=200000
)
# Pre-compute diffs to reduce traffic overhead. # Pre-compute diffs to reduce traffic overhead.
self.diffs = differ.inline_json_diff_sequence(list(texts)) self.diffs = differ.inline_json_diff_sequence(list(texts))
self.tokenizer = tokenizer or TOKENIZER self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor): class Processor(DiffEngine.Processor):
@ -394,7 +398,7 @@ class WikiDiffMatcher:
# this happens when revisions are actually equal. # this happens when revisions are actually equal.
if len(diffops) == 0: if len(diffops) == 0:
self.last_tokens = self.tokenizer.tokenize(text) self.last_tokens = self.tokenizer.tokenize(text, token_class=Token)
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))] ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
return ops, self.last_tokens, self.last_tokens return ops, self.last_tokens, self.last_tokens

29
wikiq
View File

@ -35,16 +35,12 @@ import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow.csv as pacsv import pyarrow.csv as pacsv
DIFFS_URL = 'http://localhost:8000'
class PersistMethod: class PersistMethod:
none = 0 none = 0
sequence = 1 sequence = 1
segment = 2 segment = 2
legacy = 3 legacy = 3
wikidiff = 4 wikidiff2 = 4
def calculate_persistence(tokens_added): def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]), return (sum([(len(x.revisions) - 1) for x in tokens_added]),
@ -217,8 +213,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None, namespaces: Union[list[int], None] = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
parquet_buffer_size: int = 2000, parquet_buffer_size: int = 2000
wikidiff_url: str = "http://127.0.0.1:8000",
): ):
""" """
@ -231,7 +226,6 @@ class WikiqParser:
self.persist: int = persist self.persist: int = persist
self.namespaces = [] self.namespaces = []
self.revert_radius = revert_radius self.revert_radius = revert_radius
self.wikidiff_url: str = wikidiff_url
if namespaces is not None: if namespaces is not None:
self.namespace_filter = set(namespaces) self.namespace_filter = set(namespaces)
@ -448,10 +442,10 @@ class WikiqParser:
elif self.persist == PersistMethod.segment: elif self.persist == PersistMethod.segment:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff: elif self.persist == PersistMethod.wikidiff2:
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split, tokenizer=wikitext_split,
self.wikidiff_url), ),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
else: else:
from mw.lib import persistence from mw.lib import persistence
@ -557,8 +551,8 @@ def main():
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
choices=['', 'segment', 'sequence', 'legacy'], nargs='?', choices=['', 'wikidiff2', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.")
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
help="Id number of namespace to include. Can be specified more than once.") help="Id number of namespace to include. Can be specified more than once.")
@ -590,22 +584,18 @@ def main():
action='store_true', action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
parser.add_argument('--wikidiff-url', dest="wikidiff_url",
action='store',
help="The URL to a server running WikiDiff2.")
args = parser.parse_args() args = parser.parse_args()
# set persistence method # set persistence method
if args.persist is None and not args.wikidiff_url: if args.persist is None:
persist = PersistMethod.none persist = PersistMethod.none
elif args.persist == "segment": elif args.persist == "segment":
persist = PersistMethod.segment persist = PersistMethod.segment
elif args.persist == "legacy": elif args.persist == "legacy":
persist = PersistMethod.legacy persist = PersistMethod.legacy
elif args.wikidiff_url: elif args.persist == "wikidiff2":
persist = PersistMethod.wikidiff persist = PersistMethod.wikidiff2
else: else:
persist = PersistMethod.sequence persist = PersistMethod.sequence
@ -648,7 +638,6 @@ def main():
regex_match_comment=args.regex_match_comment, regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label, regex_comment_label=args.regex_comment_label,
output_parquet=output_parquet, output_parquet=output_parquet,
wikidiff_url=args.wikidiff_url,
) )
wikiq.process() wikiq.process()