wikidiff2 integration: pwr complete.

test for pwr based on wikidiff2.
This commit is contained in:
Nathan TeBlunthuis 2025-07-07 12:06:43 -07:00
parent 58c595bf0b
commit a8e9e7f4fd
4 changed files with 4691 additions and 32 deletions

View File

@ -104,8 +104,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(IKWIKI, "noargs")
try:
# tester.call_wikiq()
tester.call_wikiq("--wikidiff-url=http://localhost:8000")
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -194,6 +193,18 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_pwr_wikidiff2(self):
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
try:
tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_pwr_segment(self):
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")

File diff suppressed because it is too large Load Diff

View File

@ -6,11 +6,13 @@ from typing import Dict, Generator, List, Optional, Tuple
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
RegexTokenizer, Token, tokenizers)
from mwpersistence import Token
from sortedcontainers import SortedDict
TOKENIZER = tokenizers.wikitext_split
import pywikidiff2
class DiffToOperationMap:
def __init__(self, diff, tokenizer):
self.tokenizer = tokenizer
@ -23,7 +25,7 @@ class DiffToOperationMap:
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
def tokenize(self, bytes):
return self.tokenizer.tokenize(bytes.decode("utf-8"))
return self.tokenizer.tokenize(bytes.decode("utf-8"), token_class=Token)
def to_operations(self):
for entry in self.diff["diff"]:
@ -148,8 +150,8 @@ class DiffToOperationMap:
{
"text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"],
'offset': offset,
'lineNumber': to_diff["lineNumber"],
"offset": offset,
"lineNumber": to_diff["lineNumber"],
}
)
diffops = [
@ -247,7 +249,9 @@ class DiffToOperationMap:
raise ValueError(delete_segment)
tokens = self.tokenize(delete_bytes)
if lineNumber is not None:
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(
delete_bytes
)
yield (
Delete(offset["from"], None, None, None),
@ -355,8 +359,8 @@ class DiffToOperationMap:
{
"text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"],
'offset': offset,
'lineNumber': to_diff["lineNumber"],
"offset": offset,
"lineNumber": to_diff["lineNumber"],
}
)
@ -367,11 +371,11 @@ class WikiDiffMatcher:
texts: list[str] = None,
tokenizer: Optional[RegexTokenizer] = None,
):
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
moved_paragraph_detection_cutoff=200000)
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
)
# Pre-compute diffs to reduce traffic overhead.
self.diffs = differ.inline_json_diff_sequence(list(texts))
self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor):
@ -394,7 +398,7 @@ class WikiDiffMatcher:
# this happens when revisions are actually equal.
if len(diffops) == 0:
self.last_tokens = self.tokenizer.tokenize(text)
self.last_tokens = self.tokenizer.tokenize(text, token_class=Token)
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
return ops, self.last_tokens, self.last_tokens

29
wikiq
View File

@ -35,16 +35,12 @@ import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pacsv
DIFFS_URL = 'http://localhost:8000'
class PersistMethod:
none = 0
sequence = 1
segment = 2
legacy = 3
wikidiff = 4
wikidiff2 = 4
def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
@ -217,8 +213,7 @@ class WikiqParser:
namespaces: Union[list[int], None] = None,
revert_radius: int = 15,
output_parquet: bool = True,
parquet_buffer_size: int = 2000,
wikidiff_url: str = "http://127.0.0.1:8000",
parquet_buffer_size: int = 2000
):
"""
@ -231,7 +226,6 @@ class WikiqParser:
self.persist: int = persist
self.namespaces = []
self.revert_radius = revert_radius
self.wikidiff_url: str = wikidiff_url
if namespaces is not None:
self.namespace_filter = set(namespaces)
@ -448,10 +442,10 @@ class WikiqParser:
elif self.persist == PersistMethod.segment:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff:
elif self.persist == PersistMethod.wikidiff2:
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split,
self.wikidiff_url),
),
revert_radius=PERSISTENCE_RADIUS)
else:
from mw.lib import persistence
@ -557,8 +551,8 @@ def main():
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
choices=['', 'wikidiff2', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.")
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
help="Id number of namespace to include. Can be specified more than once.")
@ -590,22 +584,18 @@ def main():
action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
parser.add_argument('--wikidiff-url', dest="wikidiff_url",
action='store',
help="The URL to a server running WikiDiff2.")
args = parser.parse_args()
# set persistence method
if args.persist is None and not args.wikidiff_url:
if args.persist is None:
persist = PersistMethod.none
elif args.persist == "segment":
persist = PersistMethod.segment
elif args.persist == "legacy":
persist = PersistMethod.legacy
elif args.wikidiff_url:
persist = PersistMethod.wikidiff
elif args.persist == "wikidiff2":
persist = PersistMethod.wikidiff2
else:
persist = PersistMethod.sequence
@ -648,7 +638,6 @@ def main():
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label,
output_parquet=output_parquet,
wikidiff_url=args.wikidiff_url,
)
wikiq.process()