wikidiff2 integration: pwr complete.
test for pwr based on wikidiff2.
This commit is contained in:
parent
58c595bf0b
commit
a8e9e7f4fd
@ -104,8 +104,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
tester = WikiqTester(IKWIKI, "noargs")
|
||||
|
||||
try:
|
||||
# tester.call_wikiq()
|
||||
tester.call_wikiq("--wikidiff-url=http://localhost:8000")
|
||||
tester.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -194,6 +193,18 @@ class WikiqTestCase(unittest.TestCase):
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_pwr_wikidiff2(self):
|
||||
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_table(tester.output)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_pwr_segment(self):
|
||||
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
||||
|
||||
|
4655
test/baseline_output/persistence_wikidiff2_sailormoon.tsv
Normal file
4655
test/baseline_output/persistence_wikidiff2_sailormoon.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -6,11 +6,13 @@ from typing import Dict, Generator, List, Optional, Tuple
|
||||
|
||||
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
||||
RegexTokenizer, Token, tokenizers)
|
||||
from mwpersistence import Token
|
||||
from sortedcontainers import SortedDict
|
||||
|
||||
TOKENIZER = tokenizers.wikitext_split
|
||||
import pywikidiff2
|
||||
|
||||
|
||||
class DiffToOperationMap:
|
||||
def __init__(self, diff, tokenizer):
|
||||
self.tokenizer = tokenizer
|
||||
@ -23,7 +25,7 @@ class DiffToOperationMap:
|
||||
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||
|
||||
def tokenize(self, bytes):
|
||||
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
||||
return self.tokenizer.tokenize(bytes.decode("utf-8"), token_class=Token)
|
||||
|
||||
def to_operations(self):
|
||||
for entry in self.diff["diff"]:
|
||||
@ -148,8 +150,8 @@ class DiffToOperationMap:
|
||||
{
|
||||
"text": to_diff["text"],
|
||||
"highlightRanges": to_diff["highlightRanges"],
|
||||
'offset': offset,
|
||||
'lineNumber': to_diff["lineNumber"],
|
||||
"offset": offset,
|
||||
"lineNumber": to_diff["lineNumber"],
|
||||
}
|
||||
)
|
||||
diffops = [
|
||||
@ -247,7 +249,9 @@ class DiffToOperationMap:
|
||||
raise ValueError(delete_segment)
|
||||
tokens = self.tokenize(delete_bytes)
|
||||
if lineNumber is not None:
|
||||
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
|
||||
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(
|
||||
delete_bytes
|
||||
)
|
||||
|
||||
yield (
|
||||
Delete(offset["from"], None, None, None),
|
||||
@ -355,8 +359,8 @@ class DiffToOperationMap:
|
||||
{
|
||||
"text": to_diff["text"],
|
||||
"highlightRanges": to_diff["highlightRanges"],
|
||||
'offset': offset,
|
||||
'lineNumber': to_diff["lineNumber"],
|
||||
"offset": offset,
|
||||
"lineNumber": to_diff["lineNumber"],
|
||||
}
|
||||
)
|
||||
|
||||
@ -367,11 +371,11 @@ class WikiDiffMatcher:
|
||||
texts: list[str] = None,
|
||||
tokenizer: Optional[RegexTokenizer] = None,
|
||||
):
|
||||
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
|
||||
moved_paragraph_detection_cutoff=200000)
|
||||
differ = pywikidiff2.pywikidiff2(
|
||||
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
||||
)
|
||||
# Pre-compute diffs to reduce traffic overhead.
|
||||
self.diffs = differ.inline_json_diff_sequence(list(texts))
|
||||
|
||||
self.tokenizer = tokenizer or TOKENIZER
|
||||
|
||||
class Processor(DiffEngine.Processor):
|
||||
@ -394,7 +398,7 @@ class WikiDiffMatcher:
|
||||
|
||||
# this happens when revisions are actually equal.
|
||||
if len(diffops) == 0:
|
||||
self.last_tokens = self.tokenizer.tokenize(text)
|
||||
self.last_tokens = self.tokenizer.tokenize(text, token_class=Token)
|
||||
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
|
||||
return ops, self.last_tokens, self.last_tokens
|
||||
|
||||
|
29
wikiq
29
wikiq
@ -35,16 +35,12 @@ import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.csv as pacsv
|
||||
|
||||
DIFFS_URL = 'http://localhost:8000'
|
||||
|
||||
|
||||
class PersistMethod:
|
||||
none = 0
|
||||
sequence = 1
|
||||
segment = 2
|
||||
legacy = 3
|
||||
wikidiff = 4
|
||||
|
||||
wikidiff2 = 4
|
||||
|
||||
def calculate_persistence(tokens_added):
|
||||
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
|
||||
@ -217,8 +213,7 @@ class WikiqParser:
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
parquet_buffer_size: int = 2000,
|
||||
wikidiff_url: str = "http://127.0.0.1:8000",
|
||||
parquet_buffer_size: int = 2000
|
||||
):
|
||||
|
||||
"""
|
||||
@ -231,7 +226,6 @@ class WikiqParser:
|
||||
self.persist: int = persist
|
||||
self.namespaces = []
|
||||
self.revert_radius = revert_radius
|
||||
self.wikidiff_url: str = wikidiff_url
|
||||
|
||||
if namespaces is not None:
|
||||
self.namespace_filter = set(namespaces)
|
||||
@ -448,10 +442,10 @@ class WikiqParser:
|
||||
elif self.persist == PersistMethod.segment:
|
||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
||||
revert_radius=PERSISTENCE_RADIUS)
|
||||
elif self.persist == PersistMethod.wikidiff:
|
||||
elif self.persist == PersistMethod.wikidiff2:
|
||||
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
|
||||
tokenizer=wikitext_split,
|
||||
self.wikidiff_url),
|
||||
),
|
||||
revert_radius=PERSISTENCE_RADIUS)
|
||||
else:
|
||||
from mw.lib import persistence
|
||||
@ -557,8 +551,8 @@ def main():
|
||||
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
|
||||
|
||||
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
|
||||
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
|
||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
||||
choices=['', 'wikidiff2', 'segment', 'sequence', 'legacy'], nargs='?',
|
||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.")
|
||||
|
||||
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
||||
help="Id number of namespace to include. Can be specified more than once.")
|
||||
@ -590,22 +584,18 @@ def main():
|
||||
action='store_true',
|
||||
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||
|
||||
parser.add_argument('--wikidiff-url', dest="wikidiff_url",
|
||||
action='store',
|
||||
help="The URL to a server running WikiDiff2.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set persistence method
|
||||
|
||||
if args.persist is None and not args.wikidiff_url:
|
||||
if args.persist is None:
|
||||
persist = PersistMethod.none
|
||||
elif args.persist == "segment":
|
||||
persist = PersistMethod.segment
|
||||
elif args.persist == "legacy":
|
||||
persist = PersistMethod.legacy
|
||||
elif args.wikidiff_url:
|
||||
persist = PersistMethod.wikidiff
|
||||
elif args.persist == "wikidiff2":
|
||||
persist = PersistMethod.wikidiff2
|
||||
else:
|
||||
persist = PersistMethod.sequence
|
||||
|
||||
@ -648,7 +638,6 @@ def main():
|
||||
regex_match_comment=args.regex_match_comment,
|
||||
regex_comment_label=args.regex_comment_label,
|
||||
output_parquet=output_parquet,
|
||||
wikidiff_url=args.wikidiff_url,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
|
Loading…
Reference in New Issue
Block a user