wikidiff2 integration: pwr complete.
test for pwr based on wikidiff2.
This commit is contained in:
parent
58c595bf0b
commit
a8e9e7f4fd
@ -104,8 +104,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(IKWIKI, "noargs")
|
tester = WikiqTester(IKWIKI, "noargs")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# tester.call_wikiq()
|
tester.call_wikiq()
|
||||||
tester.call_wikiq("--wikidiff-url=http://localhost:8000")
|
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -194,6 +193,18 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
baseline = pd.read_table(tester.baseline_file)
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
|
def test_pwr_wikidiff2(self):
|
||||||
|
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_table(tester.output)
|
||||||
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
def test_pwr_segment(self):
|
def test_pwr_segment(self):
|
||||||
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
||||||
|
|
||||||
|
4655
test/baseline_output/persistence_wikidiff2_sailormoon.tsv
Normal file
4655
test/baseline_output/persistence_wikidiff2_sailormoon.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -6,11 +6,13 @@ from typing import Dict, Generator, List, Optional, Tuple
|
|||||||
|
|
||||||
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
||||||
RegexTokenizer, Token, tokenizers)
|
RegexTokenizer, Token, tokenizers)
|
||||||
|
from mwpersistence import Token
|
||||||
from sortedcontainers import SortedDict
|
from sortedcontainers import SortedDict
|
||||||
|
|
||||||
TOKENIZER = tokenizers.wikitext_split
|
TOKENIZER = tokenizers.wikitext_split
|
||||||
import pywikidiff2
|
import pywikidiff2
|
||||||
|
|
||||||
|
|
||||||
class DiffToOperationMap:
|
class DiffToOperationMap:
|
||||||
def __init__(self, diff, tokenizer):
|
def __init__(self, diff, tokenizer):
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
@ -23,7 +25,7 @@ class DiffToOperationMap:
|
|||||||
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||||
|
|
||||||
def tokenize(self, bytes):
|
def tokenize(self, bytes):
|
||||||
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
return self.tokenizer.tokenize(bytes.decode("utf-8"), token_class=Token)
|
||||||
|
|
||||||
def to_operations(self):
|
def to_operations(self):
|
||||||
for entry in self.diff["diff"]:
|
for entry in self.diff["diff"]:
|
||||||
@ -148,8 +150,8 @@ class DiffToOperationMap:
|
|||||||
{
|
{
|
||||||
"text": to_diff["text"],
|
"text": to_diff["text"],
|
||||||
"highlightRanges": to_diff["highlightRanges"],
|
"highlightRanges": to_diff["highlightRanges"],
|
||||||
'offset': offset,
|
"offset": offset,
|
||||||
'lineNumber': to_diff["lineNumber"],
|
"lineNumber": to_diff["lineNumber"],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
diffops = [
|
diffops = [
|
||||||
@ -247,7 +249,9 @@ class DiffToOperationMap:
|
|||||||
raise ValueError(delete_segment)
|
raise ValueError(delete_segment)
|
||||||
tokens = self.tokenize(delete_bytes)
|
tokens = self.tokenize(delete_bytes)
|
||||||
if lineNumber is not None:
|
if lineNumber is not None:
|
||||||
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
|
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(
|
||||||
|
delete_bytes
|
||||||
|
)
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
Delete(offset["from"], None, None, None),
|
Delete(offset["from"], None, None, None),
|
||||||
@ -355,8 +359,8 @@ class DiffToOperationMap:
|
|||||||
{
|
{
|
||||||
"text": to_diff["text"],
|
"text": to_diff["text"],
|
||||||
"highlightRanges": to_diff["highlightRanges"],
|
"highlightRanges": to_diff["highlightRanges"],
|
||||||
'offset': offset,
|
"offset": offset,
|
||||||
'lineNumber': to_diff["lineNumber"],
|
"lineNumber": to_diff["lineNumber"],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -367,11 +371,11 @@ class WikiDiffMatcher:
|
|||||||
texts: list[str] = None,
|
texts: list[str] = None,
|
||||||
tokenizer: Optional[RegexTokenizer] = None,
|
tokenizer: Optional[RegexTokenizer] = None,
|
||||||
):
|
):
|
||||||
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
|
differ = pywikidiff2.pywikidiff2(
|
||||||
moved_paragraph_detection_cutoff=200000)
|
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
||||||
|
)
|
||||||
# Pre-compute diffs to reduce traffic overhead.
|
# Pre-compute diffs to reduce traffic overhead.
|
||||||
self.diffs = differ.inline_json_diff_sequence(list(texts))
|
self.diffs = differ.inline_json_diff_sequence(list(texts))
|
||||||
|
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
|
|
||||||
class Processor(DiffEngine.Processor):
|
class Processor(DiffEngine.Processor):
|
||||||
@ -394,7 +398,7 @@ class WikiDiffMatcher:
|
|||||||
|
|
||||||
# this happens when revisions are actually equal.
|
# this happens when revisions are actually equal.
|
||||||
if len(diffops) == 0:
|
if len(diffops) == 0:
|
||||||
self.last_tokens = self.tokenizer.tokenize(text)
|
self.last_tokens = self.tokenizer.tokenize(text, token_class=Token)
|
||||||
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
|
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
|
||||||
return ops, self.last_tokens, self.last_tokens
|
return ops, self.last_tokens, self.last_tokens
|
||||||
|
|
||||||
|
29
wikiq
29
wikiq
@ -35,16 +35,12 @@ import pyarrow as pa
|
|||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
import pyarrow.csv as pacsv
|
import pyarrow.csv as pacsv
|
||||||
|
|
||||||
DIFFS_URL = 'http://localhost:8000'
|
|
||||||
|
|
||||||
|
|
||||||
class PersistMethod:
|
class PersistMethod:
|
||||||
none = 0
|
none = 0
|
||||||
sequence = 1
|
sequence = 1
|
||||||
segment = 2
|
segment = 2
|
||||||
legacy = 3
|
legacy = 3
|
||||||
wikidiff = 4
|
wikidiff2 = 4
|
||||||
|
|
||||||
|
|
||||||
def calculate_persistence(tokens_added):
|
def calculate_persistence(tokens_added):
|
||||||
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
|
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
|
||||||
@ -217,8 +213,7 @@ class WikiqParser:
|
|||||||
namespaces: Union[list[int], None] = None,
|
namespaces: Union[list[int], None] = None,
|
||||||
revert_radius: int = 15,
|
revert_radius: int = 15,
|
||||||
output_parquet: bool = True,
|
output_parquet: bool = True,
|
||||||
parquet_buffer_size: int = 2000,
|
parquet_buffer_size: int = 2000
|
||||||
wikidiff_url: str = "http://127.0.0.1:8000",
|
|
||||||
):
|
):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -231,7 +226,6 @@ class WikiqParser:
|
|||||||
self.persist: int = persist
|
self.persist: int = persist
|
||||||
self.namespaces = []
|
self.namespaces = []
|
||||||
self.revert_radius = revert_radius
|
self.revert_radius = revert_radius
|
||||||
self.wikidiff_url: str = wikidiff_url
|
|
||||||
|
|
||||||
if namespaces is not None:
|
if namespaces is not None:
|
||||||
self.namespace_filter = set(namespaces)
|
self.namespace_filter = set(namespaces)
|
||||||
@ -448,10 +442,10 @@ class WikiqParser:
|
|||||||
elif self.persist == PersistMethod.segment:
|
elif self.persist == PersistMethod.segment:
|
||||||
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
|
||||||
revert_radius=PERSISTENCE_RADIUS)
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
elif self.persist == PersistMethod.wikidiff:
|
elif self.persist == PersistMethod.wikidiff2:
|
||||||
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
|
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
|
||||||
tokenizer=wikitext_split,
|
tokenizer=wikitext_split,
|
||||||
self.wikidiff_url),
|
),
|
||||||
revert_radius=PERSISTENCE_RADIUS)
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
else:
|
else:
|
||||||
from mw.lib import persistence
|
from mw.lib import persistence
|
||||||
@ -557,8 +551,8 @@ def main():
|
|||||||
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
|
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
|
||||||
|
|
||||||
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
|
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
|
||||||
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
|
choices=['', 'wikidiff2', 'segment', 'sequence', 'legacy'], nargs='?',
|
||||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is no persistence. -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. -p=segment attempts advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower. -p=wikidiff2 is like segment, but uses the wikidiff2 algorithm, which (should be) faster and more robust.")
|
||||||
|
|
||||||
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
||||||
help="Id number of namespace to include. Can be specified more than once.")
|
help="Id number of namespace to include. Can be specified more than once.")
|
||||||
@ -590,22 +584,18 @@ def main():
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||||
|
|
||||||
parser.add_argument('--wikidiff-url', dest="wikidiff_url",
|
|
||||||
action='store',
|
|
||||||
help="The URL to a server running WikiDiff2.")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# set persistence method
|
# set persistence method
|
||||||
|
|
||||||
if args.persist is None and not args.wikidiff_url:
|
if args.persist is None:
|
||||||
persist = PersistMethod.none
|
persist = PersistMethod.none
|
||||||
elif args.persist == "segment":
|
elif args.persist == "segment":
|
||||||
persist = PersistMethod.segment
|
persist = PersistMethod.segment
|
||||||
elif args.persist == "legacy":
|
elif args.persist == "legacy":
|
||||||
persist = PersistMethod.legacy
|
persist = PersistMethod.legacy
|
||||||
elif args.wikidiff_url:
|
elif args.persist == "wikidiff2":
|
||||||
persist = PersistMethod.wikidiff
|
persist = PersistMethod.wikidiff2
|
||||||
else:
|
else:
|
||||||
persist = PersistMethod.sequence
|
persist = PersistMethod.sequence
|
||||||
|
|
||||||
@ -648,7 +638,6 @@ def main():
|
|||||||
regex_match_comment=args.regex_match_comment,
|
regex_match_comment=args.regex_match_comment,
|
||||||
regex_comment_label=args.regex_comment_label,
|
regex_comment_label=args.regex_comment_label,
|
||||||
output_parquet=output_parquet,
|
output_parquet=output_parquet,
|
||||||
wikidiff_url=args.wikidiff_url,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
Loading…
Reference in New Issue
Block a user