Refactor revision parsing logic to be columnar #1

Merged
beason merged 27 commits from test-parquet into parquet_support 2025-06-17 18:22:26 +00:00
4 changed files with 4 additions and 32477 deletions
Showing only changes of commit 032fec3198 - Show all commits

View File

@ -111,21 +111,6 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode")
try:
tester.call_wikiq("--url-encode")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_namespaces(self):
tester = WikiqTester(IKWIKI, "namespaces")
@ -262,21 +247,6 @@ class WikiqTestCase(unittest.TestCase):
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
def test_url_encode(self):
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
try:
tester.call_wikiq("--url-encode", "--fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
def test_malformed_noargs(self):
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'

File diff suppressed because it is too large Load Diff

19
wikiq
View File

@ -239,9 +239,6 @@ class Revision:
editor: str | None = None
anon: bool | None = None
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
urlencode = False
# defines pyarrow schema.
# each field in the data class needs an entry in this array.
# the names should match and be in the same order.
@ -333,7 +330,6 @@ class WikiqParser:
regex_comment_label: list[str],
collapse_user: bool = False,
persist: int = None,
urlencode: bool = False,
namespaces: list[int] | None = None,
revert_radius: int = 15,
output_parquet: bool = True,
@ -347,7 +343,6 @@ class WikiqParser:
self.collapse_user: bool = collapse_user
self.persist: int = persist
self.namespaces = []
self.urlencode: bool = urlencode
self.revert_radius = revert_radius
if namespaces is not None:
@ -383,9 +378,7 @@ class WikiqParser:
# we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
self.revdata_type.urlencode = self.urlencode
self.schema: Schema = pa.schema(self.revdata_type.pa_schema_fields)
self.schema: Final[Schema] = pa.schema(self.revdata_type.pa_schema_fields)
beason marked this conversation as resolved Outdated

This logic is to 1. replace None with "" and then fix some edits from Fandom that didn't come with sha1s. Could we move this to a function so it looks like revs = repair_revs(revs). I'd like the mutation of revs to be super clear.

This logic is to 1. replace `None` with "" and then fix some edits from Fandom that didn't come with sha1s. Could we move this to a function so it looks like `revs = repair_revs(revs)`. I'd like the mutation of revs to be super clear.

Done!

Done!
# here we initialize the variables we need for output.
if output_parquet is True:
beason marked this conversation as resolved Outdated

Don't think it's necessary to call list(revs) here.

Don't think it's necessary to call `list(revs)` here.
@ -396,7 +389,7 @@ class WikiqParser:
self.parquet_buffer_size = parquet_buffer_size
else:
self.print_header = True
if output_file == sys.stdout:
if output_file == sys.stdout.buffer:
self.output_file = output_file
else:
@ -724,9 +717,6 @@ def main():
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
help="Id number of namespace to include. Can be specified more than once.")
@ -792,7 +782,8 @@ def main():
print("Processing file: %s" % filename, file=sys.stderr)
if args.stdout:
output_file = sys.stdout
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
output_file = sys.stdout.buffer
else:
filename = os.path.join(output_dir, os.path.basename(filename))
output_file = get_output_filename(filename, parquet=output_parquet)
@ -801,7 +792,6 @@ def main():
output_file,
collapse_user=args.collapse_user,
persist=persist,
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision,
@ -821,7 +811,6 @@ def main():
collapse_user=args.collapse_user,
persist=persist,
# persist_legacy=args.persist_legacy,
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision,