Refactor revision parsing logic to be columnar #1

Merged
beason merged 27 commits from test-parquet into parquet_support 2025-06-17 18:22:26 +00:00
Showing only changes of commit 89465b29f4 - Show all commits

9
wikiq
View File

@ -500,7 +500,7 @@ class WikiqParser:
writer: pq.ParquetWriter | pc.CSVWriter
schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_()))
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
@ -512,12 +512,9 @@ class WikiqParser:
# skip namespaces not in the filter
if self.namespace_filter is not None:
if page.namespace not in self.namespace_filter:
if page.mwpage.namespace not in self.namespace_filter:
continue
# if page.namespace != 0:
# page.mwpage.title = ':'.join([dump.namespace_map[page.namespace], page.title])
# Disable detecting reverts if radius is 0.
if self.revert_radius > 0:
reverts_column.rev_detector = mwreverts.Detector(radius=self.revert_radius)
@ -552,7 +549,7 @@ class WikiqParser:
is_revert_column: list[bool | None] = []
for r, d in zip(buffer['reverteds'], buffer['deleted']):
if d:
if self.revert_radius == 0 or d:
is_revert_column.append(None)
else:
is_revert_column.append(r is not None)