Fix revert column behavior

Now all columns are tested in the parquet test.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason
2025-06-03 15:03:33 -05:00
parent 06a784ef27
commit 123b9a18a8
3 changed files with 25 additions and 13 deletions

22
wikiq
View File

@@ -495,10 +495,14 @@ class WikiqParser:
rev_count = 0
writer: pq.ParquetWriter | pc.CSVWriter
schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_()))
if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark')
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
else:
writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t'))
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
# Iterate through pages
for page in dump:
@@ -541,7 +545,19 @@ class WikiqParser:
rev_count += 1
writer.write(table.pop())
buffer = table.pop()
is_revert_column: list[bool | None] = []
for r, d in zip(buffer['reverteds'], buffer['deleted']):
if d:
is_revert_column.append(None)
else:
is_revert_column.append(r is not None)
buffer['revert'] = is_revert_column
writer.write(pa.table(buffer, schema=schema))
page_count += 1