Fix revert column behavior
Now all columns are tested in the parquet test. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
22
wikiq
22
wikiq
@@ -495,10 +495,14 @@ class WikiqParser:
|
||||
rev_count = 0
|
||||
|
||||
writer: pq.ParquetWriter | pc.CSVWriter
|
||||
|
||||
schema = table.schema()
|
||||
schema = schema.append(pa.field('revert', pa.bool_()))
|
||||
|
||||
if self.output_parquet:
|
||||
writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark')
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||
else:
|
||||
writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t'))
|
||||
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||
|
||||
# Iterate through pages
|
||||
for page in dump:
|
||||
@@ -541,7 +545,19 @@ class WikiqParser:
|
||||
|
||||
rev_count += 1
|
||||
|
||||
writer.write(table.pop())
|
||||
buffer = table.pop()
|
||||
|
||||
is_revert_column: list[bool | None] = []
|
||||
for r, d in zip(buffer['reverteds'], buffer['deleted']):
|
||||
if d:
|
||||
is_revert_column.append(None)
|
||||
else:
|
||||
is_revert_column.append(r is not None)
|
||||
|
||||
buffer['revert'] = is_revert_column
|
||||
|
||||
|
||||
writer.write(pa.table(buffer, schema=schema))
|
||||
|
||||
page_count += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user