Fix revert column behavior

Now all columns are tested in the parquet test.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-03 15:03:33 -05:00
parent 06a784ef27
commit 123b9a18a8
3 changed files with 25 additions and 13 deletions

View File

@ -58,13 +58,12 @@ class RevisionTable:
def schema(self) -> pa.Schema: def schema(self) -> pa.Schema:
return pa.schema([c.field for c in self.columns]) return pa.schema([c.field for c in self.columns])
def pop(self): def pop(self) -> dict:
schema = self.schema() data = {}
data = []
for column in self.columns: for column in self.columns:
data.append(column.pop()) data[column.field.name] = column.pop()
return pa.table(data, schema) return data
class RevisionId(RevisionField[int]): class RevisionId(RevisionField[int]):

View File

@ -349,13 +349,10 @@ class WikiqTestCase(unittest.TestCase):
baseline['anon'] = baseline['anon'].replace(np.nan, None) baseline['anon'] = baseline['anon'].replace(np.nan, None)
for index, row in baseline.iterrows(): for index, row in baseline.iterrows():
if row['editorid'] is None or test['editorid'][index] is None: if row['revert'] != test['revert'][index]:
if row['editorid'] != test['editorid'][index]: print(row['revid'], ":", row['revert'], "!=", test['revert'][index])
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
for col in baseline.columns: for col in baseline.columns:
if col == "revert":
continue
try: try:
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
except ValueError as exc: except ValueError as exc:

22
wikiq
View File

@ -495,10 +495,14 @@ class WikiqParser:
rev_count = 0 rev_count = 0
writer: pq.ParquetWriter | pc.CSVWriter writer: pq.ParquetWriter | pc.CSVWriter
schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_()))
if self.output_parquet: if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark') writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
else: else:
writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t')) writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
# Iterate through pages # Iterate through pages
for page in dump: for page in dump:
@ -541,7 +545,19 @@ class WikiqParser:
rev_count += 1 rev_count += 1
writer.write(table.pop()) buffer = table.pop()
is_revert_column: list[bool | None] = []
for r, d in zip(buffer['reverteds'], buffer['deleted']):
if d:
is_revert_column.append(None)
else:
is_revert_column.append(r is not None)
buffer['revert'] = is_revert_column
writer.write(pa.table(buffer, schema=schema))
page_count += 1 page_count += 1