From 123b9a18a84d3acbeba2e02fb3ec36674a284753 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 3 Jun 2025 15:03:33 -0500 Subject: [PATCH] Fix revert column behavior Now all columns are tested in the parquet test. Signed-off-by: Will Beason --- tables.py | 9 ++++----- test/Wikiq_Unit_Test.py | 7 ++----- wikiq | 22 +++++++++++++++++++--- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/tables.py b/tables.py index d7e6ee7..8e1cdb2 100644 --- a/tables.py +++ b/tables.py @@ -58,13 +58,12 @@ class RevisionTable: def schema(self) -> pa.Schema: return pa.schema([c.field for c in self.columns]) - def pop(self): - schema = self.schema() - data = [] + def pop(self) -> dict: + data = {} for column in self.columns: - data.append(column.pop()) + data[column.field.name] = column.pop() - return pa.table(data, schema) + return data class RevisionId(RevisionField[int]): diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 90da740..b1540a8 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -349,13 +349,10 @@ class WikiqTestCase(unittest.TestCase): baseline['anon'] = baseline['anon'].replace(np.nan, None) for index, row in baseline.iterrows(): - if row['editorid'] is None or test['editorid'][index] is None: - if row['editorid'] != test['editorid'][index]: - print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index]) + if row['revert'] != test['revert'][index]: + print(row['revid'], ":", row['revert'], "!=", test['revert'][index]) for col in baseline.columns: - if col == "revert": - continue try: assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) except ValueError as exc: diff --git a/wikiq b/wikiq index 9e62b7a..bde2562 100755 --- a/wikiq +++ b/wikiq @@ -495,10 +495,14 @@ class WikiqParser: rev_count = 0 writer: pq.ParquetWriter | pc.CSVWriter + + schema = table.schema() + schema = schema.append(pa.field('revert', pa.bool_())) + if self.output_parquet: - writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark') + writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') else: - writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t')) + writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t')) # Iterate through pages for page in dump: @@ -541,7 +545,19 @@ class WikiqParser: rev_count += 1 - writer.write(table.pop()) + buffer = table.pop() + + is_revert_column: list[bool | None] = [] + for r, d in zip(buffer['reverteds'], buffer['deleted']): + if d: + is_revert_column.append(None) + else: + is_revert_column.append(r is not None) + + buffer['revert'] = is_revert_column + + + writer.write(pa.table(buffer, schema=schema)) page_count += 1