Fix revert column behavior
Now all columns are tested in the parquet test. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
06a784ef27
commit
123b9a18a8
@ -58,13 +58,12 @@ class RevisionTable:
|
||||
def schema(self) -> pa.Schema:
|
||||
return pa.schema([c.field for c in self.columns])
|
||||
|
||||
def pop(self):
|
||||
schema = self.schema()
|
||||
data = []
|
||||
def pop(self) -> dict:
|
||||
data = {}
|
||||
for column in self.columns:
|
||||
data.append(column.pop())
|
||||
data[column.field.name] = column.pop()
|
||||
|
||||
return pa.table(data, schema)
|
||||
return data
|
||||
|
||||
|
||||
class RevisionId(RevisionField[int]):
|
||||
|
@ -349,13 +349,10 @@ class WikiqTestCase(unittest.TestCase):
|
||||
baseline['anon'] = baseline['anon'].replace(np.nan, None)
|
||||
|
||||
for index, row in baseline.iterrows():
|
||||
if row['editorid'] is None or test['editorid'][index] is None:
|
||||
if row['editorid'] != test['editorid'][index]:
|
||||
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
|
||||
if row['revert'] != test['revert'][index]:
|
||||
print(row['revid'], ":", row['revert'], "!=", test['revert'][index])
|
||||
|
||||
for col in baseline.columns:
|
||||
if col == "revert":
|
||||
continue
|
||||
try:
|
||||
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
|
||||
except ValueError as exc:
|
||||
|
22
wikiq
22
wikiq
@ -495,10 +495,14 @@ class WikiqParser:
|
||||
rev_count = 0
|
||||
|
||||
writer: pq.ParquetWriter | pc.CSVWriter
|
||||
|
||||
schema = table.schema()
|
||||
schema = schema.append(pa.field('revert', pa.bool_()))
|
||||
|
||||
if self.output_parquet:
|
||||
writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark')
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||
else:
|
||||
writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t'))
|
||||
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||
|
||||
# Iterate through pages
|
||||
for page in dump:
|
||||
@ -541,7 +545,19 @@ class WikiqParser:
|
||||
|
||||
rev_count += 1
|
||||
|
||||
writer.write(table.pop())
|
||||
buffer = table.pop()
|
||||
|
||||
is_revert_column: list[bool | None] = []
|
||||
for r, d in zip(buffer['reverteds'], buffer['deleted']):
|
||||
if d:
|
||||
is_revert_column.append(None)
|
||||
else:
|
||||
is_revert_column.append(r is not None)
|
||||
|
||||
buffer['revert'] = is_revert_column
|
||||
|
||||
|
||||
writer.write(pa.table(buffer, schema=schema))
|
||||
|
||||
page_count += 1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user