Fix revert column behavior
Now all columns are tested in the parquet test. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
06a784ef27
commit
123b9a18a8
@ -58,13 +58,12 @@ class RevisionTable:
|
|||||||
def schema(self) -> pa.Schema:
|
def schema(self) -> pa.Schema:
|
||||||
return pa.schema([c.field for c in self.columns])
|
return pa.schema([c.field for c in self.columns])
|
||||||
|
|
||||||
def pop(self):
|
def pop(self) -> dict:
|
||||||
schema = self.schema()
|
data = {}
|
||||||
data = []
|
|
||||||
for column in self.columns:
|
for column in self.columns:
|
||||||
data.append(column.pop())
|
data[column.field.name] = column.pop()
|
||||||
|
|
||||||
return pa.table(data, schema)
|
return data
|
||||||
|
|
||||||
|
|
||||||
class RevisionId(RevisionField[int]):
|
class RevisionId(RevisionField[int]):
|
||||||
|
@ -349,13 +349,10 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
baseline['anon'] = baseline['anon'].replace(np.nan, None)
|
baseline['anon'] = baseline['anon'].replace(np.nan, None)
|
||||||
|
|
||||||
for index, row in baseline.iterrows():
|
for index, row in baseline.iterrows():
|
||||||
if row['editorid'] is None or test['editorid'][index] is None:
|
if row['revert'] != test['revert'][index]:
|
||||||
if row['editorid'] != test['editorid'][index]:
|
print(row['revid'], ":", row['revert'], "!=", test['revert'][index])
|
||||||
print(row['revid'], ":", row['editorid'], "!=", test['editorid'][index])
|
|
||||||
|
|
||||||
for col in baseline.columns:
|
for col in baseline.columns:
|
||||||
if col == "revert":
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
|
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
|
22
wikiq
22
wikiq
@ -495,10 +495,14 @@ class WikiqParser:
|
|||||||
rev_count = 0
|
rev_count = 0
|
||||||
|
|
||||||
writer: pq.ParquetWriter | pc.CSVWriter
|
writer: pq.ParquetWriter | pc.CSVWriter
|
||||||
|
|
||||||
|
schema = table.schema()
|
||||||
|
schema = schema.append(pa.field('revert', pa.bool_()))
|
||||||
|
|
||||||
if self.output_parquet:
|
if self.output_parquet:
|
||||||
writer = pq.ParquetWriter(self.output_file, table.schema(), flavor='spark')
|
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||||
else:
|
else:
|
||||||
writer = pc.CSVWriter(self.output_file, table.schema(), write_options=pc.WriteOptions(delimiter='\t'))
|
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||||
|
|
||||||
# Iterate through pages
|
# Iterate through pages
|
||||||
for page in dump:
|
for page in dump:
|
||||||
@ -541,7 +545,19 @@ class WikiqParser:
|
|||||||
|
|
||||||
rev_count += 1
|
rev_count += 1
|
||||||
|
|
||||||
writer.write(table.pop())
|
buffer = table.pop()
|
||||||
|
|
||||||
|
is_revert_column: list[bool | None] = []
|
||||||
|
for r, d in zip(buffer['reverteds'], buffer['deleted']):
|
||||||
|
if d:
|
||||||
|
is_revert_column.append(None)
|
||||||
|
else:
|
||||||
|
is_revert_column.append(r is not None)
|
||||||
|
|
||||||
|
buffer['revert'] = is_revert_column
|
||||||
|
|
||||||
|
|
||||||
|
writer.write(pa.table(buffer, schema=schema))
|
||||||
|
|
||||||
page_count += 1
|
page_count += 1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user