add missing files + add sorted_columns metadata.

This commit is contained in:
Nathan TeBlunthuis
2025-07-07 19:08:31 -07:00
parent d6c4c0a416
commit 56c90fe1cc
4 changed files with 37 additions and 2 deletions

6
wikiq
View File

@@ -372,7 +372,9 @@ class WikiqParser:
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
pageid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
revid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark', sorting_columns=[pageid_sortingcol, revid_sortingcol])
else:
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
@@ -428,7 +430,7 @@ class WikiqParser:
tokenizer=wikitext_split,
)
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
row_buffer = table.pop()
if self.diff: