add missing files + add sorted_columns metadata.
This commit is contained in:
6
wikiq
6
wikiq
@@ -372,7 +372,9 @@ class WikiqParser:
|
||||
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
|
||||
|
||||
if self.output_parquet:
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||
pageid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
|
||||
revid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark', sorting_columns=[pageid_sortingcol, revid_sortingcol])
|
||||
else:
|
||||
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
|
||||
|
||||
@@ -428,7 +430,7 @@ class WikiqParser:
|
||||
tokenizer=wikitext_split,
|
||||
)
|
||||
|
||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
|
||||
row_buffer = table.pop()
|
||||
|
||||
if self.diff:
|
||||
|
||||
Reference in New Issue
Block a user