add missing files + add sorted_columns metadata.
This commit is contained in:
parent
d6c4c0a416
commit
56c90fe1cc
33
diff_pyarrow_schema.py
Normal file
33
diff_pyarrow_schema.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
# Schema for the `highlightRanges` object, an array of which can be nested in a diff object.
|
||||||
|
highlight_range_struct = pa.struct([
|
||||||
|
pa.field('start', pa.int64(), nullable=False, metadata={'description': 'Where the highlighted text should start, in bytes.'}),
|
||||||
|
pa.field('length', pa.int64(), nullable=False, metadata={'description': 'The length of the highlighted section, in bytes.'}),
|
||||||
|
pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of highlight (0: addition, 1: deletion).'})
|
||||||
|
])
|
||||||
|
|
||||||
|
# Schema for the `moveInfo` object, which can be nested in a diff object.
|
||||||
|
move_info_struct = pa.struct([
|
||||||
|
pa.field('id', pa.string(), nullable=False, metadata={'description': 'The ID of the paragraph.'}),
|
||||||
|
pa.field('linkId', pa.string(), nullable=False, metadata={'description': 'The ID of the corresponding paragraph.'}),
|
||||||
|
pa.field('linkDirection', pa.int64(), nullable=False, metadata={'description': 'Visual indicator of the relationship (0: lower, 1: higher).'})
|
||||||
|
])
|
||||||
|
|
||||||
|
# Schema for the `offset` object, which is required in a diff object.
|
||||||
|
offset_struct = pa.struct([
|
||||||
|
pa.field('from', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `from` revision.'}),
|
||||||
|
pa.field('to', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `to` revision.'})
|
||||||
|
])
|
||||||
|
|
||||||
|
# The final schema for the entire structure.
|
||||||
|
diff_field = pa.field('diff', pa.list_(
|
||||||
|
pa.struct([
|
||||||
|
pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of change (0: context, 1: addition, 2: deletion, etc.).'}),
|
||||||
|
pa.field('lineNumber', pa.int64(), nullable=True, metadata={'description': 'The line number of the change based on the `to` revision.'}),
|
||||||
|
pa.field('text', pa.string(), nullable=False, metadata={'description': 'The text of the line.'}),
|
||||||
|
pa.field('highlightRanges', pa.list_(highlight_range_struct), nullable=True, metadata={'description': 'Highlights to visually represent changes.'}),
|
||||||
|
pa.field('moveInfo', move_info_struct, nullable=True, metadata={'description': 'Visual indicators for paragraph location changes.'}),
|
||||||
|
pa.field('offset', offset_struct, nullable=False, metadata={'description': 'The location of the line in bytes from the beginning of the page.'})
|
||||||
|
])
|
||||||
|
))
|
BIN
test/baseline_output/diff_sailormoon.parquet
Normal file
BIN
test/baseline_output/diff_sailormoon.parquet
Normal file
Binary file not shown.
BIN
test/baseline_output/text_sailormoon.parquet
Normal file
BIN
test/baseline_output/text_sailormoon.parquet
Normal file
Binary file not shown.
6
wikiq
6
wikiq
@ -372,7 +372,9 @@ class WikiqParser:
|
|||||||
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
|
schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True))
|
||||||
|
|
||||||
if self.output_parquet:
|
if self.output_parquet:
|
||||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
pageid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
|
||||||
|
revid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid'))
|
||||||
|
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark', sorting_columns=[pageid_sortingcol, revid_sortingcol])
|
||||||
else:
|
else:
|
||||||
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
|
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
|
||||||
|
|
||||||
@ -428,7 +430,7 @@ class WikiqParser:
|
|||||||
tokenizer=wikitext_split,
|
tokenizer=wikitext_split,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
# Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them.
|
||||||
row_buffer = table.pop()
|
row_buffer = table.pop()
|
||||||
|
|
||||||
if self.diff:
|
if self.diff:
|
||||||
|
Loading…
Reference in New Issue
Block a user