diff --git a/diff_pyarrow_schema.py b/diff_pyarrow_schema.py new file mode 100644 index 0000000..9117248 --- /dev/null +++ b/diff_pyarrow_schema.py @@ -0,0 +1,33 @@ +import pyarrow as pa + +# Schema for the `highlightRanges` object, an array of which can be nested in a diff object. +highlight_range_struct = pa.struct([ + pa.field('start', pa.int64(), nullable=False, metadata={'description': 'Where the highlighted text should start, in bytes.'}), + pa.field('length', pa.int64(), nullable=False, metadata={'description': 'The length of the highlighted section, in bytes.'}), + pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of highlight (0: addition, 1: deletion).'}) +]) + +# Schema for the `moveInfo` object, which can be nested in a diff object. +move_info_struct = pa.struct([ + pa.field('id', pa.string(), nullable=False, metadata={'description': 'The ID of the paragraph.'}), + pa.field('linkId', pa.string(), nullable=False, metadata={'description': 'The ID of the corresponding paragraph.'}), + pa.field('linkDirection', pa.int64(), nullable=False, metadata={'description': 'Visual indicator of the relationship (0: lower, 1: higher).'}) +]) + +# Schema for the `offset` object, which is required in a diff object. +offset_struct = pa.struct([ + pa.field('from', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `from` revision.'}), + pa.field('to', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `to` revision.'}) +]) + +# The final schema for the entire structure. +diff_field = pa.field('diff', pa.list_( + pa.struct([ + pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of change (0: context, 1: addition, 2: deletion, etc.).'}), + pa.field('lineNumber', pa.int64(), nullable=True, metadata={'description': 'The line number of the change based on the `to` revision.'}), + pa.field('text', pa.string(), nullable=False, metadata={'description': 'The text of the line.'}), + pa.field('highlightRanges', pa.list_(highlight_range_struct), nullable=True, metadata={'description': 'Highlights to visually represent changes.'}), + pa.field('moveInfo', move_info_struct, nullable=True, metadata={'description': 'Visual indicators for paragraph location changes.'}), + pa.field('offset', offset_struct, nullable=False, metadata={'description': 'The location of the line in bytes from the beginning of the page.'}) + ]) +)) diff --git a/test/baseline_output/diff_sailormoon.parquet b/test/baseline_output/diff_sailormoon.parquet new file mode 100644 index 0000000..e70cc1d Binary files /dev/null and b/test/baseline_output/diff_sailormoon.parquet differ diff --git a/test/baseline_output/text_sailormoon.parquet b/test/baseline_output/text_sailormoon.parquet new file mode 100644 index 0000000..b131d58 Binary files /dev/null and b/test/baseline_output/text_sailormoon.parquet differ diff --git a/wikiq b/wikiq index 6addada..b1633da 100755 --- a/wikiq +++ b/wikiq @@ -372,7 +372,9 @@ class WikiqParser: schema = schema.append(pa.field('tokens_window', pa.int64(), nullable=True)) if self.output_parquet: - writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') + pageid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid')) + revid_sortingcol = pq.SortingColumn(schema.get_field_index('pageid')) + writer = pq.ParquetWriter(self.output_file, schema, flavor='spark', sorting_columns=[pageid_sortingcol, revid_sortingcol]) else: writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t')) @@ -428,7 +430,7 @@ class WikiqParser: tokenizer=wikitext_split, ) - # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. + # Collect the set of revisions currently buffered in the table so we can run multi-revision functions on them. row_buffer = table.pop() if self.diff: