mediawiki_dump_tools/diff_pyarrow_schema.py
2025-07-07 19:08:31 -07:00

34 lines
2.3 KiB
Python

import pyarrow as pa
# Schema for the `highlightRanges` object, an array of which can be nested in a diff object.
highlight_range_struct = pa.struct([
pa.field('start', pa.int64(), nullable=False, metadata={'description': 'Where the highlighted text should start, in bytes.'}),
pa.field('length', pa.int64(), nullable=False, metadata={'description': 'The length of the highlighted section, in bytes.'}),
pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of highlight (0: addition, 1: deletion).'})
])
# Schema for the `moveInfo` object, which can be nested in a diff object.
move_info_struct = pa.struct([
pa.field('id', pa.string(), nullable=False, metadata={'description': 'The ID of the paragraph.'}),
pa.field('linkId', pa.string(), nullable=False, metadata={'description': 'The ID of the corresponding paragraph.'}),
pa.field('linkDirection', pa.int64(), nullable=False, metadata={'description': 'Visual indicator of the relationship (0: lower, 1: higher).'})
])
# Schema for the `offset` object, which is required in a diff object.
offset_struct = pa.struct([
pa.field('from', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `from` revision.'}),
pa.field('to', pa.int64(), nullable=True, metadata={'description': 'The first byte of the line in the `to` revision.'})
])
# The final schema for the entire structure.
diff_field = pa.field('diff', pa.list_(
pa.struct([
pa.field('type', pa.int64(), nullable=False, metadata={'description': 'The type of change (0: context, 1: addition, 2: deletion, etc.).'}),
pa.field('lineNumber', pa.int64(), nullable=True, metadata={'description': 'The line number of the change based on the `to` revision.'}),
pa.field('text', pa.string(), nullable=False, metadata={'description': 'The text of the line.'}),
pa.field('highlightRanges', pa.list_(highlight_range_struct), nullable=True, metadata={'description': 'Highlights to visually represent changes.'}),
pa.field('moveInfo', move_info_struct, nullable=True, metadata={'description': 'Visual indicators for paragraph location changes.'}),
pa.field('offset', offset_struct, nullable=False, metadata={'description': 'The location of the line in bytes from the beginning of the page.'})
])
))