Add docs and rename import pc -> pacsv

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-17 11:46:16 -05:00
parent 586ae85c65
commit 11d2587471
2 changed files with 7 additions and 3 deletions

View File

@ -33,6 +33,10 @@ class RevisionField(ABC, Generic[T]):
:param revisions: The set of revisions to compute the field from. :param revisions: The set of revisions to compute the field from.
Revisions are passed in chronological order, so use revisions[-1] to Revisions are passed in chronological order, so use revisions[-1] to
access the most recent revision in the set. access the most recent revision in the set.
Implementations of extract should handle the case where revisions is
either a single revision (collapse-user=FALSE), or a full edit session
of contiguous edits by the same user (collapse-user=TRUE).
""" """
pass pass

6
wikiq
View File

@ -32,7 +32,7 @@ from deltas import SequenceMatcher, SegmentMatcher
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow.csv as pc import pyarrow.csv as pacsv
class PersistMethod: class PersistMethod:
@ -338,7 +338,7 @@ class WikiqParser:
page_count = 0 page_count = 0
rev_count = 0 rev_count = 0
writer: Union[pq.ParquetWriter, pc.CSVWriter] writer: Union[pq.ParquetWriter, pacsv.CSVWriter]
schema = table.schema() schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
@ -362,7 +362,7 @@ class WikiqParser:
if self.output_parquet: if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
else: else:
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t')) writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
regex_matches = {} regex_matches = {}