Add docs and rename import pc -> pacsv

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-17 11:46:16 -05:00
parent 586ae85c65
commit 11d2587471
2 changed files with 7 additions and 3 deletions

View File

@ -33,6 +33,10 @@ class RevisionField(ABC, Generic[T]):
:param revisions: The set of revisions to compute the field from.
Revisions are passed in chronological order, so use revisions[-1] to
access the most recent revision in the set.
Implementations of extract should handle the case where revisions is
either a single revision (collapse-user=FALSE), or a full edit session
of contiguous edits by the same user (collapse-user=TRUE).
"""
pass

6
wikiq
View File

@ -32,7 +32,7 @@ from deltas import SequenceMatcher, SegmentMatcher
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pc
import pyarrow.csv as pacsv
class PersistMethod:
@ -338,7 +338,7 @@ class WikiqParser:
page_count = 0
rev_count = 0
writer: Union[pq.ParquetWriter, pc.CSVWriter]
writer: Union[pq.ParquetWriter, pacsv.CSVWriter]
schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
@ -362,7 +362,7 @@ class WikiqParser:
if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
else:
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
regex_matches = {}