diff --git a/tables.py b/tables.py index 2c0d204..cadf2d9 100644 --- a/tables.py +++ b/tables.py @@ -33,6 +33,10 @@ class RevisionField(ABC, Generic[T]): :param revisions: The set of revisions to compute the field from. Revisions are passed in chronological order, so use revisions[-1] to access the most recent revision in the set. + + Implementations of extract should handle the case where revisions is + either a single revision (collapse-user=FALSE), or a full edit session + of contiguous edits by the same user (collapse-user=TRUE). """ pass diff --git a/wikiq b/wikiq index 75d8b0d..f1582e9 100755 --- a/wikiq +++ b/wikiq @@ -32,7 +32,7 @@ from deltas import SequenceMatcher, SegmentMatcher import pyarrow as pa import pyarrow.parquet as pq -import pyarrow.csv as pc +import pyarrow.csv as pacsv class PersistMethod: @@ -338,7 +338,7 @@ class WikiqParser: page_count = 0 rev_count = 0 - writer: Union[pq.ParquetWriter, pc.CSVWriter] + writer: Union[pq.ParquetWriter, pacsv.CSVWriter] schema = table.schema() schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) @@ -362,7 +362,7 @@ class WikiqParser: if self.output_parquet: writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') else: - writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t')) + writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t')) regex_matches = {}