Add docs and rename import pc -> pacsv
Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
586ae85c65
commit
11d2587471
@ -33,6 +33,10 @@ class RevisionField(ABC, Generic[T]):
|
|||||||
:param revisions: The set of revisions to compute the field from.
|
:param revisions: The set of revisions to compute the field from.
|
||||||
Revisions are passed in chronological order, so use revisions[-1] to
|
Revisions are passed in chronological order, so use revisions[-1] to
|
||||||
access the most recent revision in the set.
|
access the most recent revision in the set.
|
||||||
|
|
||||||
|
Implementations of extract should handle the case where revisions is
|
||||||
|
either a single revision (collapse-user=FALSE), or a full edit session
|
||||||
|
of contiguous edits by the same user (collapse-user=TRUE).
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
6
wikiq
6
wikiq
@ -32,7 +32,7 @@ from deltas import SequenceMatcher, SegmentMatcher
|
|||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
import pyarrow.csv as pc
|
import pyarrow.csv as pacsv
|
||||||
|
|
||||||
|
|
||||||
class PersistMethod:
|
class PersistMethod:
|
||||||
@ -338,7 +338,7 @@ class WikiqParser:
|
|||||||
page_count = 0
|
page_count = 0
|
||||||
rev_count = 0
|
rev_count = 0
|
||||||
|
|
||||||
writer: Union[pq.ParquetWriter, pc.CSVWriter]
|
writer: Union[pq.ParquetWriter, pacsv.CSVWriter]
|
||||||
|
|
||||||
schema = table.schema()
|
schema = table.schema()
|
||||||
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
||||||
@ -362,7 +362,7 @@ class WikiqParser:
|
|||||||
if self.output_parquet:
|
if self.output_parquet:
|
||||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||||
else:
|
else:
|
||||||
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
|
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
|
||||||
|
|
||||||
regex_matches = {}
|
regex_matches = {}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user