Refactor revision parsing logic to be columnar #1
@ -33,6 +33,10 @@ class RevisionField(ABC, Generic[T]):
|
||||
:param revisions: The set of revisions to compute the field from.
|
||||
Revisions are passed in chronological order, so use revisions[-1] to
|
||||
access the most recent revision in the set.
|
||||
|
||||
Implementations of extract should handle the case where revisions is
|
||||
either a single revision (collapse-user=FALSE), or a full edit session
|
||||
of contiguous edits by the same user (collapse-user=TRUE).
|
||||
"""
|
||||
pass
|
||||
|
||||
|
6
wikiq
6
wikiq
@ -32,7 +32,7 @@ from deltas import SequenceMatcher, SegmentMatcher
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.csv as pc
|
||||
import pyarrow.csv as pacsv
|
||||
beason marked this conversation as resolved
Outdated
|
||||
|
||||
|
||||
class PersistMethod:
|
||||
@ -338,7 +338,7 @@ class WikiqParser:
|
||||
page_count = 0
|
||||
rev_count = 0
|
||||
|
||||
writer: Union[pq.ParquetWriter, pc.CSVWriter]
|
||||
writer: Union[pq.ParquetWriter, pacsv.CSVWriter]
|
||||
|
||||
schema = table.schema()
|
||||
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
|
||||
@ -362,7 +362,7 @@ class WikiqParser:
|
||||
if self.output_parquet:
|
||||
groceryheist
commented
Curious why this is page.mwpage.namespace now instead of the old logic Curious why this is page.mwpage.namespace now instead of the old logic
beason
commented
This keeps all data about the page contained to one object rather than in both the "page" and "mwpage" fields. This means all of the columnar functions can use an interface that requires minimal information and simplifies the need to propagate information from mwpage to page. This keeps all data about the page contained to one object rather than in both the "page" and "mwpage" fields. This means all of the columnar functions can use an interface that requires minimal information and simplifies the need to propagate information from mwpage to page.
|
||||
writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
|
||||
else:
|
||||
writer = pc.CSVWriter(self.output_file, schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||
writer = pacsv.CSVWriter(self.output_file, schema, write_options=pacsv.WriteOptions(delimiter='\t'))
|
||||
|
||||
regex_matches = {}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user
I usually use pc for
pyarrow.compute
. Maybepacsv
instead?