Refactor revision parsing logic to be columnar #1

Merged
beason merged 27 commits from test-parquet into parquet_support 2025-06-17 18:22:26 +00:00
Showing only changes of commit 9ee5ecfc91 - Show all commits

15
wikiq
View File

@ -110,16 +110,13 @@ class WikiqPage:
if not self.collapse_user:
for rev in self.mwpage:
yield rev
yield [rev]
return
for _, revs in groupby(self.mwpage, self.user_text):
# All revisions are either from the same user, or this is a single
# revision where the user is missing.
revs = list(revs)
rev = revs[-1]
rev.collapsed_revs = len(revs)
yield rev
yield revs
def __iter__(self):
return self.__revisions
@ -461,8 +458,6 @@ class WikiqParser:
if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
else:
print(self.output_file, file=sys.stderr)
print(self.schema, file=sys.stderr)
writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t'))
# Iterate through pages
@ -497,7 +492,9 @@ class WikiqParser:
state = persistence.State()
# Iterate through a page's revisions
for rev in page:
for revs in page:
revs = list(revs)
rev = revs[-1]
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
# create a new data object instead of a dictionary.
@ -554,7 +551,7 @@ class WikiqParser:
# if collapse user was on, let's run that
if self.collapse_user:
rev_data.collapsed_revs = rev.collapsed_revs
rev_data.collapsed_revs = len(revs)
# get the
if self.persist != PersistMethod.none: