Separate revision iteration and field collation logic

This way we're not adding temporary fields to objects that don't
normally have these fields.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-30 14:09:16 -05:00
parent f9383440a0
commit 9ee5ecfc91

15
wikiq
View File

@ -110,16 +110,13 @@ class WikiqPage:
if not self.collapse_user: if not self.collapse_user:
for rev in self.mwpage: for rev in self.mwpage:
yield rev yield [rev]
return return
for _, revs in groupby(self.mwpage, self.user_text): for _, revs in groupby(self.mwpage, self.user_text):
# All revisions are either from the same user, or this is a single # All revisions are either from the same user, or this is a single
# revision where the user is missing. # revision where the user is missing.
revs = list(revs) yield revs
rev = revs[-1]
rev.collapsed_revs = len(revs)
yield rev
def __iter__(self): def __iter__(self):
return self.__revisions return self.__revisions
@ -461,8 +458,6 @@ class WikiqParser:
if self.output_parquet: if self.output_parquet:
writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark') writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
else: else:
print(self.output_file, file=sys.stderr)
print(self.schema, file=sys.stderr)
writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t')) writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t'))
# Iterate through pages # Iterate through pages
@ -497,7 +492,9 @@ class WikiqParser:
state = persistence.State() state = persistence.State()
# Iterate through a page's revisions # Iterate through a page's revisions
for rev in page: for revs in page:
revs = list(revs)
rev = revs[-1]
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
# create a new data object instead of a dictionary. # create a new data object instead of a dictionary.
@ -554,7 +551,7 @@ class WikiqParser:
# if collapse user was on, let's run that # if collapse user was on, let's run that
if self.collapse_user: if self.collapse_user:
rev_data.collapsed_revs = rev.collapsed_revs rev_data.collapsed_revs = len(revs)
# get the # get the
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none: