From 9ee5ecfc91b7b19546189a91c19f130f68cea55b Mon Sep 17 00:00:00 2001 From: Will Beason Date: Fri, 30 May 2025 14:09:16 -0500 Subject: [PATCH] Separate revision iteration and field collation logic This way we're not adding temporary fields to objects that don't normally have these fields. Signed-off-by: Will Beason --- wikiq | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/wikiq b/wikiq index 3346828..ba9f6c3 100755 --- a/wikiq +++ b/wikiq @@ -110,16 +110,13 @@ class WikiqPage: if not self.collapse_user: for rev in self.mwpage: - yield rev + yield [rev] return for _, revs in groupby(self.mwpage, self.user_text): # All revisions are either from the same user, or this is a single # revision where the user is missing. - revs = list(revs) - rev = revs[-1] - rev.collapsed_revs = len(revs) - yield rev + yield revs def __iter__(self): return self.__revisions @@ -461,8 +458,6 @@ class WikiqParser: if self.output_parquet: writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark') else: - print(self.output_file, file=sys.stderr) - print(self.schema, file=sys.stderr) writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t')) # Iterate through pages @@ -497,7 +492,9 @@ class WikiqParser: state = persistence.State() # Iterate through a page's revisions - for rev in page: + for revs in page: + revs = list(revs) + rev = revs[-1] editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id # create a new data object instead of a dictionary. @@ -554,7 +551,7 @@ class WikiqParser: # if collapse user was on, let's run that if self.collapse_user: - rev_data.collapsed_revs = rev.collapsed_revs + rev_data.collapsed_revs = len(revs) # get the if self.persist != PersistMethod.none: