Separate revision iteration and field collation logic
This way we're not adding temporary fields to objects that don't normally have these fields. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
f9383440a0
commit
9ee5ecfc91
15
wikiq
15
wikiq
@ -110,16 +110,13 @@ class WikiqPage:
|
||||
|
||||
if not self.collapse_user:
|
||||
for rev in self.mwpage:
|
||||
yield rev
|
||||
yield [rev]
|
||||
return
|
||||
|
||||
for _, revs in groupby(self.mwpage, self.user_text):
|
||||
# All revisions are either from the same user, or this is a single
|
||||
# revision where the user is missing.
|
||||
revs = list(revs)
|
||||
rev = revs[-1]
|
||||
rev.collapsed_revs = len(revs)
|
||||
yield rev
|
||||
yield revs
|
||||
|
||||
def __iter__(self):
|
||||
return self.__revisions
|
||||
@ -461,8 +458,6 @@ class WikiqParser:
|
||||
if self.output_parquet:
|
||||
writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
|
||||
else:
|
||||
print(self.output_file, file=sys.stderr)
|
||||
print(self.schema, file=sys.stderr)
|
||||
writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||
|
||||
# Iterate through pages
|
||||
@ -497,7 +492,9 @@ class WikiqParser:
|
||||
state = persistence.State()
|
||||
|
||||
# Iterate through a page's revisions
|
||||
for rev in page:
|
||||
for revs in page:
|
||||
revs = list(revs)
|
||||
rev = revs[-1]
|
||||
|
||||
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
||||
# create a new data object instead of a dictionary.
|
||||
@ -554,7 +551,7 @@ class WikiqParser:
|
||||
|
||||
# if collapse user was on, let's run that
|
||||
if self.collapse_user:
|
||||
rev_data.collapsed_revs = rev.collapsed_revs
|
||||
rev_data.collapsed_revs = len(revs)
|
||||
|
||||
# get the
|
||||
if self.persist != PersistMethod.none:
|
||||
|
Loading…
Reference in New Issue
Block a user