Separate revision iteration and field collation logic
This way we're not adding temporary fields to objects that don't normally have these fields. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
f9383440a0
commit
9ee5ecfc91
15
wikiq
15
wikiq
@ -110,16 +110,13 @@ class WikiqPage:
|
|||||||
|
|
||||||
if not self.collapse_user:
|
if not self.collapse_user:
|
||||||
for rev in self.mwpage:
|
for rev in self.mwpage:
|
||||||
yield rev
|
yield [rev]
|
||||||
return
|
return
|
||||||
|
|
||||||
for _, revs in groupby(self.mwpage, self.user_text):
|
for _, revs in groupby(self.mwpage, self.user_text):
|
||||||
# All revisions are either from the same user, or this is a single
|
# All revisions are either from the same user, or this is a single
|
||||||
# revision where the user is missing.
|
# revision where the user is missing.
|
||||||
revs = list(revs)
|
yield revs
|
||||||
rev = revs[-1]
|
|
||||||
rev.collapsed_revs = len(revs)
|
|
||||||
yield rev
|
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self.__revisions
|
return self.__revisions
|
||||||
@ -461,8 +458,6 @@ class WikiqParser:
|
|||||||
if self.output_parquet:
|
if self.output_parquet:
|
||||||
writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
|
writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
|
||||||
else:
|
else:
|
||||||
print(self.output_file, file=sys.stderr)
|
|
||||||
print(self.schema, file=sys.stderr)
|
|
||||||
writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t'))
|
writer = pc.CSVWriter(self.output_file, self.schema, write_options=pc.WriteOptions(delimiter='\t'))
|
||||||
|
|
||||||
# Iterate through pages
|
# Iterate through pages
|
||||||
@ -497,7 +492,9 @@ class WikiqParser:
|
|||||||
state = persistence.State()
|
state = persistence.State()
|
||||||
|
|
||||||
# Iterate through a page's revisions
|
# Iterate through a page's revisions
|
||||||
for rev in page:
|
for revs in page:
|
||||||
|
revs = list(revs)
|
||||||
|
rev = revs[-1]
|
||||||
|
|
||||||
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
||||||
# create a new data object instead of a dictionary.
|
# create a new data object instead of a dictionary.
|
||||||
@ -554,7 +551,7 @@ class WikiqParser:
|
|||||||
|
|
||||||
# if collapse user was on, let's run that
|
# if collapse user was on, let's run that
|
||||||
if self.collapse_user:
|
if self.collapse_user:
|
||||||
rev_data.collapsed_revs = rev.collapsed_revs
|
rev_data.collapsed_revs = len(revs)
|
||||||
|
|
||||||
# get the
|
# get the
|
||||||
if self.persist != PersistMethod.none:
|
if self.persist != PersistMethod.none:
|
||||||
|
Loading…
Reference in New Issue
Block a user