Refactor collapse user logic
Use simple loop for when we aren't collapsing users. Add test which covers case when users are deleted. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
c0e629a313
commit
aec6e5fafa
@ -177,6 +177,19 @@ class WikiqTestCase(unittest.TestCase):
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_WP_collapse_user(self):
|
||||
tester = WikiqTester(IKWIKI, "collapse_user")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--collapse-user")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
test = pd.read_table(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
def test_noargs(self):
|
||||
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
73
wikiq
73
wikiq
@ -102,35 +102,28 @@ class WikiqPage:
|
||||
# 3 A B True
|
||||
# 4 A A False
|
||||
# Post-loop A Always
|
||||
collapsed_revs = 0
|
||||
for i, rev in enumerate(self.mwpage):
|
||||
# never yield the first time
|
||||
if i == 0:
|
||||
if self.collapse_user:
|
||||
collapsed_revs = 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
|
||||
if not self.collapse_user:
|
||||
for rev in self.mwpage:
|
||||
yield rev
|
||||
return
|
||||
|
||||
collapsed_revs = 1
|
||||
prev_rev = next(self.mwpage)
|
||||
prev_rev.collapsed_revs = collapsed_revs
|
||||
|
||||
for rev in self.mwpage:
|
||||
# yield if this is the last edit in a seq by a user and reset
|
||||
# also yield if we don't know who the user is
|
||||
|
||||
if rev.deleted.user or prev_rev.deleted.user or rev.user.text != prev_rev.user.text:
|
||||
yield prev_rev
|
||||
collapsed_revs = 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
else:
|
||||
if self.collapse_user:
|
||||
# yield if this is the last edit in a seq by a user and reset
|
||||
# also yield if we do know who the user is
|
||||
|
||||
if rev.deleted.user or prev_rev.deleted.user:
|
||||
yield prev_rev
|
||||
collapsed_revs = 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
|
||||
elif not rev.user.text == prev_rev.user.text:
|
||||
yield prev_rev
|
||||
collapsed_revs = 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
# otherwise, add one to the counter
|
||||
else:
|
||||
collapsed_revs += 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
# if collapse_user is false, we always yield
|
||||
else:
|
||||
yield prev_rev
|
||||
# Otherwise, collapse revision.
|
||||
collapsed_revs += 1
|
||||
rev.collapsed_revs = collapsed_revs
|
||||
|
||||
prev_rev = rev
|
||||
|
||||
@ -239,7 +232,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq
|
||||
|
||||
|
||||
@dataclass()
|
||||
class RevDataBase:
|
||||
class Revision:
|
||||
revid: int
|
||||
date_time: datetime
|
||||
articleid: int
|
||||
@ -247,13 +240,13 @@ class RevDataBase:
|
||||
title: str
|
||||
namespace: int
|
||||
deleted: bool
|
||||
text_chars: int = None
|
||||
revert: bool = None
|
||||
text_chars: int | None = None
|
||||
revert: bool | None = None
|
||||
reverteds: list[int] = None
|
||||
sha1: str = None
|
||||
minor: bool = None
|
||||
editor: str = None
|
||||
anon: bool = None
|
||||
sha1: str | None = None
|
||||
minor: bool | None = None
|
||||
editor: str | None = None
|
||||
anon: bool | None = None
|
||||
|
||||
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
|
||||
urlencode = False
|
||||
@ -332,11 +325,11 @@ It just adds a new field and updates the pyarrow schema.
|
||||
|
||||
|
||||
@dataclass()
|
||||
class RevDataCollapse(RevDataBase):
|
||||
class RevDataCollapse(Revision):
|
||||
collapsed_revs: int = None
|
||||
|
||||
pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
|
||||
pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
|
||||
pa_schema_fields = Revision.pa_schema_fields + [pa_collapsed_revs_schema]
|
||||
|
||||
|
||||
"""
|
||||
@ -347,7 +340,7 @@ If persistence data is to be computed we'll need the fields added by RevDataPers
|
||||
|
||||
|
||||
@dataclass()
|
||||
class RevDataPersistence(RevDataBase):
|
||||
class RevDataPersistence(Revision):
|
||||
token_revs: int = None
|
||||
tokens_added: int = None
|
||||
tokens_removed: int = None
|
||||
@ -359,7 +352,7 @@ class RevDataPersistence(RevDataBase):
|
||||
pa.field("tokens_removed", pa.int64()),
|
||||
pa.field("tokens_window", pa.int64())]
|
||||
|
||||
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
|
||||
pa_schema_fields = Revision.pa_schema_fields + pa_persistence_schema_fields
|
||||
|
||||
|
||||
"""
|
||||
@ -394,7 +387,7 @@ class WikiqParser:
|
||||
"""
|
||||
self.input_file = input_file
|
||||
|
||||
self.collapse_user = collapse_user
|
||||
self.collapse_user: bool = collapse_user
|
||||
self.persist: int = persist
|
||||
self.namespaces = []
|
||||
self.urlencode: bool = urlencode
|
||||
@ -419,7 +412,7 @@ class WikiqParser:
|
||||
elif self.persist != PersistMethod.none:
|
||||
revdata_type = RevDataPersistence
|
||||
else:
|
||||
revdata_type = RevDataBase
|
||||
revdata_type = Revision
|
||||
|
||||
# if there are regex fields, we need to add them to the revdata type.
|
||||
regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
|
||||
|
Loading…
Reference in New Issue
Block a user