Refactor collapse user logic

Use simple loop for when we aren't collapsing users.
Add test which covers case when users are deleted.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-29 15:20:34 -05:00
parent c0e629a313
commit aec6e5fafa
3 changed files with 20292 additions and 40 deletions

View File

@ -177,6 +177,19 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_collapse_user(self):
tester = WikiqTester(IKWIKI, "collapse_user")
try:
tester.call_wikiq("--collapse-user")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_noargs(self):
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")

File diff suppressed because it is too large Load Diff

73
wikiq
View File

@ -102,35 +102,28 @@ class WikiqPage:
# 3 A B True
# 4 A A False
# Post-loop A Always
collapsed_revs = 0
for i, rev in enumerate(self.mwpage):
# never yield the first time
if i == 0:
if self.collapse_user:
collapsed_revs = 1
rev.collapsed_revs = collapsed_revs
if not self.collapse_user:
for rev in self.mwpage:
yield rev
return
collapsed_revs = 1
prev_rev = next(self.mwpage)
prev_rev.collapsed_revs = collapsed_revs
for rev in self.mwpage:
# yield if this is the last edit in a seq by a user and reset
# also yield if we don't know who the user is
if rev.deleted.user or prev_rev.deleted.user or rev.user.text != prev_rev.user.text:
yield prev_rev
collapsed_revs = 1
rev.collapsed_revs = collapsed_revs
else:
if self.collapse_user:
# yield if this is the last edit in a seq by a user and reset
# also yield if we do know who the user is
if rev.deleted.user or prev_rev.deleted.user:
yield prev_rev
collapsed_revs = 1
rev.collapsed_revs = collapsed_revs
elif not rev.user.text == prev_rev.user.text:
yield prev_rev
collapsed_revs = 1
rev.collapsed_revs = collapsed_revs
# otherwise, add one to the counter
else:
collapsed_revs += 1
rev.collapsed_revs = collapsed_revs
# if collapse_user is false, we always yield
else:
yield prev_rev
# Otherwise, collapse revision.
collapsed_revs += 1
rev.collapsed_revs = collapsed_revs
prev_rev = rev
@ -239,7 +232,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq
@dataclass()
class RevDataBase:
class Revision:
revid: int
date_time: datetime
articleid: int
@ -247,13 +240,13 @@ class RevDataBase:
title: str
namespace: int
deleted: bool
text_chars: int = None
revert: bool = None
text_chars: int | None = None
revert: bool | None = None
reverteds: list[int] = None
sha1: str = None
minor: bool = None
editor: str = None
anon: bool = None
sha1: str | None = None
minor: bool | None = None
editor: str | None = None
anon: bool | None = None
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
urlencode = False
@ -332,11 +325,11 @@ It just adds a new field and updates the pyarrow schema.
@dataclass()
class RevDataCollapse(RevDataBase):
class RevDataCollapse(Revision):
collapsed_revs: int = None
pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
pa_schema_fields = Revision.pa_schema_fields + [pa_collapsed_revs_schema]
"""
@ -347,7 +340,7 @@ If persistence data is to be computed we'll need the fields added by RevDataPers
@dataclass()
class RevDataPersistence(RevDataBase):
class RevDataPersistence(Revision):
token_revs: int = None
tokens_added: int = None
tokens_removed: int = None
@ -359,7 +352,7 @@ class RevDataPersistence(RevDataBase):
pa.field("tokens_removed", pa.int64()),
pa.field("tokens_window", pa.int64())]
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
pa_schema_fields = Revision.pa_schema_fields + pa_persistence_schema_fields
"""
@ -394,7 +387,7 @@ class WikiqParser:
"""
self.input_file = input_file
self.collapse_user = collapse_user
self.collapse_user: bool = collapse_user
self.persist: int = persist
self.namespaces = []
self.urlencode: bool = urlencode
@ -419,7 +412,7 @@ class WikiqParser:
elif self.persist != PersistMethod.none:
revdata_type = RevDataPersistence
else:
revdata_type = RevDataBase
revdata_type = Revision
# if there are regex fields, we need to add them to the revdata type.
regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]