Refactor collapse user logic
Use simple loop for when we aren't collapsing users. Add test which covers case when users are deleted. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
c0e629a313
commit
aec6e5fafa
@ -177,6 +177,19 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
baseline = pd.read_table(tester.baseline_file)
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
|
def test_WP_collapse_user(self):
|
||||||
|
tester = WikiqTester(IKWIKI, "collapse_user")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--collapse-user")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
copyfile(tester.call_output, tester.test_file)
|
||||||
|
test = pd.read_table(tester.test_file)
|
||||||
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
def test_noargs(self):
|
def test_noargs(self):
|
||||||
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
73
wikiq
73
wikiq
@ -102,35 +102,28 @@ class WikiqPage:
|
|||||||
# 3 A B True
|
# 3 A B True
|
||||||
# 4 A A False
|
# 4 A A False
|
||||||
# Post-loop A Always
|
# Post-loop A Always
|
||||||
collapsed_revs = 0
|
|
||||||
for i, rev in enumerate(self.mwpage):
|
|
||||||
# never yield the first time
|
|
||||||
if i == 0:
|
|
||||||
if self.collapse_user:
|
|
||||||
collapsed_revs = 1
|
|
||||||
rev.collapsed_revs = collapsed_revs
|
|
||||||
|
|
||||||
|
if not self.collapse_user:
|
||||||
|
for rev in self.mwpage:
|
||||||
|
yield rev
|
||||||
|
return
|
||||||
|
|
||||||
|
collapsed_revs = 1
|
||||||
|
prev_rev = next(self.mwpage)
|
||||||
|
prev_rev.collapsed_revs = collapsed_revs
|
||||||
|
|
||||||
|
for rev in self.mwpage:
|
||||||
|
# yield if this is the last edit in a seq by a user and reset
|
||||||
|
# also yield if we don't know who the user is
|
||||||
|
|
||||||
|
if rev.deleted.user or prev_rev.deleted.user or rev.user.text != prev_rev.user.text:
|
||||||
|
yield prev_rev
|
||||||
|
collapsed_revs = 1
|
||||||
|
rev.collapsed_revs = collapsed_revs
|
||||||
else:
|
else:
|
||||||
if self.collapse_user:
|
# Otherwise, collapse revision.
|
||||||
# yield if this is the last edit in a seq by a user and reset
|
collapsed_revs += 1
|
||||||
# also yield if we do know who the user is
|
rev.collapsed_revs = collapsed_revs
|
||||||
|
|
||||||
if rev.deleted.user or prev_rev.deleted.user:
|
|
||||||
yield prev_rev
|
|
||||||
collapsed_revs = 1
|
|
||||||
rev.collapsed_revs = collapsed_revs
|
|
||||||
|
|
||||||
elif not rev.user.text == prev_rev.user.text:
|
|
||||||
yield prev_rev
|
|
||||||
collapsed_revs = 1
|
|
||||||
rev.collapsed_revs = collapsed_revs
|
|
||||||
# otherwise, add one to the counter
|
|
||||||
else:
|
|
||||||
collapsed_revs += 1
|
|
||||||
rev.collapsed_revs = collapsed_revs
|
|
||||||
# if collapse_user is false, we always yield
|
|
||||||
else:
|
|
||||||
yield prev_rev
|
|
||||||
|
|
||||||
prev_rev = rev
|
prev_rev = rev
|
||||||
|
|
||||||
@ -239,7 +232,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq
|
|||||||
|
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class RevDataBase:
|
class Revision:
|
||||||
revid: int
|
revid: int
|
||||||
date_time: datetime
|
date_time: datetime
|
||||||
articleid: int
|
articleid: int
|
||||||
@ -247,13 +240,13 @@ class RevDataBase:
|
|||||||
title: str
|
title: str
|
||||||
namespace: int
|
namespace: int
|
||||||
deleted: bool
|
deleted: bool
|
||||||
text_chars: int = None
|
text_chars: int | None = None
|
||||||
revert: bool = None
|
revert: bool | None = None
|
||||||
reverteds: list[int] = None
|
reverteds: list[int] = None
|
||||||
sha1: str = None
|
sha1: str | None = None
|
||||||
minor: bool = None
|
minor: bool | None = None
|
||||||
editor: str = None
|
editor: str | None = None
|
||||||
anon: bool = None
|
anon: bool | None = None
|
||||||
|
|
||||||
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
|
# toggles url encoding. this isn't a dataclass field since it doesn't have a type annotation
|
||||||
urlencode = False
|
urlencode = False
|
||||||
@ -332,11 +325,11 @@ It just adds a new field and updates the pyarrow schema.
|
|||||||
|
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class RevDataCollapse(RevDataBase):
|
class RevDataCollapse(Revision):
|
||||||
collapsed_revs: int = None
|
collapsed_revs: int = None
|
||||||
|
|
||||||
pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
|
pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
|
||||||
pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
|
pa_schema_fields = Revision.pa_schema_fields + [pa_collapsed_revs_schema]
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -347,7 +340,7 @@ If persistence data is to be computed we'll need the fields added by RevDataPers
|
|||||||
|
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class RevDataPersistence(RevDataBase):
|
class RevDataPersistence(Revision):
|
||||||
token_revs: int = None
|
token_revs: int = None
|
||||||
tokens_added: int = None
|
tokens_added: int = None
|
||||||
tokens_removed: int = None
|
tokens_removed: int = None
|
||||||
@ -359,7 +352,7 @@ class RevDataPersistence(RevDataBase):
|
|||||||
pa.field("tokens_removed", pa.int64()),
|
pa.field("tokens_removed", pa.int64()),
|
||||||
pa.field("tokens_window", pa.int64())]
|
pa.field("tokens_window", pa.int64())]
|
||||||
|
|
||||||
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
|
pa_schema_fields = Revision.pa_schema_fields + pa_persistence_schema_fields
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -394,7 +387,7 @@ class WikiqParser:
|
|||||||
"""
|
"""
|
||||||
self.input_file = input_file
|
self.input_file = input_file
|
||||||
|
|
||||||
self.collapse_user = collapse_user
|
self.collapse_user: bool = collapse_user
|
||||||
self.persist: int = persist
|
self.persist: int = persist
|
||||||
self.namespaces = []
|
self.namespaces = []
|
||||||
self.urlencode: bool = urlencode
|
self.urlencode: bool = urlencode
|
||||||
@ -419,7 +412,7 @@ class WikiqParser:
|
|||||||
elif self.persist != PersistMethod.none:
|
elif self.persist != PersistMethod.none:
|
||||||
revdata_type = RevDataPersistence
|
revdata_type = RevDataPersistence
|
||||||
else:
|
else:
|
||||||
revdata_type = RevDataBase
|
revdata_type = Revision
|
||||||
|
|
||||||
# if there are regex fields, we need to add them to the revdata type.
|
# if there are regex fields, we need to add them to the revdata type.
|
||||||
regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
|
regex_fields = [(field.name, list[str], dc.field(default=None)) for field in self.regex_schemas]
|
||||||
|
Loading…
Reference in New Issue
Block a user