From bd22d26291cdd666b382812e836f0c22f394528e Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:02:14 -0700 Subject: [PATCH] update deps and add edit_summary to wikiq output. --- pyproject.toml | 9 ++++----- wikiq | 8 ++++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 59e1043..368239a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,11 @@ dependencies = [ "yamlconf", ] -# [tool.uv.sources] -# yamlconf = { git = "https://github.com/groceryheist/yamlconf" } -# mwxml = { git = "https://github.com/groceryheist/python-mwxml" } +[tool.uv.sources] +yamlconf = { git = "https://github.com/groceryheist/yamlconf" } +mwxml = { git = "https://github.com/groceryheist/python-mwxml" } [dependency-groups] dev = [ - "pandas>=2.1.0", - "pytest>=8.3.5", + "pandas>=2.1.0" ] diff --git a/wikiq b/wikiq index 290225b..5ce219d 100755 --- a/wikiq +++ b/wikiq @@ -242,6 +242,7 @@ class RevDataBase: title: str namespace: int deleted: bool + edit_summary: str text_chars: int = None revert: bool = None reverteds: list[int] = None @@ -271,7 +272,8 @@ class RevDataBase: pa.field("sha1",pa.string()), pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), - pa.field("anon",pa.bool_()) + pa.field("anon",pa.bool_()), + pa.field("edit_summary",pa.bool_()) ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function @@ -535,7 +537,8 @@ class WikiqParser: editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id, title=page.title, deleted=rev.deleted.text, - namespace=namespace + namespace=namespace, + edit_summary=rev.comment ) rev_data = self.matchmake_revision(rev, rev_data) @@ -717,6 +720,7 @@ def match_archive_suffix(input_filename): def open_input_file(input_filename, fandom_2020=False): cmd = match_archive_suffix(input_filename) + if fandom_2020: cmd.append("*.xml") try: