From 22d14dc5f2274522e22d9d688201f47b8e4b5a36 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 21:54:31 -0700 Subject: [PATCH 1/8] Remove dependency on pytest. --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d68de97..59e1043 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,9 @@ dependencies = [ "yamlconf", ] -[tool.uv.sources] -yamlconf = { git = "https://github.com/groceryheist/yamlconf" } -mwxml = { git = "https://github.com/groceryheist/python-mwxml" } +# [tool.uv.sources] +# yamlconf = { git = "https://github.com/groceryheist/yamlconf" } +# mwxml = { git = "https://github.com/groceryheist/python-mwxml" } [dependency-groups] dev = [ From bd22d26291cdd666b382812e836f0c22f394528e Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:02:14 -0700 Subject: [PATCH 2/8] update deps and add edit_summary to wikiq output. --- pyproject.toml | 9 ++++----- wikiq | 8 ++++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 59e1043..368239a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,11 @@ dependencies = [ "yamlconf", ] -# [tool.uv.sources] -# yamlconf = { git = "https://github.com/groceryheist/yamlconf" } -# mwxml = { git = "https://github.com/groceryheist/python-mwxml" } +[tool.uv.sources] +yamlconf = { git = "https://github.com/groceryheist/yamlconf" } +mwxml = { git = "https://github.com/groceryheist/python-mwxml" } [dependency-groups] dev = [ - "pandas>=2.1.0", - "pytest>=8.3.5", + "pandas>=2.1.0" ] diff --git a/wikiq b/wikiq index 290225b..5ce219d 100755 --- a/wikiq +++ b/wikiq @@ -242,6 +242,7 @@ class RevDataBase: title: str namespace: int deleted: bool + edit_summary: str text_chars: int = None revert: bool = None reverteds: list[int] = None @@ -271,7 +272,8 @@ class RevDataBase: pa.field("sha1",pa.string()), pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), - pa.field("anon",pa.bool_()) + pa.field("anon",pa.bool_()), + pa.field("edit_summary",pa.bool_()) ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function @@ -535,7 +537,8 @@ class WikiqParser: editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id, title=page.title, deleted=rev.deleted.text, - namespace=namespace + namespace=namespace, + edit_summary=rev.comment ) rev_data = self.matchmake_revision(rev, rev_data) @@ -717,6 +720,7 @@ def match_archive_suffix(input_filename): def open_input_file(input_filename, fandom_2020=False): cmd = match_archive_suffix(input_filename) + if fandom_2020: cmd.append("*.xml") try: From 13ee1607081ccc001b1aef556c00d89b00ebe53b Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:04:41 -0700 Subject: [PATCH 3/8] bugfix. --- wikiq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiq b/wikiq index 5ce219d..acdded6 100755 --- a/wikiq +++ b/wikiq @@ -273,7 +273,7 @@ class RevDataBase: pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), pa.field("anon",pa.bool_()), - pa.field("edit_summary",pa.bool_()) + pa.field("edit_summary",pa.string()) ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function From a9f76a0f627cef9d92dba903e9676be7bc6e23d7 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:10:59 -0700 Subject: [PATCH 4/8] change order of fields. --- wikiq | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wikiq b/wikiq index acdded6..311a26f 100755 --- a/wikiq +++ b/wikiq @@ -266,14 +266,14 @@ class RevDataBase: pa.field("title",pa.string()), pa.field("namespace",pa.int32()), pa.field("deleted",pa.bool_()), + pa.field("edit_summary",pa.string()), pa.field("text_chars",pa.int32()), pa.field("revert",pa.bool_(), nullable=True), pa.field("reverteds",pa.list_(pa.int64()), nullable=True), pa.field("sha1",pa.string()), pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), - pa.field("anon",pa.bool_()), - pa.field("edit_summary",pa.string()) + pa.field("anon",pa.bool_()) ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function From 606a3994502de11de1967ffad934b1268e7e175f Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:14:58 -0700 Subject: [PATCH 5/8] handle empty comments which are 'False' somehow. --- wikiq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiq b/wikiq index 311a26f..4deb674 100755 --- a/wikiq +++ b/wikiq @@ -538,7 +538,7 @@ class WikiqParser: title=page.title, deleted=rev.deleted.text, namespace=namespace, - edit_summary=rev.comment + edit_summary="" if rev.comment is False else rev.comment ) rev_data = self.matchmake_revision(rev, rev_data) From ffbd180001724e43779e7506f3c28255ef0dca31 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:24:33 -0700 Subject: [PATCH 6/8] make editorid null not '' in parquet. --- wikiq | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wikiq b/wikiq index 4deb674..2631852 100755 --- a/wikiq +++ b/wikiq @@ -238,11 +238,11 @@ class RevDataBase: revid: int date_time: datetime articleid: int - editorid: int title: str namespace: int deleted: bool edit_summary: str + editorid: int = None text_chars: int = None revert: bool = None reverteds: list[int] = None @@ -534,11 +534,11 @@ class WikiqParser: rev_data = self.revdata_type(revid=rev.id, date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc), articleid=page.id, - editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id, + editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id, title=page.title, deleted=rev.deleted.text, namespace=namespace, - edit_summary="" if rev.comment is False else rev.comment + edit_summary=rev.comment ) rev_data = self.matchmake_revision(rev, rev_data) @@ -672,7 +672,7 @@ class WikiqParser: cols = [] first = rg[0] for col in first: - cols.append([col]) + cols.appnd([col]) for row in rg[1:]: for j in range(len(cols)): From a13d7f1deb5db7f0a59dfa7236e135686ef16139 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:25:08 -0700 Subject: [PATCH 7/8] typo fix. --- wikiq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiq b/wikiq index 2631852..b930b4a 100755 --- a/wikiq +++ b/wikiq @@ -672,7 +672,7 @@ class WikiqParser: cols = [] first = rg[0] for col in first: - cols.appnd([col]) + cols.append([col]) for row in rg[1:]: for j in range(len(cols)): From 260e2b177ce72e33f69f5f78c920de887fe0dd73 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 29 May 2025 18:32:16 -0700 Subject: [PATCH 8/8] fix order of fields. --- wikiq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiq b/wikiq index b930b4a..adc5c1d 100755 --- a/wikiq +++ b/wikiq @@ -262,11 +262,11 @@ class RevDataBase: pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('ms')), pa.field("articleid",pa.int64()), - pa.field("editorid",pa.int64(), nullable=True), pa.field("title",pa.string()), pa.field("namespace",pa.int32()), pa.field("deleted",pa.bool_()), pa.field("edit_summary",pa.string()), + pa.field("editorid",pa.int64(), nullable=True), pa.field("text_chars",pa.int32()), pa.field("revert",pa.bool_(), nullable=True), pa.field("reverteds",pa.list_(pa.int64()), nullable=True),