update deps and add edit_summary to wikiq output.

This commit is contained in:
Nathan TeBlunthuis 2025-05-29 18:02:14 -07:00
parent 22d14dc5f2
commit bd22d26291
2 changed files with 10 additions and 7 deletions

View File

@ -14,12 +14,11 @@ dependencies = [
"yamlconf", "yamlconf",
] ]
# [tool.uv.sources] [tool.uv.sources]
# yamlconf = { git = "https://github.com/groceryheist/yamlconf" } yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
# mwxml = { git = "https://github.com/groceryheist/python-mwxml" } mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
[dependency-groups] [dependency-groups]
dev = [ dev = [
"pandas>=2.1.0", "pandas>=2.1.0"
"pytest>=8.3.5",
] ]

8
wikiq
View File

@ -242,6 +242,7 @@ class RevDataBase:
title: str title: str
namespace: int namespace: int
deleted: bool deleted: bool
edit_summary: str
text_chars: int = None text_chars: int = None
revert: bool = None revert: bool = None
reverteds: list[int] = None reverteds: list[int] = None
@ -271,7 +272,8 @@ class RevDataBase:
pa.field("sha1",pa.string()), pa.field("sha1",pa.string()),
pa.field("minor",pa.bool_()), pa.field("minor",pa.bool_()),
pa.field("editor",pa.string()), pa.field("editor",pa.string()),
pa.field("anon",pa.bool_()) pa.field("anon",pa.bool_()),
pa.field("edit_summary",pa.bool_())
] ]
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
@ -535,7 +537,8 @@ class WikiqParser:
editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id, editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
title=page.title, title=page.title,
deleted=rev.deleted.text, deleted=rev.deleted.text,
namespace=namespace namespace=namespace,
edit_summary=rev.comment
) )
rev_data = self.matchmake_revision(rev, rev_data) rev_data = self.matchmake_revision(rev, rev_data)
@ -717,6 +720,7 @@ def match_archive_suffix(input_filename):
def open_input_file(input_filename, fandom_2020=False): def open_input_file(input_filename, fandom_2020=False):
cmd = match_archive_suffix(input_filename) cmd = match_archive_suffix(input_filename)
if fandom_2020: if fandom_2020:
cmd.append("*.xml") cmd.append("*.xml")
try: try: