3 Commits

Author SHA1 Message Date
Nathan TeBlunthuis
15e9234903 adding pyproject.toml 2025-05-28 20:59:55 -07:00
Nathan TeBlunthuis
8c7d46472f Merge branch 'parquet_support' of code:mediawiki_dump_tools into parquet_support 2025-05-28 20:54:52 -07:00
Nathan TeBlunthuis
3c7fb088d6 fix schema bugs. 2025-05-28 20:54:42 -07:00
2 changed files with 28 additions and 4 deletions

24
pyproject.toml Normal file
View File

@@ -0,0 +1,24 @@
[project]
name = "mediawiki-dump-tools"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"deltas>=0.7.0",
"mw>=0.4.0",
"mwpersistence>=0.2.4",
"mwreverts>=0.1.5",
"mwxml>=0.3.6",
"pyarrow>=20.0.0",
"yamlconf",
]
[tool.uv.sources]
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
[dependency-groups]
dev = [
"pandas>=2.1.0",
"pytest>=8.3.5",
]

8
wikiq
View File

@@ -250,13 +250,13 @@ class RevDataBase():
pa.field("revid", pa.int64()),
pa.field("date_time", pa.timestamp('ms')),
pa.field("articleid",pa.int64()),
pa.field("editorid",pa.int64()),
pa.field("editorid",pa.int64(), nullable=True),
pa.field("title",pa.string()),
pa.field("namespace",pa.int32()),
pa.field("deleted",pa.bool_()),
pa.field("test_chars",pa.int32()),
pa.field("revert",pa.bool_()),
pa.field("reverteds",pa.list_(pa.int64())),
pa.field("text_chars",pa.int32()),
pa.field("revert",pa.bool_(), nullable=True),
pa.field("reverteds",pa.list_(pa.int64()), nullable=True),
pa.field("sha1",pa.string()),
pa.field("minor",pa.bool_()),
pa.field("editor",pa.string()),