Merge branch 'parquet_support' into test-parquet
This commit is contained in:
commit
9009bb6fa4
3
.gitignore
vendored
3
.gitignore
vendored
@ -4,6 +4,9 @@
|
|||||||
*.xml.xz
|
*.xml.xz
|
||||||
*.swp
|
*.swp
|
||||||
|
|
||||||
|
# Lockfiles
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# JetBrains
|
# JetBrains
|
||||||
/.idea
|
/.idea
|
||||||
|
|
||||||
|
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[project]
|
||||||
|
name = "mediawiki-dump-tools"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"deltas>=0.7.0",
|
||||||
|
"mediawiki-utilities>=0.4.18",
|
||||||
|
"mwpersistence>=0.2.4",
|
||||||
|
"mwreverts>=0.1.5",
|
||||||
|
"mwtypes>=0.4.0",
|
||||||
|
"mwxml>=0.3.6",
|
||||||
|
"pyarrow>=20.0.0",
|
||||||
|
"yamlconf",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
||||||
|
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"pandas>=2.1.0",
|
||||||
|
"pytest>=8.3.5",
|
||||||
|
]
|
@ -1,40 +0,0 @@
|
|||||||
attrs==25.3.0
|
|
||||||
certifi==2025.4.26
|
|
||||||
charset-normalizer==3.4.2
|
|
||||||
Cython==0.29.37
|
|
||||||
deltas==0.7.0
|
|
||||||
docopt==0.6.2
|
|
||||||
gnureadline==8.1.2
|
|
||||||
idna==3.10
|
|
||||||
jsonable==0.3.1
|
|
||||||
jsonschema==4.23.0
|
|
||||||
jsonschema-specifications==2025.4.1
|
|
||||||
mediawiki-utilities==0.4.18
|
|
||||||
mwcli==0.0.3
|
|
||||||
mwdiffs==0.0.2
|
|
||||||
mwpersistence==0.2.4
|
|
||||||
mwreverts==0.1.5
|
|
||||||
mwtypes==0.4.0
|
|
||||||
mwxml==0.3.6
|
|
||||||
numpy==2.2.6
|
|
||||||
pandas==2.2.3
|
|
||||||
para==0.0.8
|
|
||||||
parsimonious==0.10.0
|
|
||||||
pyarrow==20.0.0
|
|
||||||
pydub==0.25.1
|
|
||||||
PyMySQL==1.1.1
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
pytz==2025.2
|
|
||||||
PyYAML==5.4.1
|
|
||||||
referencing==0.36.2
|
|
||||||
regex==2024.11.6
|
|
||||||
requests==2.32.3
|
|
||||||
rpds-py==0.25.1
|
|
||||||
setuptools==80.8.0
|
|
||||||
six==1.17.0
|
|
||||||
stopit==1.1.2
|
|
||||||
typing_extensions==4.13.2
|
|
||||||
tzdata==2025.2
|
|
||||||
urllib3==2.4.0
|
|
||||||
wheel==0.45.1
|
|
||||||
yamlconf==0.2.6
|
|
@ -1,4 +1,3 @@
|
|||||||
import math
|
|
||||||
import unittest
|
import unittest
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -7,12 +6,10 @@ from shutil import copyfile
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
from pandas._testing import assert_series_equal
|
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||||
from pandas.testing import assert_frame_equal
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import tracemalloc
|
import tracemalloc
|
||||||
from typing import Final
|
from typing import Final
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# Make references to files and wikiq relative to this file, not to the current working directory.
|
# Make references to files and wikiq relative to this file, not to the current working directory.
|
||||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||||
@ -181,7 +178,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--collapse-user")
|
tester.call_wikiq("--collapse-user", "--fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -195,7 +192,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence segment")
|
tester.call_wikiq("--persistence segment", "--fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -209,7 +206,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence legacy")
|
tester.call_wikiq("--persistence legacy", "--fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -223,7 +220,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--persistence")
|
tester.call_wikiq("--persistence", "--fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -239,7 +236,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tester.call_wikiq("--url-encode")
|
tester.call_wikiq("--url-encode", "--fandom-2020")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
@ -266,7 +263,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
|
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
outs = tester.call_wikiq("--stdout", out=False).decode("utf8")
|
outs = tester.call_wikiq( "--stdout", "--fandom-2020", out=False).decode("utf8")
|
||||||
except subprocess.CalledProcessError as exc:
|
except subprocess.CalledProcessError as exc:
|
||||||
self.fail(exc.stderr.decode("utf8"))
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
24
wikiq
24
wikiq
@ -266,13 +266,13 @@ class RevDataBase:
|
|||||||
pa.field("revid", pa.int64()),
|
pa.field("revid", pa.int64()),
|
||||||
pa.field("date_time", pa.timestamp('ms')),
|
pa.field("date_time", pa.timestamp('ms')),
|
||||||
pa.field("articleid", pa.int64()),
|
pa.field("articleid", pa.int64()),
|
||||||
pa.field("editorid", pa.int64()),
|
pa.field("editorid", pa.int64(), nullable=True),
|
||||||
pa.field("title", pa.string()),
|
pa.field("title", pa.string()),
|
||||||
pa.field("namespace", pa.int32()),
|
pa.field("namespace", pa.int32()),
|
||||||
pa.field("deleted", pa.bool_()),
|
pa.field("deleted", pa.bool_()),
|
||||||
pa.field("text_chars", pa.int32()),
|
pa.field("text_chars", pa.int32()),
|
||||||
pa.field("revert", pa.bool_()),
|
pa.field("revert", pa.bool_(), nullable=True),
|
||||||
pa.field("reverteds", pa.list_(pa.int64())),
|
pa.field("reverteds", pa.list_(pa.int64()), nullable=True),
|
||||||
pa.field("sha1", pa.string()),
|
pa.field("sha1", pa.string()),
|
||||||
pa.field("minor", pa.bool_()),
|
pa.field("minor", pa.bool_()),
|
||||||
pa.field("editor", pa.string()),
|
pa.field("editor", pa.string()),
|
||||||
@ -280,7 +280,7 @@ class RevDataBase:
|
|||||||
]
|
]
|
||||||
|
|
||||||
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
|
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
|
||||||
def to_pyarrow(self) -> tuple[Any, ...]:
|
def to_pyarrow(self):
|
||||||
return dc.astuple(self)
|
return dc.astuple(self)
|
||||||
|
|
||||||
# logic to convert each field into the wikiq tsv format goes here.
|
# logic to convert each field into the wikiq tsv format goes here.
|
||||||
@ -732,16 +732,22 @@ class WikiqParser:
|
|||||||
print(line, file=self.output_file)
|
print(line, file=self.output_file)
|
||||||
|
|
||||||
|
|
||||||
def open_input_file(input_filename) -> TextIOWrapper | IO[Any] | IO[bytes]:
|
def match_archive_suffix(input_filename):
|
||||||
if re.match(r'.*\.7z$', input_filename):
|
if re.match(r'.*\.7z$', input_filename):
|
||||||
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
|
cmd = ["7za", "x", "-so", input_filename]
|
||||||
elif re.match(r'.*\.gz$', input_filename):
|
elif re.match(r'.*\.gz$', input_filename):
|
||||||
cmd = ["zcat", input_filename]
|
cmd = ["zcat", input_filename]
|
||||||
elif re.match(r'.*\.bz2$', input_filename):
|
elif re.match(r'.*\.bz2$', input_filename):
|
||||||
cmd = ["bzcat", "-dk", input_filename]
|
cmd = ["bzcat", "-dk", input_filename]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unrecognized file type: %s" % input_filename)
|
raise ValueError("Unrecognized file type: %s" % input_filename)
|
||||||
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
|
def open_input_file(input_filename, fandom_2020=False):
|
||||||
|
cmd = match_archive_suffix(input_filename)
|
||||||
|
if fandom_2020:
|
||||||
|
cmd.append("*.xml")
|
||||||
try:
|
try:
|
||||||
return Popen(cmd, stdout=PIPE).stdout
|
return Popen(cmd, stdout=PIPE).stdout
|
||||||
except NameError:
|
except NameError:
|
||||||
@ -814,6 +820,10 @@ def main():
|
|||||||
action='append',
|
action='append',
|
||||||
help="The label for the outputted column based on matching the regex in comments.")
|
help="The label for the outputted column based on matching the regex in comments.")
|
||||||
|
|
||||||
|
parser.add_argument('--fandom-2020', dest="fandom_2020",
|
||||||
|
action='store_true',
|
||||||
|
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# set persistence method
|
# set persistence method
|
||||||
@ -835,7 +845,7 @@ def main():
|
|||||||
if len(args.dumpfiles) > 0:
|
if len(args.dumpfiles) > 0:
|
||||||
output_parquet = False
|
output_parquet = False
|
||||||
for filename in args.dumpfiles:
|
for filename in args.dumpfiles:
|
||||||
input_file = open_input_file(filename)
|
input_file = open_input_file(filename, args.fandom_2020)
|
||||||
|
|
||||||
# open directory for output
|
# open directory for output
|
||||||
if args.output_dir:
|
if args.output_dir:
|
||||||
|
Loading…
Reference in New Issue
Block a user