Add noargs test for ikwiki

This way we can ensure that the parquet code outputs equivalent output.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-28 15:04:10 -05:00
parent d413443740
commit 52757a8239
3 changed files with 27806 additions and 7 deletions

View File

@ -12,6 +12,7 @@ from typing import Final
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
@ -25,6 +26,8 @@ def setup():
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
if not os.path.exists(TEST_OUTPUT_DIR):
os.mkdir(TEST_OUTPUT_DIR)
if not os.path.exists(PARQUET_OUTPUT_DIR):
os.mkdir(PARQUET_OUTPUT_DIR)
# Always run setup, even if this is executed via "python -m unittest" rather
@ -86,6 +89,21 @@ class WikiqTester:
# malformed xmls DONE
class WikiqTestCase(unittest.TestCase):
def test_WP_noargs(self):
tester = WikiqTester(IKWIKI, "noargs")
try:
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode")
@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames
test = pd.read_table(tester.test_file)
test = pd.read_parquet(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)

File diff suppressed because it is too large Load Diff

13
wikiq
View File

@ -14,7 +14,7 @@ from io import TextIOWrapper
from subprocess import Popen, PIPE
from collections import deque
from hashlib import sha1
from typing import Any, IO, TextIO, Iterable
from typing import Any, IO, TextIO
from mwxml import Dump
@ -387,7 +387,7 @@ class WikiqParser:
namespaces: list[int] | None = None,
revert_radius: int = 15,
output_parquet: bool = True,
parquet_buffer_size: int=2000):
parquet_buffer_size: int = 2000):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@ -427,8 +427,8 @@ class WikiqParser:
# make_dataclass is a function that defines a new dataclass type.
# here we extend the type we have already chosen and add the regular expression types
self.revdata_type: type = dc.make_dataclass('RevData_Parser',
fields=regex_fields,
bases=(revdata_type,))
fields=regex_fields,
bases=(revdata_type,))
# we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
@ -544,11 +544,12 @@ class WikiqParser:
# Iterate through a page's revisions
for rev in page:
# create a new data object instead of a dictionary.
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
# create a new data object instead of a dictionary.
rev_data = self.revdata_type(revid=rev.id,
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid=page.id,
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
editorid=editorid,
title=page.title,
deleted=rev.deleted.text,
namespace=namespace