Add noargs test for ikwiki

This way we can ensure that the parquet code outputs equivalent output.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-28 15:04:10 -05:00
parent d413443740
commit 52757a8239
3 changed files with 27806 additions and 7 deletions

View File

@ -12,6 +12,7 @@ from typing import Final
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history" IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
@ -25,6 +26,8 @@ def setup():
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
if not os.path.exists(TEST_OUTPUT_DIR): if not os.path.exists(TEST_OUTPUT_DIR):
os.mkdir(TEST_OUTPUT_DIR) os.mkdir(TEST_OUTPUT_DIR)
if not os.path.exists(PARQUET_OUTPUT_DIR):
os.mkdir(PARQUET_OUTPUT_DIR)
# Always run setup, even if this is executed via "python -m unittest" rather # Always run setup, even if this is executed via "python -m unittest" rather
@ -86,6 +89,21 @@ class WikiqTester:
# malformed xmls DONE # malformed xmls DONE
class WikiqTestCase(unittest.TestCase): class WikiqTestCase(unittest.TestCase):
def test_WP_noargs(self):
tester = WikiqTester(IKWIKI, "noargs")
try:
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_WP_url_encode(self): def test_WP_url_encode(self):
tester = WikiqTester(IKWIKI, "url-encode") tester = WikiqTester(IKWIKI, "url-encode")
@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
copyfile(tester.call_output, tester.test_file) copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(tester.test_file) test = pd.read_parquet(tester.test_file)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)

File diff suppressed because it is too large Load Diff

5
wikiq
View File

@ -14,7 +14,7 @@ from io import TextIOWrapper
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from collections import deque from collections import deque
from hashlib import sha1 from hashlib import sha1
from typing import Any, IO, TextIO, Iterable from typing import Any, IO, TextIO
from mwxml import Dump from mwxml import Dump
@ -544,11 +544,12 @@ class WikiqParser:
# Iterate through a page's revisions # Iterate through a page's revisions
for rev in page: for rev in page:
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
# create a new data object instead of a dictionary. # create a new data object instead of a dictionary.
rev_data = self.revdata_type(revid=rev.id, rev_data = self.revdata_type(revid=rev.id,
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc), date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid=page.id, articleid=page.id,
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id, editorid=editorid,
title=page.title, title=page.title,
deleted=rev.deleted.text, deleted=rev.deleted.text,
namespace=namespace namespace=namespace