Add noargs test for ikwiki

This way we can ensure that the parquet code outputs equivalent output. Signed-off-by: Will Beason <willbeason@gmail.com>
2025-05-28 15:04:10 -05:00
parent d413443740
commit 52757a8239
3 changed files with 27806 additions and 7 deletions
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -12,6 +12,7 @@ from typing import Final
 TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
 WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
 TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
 PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
 BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
 IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
@@ -25,6 +26,8 @@ def setup():
    # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
    if not os.path.exists(TEST_OUTPUT_DIR):
        os.mkdir(TEST_OUTPUT_DIR)
    if not os.path.exists(PARQUET_OUTPUT_DIR):
        os.mkdir(PARQUET_OUTPUT_DIR)
 # Always run setup, even if this is executed via "python -m unittest" rather
@@ -86,6 +89,21 @@ class WikiqTester:
 # malformed xmls DONE
 class WikiqTestCase(unittest.TestCase):
    def test_WP_noargs(self):
        tester = WikiqTester(IKWIKI, "noargs")
        try:
            tester.call_wikiq()
        except subprocess.CalledProcessError as exc:
            self.fail(exc.stderr.decode("utf8"))
        copyfile(tester.call_output, tester.test_file)
        test = pd.read_table(tester.test_file)
        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)
    def test_WP_url_encode(self):
        tester = WikiqTester(IKWIKI, "url-encode")
@@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
        copyfile(tester.call_output, tester.test_file)
        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(tester.test_file)
+        test = pd.read_parquet(tester.test_file)
        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)
--- a/test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
+++ b/test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
--- a/5
+++ b/5
@@ -14,7 +14,7 @@ from io import TextIOWrapper
 from subprocess import Popen, PIPE
 from collections import deque
 from hashlib import sha1
-from typing import Any, IO, TextIO, Iterable
+from typing import Any, IO, TextIO
 from mwxml import Dump
@@ -544,11 +544,12 @@ class WikiqParser:
            # Iterate through a page's revisions
            for rev in page:
                editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
                # create a new data object instead of a dictionary.
                rev_data = self.revdata_type(revid=rev.id,
                                             date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
                                             articleid=page.id,
-                                             editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
+                                             editorid=editorid,
                                             title=page.title,
                                             deleted=rev.deleted.text,
                                             namespace=namespace