Add noargs test for ikwiki

This way we can ensure that the parquet code outputs equivalent output. Signed-off-by: Will Beason <willbeason@gmail.com>
2025-05-28 15:04:10 -05:00 · 2025-05-28 15:04:10 -05:00 · 52757a8239
commit 52757a8239
parent d413443740
3 changed files with 27806 additions and 7 deletions
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@ -12,6 +12,7 @@ from typing import Final
 TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
 WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
 TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
+PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
 BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")

 IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
@ -25,6 +26,8 @@ def setup():
    # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
    if not os.path.exists(TEST_OUTPUT_DIR):
        os.mkdir(TEST_OUTPUT_DIR)
+    if not os.path.exists(PARQUET_OUTPUT_DIR):
+        os.mkdir(PARQUET_OUTPUT_DIR)


 # Always run setup, even if this is executed via "python -m unittest" rather
@ -86,6 +89,21 @@ class WikiqTester:
 # malformed xmls DONE

 class WikiqTestCase(unittest.TestCase):
+    def test_WP_noargs(self):
+        tester = WikiqTester(IKWIKI, "noargs")
+
+        try:
+            tester.call_wikiq()
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))
+
+        copyfile(tester.call_output, tester.test_file)
+
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
+        assert_frame_equal(test, baseline, check_like=True)
+
+
    def test_WP_url_encode(self):
        tester = WikiqTester(IKWIKI, "url-encode")

@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
        copyfile(tester.call_output, tester.test_file)

        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(tester.test_file)
+        test = pd.read_parquet(tester.test_file)
        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

--- a/test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
+++ b/test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
--- a/13
+++ b/13
@ -14,7 +14,7 @@ from io import TextIOWrapper
 from subprocess import Popen, PIPE
 from collections import deque
 from hashlib import sha1
-from typing import Any, IO, TextIO, Iterable
+from typing import Any, IO, TextIO

 from mwxml import Dump

@ -387,7 +387,7 @@ class WikiqParser:
                 namespaces: list[int] | None = None,
                 revert_radius: int = 15,
                 output_parquet: bool = True,
-                 parquet_buffer_size: int=2000):
+                 parquet_buffer_size: int = 2000):
        """ 
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
@ -427,8 +427,8 @@ class WikiqParser:
        # make_dataclass is a function that defines a new dataclass type.
        # here we extend the type we have already chosen and add the regular expression types 
        self.revdata_type: type = dc.make_dataclass('RevData_Parser',
-                                              fields=regex_fields,
-                                              bases=(revdata_type,))
+                                                    fields=regex_fields,
+                                                    bases=(revdata_type,))

        # we also need to make sure that we have the right pyarrow schema
        self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
@ -544,11 +544,12 @@ class WikiqParser:
            # Iterate through a page's revisions
            for rev in page:

-                # create a new data object instead of a dictionary. 
+                editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
+                # create a new data object instead of a dictionary.
                rev_data = self.revdata_type(revid=rev.id,
                                             date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
                                             articleid=page.id,
-                                             editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
+                                             editorid=editorid,
                                             title=page.title,
                                             deleted=rev.deleted.text,
                                             namespace=namespace