Begin adding test for parquet export

Changed logic for handling anonymous edits so that wikiq handles
the type for editor ids consistently. Parquet can mix int64 and
None, but not int64 and strings - previously the code used the empty
string to denote anonymous editors.

Tests failing. Don't merge yet.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason
2025-05-28 13:17:30 -05:00
parent df0ad1de63
commit 3f94144b1b
3 changed files with 66 additions and 20 deletions

View File

@@ -42,16 +42,21 @@ class WikiqTester:
):
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
if out_format == "tsv":
self.output_dir = TEST_OUTPUT_DIR
else:
self.output_dir = "{0}.parquet".format(TEST_OUTPUT_DIR)
if suffix is None:
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
else:
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
self.call_output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(wiki, out_format))
self.call_output = os.path.join(self.output_dir, "{0}.{1}".format(wiki, out_format))
# If case_name is unset, there are no relevant baseline or test files.
if case_name is not None:
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
self.test_file = os.path.join(self.output_dir, "{0}_{1}".format(case_name, self.wikiq_out_name))
if os.path.exists(self.test_file):
os.remove(self.test_file)
@@ -63,7 +68,7 @@ class WikiqTester:
:return: The output of the wikiq call.
"""
if out:
call = ' '.join([WIKIQ, self.input_file, "-o", TEST_OUTPUT_DIR, *args])
call = ' '.join([WIKIQ, self.input_file, "-o", self.output_dir, *args])
else:
call = ' '.join([WIKIQ, self.input_file, *args])
@@ -314,6 +319,20 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_parquet(self):
tester = WikiqTester(IKWIKI, "parquet", out_format="parquet")
try:
tester.call_wikiq()
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
if __name__ == '__main__':
unittest.main()