Add noargs test for ikwiki
This way we can ensure that the parquet code outputs equivalent output. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
d413443740
commit
52757a8239
@ -12,6 +12,7 @@ from typing import Final
|
||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
|
||||
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
||||
PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
|
||||
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
||||
|
||||
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
|
||||
@ -25,6 +26,8 @@ def setup():
|
||||
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
|
||||
if not os.path.exists(TEST_OUTPUT_DIR):
|
||||
os.mkdir(TEST_OUTPUT_DIR)
|
||||
if not os.path.exists(PARQUET_OUTPUT_DIR):
|
||||
os.mkdir(PARQUET_OUTPUT_DIR)
|
||||
|
||||
|
||||
# Always run setup, even if this is executed via "python -m unittest" rather
|
||||
@ -86,6 +89,21 @@ class WikiqTester:
|
||||
# malformed xmls DONE
|
||||
|
||||
class WikiqTestCase(unittest.TestCase):
|
||||
def test_WP_noargs(self):
|
||||
tester = WikiqTester(IKWIKI, "noargs")
|
||||
|
||||
try:
|
||||
tester.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
self.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
|
||||
test = pd.read_table(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
||||
def test_WP_url_encode(self):
|
||||
tester = WikiqTester(IKWIKI, "url-encode")
|
||||
|
||||
@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
|
||||
copyfile(tester.call_output, tester.test_file)
|
||||
|
||||
# as a test let's make sure that we get equal data frames
|
||||
test = pd.read_table(tester.test_file)
|
||||
test = pd.read_parquet(tester.test_file)
|
||||
baseline = pd.read_table(tester.baseline_file)
|
||||
assert_frame_equal(test, baseline, check_like=True)
|
||||
|
||||
|
27780
test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
Normal file
27780
test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
Normal file
File diff suppressed because it is too large
Load Diff
13
wikiq
13
wikiq
@ -14,7 +14,7 @@ from io import TextIOWrapper
|
||||
from subprocess import Popen, PIPE
|
||||
from collections import deque
|
||||
from hashlib import sha1
|
||||
from typing import Any, IO, TextIO, Iterable
|
||||
from typing import Any, IO, TextIO
|
||||
|
||||
from mwxml import Dump
|
||||
|
||||
@ -387,7 +387,7 @@ class WikiqParser:
|
||||
namespaces: list[int] | None = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
parquet_buffer_size: int=2000):
|
||||
parquet_buffer_size: int = 2000):
|
||||
"""
|
||||
Parameters:
|
||||
persist : what persistence method to use. Takes a PersistMethod value
|
||||
@ -427,8 +427,8 @@ class WikiqParser:
|
||||
# make_dataclass is a function that defines a new dataclass type.
|
||||
# here we extend the type we have already chosen and add the regular expression types
|
||||
self.revdata_type: type = dc.make_dataclass('RevData_Parser',
|
||||
fields=regex_fields,
|
||||
bases=(revdata_type,))
|
||||
fields=regex_fields,
|
||||
bases=(revdata_type,))
|
||||
|
||||
# we also need to make sure that we have the right pyarrow schema
|
||||
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
|
||||
@ -544,11 +544,12 @@ class WikiqParser:
|
||||
# Iterate through a page's revisions
|
||||
for rev in page:
|
||||
|
||||
# create a new data object instead of a dictionary.
|
||||
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
||||
# create a new data object instead of a dictionary.
|
||||
rev_data = self.revdata_type(revid=rev.id,
|
||||
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
|
||||
articleid=page.id,
|
||||
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
|
||||
editorid=editorid,
|
||||
title=page.title,
|
||||
deleted=rev.deleted.text,
|
||||
namespace=namespace
|
||||
|
Loading…
Reference in New Issue
Block a user