Add noargs test for ikwiki
This way we can ensure that the parquet code outputs equivalent output. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
d413443740
commit
52757a8239
@ -12,6 +12,7 @@ from typing import Final
|
|||||||
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
|
||||||
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
|
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
|
||||||
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
|
||||||
|
PARQUET_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output.parquet")
|
||||||
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
|
||||||
|
|
||||||
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
|
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
|
||||||
@ -25,6 +26,8 @@ def setup():
|
|||||||
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
|
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
|
||||||
if not os.path.exists(TEST_OUTPUT_DIR):
|
if not os.path.exists(TEST_OUTPUT_DIR):
|
||||||
os.mkdir(TEST_OUTPUT_DIR)
|
os.mkdir(TEST_OUTPUT_DIR)
|
||||||
|
if not os.path.exists(PARQUET_OUTPUT_DIR):
|
||||||
|
os.mkdir(PARQUET_OUTPUT_DIR)
|
||||||
|
|
||||||
|
|
||||||
# Always run setup, even if this is executed via "python -m unittest" rather
|
# Always run setup, even if this is executed via "python -m unittest" rather
|
||||||
@ -86,6 +89,21 @@ class WikiqTester:
|
|||||||
# malformed xmls DONE
|
# malformed xmls DONE
|
||||||
|
|
||||||
class WikiqTestCase(unittest.TestCase):
|
class WikiqTestCase(unittest.TestCase):
|
||||||
|
def test_WP_noargs(self):
|
||||||
|
tester = WikiqTester(IKWIKI, "noargs")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq()
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
self.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
copyfile(tester.call_output, tester.test_file)
|
||||||
|
|
||||||
|
test = pd.read_table(tester.test_file)
|
||||||
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
|
|
||||||
def test_WP_url_encode(self):
|
def test_WP_url_encode(self):
|
||||||
tester = WikiqTester(IKWIKI, "url-encode")
|
tester = WikiqTester(IKWIKI, "url-encode")
|
||||||
|
|
||||||
@ -330,7 +348,7 @@ class WikiqTestCase(unittest.TestCase):
|
|||||||
copyfile(tester.call_output, tester.test_file)
|
copyfile(tester.call_output, tester.test_file)
|
||||||
|
|
||||||
# as a test let's make sure that we get equal data frames
|
# as a test let's make sure that we get equal data frames
|
||||||
test = pd.read_table(tester.test_file)
|
test = pd.read_parquet(tester.test_file)
|
||||||
baseline = pd.read_table(tester.baseline_file)
|
baseline = pd.read_table(tester.baseline_file)
|
||||||
assert_frame_equal(test, baseline, check_like=True)
|
assert_frame_equal(test, baseline, check_like=True)
|
||||||
|
|
||||||
|
27780
test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
Normal file
27780
test/baseline_output/noargs_ikwiki-20180301-pages-meta-history.tsv
Normal file
File diff suppressed because it is too large
Load Diff
7
wikiq
7
wikiq
@ -14,7 +14,7 @@ from io import TextIOWrapper
|
|||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
from typing import Any, IO, TextIO, Iterable
|
from typing import Any, IO, TextIO
|
||||||
|
|
||||||
from mwxml import Dump
|
from mwxml import Dump
|
||||||
|
|
||||||
@ -387,7 +387,7 @@ class WikiqParser:
|
|||||||
namespaces: list[int] | None = None,
|
namespaces: list[int] | None = None,
|
||||||
revert_radius: int = 15,
|
revert_radius: int = 15,
|
||||||
output_parquet: bool = True,
|
output_parquet: bool = True,
|
||||||
parquet_buffer_size: int=2000):
|
parquet_buffer_size: int = 2000):
|
||||||
"""
|
"""
|
||||||
Parameters:
|
Parameters:
|
||||||
persist : what persistence method to use. Takes a PersistMethod value
|
persist : what persistence method to use. Takes a PersistMethod value
|
||||||
@ -544,11 +544,12 @@ class WikiqParser:
|
|||||||
# Iterate through a page's revisions
|
# Iterate through a page's revisions
|
||||||
for rev in page:
|
for rev in page:
|
||||||
|
|
||||||
|
editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id
|
||||||
# create a new data object instead of a dictionary.
|
# create a new data object instead of a dictionary.
|
||||||
rev_data = self.revdata_type(revid=rev.id,
|
rev_data = self.revdata_type(revid=rev.id,
|
||||||
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
|
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
|
||||||
articleid=page.id,
|
articleid=page.id,
|
||||||
editorid=None if rev.deleted.user == True or rev.user.id is None else rev.user.id,
|
editorid=editorid,
|
||||||
title=page.title,
|
title=page.title,
|
||||||
deleted=rev.deleted.text,
|
deleted=rev.deleted.text,
|
||||||
namespace=namespace
|
namespace=namespace
|
||||||
|
Loading…
Reference in New Issue
Block a user