fix jsonl.d output.
This commit is contained in:
@@ -668,6 +668,88 @@ def test_resume_page_boundary():
|
||||
assert_frame_equal(df_full, df_resumed)
|
||||
|
||||
|
||||
def test_jsonl_dir_output():
|
||||
"""Test that .jsonl.d output creates files named after input files.
|
||||
|
||||
When output is a .jsonl.d directory, each input file should write to
|
||||
a separate JSONL file named after the input (e.g., sailormoon.jsonl),
|
||||
not a generic data.jsonl.
|
||||
"""
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
output_dir = os.path.join(TEST_OUTPUT_DIR, "jsonl_dir_test.jsonl.d")
|
||||
input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
|
||||
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
|
||||
# Run wikiq with .jsonl.d output
|
||||
cmd = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10"
|
||||
try:
|
||||
subprocess.check_output(cmd, stderr=subprocess.PIPE, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Verify output file is named after input, not "data.jsonl"
|
||||
expected_output = os.path.join(output_dir, f"{SAILORMOON}.jsonl")
|
||||
wrong_output = os.path.join(output_dir, "data.jsonl")
|
||||
|
||||
assert os.path.exists(expected_output), \
|
||||
f"Expected {expected_output} to exist, but it doesn't. Directory contents: {os.listdir(output_dir)}"
|
||||
assert not os.path.exists(wrong_output), \
|
||||
f"Expected {wrong_output} NOT to exist (should be named after input file)"
|
||||
|
||||
# Verify output has data
|
||||
rows = read_jsonl(expected_output)
|
||||
assert len(rows) > 0, "Output file should have data"
|
||||
|
||||
|
||||
def test_jsonl_dir_resume():
|
||||
"""Test that resume works correctly with .jsonl.d directory output.
|
||||
|
||||
The resume logic must derive the same filename from the input file
|
||||
as the write logic does.
|
||||
"""
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
output_dir = os.path.join(TEST_OUTPUT_DIR, "jsonl_dir_resume.jsonl.d")
|
||||
input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
|
||||
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
|
||||
# First run: complete
|
||||
cmd_full = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10"
|
||||
try:
|
||||
subprocess.check_output(cmd_full, stderr=subprocess.PIPE, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
expected_output = os.path.join(output_dir, f"{SAILORMOON}.jsonl")
|
||||
full_rows = read_jsonl(expected_output)
|
||||
|
||||
# Truncate to partial
|
||||
partial_idx = len(full_rows) // 2
|
||||
with open(expected_output, 'w') as f:
|
||||
for row in full_rows[:partial_idx]:
|
||||
f.write(json.dumps(row) + "\n")
|
||||
|
||||
# Resume
|
||||
cmd_resume = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10 --resume"
|
||||
try:
|
||||
subprocess.check_output(cmd_resume, stderr=subprocess.PIPE, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
resumed_rows = read_jsonl(expected_output)
|
||||
|
||||
df_full = pd.DataFrame(full_rows)
|
||||
df_resumed = pd.DataFrame(resumed_rows)
|
||||
assert_frame_equal(df_full, df_resumed)
|
||||
|
||||
|
||||
def test_resume_revert_detection():
|
||||
"""Test that revert detection works correctly after resume.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user