fix jsonl.d output.

2025-12-30 11:26:24 -08:00
parent 93f6ed0ff5
commit 8590e5f920
2 changed files with 92 additions and 7 deletions
--- a/src/wikiq/init.py
+++ b/src/wikiq/init.py
@@ -527,6 +527,7 @@ class WikiqParser:
        headings: bool = False,
        time_limit_seconds: Union[float, None] = None,
        max_revisions_per_file: int = 0,
        input_filename: Union[str, None] = None,
    ):
        """
        Parameters:
@@ -535,8 +536,10 @@ class WikiqParser:
                          or a dict mapping namespace -> (pageid, revid) for partitioned output.
                          For single-file: skip all revisions up to and including this point.
           max_revisions_per_file : if > 0, close and rotate output files after this many revisions
           input_filename : original input filename (needed for .jsonl.d output to derive output filename)
        """
        self.input_file = input_file
        self.input_filename = input_filename
        self.collapse_user: bool = collapse_user
        self.persist: int = persist
@@ -845,13 +848,12 @@ class WikiqParser:
            if self.output_jsonl_dir:
                # Create directory for JSONL output
                Path(self.output_file).mkdir(parents=True, exist_ok=True)
-                part_num = 0
+                # Derive JSONL filename from input filename
-                if self.resume_point is not None and len(self.resume_point) > 2:
+                if self.input_filename:
-                    part_num = self.resume_point[2]
+                    jsonl_basename = os.path.basename(get_output_filename(self.input_filename, 'jsonl'))
-                part_numbers[None] = part_num
+                else:
-                jsonl_path = self._get_part_path(
+                    jsonl_basename = "data.jsonl"
-                    Path(self.output_file) / "data.jsonl", part_num
+                jsonl_path = Path(self.output_file) / jsonl_basename
                )
                writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
            else:
                writer = JSONLWriter(self.output_file, schema, append=append_mode)
@@ -1527,6 +1529,7 @@ def main():
                headings=args.headings,
                time_limit_seconds=time_limit_seconds,
                max_revisions_per_file=args.max_revisions_per_file,
                input_filename=filename,
            )
            # Register signal handlers for graceful shutdown (CLI only)
--- a/test/test_resume.py
+++ b/test/test_resume.py
@@ -668,6 +668,88 @@ def test_resume_page_boundary():
    assert_frame_equal(df_full, df_resumed)
 def test_jsonl_dir_output():
    """Test that .jsonl.d output creates files named after input files.
    When output is a .jsonl.d directory, each input file should write to
    a separate JSONL file named after the input (e.g., sailormoon.jsonl),
    not a generic data.jsonl.
    """
    import pandas as pd
    from pandas.testing import assert_frame_equal
    output_dir = os.path.join(TEST_OUTPUT_DIR, "jsonl_dir_test.jsonl.d")
    input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    # Run wikiq with .jsonl.d output
    cmd = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10"
    try:
        subprocess.check_output(cmd, stderr=subprocess.PIPE, shell=True)
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))
    # Verify output file is named after input, not "data.jsonl"
    expected_output = os.path.join(output_dir, f"{SAILORMOON}.jsonl")
    wrong_output = os.path.join(output_dir, "data.jsonl")
    assert os.path.exists(expected_output), \
        f"Expected {expected_output} to exist, but it doesn't. Directory contents: {os.listdir(output_dir)}"
    assert not os.path.exists(wrong_output), \
        f"Expected {wrong_output} NOT to exist (should be named after input file)"
    # Verify output has data
    rows = read_jsonl(expected_output)
    assert len(rows) > 0, "Output file should have data"
 def test_jsonl_dir_resume():
    """Test that resume works correctly with .jsonl.d directory output.
    The resume logic must derive the same filename from the input file
    as the write logic does.
    """
    import pandas as pd
    from pandas.testing import assert_frame_equal
    output_dir = os.path.join(TEST_OUTPUT_DIR, "jsonl_dir_resume.jsonl.d")
    input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    # First run: complete
    cmd_full = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10"
    try:
        subprocess.check_output(cmd_full, stderr=subprocess.PIPE, shell=True)
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))
    expected_output = os.path.join(output_dir, f"{SAILORMOON}.jsonl")
    full_rows = read_jsonl(expected_output)
    # Truncate to partial
    partial_idx = len(full_rows) // 2
    with open(expected_output, 'w') as f:
        for row in full_rows[:partial_idx]:
            f.write(json.dumps(row) + "\n")
    # Resume
    cmd_resume = f"{WIKIQ} {input_file} -o {output_dir} --fandom-2020 --batch-size 10 --resume"
    try:
        subprocess.check_output(cmd_resume, stderr=subprocess.PIPE, shell=True)
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))
    resumed_rows = read_jsonl(expected_output)
    df_full = pd.DataFrame(full_rows)
    df_resumed = pd.DataFrame(resumed_rows)
    assert_frame_equal(df_full, df_resumed)
 def test_resume_revert_detection():
    """Test that revert detection works correctly after resume.