don't put checkpoint files inside namespace directories.
This commit is contained in:
parent
783f5fd8bc
commit
d1fc094c96
@ -322,7 +322,7 @@ class WikiqParser:
|
|||||||
"""Open checkpoint file for writing. Keeps file open for performance."""
|
"""Open checkpoint file for writing. Keeps file open for performance."""
|
||||||
if not self.output_parquet or output_file == sys.stdout.buffer:
|
if not self.output_parquet or output_file == sys.stdout.buffer:
|
||||||
return
|
return
|
||||||
checkpoint_path = get_checkpoint_path(output_file)
|
checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
|
||||||
Path(checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
|
Path(checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
self.checkpoint_file = open(checkpoint_path, 'w')
|
self.checkpoint_file = open(checkpoint_path, 'w')
|
||||||
print(f"Checkpoint file opened: {checkpoint_path}", file=sys.stderr)
|
print(f"Checkpoint file opened: {checkpoint_path}", file=sys.stderr)
|
||||||
|
|||||||
@ -14,12 +14,24 @@ import sys
|
|||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
|
|
||||||
def get_checkpoint_path(output_file):
|
def get_checkpoint_path(output_file, partition_namespaces=False):
|
||||||
"""Get the path to the checkpoint file for a given output file."""
|
"""Get the path to the checkpoint file for a given output file.
|
||||||
|
|
||||||
|
For partitioned output, the checkpoint is placed outside the partition directory
|
||||||
|
to avoid pyarrow trying to read it as a parquet file. The filename includes
|
||||||
|
the output filename to keep it unique per input file (for parallel jobs).
|
||||||
|
"""
|
||||||
|
if partition_namespaces:
|
||||||
|
# output_file is like partition_dir/output.parquet
|
||||||
|
# checkpoint should be at parent level: parent/output.parquet.checkpoint
|
||||||
|
partition_dir = os.path.dirname(output_file)
|
||||||
|
output_filename = os.path.basename(output_file)
|
||||||
|
parent_dir = os.path.dirname(partition_dir)
|
||||||
|
return os.path.join(parent_dir, output_filename + ".checkpoint")
|
||||||
return str(output_file) + ".checkpoint"
|
return str(output_file) + ".checkpoint"
|
||||||
|
|
||||||
|
|
||||||
def read_checkpoint(output_file):
|
def read_checkpoint(output_file, partition_namespaces=False):
|
||||||
"""
|
"""
|
||||||
Read resume point from checkpoint file if it exists.
|
Read resume point from checkpoint file if it exists.
|
||||||
|
|
||||||
@ -31,7 +43,7 @@ def read_checkpoint(output_file):
|
|||||||
For single files: A tuple (pageid, revid), or None if not found.
|
For single files: A tuple (pageid, revid), or None if not found.
|
||||||
For partitioned: A dict mapping namespace -> (pageid, revid), or None.
|
For partitioned: A dict mapping namespace -> (pageid, revid), or None.
|
||||||
"""
|
"""
|
||||||
checkpoint_path = get_checkpoint_path(output_file)
|
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
|
||||||
if not os.path.exists(checkpoint_path):
|
if not os.path.exists(checkpoint_path):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -78,7 +90,7 @@ def get_resume_point(output_file, partition_namespaces=False):
|
|||||||
or None if no partitions exist.
|
or None if no partitions exist.
|
||||||
"""
|
"""
|
||||||
# First try checkpoint file (fast)
|
# First try checkpoint file (fast)
|
||||||
checkpoint_result = read_checkpoint(output_file)
|
checkpoint_result = read_checkpoint(output_file, partition_namespaces)
|
||||||
if checkpoint_result is not None:
|
if checkpoint_result is not None:
|
||||||
print(f"Resume point found in checkpoint file", file=sys.stderr)
|
print(f"Resume point found in checkpoint file", file=sys.stderr)
|
||||||
return checkpoint_result
|
return checkpoint_result
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user