only merge the correct partitioned files.

This commit is contained in:
Nathan TeBlunthuis
2025-12-19 11:47:18 -08:00
parent 006feb795c
commit 38dabd0547

View File

@@ -45,7 +45,7 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
if has_old_temp_files: if has_old_temp_files:
print(f"Found leftover temp files in {partition_dir} from previous interrupted partitioned run, merging first...", file=sys.stderr) print(f"Found leftover temp files in {partition_dir} from previous interrupted partitioned run, merging first...", file=sys.stderr)
had_corruption = merge_partitioned_namespaces(partition_dir, temp_suffix) had_corruption = merge_partitioned_namespaces(partition_dir, temp_suffix, output_filename)
# Check if any valid data remains after merge # Check if any valid data remains after merge
has_valid_data = False has_valid_data = False
@@ -342,7 +342,7 @@ def merge_parquet_files(original_path, temp_path, merged_path):
return False return False
def merge_partitioned_namespaces(partition_dir, temp_suffix): def merge_partitioned_namespaces(partition_dir, temp_suffix, file_filter):
""" """
Merge partitioned namespace directories after resume. Merge partitioned namespace directories after resume.
@@ -353,6 +353,8 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
Args: Args:
partition_dir: The partition directory containing namespace=* subdirs partition_dir: The partition directory containing namespace=* subdirs
temp_suffix: The suffix appended to temp files (e.g., '.resume_temp') temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')
file_filter: Only process temp files matching this base name
(e.g., 'enwiki-20250123-pages-meta-history24-p53238682p53445302.parquet')
Returns: Returns:
True if at least one namespace has valid data after merge True if at least one namespace has valid data after merge
@@ -360,20 +362,17 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
""" """
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')] namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
had_corruption = False had_corruption = False
expected_temp = file_filter + temp_suffix
for ns_dir in namespace_dirs: for ns_dir in namespace_dirs:
ns_path = os.path.join(partition_dir, ns_dir) ns_path = os.path.join(partition_dir, ns_dir)
temp_path = os.path.join(ns_path, expected_temp)
# Find all files in this namespace directory if not os.path.exists(temp_path):
files = os.listdir(ns_path) continue
# Find temp files (files ending with the temp suffix)
temp_files = [f for f in files if f.endswith(temp_suffix)]
for temp_file in temp_files:
temp_path = os.path.join(ns_path, temp_file)
# Original file is the temp file without the suffix # Original file is the temp file without the suffix
original_file = temp_file[:-len(temp_suffix)] original_file = file_filter
original_path = os.path.join(ns_path, original_file) original_path = os.path.join(ns_path, original_file)
if os.path.exists(original_path): if os.path.exists(original_path):
@@ -447,7 +446,9 @@ def finalize_resume_merge(
if partition_namespaces and original_partition_dir is not None: if partition_namespaces and original_partition_dir is not None:
# For partitioned namespaces, temp files are written alongside originals # For partitioned namespaces, temp files are written alongside originals
# with '.resume_temp' suffix in each namespace directory. # with '.resume_temp' suffix in each namespace directory.
merge_partitioned_namespaces(original_partition_dir, ".resume_temp") # Only merge temp files for the current dump file, not other concurrent jobs.
file_filter = os.path.basename(original_output_file)
merge_partitioned_namespaces(original_partition_dir, ".resume_temp", file_filter)
# Clean up the empty temp directory we created # Clean up the empty temp directory we created
if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file): if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file):
shutil.rmtree(temp_output_file) shutil.rmtree(temp_output_file)