make --resume work with partitioned namespaces.

This commit is contained in:
Nathan TeBlunthuis
2025-12-01 07:19:52 -08:00
parent 3c26185739
commit b46f98a875
2 changed files with 257 additions and 54 deletions

View File

@@ -347,13 +347,24 @@ class WikiqParser:
# Track whether we've passed the resume point
found_resume_point = self.resume_from_revid is None
# When resuming with parquet, write new data to temp file and merge at the end
# When resuming with parquet, write new data to temp file/directory and merge at the end
original_output_file = None
temp_output_file = None
if self.resume_from_revid is not None and self.output_parquet:
if isinstance(self.output_file, str) and os.path.exists(self.output_file):
original_output_file = self.output_file
# For partitioned namespaces, create a temp directory; for single files, create a temp file path
temp_output_file = self.output_file + ".resume_temp"
# Remove temp file/dir if it exists from a previous failed run
if os.path.exists(temp_output_file):
import shutil
if os.path.isdir(temp_output_file):
shutil.rmtree(temp_output_file)
else:
os.remove(temp_output_file)
# For partitioned namespaces, create the directory now; for single files it will be created by ParquetWriter
if self.partition_namespaces:
os.makedirs(temp_output_file, exist_ok=True)
self.output_file = temp_output_file
# Construct dump file iterator
@@ -780,43 +791,26 @@ class WikiqParser:
if original_output_file is not None and temp_output_file is not None:
print("Merging resumed data with existing output...", file=sys.stderr)
try:
# Create a merged output file
merged_output_file = original_output_file + ".merged"
# Check if we're merging partitioned namespaces or single files
if os.path.isdir(original_output_file):
# Merge partitioned namespace directories
self._merge_partitioned_namespaces(original_output_file, temp_output_file)
else:
# Merge single parquet files
merged_output_file = original_output_file + ".merged"
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
# Open the original file
original_pq = pq.ParquetFile(original_output_file)
# Open the temp file
temp_pq = pq.ParquetFile(temp_output_file)
# Replace the original file with the merged file
os.remove(original_output_file)
os.rename(merged_output_file, original_output_file)
# Create a writer for the merged file
merged_writer = None
# Copy all row groups from the original file
for i in range(original_pq.num_row_groups):
row_group = original_pq.read_row_group(i)
if merged_writer is None:
merged_writer = pq.ParquetWriter(
merged_output_file,
row_group.schema,
flavor="spark"
)
merged_writer.write_table(row_group)
# Append all row groups from the temp file
for i in range(temp_pq.num_row_groups):
row_group = temp_pq.read_row_group(i)
merged_writer.write_table(row_group)
# Close the writer
if merged_writer is not None:
merged_writer.close()
# Replace the original file with the merged file
os.remove(original_output_file)
os.rename(merged_output_file, original_output_file)
# Clean up the temp file
os.remove(temp_output_file)
# Clean up the temp file/directory
if os.path.exists(temp_output_file):
if os.path.isdir(temp_output_file):
import shutil
shutil.rmtree(temp_output_file)
else:
os.remove(temp_output_file)
print("Merge complete.", file=sys.stderr)
except Exception as e:
@@ -824,6 +818,50 @@ class WikiqParser:
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
raise
def _merge_partitioned_namespaces(self, original_output_dir, temp_output_dir):
"""
Merge partitioned namespace directories.
For each namespace partition in the temp directory, merge its parquet files with the original.
"""
import shutil
# Get all namespace directories from temp
temp_namespace_dirs = [d for d in os.listdir(temp_output_dir) if d.startswith('namespace=')]
for ns_dir in temp_namespace_dirs:
temp_ns_path = os.path.join(temp_output_dir, ns_dir)
original_ns_path = os.path.join(original_output_dir, ns_dir)
# Find parquet files in the temp namespace directory
temp_parquet_files = [f for f in os.listdir(temp_ns_path) if f.endswith('.parquet')]
if not temp_parquet_files:
continue
temp_parquet_path = os.path.join(temp_ns_path, temp_parquet_files[0])
# Check if the namespace partition exists in the original directory
if os.path.exists(original_ns_path):
# Namespace partition exists, merge the files
original_parquet_files = [f for f in os.listdir(original_ns_path) if f.endswith('.parquet')]
if not original_parquet_files:
# No parquet file in original, just copy the temp file
shutil.copy(temp_parquet_path, os.path.join(original_ns_path, temp_parquet_files[0]))
else:
original_parquet_path = os.path.join(original_ns_path, original_parquet_files[0])
merged_parquet_path = original_parquet_path + ".merged"
# Merge the files
merge_parquet_files(original_parquet_path, temp_parquet_path, merged_parquet_path)
# Replace the original file with the merged file
os.remove(original_parquet_path)
os.rename(merged_parquet_path, original_parquet_path)
else:
# Namespace partition doesn't exist in original, create it
shutil.copytree(temp_ns_path, original_ns_path)
def match_archive_suffix(input_filename):
if re.match(r".*\.7z$", input_filename):
@@ -864,37 +902,108 @@ def open_output_file(input_filename):
return output_file
def merge_parquet_files(original_path, temp_path, merged_path):
"""
Merge two parquet files by copying all row groups from original and temp into merged.
"""
original_pq = pq.ParquetFile(original_path)
temp_pq = pq.ParquetFile(temp_path)
merged_writer = None
# Copy all row groups from the original file
for i in range(original_pq.num_row_groups):
row_group = original_pq.read_row_group(i)
if merged_writer is None:
merged_writer = pq.ParquetWriter(
merged_path,
row_group.schema,
flavor="spark"
)
merged_writer.write_table(row_group)
# Append all row groups from the temp file
for i in range(temp_pq.num_row_groups):
row_group = temp_pq.read_row_group(i)
merged_writer.write_table(row_group)
# Close the writer
if merged_writer is not None:
merged_writer.close()
def get_last_revid_from_parquet(output_file):
"""
Read the last revid from a parquet file by reading only the last row group.
Read the last revid from a parquet file or partitioned namespace directory.
Returns None if the file doesn't exist or is empty.
Handles both single files and partitioned namespace structures (namespace=*/file.parquet).
For partitioned namespaces, finds the most recently modified partition and reads from it.
"""
try:
file_path = output_file
if not os.path.exists(file_path):
if not os.path.exists(output_file):
return None
# Open the parquet file
parquet_file = pq.ParquetFile(file_path)
# Check if this is a partitioned namespace directory
if os.path.isdir(output_file):
# Find all namespace=* subdirectories
namespace_dirs = [d for d in os.listdir(output_file) if d.startswith('namespace=')]
# Get the number of row groups
num_row_groups = parquet_file.num_row_groups
if not namespace_dirs:
return None
if num_row_groups == 0:
return None
# Find the most recently modified namespace partition
most_recent_ns = None
most_recent_mtime = -1
# Read only the last row group, and only the revid column
last_row_group = parquet_file.read_row_group(num_row_groups - 1, columns=['revid'])
for ns_dir in namespace_dirs:
ns_path = os.path.join(output_file, ns_dir)
mtime = os.path.getmtime(ns_path)
if mtime > most_recent_mtime:
most_recent_mtime = mtime
most_recent_ns = ns_path
if last_row_group.num_rows == 0:
return None
if most_recent_ns is None:
return None
# Get the last revid from this row group
last_revid = last_row_group.column('revid')[-1].as_py()
return last_revid
# Find the parquet file in the most recent namespace directory
parquet_files = [f for f in os.listdir(most_recent_ns) if f.endswith('.parquet')]
if not parquet_files:
return None
parquet_path = os.path.join(most_recent_ns, parquet_files[0])
parquet_file = pq.ParquetFile(parquet_path)
if parquet_file.num_row_groups == 0:
return None
# Read only the last row group, and only the revid column
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
if last_row_group.num_rows == 0:
return None
# Get the last revid from this row group
last_revid = last_row_group.column('revid')[-1].as_py()
return last_revid
else:
# Single parquet file
parquet_file = pq.ParquetFile(output_file)
if parquet_file.num_row_groups == 0:
return None
# Read only the last row group, and only the revid column
last_row_group = parquet_file.read_row_group(parquet_file.num_row_groups - 1, columns=['revid'])
if last_row_group.num_rows == 0:
return None
# Get the last revid from this row group
last_revid = last_row_group.column('revid')[-1].as_py()
return last_revid
except Exception as e:
print(f"Error reading last revid from {file_path}: {e}", file=sys.stderr)
print(f"Error reading last revid from {output_file}: {e}", file=sys.stderr)
return None