fix bug by truncating corrupted jsonl lines.
This commit is contained in:
@@ -175,15 +175,32 @@ def get_jsonl_resume_point(output_file, input_file=None):
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(output_file) as f:
|
||||
# Stream through file, keeping only last 2 lines in memory
|
||||
for line in reversed(deque(f, maxlen=2)):
|
||||
# Track positions of last two valid lines for potential truncation
|
||||
valid_lines = deque(maxlen=2) # (end_position, record)
|
||||
with open(output_file, 'rb') as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
try:
|
||||
record = json.loads(line)
|
||||
return (record['articleid'], record['revid'])
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
continue
|
||||
return None
|
||||
record = json.loads(line.decode('utf-8'))
|
||||
valid_lines.append((f.tell(), record))
|
||||
except (json.JSONDecodeError, KeyError, UnicodeDecodeError):
|
||||
pass
|
||||
|
||||
if not valid_lines:
|
||||
return None
|
||||
|
||||
last_valid_pos, last_valid_record = valid_lines[-1]
|
||||
|
||||
# Truncate if file extends past last valid line (corrupted trailing data)
|
||||
file_size = os.path.getsize(output_file)
|
||||
if last_valid_pos < file_size:
|
||||
print(f"Truncating corrupted data from {output_file} ({file_size - last_valid_pos} bytes)", file=sys.stderr)
|
||||
with open(output_file, 'r+b') as f:
|
||||
f.truncate(last_valid_pos)
|
||||
|
||||
return (last_valid_record['articleid'], last_valid_record['revid'])
|
||||
except IOError as e:
|
||||
print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user