13
0
cdsc_reddit/dumps/remove_duplicate_comments.py

35 lines
932 B
Python
Raw Normal View History

2023-05-24 00:18:19 +00:00
from pathlib import Path
from itertools import chain, groupby
dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
zst_files = dumpdir.glob("*.zst")
bz2_files = dumpdir.glob("*.bz2")
xz_files = dumpdir.glob("*.xz")
all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
groups = groupby(all_files, key = lambda p: p.stem)
kept_paths = []
removed_paths = []
priority = ['.zst','.xz','.bz2']
for stem, files in groups:
keep_file = None
remove_files = []
for f in files:
if keep_file is None:
keep_file = f
elif priority.index(keep_file.suffix) > priority.index(f.suffix):
remove_files.append(keep_file)
keep_file = f
else:
remove_files.append(f)
kept_paths.append(keep_file)
removed_paths.extend(remove_files)
(dumpdir / "to_remove").mkdir()
for f in removed_paths:
f.rename(f.parent / "to_remove" / f.name)