changes for archiving.
This commit is contained in:
34
dumps/remove_duplicate_comments.py
Normal file
34
dumps/remove_duplicate_comments.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from pathlib import Path
|
||||
from itertools import chain, groupby
|
||||
|
||||
dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
|
||||
|
||||
zst_files = dumpdir.glob("*.zst")
|
||||
bz2_files = dumpdir.glob("*.bz2")
|
||||
xz_files = dumpdir.glob("*.xz")
|
||||
all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
|
||||
groups = groupby(all_files, key = lambda p: p.stem)
|
||||
|
||||
kept_paths = []
|
||||
removed_paths = []
|
||||
|
||||
priority = ['.zst','.xz','.bz2']
|
||||
|
||||
for stem, files in groups:
|
||||
keep_file = None
|
||||
remove_files = []
|
||||
for f in files:
|
||||
if keep_file is None:
|
||||
keep_file = f
|
||||
elif priority.index(keep_file.suffix) > priority.index(f.suffix):
|
||||
remove_files.append(keep_file)
|
||||
keep_file = f
|
||||
else:
|
||||
remove_files.append(f)
|
||||
kept_paths.append(keep_file)
|
||||
removed_paths.extend(remove_files)
|
||||
|
||||
(dumpdir / "to_remove").mkdir()
|
||||
|
||||
for f in removed_paths:
|
||||
f.rename(f.parent / "to_remove" / f.name)
|
||||
34
dumps/remove_duplicate_submissions.py
Normal file
34
dumps/remove_duplicate_submissions.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from pathlib import Path
|
||||
from itertools import chain, groupby
|
||||
|
||||
dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions")
|
||||
|
||||
zst_files = dumpdir.glob("*.zst")
|
||||
bz2_files = dumpdir.glob("*.bz2")
|
||||
xz_files = dumpdir.glob("*.xz")
|
||||
all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
|
||||
groups = groupby(all_files, key = lambda p: p.stem)
|
||||
|
||||
kept_paths = []
|
||||
removed_paths = []
|
||||
|
||||
priority = ['.zst','.xz','.bz2']
|
||||
|
||||
for stem, files in groups:
|
||||
keep_file = None
|
||||
remove_files = []
|
||||
for f in files:
|
||||
if keep_file is None:
|
||||
keep_file = f
|
||||
elif priority.index(keep_file.suffix) > priority.index(f.suffix):
|
||||
remove_files.append(keep_file)
|
||||
keep_file = f
|
||||
else:
|
||||
remove_files.append(f)
|
||||
kept_paths.append(keep_file)
|
||||
removed_paths.extend(remove_files)
|
||||
|
||||
(dumpdir / "to_remove").mkdir()
|
||||
|
||||
for f in removed_paths:
|
||||
f.rename(f.parent / "to_remove" / f.name)
|
||||
Reference in New Issue
Block a user