1
0
mw-convo-collections/data_collection/decompression_script.py

50 lines
1.6 KiB
Python

import requests
import bz2
import shutil
import os
import sys
#FILE_LOC_PREFIX = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
def decompress(filepath):
decompressed_filepath = filepath[:-4]
with bz2.BZ2File(filepath) as fr, open(decompressed_filepath,"wb") as fw:
shutil.copyfileobj(fr,fw)
print(f"Decompressed {decompressed_filepath}")
os.remove(filepath)
print(f"Deleted {filepath}")
def decompress_directory(directory_name):
# Traverse the directory
OSErrors = 0
for root, dirs, files in os.walk(directory_name):
for file in files:
if file.endswith('.bz2'):
# Full path to the file
filepath = os.path.join(root, file)
print(filepath)
# Apply the decompress function
try:
decompress(filepath)
except OSError:
OSErrors += 1
print(f"OSError @ {filepath}")
os.remove(filepath)
print(f"Deleted {filepath}")
return OSErrors
def cleanup(directory_name):
for root, dirs, files in os.walk(directory_name):
for file in files:
if file.endswith('.bz2'):
filepath = os.path.join(root, file)
os.remove(filepath)
print(f"Deleted {filepath}")
if __name__ == "__main__":
#batch_parallel_for_single()
decompression_errors = decompress_directory(sys.argv[1])
print(f"We had {decompression_errors} OSErrors during decompression.")
#cleanup(FILE_LOC_PREFIX)