Fix issue with .7z archives
Before, only fandom wikis dumps were compressed with .7z. These archives can have several .xml files in the .7z; not just one. So we need to have a flag for the fandom-2020 dumps. This fixes the bug so .7z archives work in either case.
This commit is contained in:
16
wikiq
16
wikiq
@@ -704,17 +704,21 @@ class WikiqParser:
|
||||
line = rev_data.to_tsv_row()
|
||||
print(line, file=self.output_file)
|
||||
|
||||
|
||||
def open_input_file(input_filename):
|
||||
def match_archive_suffix(input_filename):
|
||||
if re.match(r'.*\.7z$', input_filename):
|
||||
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
|
||||
cmd = ["7za", "x", "-so", input_filename]
|
||||
elif re.match(r'.*\.gz$', input_filename):
|
||||
cmd = ["zcat", input_filename]
|
||||
elif re.match(r'.*\.bz2$', input_filename):
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
else:
|
||||
raise ValueError("Unrecognized file type: %s" % input_filename)
|
||||
return cmd
|
||||
|
||||
def open_input_file(input_filename, fandom_2020=False):
|
||||
cmd = match_archive_suffix(input_filename)
|
||||
if fandom_2020:
|
||||
cmd.append("*.xml")
|
||||
try:
|
||||
return Popen(cmd, stdout=PIPE).stdout
|
||||
except NameError:
|
||||
@@ -787,6 +791,10 @@ def main():
|
||||
action='append',
|
||||
help="The label for the outputted column based on matching the regex in comments.")
|
||||
|
||||
parser.add_argument('--fandom-2020', dest="fandom_2020",
|
||||
action='store_true',
|
||||
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set persistence method
|
||||
@@ -808,7 +816,7 @@ def main():
|
||||
if len(args.dumpfiles) > 0:
|
||||
output_parquet = False
|
||||
for filename in args.dumpfiles:
|
||||
input_file = open_input_file(filename)
|
||||
input_file = open_input_file(filename, args.fandom_2020)
|
||||
|
||||
# open directory for output
|
||||
if args.output_dir:
|
||||
|
||||
Reference in New Issue
Block a user