Fix issue with .7z archives

Before, only fandom wikis dumps were compressed with .7z.
These archives can have several .xml files in the .7z; not just one.
So we need to have a flag for the fandom-2020 dumps.

This fixes the bug so .7z archives work in either case.
This commit is contained in:
Nathan TeBlunthuis
2025-05-28 21:31:41 -07:00
parent 39fec0820d
commit 2a2b611d79
3 changed files with 19 additions and 11 deletions

16
wikiq
View File

@@ -704,17 +704,21 @@ class WikiqParser:
line = rev_data.to_tsv_row()
print(line, file=self.output_file)
def open_input_file(input_filename):
def match_archive_suffix(input_filename):
if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
cmd = ["7za", "x", "-so", input_filename]
elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2$', input_filename):
cmd = ["bzcat", "-dk", input_filename]
else:
raise ValueError("Unrecognized file type: %s" % input_filename)
return cmd
def open_input_file(input_filename, fandom_2020=False):
cmd = match_archive_suffix(input_filename)
if fandom_2020:
cmd.append("*.xml")
try:
return Popen(cmd, stdout=PIPE).stdout
except NameError:
@@ -787,6 +791,10 @@ def main():
action='append',
help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('--fandom-2020', dest="fandom_2020",
action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
args = parser.parse_args()
# set persistence method
@@ -808,7 +816,7 @@ def main():
if len(args.dumpfiles) > 0:
output_parquet = False
for filename in args.dumpfiles:
input_file = open_input_file(filename)
input_file = open_input_file(filename, args.fandom_2020)
# open directory for output
if args.output_dir: