Fix issue with .7z archives

Before, only fandom wikis dumps were compressed with .7z.
These archives can have several .xml files in the .7z; not just one.
So we need to have a flag for the fandom-2020 dumps.

This fixes the bug so .7z archives work in either case.
This commit is contained in:
Nathan TeBlunthuis 2025-05-28 21:31:41 -07:00
parent 39fec0820d
commit 2a2b611d79
3 changed files with 19 additions and 11 deletions

View File

@ -6,7 +6,7 @@ readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"deltas>=0.7.0",
"mw>=0.4.0",
"mediawiki-utilities>=0.4.18",
"mwpersistence>=0.2.4",
"mwreverts>=0.1.5",
"mwxml>=0.3.6",

View File

@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
try:
tester.call_wikiq("--collapse-user")
tester.call_wikiq("--collapse-user --fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
try:
tester.call_wikiq("--persistence segment")
tester.call_wikiq("--persistence segment --fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
try:
tester.call_wikiq("--persistence legacy")
tester.call_wikiq("--persistence legacy --fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
try:
tester.call_wikiq("--persistence")
tester.call_wikiq("--persistence --fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
try:
tester.call_wikiq("--url-encode")
tester.call_wikiq("--url-encode --fandom-2020")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase):
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
try:
outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))

16
wikiq
View File

@ -704,17 +704,21 @@ class WikiqParser:
line = rev_data.to_tsv_row()
print(line, file=self.output_file)
def open_input_file(input_filename):
def match_archive_suffix(input_filename):
if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
cmd = ["7za", "x", "-so", input_filename]
elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2$', input_filename):
cmd = ["bzcat", "-dk", input_filename]
else:
raise ValueError("Unrecognized file type: %s" % input_filename)
return cmd
def open_input_file(input_filename, fandom_2020=False):
cmd = match_archive_suffix(input_filename)
if fandom_2020:
cmd.append("*.xml")
try:
return Popen(cmd, stdout=PIPE).stdout
except NameError:
@ -787,6 +791,10 @@ def main():
action='append',
help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('--fandom-2020', dest="fandom_2020",
action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
args = parser.parse_args()
# set persistence method
@ -808,7 +816,7 @@ def main():
if len(args.dumpfiles) > 0:
output_parquet = False
for filename in args.dumpfiles:
input_file = open_input_file(filename)
input_file = open_input_file(filename, args.fandom_2020)
# open directory for output
if args.output_dir: