Allow specifying output file basename instead of just directory

This is optional, and doesn't impact existing users as preexisting
behavior when users specify an output directory is unchanged.

This makes tests not need to copy large files as part of their
execution, as they can ask files to be written to explicit
locations.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason
2025-06-02 14:13:13 -05:00
parent 9ee5ecfc91
commit f916af9836
3 changed files with 69 additions and 68 deletions

46
wikiq
View File

@@ -201,6 +201,27 @@ class RegexPair(object):
return rev_data
def pa_schema() -> pa.Schema:
fields = [
pa.field("revid", pa.int64()),
pa.field("date_time", pa.timestamp('s')),
pa.field("articleid", pa.int64()),
pa.field("editorid", pa.int64(), nullable=True),
pa.field("title", pa.string()),
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("text_chars", pa.int32()),
pa.field("comment_chars", pa.int32()),
pa.field("revert", pa.bool_(), nullable=True),
# reverteds is a string which contains a comma-separated list of reverted revision ids.
pa.field("reverteds", pa.string(), nullable=True),
pa.field("sha1", pa.string()),
pa.field("minor", pa.bool_()),
pa.field("editor", pa.string()),
pa.field("anon", pa.bool_())
]
return pa.schema(fields)
"""
We used to use a dictionary to collect fields for the output.
@@ -229,6 +250,7 @@ class Revision:
namespace: int
deleted: bool
text_chars: int | None = None
comment_chars: int | None = None
revert: bool | None = None
reverteds: str = None
sha1: str | None = None
@@ -249,6 +271,7 @@ class Revision:
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("text_chars", pa.int32()),
# pa.field("comment_chars", pa.int32()),
pa.field("revert", pa.bool_(), nullable=True),
# reverteds is a string which contains a comma-separated list of reverted revision ids.
pa.field("reverteds", pa.string(), nullable=True),
@@ -492,6 +515,7 @@ class WikiqParser:
state = persistence.State()
# Iterate through a page's revisions
prev_text_chars = 0
for revs in page:
revs = list(revs)
rev = revs[-1]
@@ -525,6 +549,7 @@ class WikiqParser:
# TODO rev.bytes doesn't work.. looks like a bug
rev_data.text_chars = len(rev.text)
rev_data.comment_chars = sum(0 if r.comment is None else len(r.comment) for r in revs)
# generate revert data
if rev_detector is not None:
@@ -550,8 +575,7 @@ class WikiqParser:
# TODO missing: additions_size deletions_size
# if collapse user was on, let's run that
if self.collapse_user:
rev_data.collapsed_revs = len(revs)
rev_data.collapsed_revs = len(revs)
# get the
if self.persist != PersistMethod.none:
@@ -652,7 +676,7 @@ def main():
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
parser.add_argument('-o', '--output', metavar='OUTPUT', dest='output', type=str, nargs=1,
help="Directory for output files. If it ends with .parquet output will be in parquet format.")
parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
@@ -714,27 +738,27 @@ def main():
namespaces = None
if len(args.dumpfiles) > 0:
output_parquet = False
for filename in args.dumpfiles:
input_file = open_input_file(filename, args.fandom_2020)
# open directory for output
if args.output_dir:
output_dir = args.output_dir[0]
if args.output:
output = args.output[0]
else:
output_dir = "."
output = "."
if output_dir.endswith(".parquet"):
output_parquet = True
output_parquet = output.endswith(".parquet")
print("Processing file: %s" % filename, file=sys.stderr)
if args.stdout:
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
output_file = sys.stdout.buffer
else:
filename = os.path.join(output_dir, os.path.basename(filename))
elif os.path.isdir(output) or output_parquet:
filename = os.path.join(output, os.path.basename(filename))
output_file = get_output_filename(filename, parquet=output_parquet)
else:
output_file = output
wikiq = WikiqParser(input_file,
output_file,