Allow specifying output file basename instead of just directory
This is optional, and doesn't impact existing users as preexisting behavior when users specify an output directory is unchanged. This makes tests not need to copy large files as part of their execution, as they can ask files to be written to explicit locations. Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
46
wikiq
46
wikiq
@@ -201,6 +201,27 @@ class RegexPair(object):
|
||||
return rev_data
|
||||
|
||||
|
||||
def pa_schema() -> pa.Schema:
|
||||
fields = [
|
||||
pa.field("revid", pa.int64()),
|
||||
pa.field("date_time", pa.timestamp('s')),
|
||||
pa.field("articleid", pa.int64()),
|
||||
pa.field("editorid", pa.int64(), nullable=True),
|
||||
pa.field("title", pa.string()),
|
||||
pa.field("namespace", pa.int32()),
|
||||
pa.field("deleted", pa.bool_()),
|
||||
pa.field("text_chars", pa.int32()),
|
||||
pa.field("comment_chars", pa.int32()),
|
||||
pa.field("revert", pa.bool_(), nullable=True),
|
||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
||||
pa.field("reverteds", pa.string(), nullable=True),
|
||||
pa.field("sha1", pa.string()),
|
||||
pa.field("minor", pa.bool_()),
|
||||
pa.field("editor", pa.string()),
|
||||
pa.field("anon", pa.bool_())
|
||||
]
|
||||
return pa.schema(fields)
|
||||
|
||||
"""
|
||||
|
||||
We used to use a dictionary to collect fields for the output.
|
||||
@@ -229,6 +250,7 @@ class Revision:
|
||||
namespace: int
|
||||
deleted: bool
|
||||
text_chars: int | None = None
|
||||
comment_chars: int | None = None
|
||||
revert: bool | None = None
|
||||
reverteds: str = None
|
||||
sha1: str | None = None
|
||||
@@ -249,6 +271,7 @@ class Revision:
|
||||
pa.field("namespace", pa.int32()),
|
||||
pa.field("deleted", pa.bool_()),
|
||||
pa.field("text_chars", pa.int32()),
|
||||
# pa.field("comment_chars", pa.int32()),
|
||||
pa.field("revert", pa.bool_(), nullable=True),
|
||||
# reverteds is a string which contains a comma-separated list of reverted revision ids.
|
||||
pa.field("reverteds", pa.string(), nullable=True),
|
||||
@@ -492,6 +515,7 @@ class WikiqParser:
|
||||
state = persistence.State()
|
||||
|
||||
# Iterate through a page's revisions
|
||||
prev_text_chars = 0
|
||||
for revs in page:
|
||||
revs = list(revs)
|
||||
rev = revs[-1]
|
||||
@@ -525,6 +549,7 @@ class WikiqParser:
|
||||
|
||||
# TODO rev.bytes doesn't work.. looks like a bug
|
||||
rev_data.text_chars = len(rev.text)
|
||||
rev_data.comment_chars = sum(0 if r.comment is None else len(r.comment) for r in revs)
|
||||
|
||||
# generate revert data
|
||||
if rev_detector is not None:
|
||||
@@ -550,8 +575,7 @@ class WikiqParser:
|
||||
# TODO missing: additions_size deletions_size
|
||||
|
||||
# if collapse user was on, let's run that
|
||||
if self.collapse_user:
|
||||
rev_data.collapsed_revs = len(revs)
|
||||
rev_data.collapsed_revs = len(revs)
|
||||
|
||||
# get the
|
||||
if self.persist != PersistMethod.none:
|
||||
@@ -652,7 +676,7 @@ def main():
|
||||
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
|
||||
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
|
||||
|
||||
parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
|
||||
parser.add_argument('-o', '--output', metavar='OUTPUT', dest='output', type=str, nargs=1,
|
||||
help="Directory for output files. If it ends with .parquet output will be in parquet format.")
|
||||
|
||||
parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
|
||||
@@ -714,27 +738,27 @@ def main():
|
||||
namespaces = None
|
||||
|
||||
if len(args.dumpfiles) > 0:
|
||||
output_parquet = False
|
||||
for filename in args.dumpfiles:
|
||||
input_file = open_input_file(filename, args.fandom_2020)
|
||||
|
||||
# open directory for output
|
||||
if args.output_dir:
|
||||
output_dir = args.output_dir[0]
|
||||
if args.output:
|
||||
output = args.output[0]
|
||||
else:
|
||||
output_dir = "."
|
||||
output = "."
|
||||
|
||||
if output_dir.endswith(".parquet"):
|
||||
output_parquet = True
|
||||
output_parquet = output.endswith(".parquet")
|
||||
|
||||
print("Processing file: %s" % filename, file=sys.stderr)
|
||||
|
||||
if args.stdout:
|
||||
# Parquet libraries need a binary output, so just sys.stdout doesn't work.
|
||||
output_file = sys.stdout.buffer
|
||||
else:
|
||||
filename = os.path.join(output_dir, os.path.basename(filename))
|
||||
elif os.path.isdir(output) or output_parquet:
|
||||
filename = os.path.join(output, os.path.basename(filename))
|
||||
output_file = get_output_filename(filename, parquet=output_parquet)
|
||||
else:
|
||||
output_file = output
|
||||
|
||||
wikiq = WikiqParser(input_file,
|
||||
output_file,
|
||||
|
||||
Reference in New Issue
Block a user