Remove unused dependencies and fix spacing

The "mw" and "numpy" dependencies were unneeded.

Spaces and tabs were inconsistently used.
They are now used consistently, changes via auto-formatter.

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-05-26 14:15:01 -05:00
parent 4804ecc4b3
commit 9c5bf577e6
2 changed files with 134 additions and 113 deletions

View File

@ -1,4 +1,3 @@
apeek==0.1.1
attrs==25.3.0
certifi==2025.4.26
charset-normalizer==3.4.2
@ -11,14 +10,12 @@ jsonable==0.3.1
jsonschema==4.23.0
jsonschema-specifications==2025.4.1
mediawiki-utilities==0.4.18
mw==0.4.0
mwcli==0.0.3
mwdiffs==0.0.2
mwpersistence==0.2.4
mwreverts==0.1.5
mwtypes==0.4.0
mwxml==0.3.6
numpy==1.26.4
pandas==2.2.3
para==0.0.8
parsimonious==0.10.0

244
wikiq
View File

@ -6,9 +6,9 @@
import argparse
import sys
import os, os.path
import os.path
import re
from datetime import datetime,timezone
from datetime import datetime, timezone
from subprocess import Popen, PIPE
from collections import deque
@ -20,8 +20,9 @@ from deltas.tokenizers import wikitext_split
import mwpersistence
import mwreverts
from urllib.parse import quote
TO_ENCODE = ('title', 'editor')
PERSISTENCE_RADIUS=7
PERSISTENCE_RADIUS = 7
from deltas import SequenceMatcher
from deltas import SegmentMatcher
@ -30,42 +31,46 @@ from dataclasses import dataclass
import pyarrow as pa
import pyarrow.parquet as pq
class PersistMethod:
none = 0
sequence = 1
segment = 2
legacy = 3
def calculate_persistence(tokens_added):
return(sum([(len(x.revisions)-1) for x in tokens_added]),
len(tokens_added))
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
len(tokens_added))
class WikiqIterator():
def __init__(self, fh, collapse_user=False):
self.fh = fh
self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in
self.mwiterator.site_info.namespaces }
self.namespace_map = {ns.id: ns.name for ns in
self.mwiterator.site_info.namespaces}
self.__pages = self.load_pages()
def load_pages(self):
for page in self.mwiterator:
yield WikiqPage(page,
namespace_map = self.namespace_map,
namespace_map=self.namespace_map,
collapse_user=self.collapse_user)
def __iter__(self):
return self.__pages
def __next__(self):
return next(self._pages)
return next(self.__pages)
class WikiqPage():
__slots__ = ('id', 'title', 'namespace', 'redirect',
'restrictions', 'mwpage', '__revisions',
'collapse_user')
def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id
self.namespace = page.namespace
@ -95,7 +100,7 @@ class WikiqPage():
for i, rev in enumerate(self.mwpage):
# never yield the first time
if i == 0:
if self.collapse_user:
if self.collapse_user:
collapsed_revs = 1
rev.collapsed_revs = collapsed_revs
@ -138,6 +143,8 @@ A RegexPair is defined by a regular expression (pattern) and a label.
The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
If the pattern does not include a capture group, then only one output column will result.
"""
class RegexPair(object):
def __init__(self, pattern, label):
self.pattern = re.compile(pattern)
@ -145,10 +152,10 @@ class RegexPair(object):
self.has_groups = bool(self.pattern.groupindex)
if self.has_groups:
self.capture_groups = list(self.pattern.groupindex.keys())
def get_pyarrow_fields(self):
if self.has_groups:
fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string()))
fields = [pa.field(self._make_key(cap_group), pa.list_(pa.string()))
for cap_group in self.capture_groups]
else:
fields = [pa.field(self.label, pa.list_(pa.string()))]
@ -159,7 +166,7 @@ class RegexPair(object):
return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data):
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
@ -178,7 +185,7 @@ class RegexPair(object):
temp_list.append(match.group(cap_group))
# if temp_list of matches is empty just make that column None
if len(temp_list)==0:
if len(temp_list) == 0:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
@ -192,8 +199,8 @@ class RegexPair(object):
# there are no capture groups, we just search for all the matches of the regex
else:
#given that there are matches to be made
if type(content) in(str, bytes):
# given that there are matches to be made
if type(content) in (str, bytes):
if self.pattern.search(content) is not None:
m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m)
@ -206,6 +213,7 @@ class RegexPair(object):
return rev_data
"""
We used to use a dictionary to collect fields for the output.
@ -222,9 +230,11 @@ It also needs to have the correct pyarrow schema so we can write parquet files.
The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
"""
@dataclass()
class RevDataBase():
revid: int
revid: int
date_time: datetime
articleid: int
editorid: int
@ -249,18 +259,18 @@ class RevDataBase():
pa_schema_fields = [
pa.field("revid", pa.int64()),
pa.field("date_time", pa.timestamp('ms')),
pa.field("articleid",pa.int64()),
pa.field("editorid",pa.int64()),
pa.field("title",pa.string()),
pa.field("namespace",pa.int32()),
pa.field("deleted",pa.bool_()),
pa.field("test_chars",pa.int32()),
pa.field("revert",pa.bool_()),
pa.field("reverteds",pa.list_(pa.int64())),
pa.field("sha1",pa.string()),
pa.field("minor",pa.bool_()),
pa.field("editor",pa.string()),
pa.field("anon",pa.bool_())
pa.field("articleid", pa.int64()),
pa.field("editorid", pa.int64()),
pa.field("title", pa.string()),
pa.field("namespace", pa.int32()),
pa.field("deleted", pa.bool_()),
pa.field("test_chars", pa.int32()),
pa.field("revert", pa.bool_()),
pa.field("reverteds", pa.list_(pa.int64())),
pa.field("sha1", pa.string()),
pa.field("minor", pa.bool_()),
pa.field("editor", pa.string()),
pa.field("anon", pa.bool_())
]
# pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function
@ -269,7 +279,7 @@ class RevDataBase():
# logic to convert each field into the wikiq tsv format goes here.
def to_tsv_row(self):
row = []
for f in dc.fields(self):
val = getattr(self, f.name)
@ -281,7 +291,7 @@ class RevDataBase():
elif f.type == datetime:
row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
elif f.name in {'editor','title'}:
elif f.name in {'editor', 'title'}:
s = '"' + val + '"'
if self.urlencode and f.name in TO_ENCODE:
row.append(quote(str(s)))
@ -299,11 +309,12 @@ class RevDataBase():
else:
row.append(val)
return '\t'.join(map(str,row))
return '\t'.join(map(str, row))
def header_row(self):
return '\t'.join(map(lambda f: f.name, dc.fields(self)))
"""
If collapse=True we'll use a RevDataCollapse dataclass.
@ -312,43 +323,54 @@ This class inherits from RevDataBase. This means that it has all the same fields
It just adds a new field and updates the pyarrow schema.
"""
@dataclass()
class RevDataCollapse(RevDataBase):
collapsed_revs:int = None
collapsed_revs: int = None
pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64())
pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
"""
If persistence data is to be computed we'll need the fields added by RevDataPersistence.
"""
@dataclass()
class RevDataPersistence(RevDataBase):
token_revs:int = None
tokens_added:int = None
tokens_removed:int = None
tokens_window:int = None
token_revs: int = None
tokens_added: int = None
tokens_removed: int = None
tokens_window: int = None
pa_persistence_schema_fields = [
pa.field("token_revs", pa.int64()),
pa.field("tokens_added", pa.int64()),
pa.field("tokens_removed", pa.int64()),
pa.field("tokens_window", pa.int64())]
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
"""
class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.
"""
@dataclass()
class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
class WikiqParser:
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label,
regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces=None,
revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@ -360,7 +382,7 @@ class WikiqParser():
self.namespaces = []
self.urlencode = urlencode
self.revert_radius = revert_radius
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@ -370,9 +392,8 @@ class WikiqParser():
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
# This is where we set the type for revdata.
if self.collapse_user is True:
if self.persist == PersistMethod.none:
revdata_type = RevDataCollapse
@ -391,10 +412,10 @@ class WikiqParser():
self.revdata_type = dc.make_dataclass('RevData_Parser',
fields=regex_fields,
bases=(revdata_type,))
# we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
self.revdata_type.urlencode = self.urlencode
self.schema = pa.schema(self.revdata_type.pa_schema_fields)
@ -409,15 +430,15 @@ class WikiqParser():
else:
self.print_header = True
if output_file == sys.stdout:
self.output_file = output_file
else:
self.output_file = open(output_file,'w')
self.output_file = open(output_file, 'w')
self.output_parquet = False
def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \
(len(patterns) == len(labels)):
(len(patterns) == len(labels)):
result = []
for pattern, label in zip(patterns, labels):
rp = RegexPair(pattern, label)
@ -435,7 +456,7 @@ class WikiqParser():
return rev_data
def matchmake_text(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
@ -450,7 +471,7 @@ class WikiqParser():
for ns in self.namespaces:
# skip if the namespace is not defined
if ns == None:
if ns is None:
default_ns = self.namespaces[ns]
continue
@ -460,7 +481,6 @@ class WikiqParser():
# if we've made it this far with no matches, we return the default namespace
return default_ns
def process(self):
# create a regex that creates the output filename
@ -472,12 +492,11 @@ class WikiqParser():
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
# extract list of namspaces
self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces}
page_count = 0
rev_count = 0
# Iterate through pages
for page in dump:
namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
@ -487,17 +506,17 @@ class WikiqParser():
if namespace not in self.namespace_filter:
continue
rev_detector = mwreverts.Detector(radius = self.revert_radius)
rev_detector = mwreverts.Detector(radius=self.revert_radius)
if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist == PersistMethod.sequence:
state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.segment:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS)
# self.persist == PersistMethod.legacy
@ -507,15 +526,15 @@ class WikiqParser():
# Iterate through a page's revisions
for rev in page:
# create a new data object instead of a dictionary.
rev_data = self.revdata_type(revid = rev.id,
date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid = page.id,
editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
title = page.title,
deleted = rev.deleted.text,
namespace = namespace
rev_data = self.revdata_type(revid=rev.id,
date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid=page.id,
editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
title=page.title,
deleted=rev.deleted.text,
namespace=namespace
)
rev_data = self.matchmake_revision(rev, rev_data)
@ -530,7 +549,7 @@ class WikiqParser():
text_sha1 = rev.sha1
else:
text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
rev_data.sha1 = text_sha1
# TODO rev.bytes doesn't work.. looks like a bug
@ -538,7 +557,7 @@ class WikiqParser():
# generate revert data
revert = rev_detector.process(text_sha1, rev.id)
if revert:
rev_data.revert = True
rev_data.reverteds = revert.reverteds
@ -550,16 +569,16 @@ class WikiqParser():
if not rev.deleted.user:
# wrap user-defined editors in quotes for fread
rev_data.editor = rev.user.text
rev_data.editor = rev.user.text
rev_data.anon = rev.user.id is None
#if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
# if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
# redirect = True
#else:
# else:
# redirect = False
#TODO missing: additions_size deletions_size
# TODO missing: additions_size deletions_size
# if collapse user was on, lets run that
if self.collapse_user:
rev_data.collapsed_revs = rev.collapsed_revs
@ -573,18 +592,18 @@ class WikiqParser():
else:
_, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
window.append((rev.id, rev_data, tokens_added, tokens_removed))
if len(window) == PERSISTENCE_RADIUS:
old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
old_rev_data.token_revs = num_token_revs
old_rev_data.tokens_added = num_tokens
old_rev_data.tokens_removed = len(old_tokens_removed)
old_rev_data.tokens_window = PERSISTENCE_RADIUS-1
old_rev_data.tokens_window = PERSISTENCE_RADIUS - 1
self.print_rev_data(old_rev_data)
@ -606,7 +625,7 @@ class WikiqParser():
rev_data.token_revs = num_token_revs
rev_data.tokens_added = num_tokens
rev_data.tokens_removed = len(tokens_removed)
rev_data.tokens_window = len(window)-(i+1)
rev_data.tokens_window = len(window) - (i + 1)
self.print_rev_data(rev_data)
page_count += 1
@ -622,11 +641,11 @@ class WikiqParser():
else:
self.output_file.close()
"""
For performance reasons it's better to write parquet in batches instead of one row at a time.
So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
"""
def write_parquet_row(self, rev_data):
padata = rev_data.to_pyarrow()
self.parquet_buffer.append(padata)
@ -634,16 +653,17 @@ class WikiqParser():
if len(self.parquet_buffer) >= self.parquet_buffer_size:
self.flush_parquet_buffer()
"""
Function that actually writes data to the parquet file.
It needs to transpose the data from row-by-row to column-by-column
"""
def flush_parquet_buffer(self):
"""
Returns the pyarrow table that we'll write
"""
def rows_to_table(rg, schema):
cols = []
first = rg[0]
@ -661,18 +681,18 @@ class WikiqParser():
outtable = rows_to_table(self.parquet_buffer, self.schema)
if self.pq_writer is None:
self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
self.pq_writer.write_table(outtable)
self.parquet_buffer = []
# depending on if we are configured to write tsv or parquet, we'll call a different function.
def print_rev_data(self, rev_data):
if self.output_parquet is False:
printfunc = self.write_tsv_row
else:
printfunc = self.write_parquet_row
printfunc(rev_data)
def write_tsv_row(self, rev_data):
@ -686,11 +706,11 @@ class WikiqParser():
def open_input_file(input_filename):
if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
cmd = ["7za", "x", "-so", input_filename, "*.xml"]
elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename]
cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2$', input_filename):
cmd = ["bzcat", "-dk", input_filename]
cmd = ["bzcat", "-dk", input_filename]
try:
input_file = Popen(cmd, stdout=PIPE).stdout
@ -699,7 +719,8 @@ def open_input_file(input_filename):
return input_file
def get_output_filename(input_filename, parquet = False):
def get_output_filename(input_filename, parquet=False):
output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
output_filename = re.sub(r'\.xml', '', output_filename)
if parquet is False:
@ -708,16 +729,18 @@ def get_output_filename(input_filename, parquet = False):
output_filename = output_filename + ".parquet"
return output_filename
def open_output_file(input_filename):
# create a regex that creates the output filename
output_filename = get_output_filename(input_filename, parquet = False)
output_filename = get_output_filename(input_filename, parquet=False)
output_file = open(output_filename, "w")
return output_file
parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
# arguments for the input direction
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
@ -729,7 +752,8 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
@ -749,19 +773,19 @@ parser.add_argument('-rr',
parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str,
action='append',
help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str,
action='append',
help="The label for the outputted column based on matching the regex in comments.")
args = parser.parse_args()
# set persistence method
if args.persist is None:
@ -798,7 +822,7 @@ if len(args.dumpfiles) > 0:
output_file = sys.stdout
else:
filename = os.path.join(output_dir, os.path.basename(filename))
output_file = get_output_filename(filename, parquet = output_parquet)
output_file = get_output_filename(filename, parquet=output_parquet)
wikiq = WikiqParser(input_file,
output_file,
@ -807,15 +831,15 @@ if len(args.dumpfiles) > 0:
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label,
regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label,
output_parquet=output_parquet)
wikiq.process()
# close things
# close things
input_file.close()
else:
@ -823,16 +847,16 @@ else:
sys.stdout,
collapse_user=args.collapse_user,
persist=persist,
#persist_legacy=args.persist_legacy,
# persist_legacy=args.persist_legacy,
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label)
wikiq.process()
wikiq.process()
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
# stop_words = stop_words.split(",")