added counting functionality to regex code

The regex code has historically returned the actual matched patterns and the
named capture groups within regexes.  When trying to count common and/or large
patterns, this leads to very large outputs.

I've added two new functions -RPc and -CPc that will cause wikiq to return
counts of each pattern (0 when there are no matches). The options apply to all
comment or revision patterns. I considered interfaces to make it possible to do
some but others but concluded this would be too complicated an interface.

This code should be checked before it's merged.
This commit is contained in:
Benjamin Mako Hill 2023-04-29 11:40:03 -07:00
parent 4729371d5a
commit 2ff4d60613

44
wikiq
View File

@ -139,8 +139,7 @@ class RegexPair(object):
def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data):
def matchmake(self, content, rev_data, count_only=False):
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
@ -162,6 +161,9 @@ class RegexPair(object):
if len(temp_list)==0:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
if count_only:
temp_dict[key] = len(temp_list)
else:
temp_dict[key] = ', '.join(temp_list)
@ -169,6 +171,9 @@ class RegexPair(object):
else:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
if count_only:
temp_dict[key] = 0
else:
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex
@ -176,7 +181,13 @@ class RegexPair(object):
#given that there are matches to be made
if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content)
if count_only:
temp_dict[self.label] = len(m)
else:
temp_dict[self.label] = ', '.join(m)
else:
if count_only:
temp_dict[self.label] = 0
else:
temp_dict[self.label] = None
# update rev_data with our new columns
@ -185,7 +196,7 @@ class RegexPair(object):
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
def __init__(self, input_file, output_file, regex_match_revision, regex_revision_label, regex_revision_output_count, regex_match_comment, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@ -205,8 +216,10 @@ class WikiqParser():
self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_revision_output_count = regex_revision_output_count
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \
@ -223,14 +236,14 @@ class WikiqParser():
return rev_data
def matchmake_revision(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
def matchmake_pairs(self, text, rev_data, pairs):
def matchmake_pairs(self, text, rev_data, pairs, count_only):
for pair in pairs:
rev_data = pair.matchmake(text, rev_data)
rev_data = pair.matchmake(text, rev_data, count_only)
return rev_data
def __get_namespace_from_title(self, title):
@ -496,12 +509,18 @@ parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", de
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
args = parser.parse_args()
# set persistence method
@ -547,8 +566,10 @@ if len(args.dumpfiles) > 0:
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()
@ -566,8 +587,11 @@ else:
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()