Compare commits

...

7 Commits

Author SHA1 Message Date
933ca753ed code review. 2023-05-03 10:23:30 -07:00
Benjamin Mako Hill
54fa6221a8 fix because pandas testing API has changed 2023-04-29 11:52:13 -07:00
Benjamin Mako Hill
9dcd337315 rename variables to be more consistent
I changed regex_match_revision to be regex_revision_match so that it matches
the way that the other revisions are named so that they are all of the form:
regex_<thing_being_matched_again>_<variable>

I made the same change for comments.
2023-04-29 11:44:48 -07:00
Benjamin Mako Hill
2ff4d60613 added counting functionality to regex code
The regex code has historically returned the actual matched patterns and the
named capture groups within regexes.  When trying to count common and/or large
patterns, this leads to very large outputs.

I've added two new functions -RPc and -CPc that will cause wikiq to return
counts of each pattern (0 when there are no matches). The options apply to all
comment or revision patterns. I considered interfaces to make it possible to do
some but others but concluded this would be too complicated an interface.

This code should be checked before it's merged.
2023-04-29 11:40:03 -07:00
Benjamin Mako Hill
4729371d5a updated README file
- added information on Python dependencies
- wrapped lines in a previous paragraph (no changes)
2023-04-28 14:40:18 -07:00
Benjamin Mako Hill
7e6cd5b386 make sure that content is defined before testing for search patterns
This appears to have been causing a bug with comments/text that were deleted.
Kaylea fixed and I adapated the code.
2023-04-28 14:30:42 -07:00
Benjamin Mako Hill
556285b198 added a line to fix persistence with deleted revs
kaylea realized that we need to initialize the old_rev_data dictionary or it
fails when the first revision to a page is deleted. This patch is from kaylea
and modified by mako.
2023-04-28 14:21:21 -07:00
4 changed files with 76 additions and 38 deletions

View File

@ -9,5 +9,11 @@ submodule like::
git submodule update
Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
`7za`, `gzcat`, and `zcat`.
Wikimedia dumps are usually in a compressed format such as 7z (most common),
gz, or bz2. Wikiq uses your computer's compression software to read these
files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`.
There are also a series of Python dependencies. You can install these using pip
with a command like:
pip3 install mwbase mwreverts mwxml mwtypes mwcli mwdiffs mwpersistence pandas

9
code_review_notes.txt Normal file
View File

@ -0,0 +1,9 @@
Please add unit tests for the new count-only functionality.
line 43 def matchmake:
This was making redundant calls to regex matching functions and so could be slower than necessary. I suggest changes that use the walrus operator to keep the same logical structure without the redundant calls.
line 212 def __init__:
Minor note: This constructor is taking a lot of arguments. This is fine, but from a style + maintainability perspective it might make sense to create a new class for the regex matching configuration and pass a configuration object to this contructor instead.

View File

@ -3,7 +3,7 @@ import os
import subprocess
from shutil import copyfile
import pandas as pd
from pandas.util.testing import assert_frame_equal
from pandas.testing import assert_frame_equal
from io import StringIO
# with / without pwr DONE

93
wikiq
View File

@ -139,53 +139,61 @@ class RegexPair(object):
def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data):
def matchmake(self, content, rev_data, count_only=False):
temp_dict = {}
# if there are named capture groups in the regex
if self.has_groups:
# if there are matches of some sort in this revision content, fill the lists for each cap_group
if self.pattern.search(content) is not None:
m = self.pattern.finditer(content)
matchobjects = list(m)
if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
temp_list = []
for match in matchobjects:
# we only want to add the match for the capture group if the match is not None
if match.group(cap_group) != None:
temp_list.append(match.group(cap_group))
if (group := match.group(cap_group)) is not None:
temp_list.append(group)
# if temp_list of matches is empty just make that column None
if len(temp_list)==0:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
temp_dict[key] = ', '.join(temp_list)
# if temp_list of matches is empty just make that column None
if len(temp_list)==0:
temp_dict[key] = None
# else we put in the list we made in the for-loop above
else:
if count_only:
temp_dict[key] = len(temp_list)
else:
temp_dict[key] = ', '.join(temp_list)
# there are no matches at all in this revision content, we default values to None
# there are no matches at all in this revision content, we default values to None
else:
for cap_group in self.capture_groups:
key = self._make_key(cap_group)
temp_dict[key] = None
if count_only:
temp_dict[key] = 0
else:
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex
else:
#given that there are matches to be made
if self.pattern.search(content) is not None:
if content is not None and self.pattern.search(content) is not None:
m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m)
if count_only:
temp_dict[self.label] = len(m)
else:
temp_dict[self.label] = ', '.join(m)
else:
temp_dict[self.label] = None
if count_only:
temp_dict[self.label] = 0
else:
temp_dict[self.label] = None
# update rev_data with our new columns
rev_data.update(temp_dict)
return rev_data
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
"""
Parameters:
persist : what persistence method to use. Takes a PersistMethod value
@ -204,9 +212,11 @@ class WikiqParser():
else:
self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label)
self.regex_revision_output_count = regex_revision_output_count
self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label)
self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \
@ -223,14 +233,14 @@ class WikiqParser():
return rev_data
def matchmake_revision(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
def matchmake_pairs(self, text, rev_data, pairs):
def matchmake_pairs(self, text, rev_data, pairs, count_only):
for pair in pairs:
rev_data = pair.matchmake(text, rev_data)
rev_data = pair.matchmake(text, rev_data, count_only)
return rev_data
def __get_namespace_from_title(self, title):
@ -367,6 +377,8 @@ class WikiqParser():
rev_data['collapsed_revs'] = rev.collapsed_revs
if self.persist != PersistMethod.none:
# initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted
old_rev_data = {}
if rev.deleted.text:
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
old_rev_data[k] = None
@ -488,18 +500,24 @@ parser.add_argument('-rr',
default=15,
help="Number of edits to check when looking for reverts (default: 15)")
parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append',
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
args = parser.parse_args()
# set persistence method
@ -543,10 +561,12 @@ if len(args.dumpfiles) > 0:
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_match = args.regex_revision_match,
regex_revision_label = args.regex_revision_label,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_revision_output_count = args.regex_revision_output_count,
regex_comment_match = args.regex_comment_match,
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()
@ -562,10 +582,13 @@ else:
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_match = args.regex_revision_match,
regex_revision_label = args.regex_revision_label,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
regex_revision_output_count = args.regex_revision_output_count,
regex_comment_match = args.regex_comment_match,
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process()