Compare commits

..

No commits in common. "mako_changes-20230429" and "master" have entirely different histories.

4 changed files with 38 additions and 76 deletions

View File

@ -9,11 +9,5 @@ submodule like::
git submodule update git submodule update
Wikimedia dumps are usually in a compressed format such as 7z (most common), Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
gz, or bz2. Wikiq uses your computer's compression software to read these `7za`, `gzcat`, and `zcat`.
files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`.
There are also a series of Python dependencies. You can install these using pip
with a command like:
pip3 install mwbase mwreverts mwxml mwtypes mwcli mwdiffs mwpersistence pandas

View File

@ -1,9 +0,0 @@
Please add unit tests for the new count-only functionality.
line 43 def matchmake:
This was making redundant calls to regex matching functions and so could be slower than necessary. I suggest changes that use the walrus operator to keep the same logical structure without the redundant calls.
line 212 def __init__:
Minor note: This constructor is taking a lot of arguments. This is fine, but from a style + maintainability perspective it might make sense to create a new class for the regex matching configuration and pass a configuration object to this contructor instead.

View File

@ -3,7 +3,7 @@ import os
import subprocess import subprocess
from shutil import copyfile from shutil import copyfile
import pandas as pd import pandas as pd
from pandas.testing import assert_frame_equal from pandas.util.testing import assert_frame_equal
from io import StringIO from io import StringIO
# with / without pwr DONE # with / without pwr DONE

93
wikiq
View File

@ -139,61 +139,53 @@ class RegexPair(object):
def _make_key(self, cap_group): def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group)) return ("{}_{}".format(self.label, cap_group))
def matchmake(self, content, rev_data, count_only=False): def matchmake(self, content, rev_data):
temp_dict = {} temp_dict = {}
# if there are named capture groups in the regex # if there are named capture groups in the regex
if self.has_groups: if self.has_groups:
# if there are matches of some sort in this revision content, fill the lists for each cap_group # if there are matches of some sort in this revision content, fill the lists for each cap_group
if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0: if self.pattern.search(content) is not None:
m = self.pattern.finditer(content)
matchobjects = list(m)
for cap_group in self.capture_groups: for cap_group in self.capture_groups:
key = self._make_key(cap_group) key = self._make_key(cap_group)
temp_list = [] temp_list = []
for match in matchobjects: for match in matchobjects:
# we only want to add the match for the capture group if the match is not None # we only want to add the match for the capture group if the match is not None
if (group := match.group(cap_group)) is not None: if match.group(cap_group) != None:
temp_list.append(group) temp_list.append(match.group(cap_group))
# if temp_list of matches is empty just make that column None # if temp_list of matches is empty just make that column None
if len(temp_list)==0: if len(temp_list)==0:
temp_dict[key] = None temp_dict[key] = None
# else we put in the list we made in the for-loop above # else we put in the list we made in the for-loop above
else: else:
if count_only: temp_dict[key] = ', '.join(temp_list)
temp_dict[key] = len(temp_list)
else:
temp_dict[key] = ', '.join(temp_list)
# there are no matches at all in this revision content, we default values to None # there are no matches at all in this revision content, we default values to None
else: else:
for cap_group in self.capture_groups: for cap_group in self.capture_groups:
key = self._make_key(cap_group) key = self._make_key(cap_group)
if count_only: temp_dict[key] = None
temp_dict[key] = 0
else:
temp_dict[key] = None
# there are no capture groups, we just search for all the matches of the regex # there are no capture groups, we just search for all the matches of the regex
else: else:
#given that there are matches to be made #given that there are matches to be made
if content is not None and self.pattern.search(content) is not None: if self.pattern.search(content) is not None:
m = self.pattern.findall(content) m = self.pattern.findall(content)
if count_only: temp_dict[self.label] = ', '.join(m)
temp_dict[self.label] = len(m)
else:
temp_dict[self.label] = ', '.join(m)
else: else:
if count_only: temp_dict[self.label] = None
temp_dict[self.label] = 0
else:
temp_dict[self.label] = None
# update rev_data with our new columns # update rev_data with our new columns
rev_data.update(temp_dict) rev_data.update(temp_dict)
return rev_data return rev_data
class WikiqParser(): class WikiqParser():
def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
""" """
Parameters: Parameters:
persist : what persistence method to use. Takes a PersistMethod value persist : what persistence method to use. Takes a PersistMethod value
@ -212,11 +204,9 @@ class WikiqParser():
else: else:
self.namespace_filter = None self.namespace_filter = None
self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label) self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_revision_output_count = regex_revision_output_count self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label)
self.regex_comment_output_count = regex_comment_output_count
def make_matchmake_pairs(self, patterns, labels): def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \ if (patterns is not None and labels is not None) and \
@ -233,14 +223,14 @@ class WikiqParser():
return rev_data return rev_data
def matchmake_revision(self, text, rev_data): def matchmake_revision(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count) return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
def matchmake_comment(self, comment, rev_data): def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count) return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
def matchmake_pairs(self, text, rev_data, pairs, count_only): def matchmake_pairs(self, text, rev_data, pairs):
for pair in pairs: for pair in pairs:
rev_data = pair.matchmake(text, rev_data, count_only) rev_data = pair.matchmake(text, rev_data)
return rev_data return rev_data
def __get_namespace_from_title(self, title): def __get_namespace_from_title(self, title):
@ -377,8 +367,6 @@ class WikiqParser():
rev_data['collapsed_revs'] = rev.collapsed_revs rev_data['collapsed_revs'] = rev.collapsed_revs
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
# initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted
old_rev_data = {}
if rev.deleted.text: if rev.deleted.text:
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]: for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
old_rev_data[k] = None old_rev_data[k] = None
@ -500,24 +488,18 @@ parser.add_argument('-rr',
default=15, default=15,
help="Number of edits to check when looking for reverts (default: 15)") help="Number of edits to check when looking for reverts (default: 15)")
parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append', parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in revision text.") help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true', parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append',
help="The regular expression to search for in comments of revisions.") help="The regular expression to search for in comments of revisions.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.") help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
args = parser.parse_args() args = parser.parse_args()
# set persistence method # set persistence method
@ -561,12 +543,10 @@ if len(args.dumpfiles) > 0:
urlencode=args.urlencode, urlencode=args.urlencode,
namespaces=namespaces, namespaces=namespaces,
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_revision_match = args.regex_revision_match, regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label, regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count, regex_match_comment = args.regex_match_comment,
regex_comment_match = args.regex_comment_match, regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process() wikiq.process()
@ -582,13 +562,10 @@ else:
urlencode=args.urlencode, urlencode=args.urlencode,
namespaces=namespaces, namespaces=namespaces,
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_revision_match = args.regex_revision_match, regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label, regex_revision_label = args.regex_revision_label,
regex_revision_output_count = args.regex_revision_output_count, regex_match_comment = args.regex_match_comment,
regex_comment_match = args.regex_comment_match, regex_comment_label = args.regex_comment_label)
regex_comment_label = args.regex_comment_label,
regex_comment_output_count = args.regex_comment_output_count)
wikiq.process() wikiq.process()