Compare commits
7 Commits
master
...
mako_chang
Author | SHA1 | Date | |
---|---|---|---|
933ca753ed | |||
|
54fa6221a8 | ||
|
9dcd337315 | ||
|
2ff4d60613 | ||
|
4729371d5a | ||
|
7e6cd5b386 | ||
|
556285b198 |
10
README.rst
10
README.rst
@ -9,5 +9,11 @@ submodule like::
|
||||
git submodule update
|
||||
|
||||
|
||||
Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
|
||||
`7za`, `gzcat`, and `zcat`.
|
||||
Wikimedia dumps are usually in a compressed format such as 7z (most common),
|
||||
gz, or bz2. Wikiq uses your computer's compression software to read these
|
||||
files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`.
|
||||
|
||||
There are also a series of Python dependencies. You can install these using pip
|
||||
with a command like:
|
||||
|
||||
pip3 install mwbase mwreverts mwxml mwtypes mwcli mwdiffs mwpersistence pandas
|
||||
|
9
code_review_notes.txt
Normal file
9
code_review_notes.txt
Normal file
@ -0,0 +1,9 @@
|
||||
Please add unit tests for the new count-only functionality.
|
||||
|
||||
line 43 def matchmake:
|
||||
This was making redundant calls to regex matching functions and so could be slower than necessary. I suggest changes that use the walrus operator to keep the same logical structure without the redundant calls.
|
||||
|
||||
|
||||
line 212 def __init__:
|
||||
|
||||
Minor note: This constructor is taking a lot of arguments. This is fine, but from a style + maintainability perspective it might make sense to create a new class for the regex matching configuration and pass a configuration object to this contructor instead.
|
@ -3,7 +3,7 @@ import os
|
||||
import subprocess
|
||||
from shutil import copyfile
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
from pandas.testing import assert_frame_equal
|
||||
from io import StringIO
|
||||
|
||||
# with / without pwr DONE
|
||||
|
93
wikiq
93
wikiq
@ -139,53 +139,61 @@ class RegexPair(object):
|
||||
def _make_key(self, cap_group):
|
||||
return ("{}_{}".format(self.label, cap_group))
|
||||
|
||||
def matchmake(self, content, rev_data):
|
||||
|
||||
def matchmake(self, content, rev_data, count_only=False):
|
||||
temp_dict = {}
|
||||
# if there are named capture groups in the regex
|
||||
if self.has_groups:
|
||||
|
||||
# if there are matches of some sort in this revision content, fill the lists for each cap_group
|
||||
if self.pattern.search(content) is not None:
|
||||
m = self.pattern.finditer(content)
|
||||
matchobjects = list(m)
|
||||
|
||||
if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0:
|
||||
for cap_group in self.capture_groups:
|
||||
key = self._make_key(cap_group)
|
||||
temp_list = []
|
||||
for match in matchobjects:
|
||||
# we only want to add the match for the capture group if the match is not None
|
||||
if match.group(cap_group) != None:
|
||||
temp_list.append(match.group(cap_group))
|
||||
if (group := match.group(cap_group)) is not None:
|
||||
temp_list.append(group)
|
||||
|
||||
# if temp_list of matches is empty just make that column None
|
||||
if len(temp_list)==0:
|
||||
temp_dict[key] = None
|
||||
# else we put in the list we made in the for-loop above
|
||||
else:
|
||||
temp_dict[key] = ', '.join(temp_list)
|
||||
# if temp_list of matches is empty just make that column None
|
||||
if len(temp_list)==0:
|
||||
temp_dict[key] = None
|
||||
# else we put in the list we made in the for-loop above
|
||||
else:
|
||||
if count_only:
|
||||
temp_dict[key] = len(temp_list)
|
||||
else:
|
||||
temp_dict[key] = ', '.join(temp_list)
|
||||
|
||||
# there are no matches at all in this revision content, we default values to None
|
||||
# there are no matches at all in this revision content, we default values to None
|
||||
else:
|
||||
for cap_group in self.capture_groups:
|
||||
key = self._make_key(cap_group)
|
||||
temp_dict[key] = None
|
||||
if count_only:
|
||||
temp_dict[key] = 0
|
||||
else:
|
||||
temp_dict[key] = None
|
||||
|
||||
# there are no capture groups, we just search for all the matches of the regex
|
||||
else:
|
||||
#given that there are matches to be made
|
||||
if self.pattern.search(content) is not None:
|
||||
if content is not None and self.pattern.search(content) is not None:
|
||||
m = self.pattern.findall(content)
|
||||
temp_dict[self.label] = ', '.join(m)
|
||||
if count_only:
|
||||
temp_dict[self.label] = len(m)
|
||||
else:
|
||||
temp_dict[self.label] = ', '.join(m)
|
||||
else:
|
||||
temp_dict[self.label] = None
|
||||
if count_only:
|
||||
temp_dict[self.label] = 0
|
||||
else:
|
||||
temp_dict[self.label] = None
|
||||
# update rev_data with our new columns
|
||||
rev_data.update(temp_dict)
|
||||
return rev_data
|
||||
|
||||
|
||||
class WikiqParser():
|
||||
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
|
||||
def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
|
||||
"""
|
||||
Parameters:
|
||||
persist : what persistence method to use. Takes a PersistMethod value
|
||||
@ -204,9 +212,11 @@ class WikiqParser():
|
||||
else:
|
||||
self.namespace_filter = None
|
||||
|
||||
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
|
||||
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
|
||||
|
||||
self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label)
|
||||
self.regex_revision_output_count = regex_revision_output_count
|
||||
|
||||
self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label)
|
||||
self.regex_comment_output_count = regex_comment_output_count
|
||||
|
||||
def make_matchmake_pairs(self, patterns, labels):
|
||||
if (patterns is not None and labels is not None) and \
|
||||
@ -223,14 +233,14 @@ class WikiqParser():
|
||||
return rev_data
|
||||
|
||||
def matchmake_revision(self, text, rev_data):
|
||||
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
|
||||
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
|
||||
|
||||
def matchmake_comment(self, comment, rev_data):
|
||||
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
|
||||
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
|
||||
|
||||
def matchmake_pairs(self, text, rev_data, pairs):
|
||||
def matchmake_pairs(self, text, rev_data, pairs, count_only):
|
||||
for pair in pairs:
|
||||
rev_data = pair.matchmake(text, rev_data)
|
||||
rev_data = pair.matchmake(text, rev_data, count_only)
|
||||
return rev_data
|
||||
|
||||
def __get_namespace_from_title(self, title):
|
||||
@ -367,6 +377,8 @@ class WikiqParser():
|
||||
rev_data['collapsed_revs'] = rev.collapsed_revs
|
||||
|
||||
if self.persist != PersistMethod.none:
|
||||
# initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted
|
||||
old_rev_data = {}
|
||||
if rev.deleted.text:
|
||||
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
|
||||
old_rev_data[k] = None
|
||||
@ -488,18 +500,24 @@ parser.add_argument('-rr',
|
||||
default=15,
|
||||
help="Number of edits to check when looking for reverts (default: 15)")
|
||||
|
||||
parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
|
||||
parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append',
|
||||
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
|
||||
|
||||
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
|
||||
help="The label for the outputted column based on matching the regex in revision text.")
|
||||
|
||||
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
|
||||
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
|
||||
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
|
||||
|
||||
parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append',
|
||||
help="The regular expression to search for in comments of revisions.")
|
||||
|
||||
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
|
||||
help="The label for the outputted column based on matching the regex in comments.")
|
||||
|
||||
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
|
||||
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set persistence method
|
||||
@ -543,10 +561,12 @@ if len(args.dumpfiles) > 0:
|
||||
urlencode=args.urlencode,
|
||||
namespaces=namespaces,
|
||||
revert_radius=args.revert_radius,
|
||||
regex_match_revision = args.regex_match_revision,
|
||||
regex_revision_match = args.regex_revision_match,
|
||||
regex_revision_label = args.regex_revision_label,
|
||||
regex_match_comment = args.regex_match_comment,
|
||||
regex_comment_label = args.regex_comment_label)
|
||||
regex_revision_output_count = args.regex_revision_output_count,
|
||||
regex_comment_match = args.regex_comment_match,
|
||||
regex_comment_label = args.regex_comment_label,
|
||||
regex_comment_output_count = args.regex_comment_output_count)
|
||||
|
||||
wikiq.process()
|
||||
|
||||
@ -562,10 +582,13 @@ else:
|
||||
urlencode=args.urlencode,
|
||||
namespaces=namespaces,
|
||||
revert_radius=args.revert_radius,
|
||||
regex_match_revision = args.regex_match_revision,
|
||||
regex_revision_match = args.regex_revision_match,
|
||||
regex_revision_label = args.regex_revision_label,
|
||||
regex_match_comment = args.regex_match_comment,
|
||||
regex_comment_label = args.regex_comment_label)
|
||||
regex_revision_output_count = args.regex_revision_output_count,
|
||||
regex_comment_match = args.regex_comment_match,
|
||||
regex_comment_label = args.regex_comment_label,
|
||||
regex_comment_output_count = args.regex_comment_output_count)
|
||||
|
||||
|
||||
wikiq.process()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user