Compare commits
33 Commits
legacy
...
mako_chang
| Author | SHA1 | Date | |
|---|---|---|---|
| 933ca753ed | |||
| 54fa6221a8 | |||
| 9dcd337315 | |||
| 2ff4d60613 | |||
| 4729371d5a | |||
| 7e6cd5b386 | |||
| 556285b198 | |||
| cdfa77d66d | |||
| 02b3250a36 | |||
| 414cc5ff2d | |||
|
|
4ccde84529 | ||
|
|
f147e1d899 | ||
| c84844cfb5 | |||
| c4416d0f1b | |||
| 7b856bec86 | |||
| 324ccc8e26 | |||
| 17529cdd48 | |||
|
|
7bf4559ceb | ||
| fb052ffa33 | |||
| e871023ff5 | |||
| 7d62ff9fb7 | |||
| f7f5bf8fd4 | |||
|
|
f784c77f60 | ||
| 317bafb50d | |||
| 7cd0bf3b9e | |||
| d93769c21f | |||
|
|
afd40c1a45 | ||
|
|
e4222c45dd | ||
|
|
829ffcffae | ||
|
|
776b73519a | ||
|
|
5b6aaad862 | ||
| f468d1a5b6 | |||
| bf396ad366 |
Submodule Mediawiki-Utilities deleted from f7329417eb
10
README.rst
10
README.rst
@@ -9,5 +9,11 @@ submodule like::
|
|||||||
git submodule update
|
git submodule update
|
||||||
|
|
||||||
|
|
||||||
Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
|
Wikimedia dumps are usually in a compressed format such as 7z (most common),
|
||||||
`7za`, `gzcat`, and `zcat`.
|
gz, or bz2. Wikiq uses your computer's compression software to read these
|
||||||
|
files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`.
|
||||||
|
|
||||||
|
There are also a series of Python dependencies. You can install these using pip
|
||||||
|
with a command like:
|
||||||
|
|
||||||
|
pip3 install mwbase mwreverts mwxml mwtypes mwcli mwdiffs mwpersistence pandas
|
||||||
|
|||||||
9
code_review_notes.txt
Normal file
9
code_review_notes.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
Please add unit tests for the new count-only functionality.
|
||||||
|
|
||||||
|
line 43 def matchmake:
|
||||||
|
This was making redundant calls to regex matching functions and so could be slower than necessary. I suggest changes that use the walrus operator to keep the same logical structure without the redundant calls.
|
||||||
|
|
||||||
|
|
||||||
|
line 212 def __init__:
|
||||||
|
|
||||||
|
Minor note: This constructor is taking a lot of arguments. This is fine, but from a style + maintainability perspective it might make sense to create a new class for the regex matching configuration and pass a configuration object to this contructor instead.
|
||||||
@@ -3,7 +3,7 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.util.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
# with / without pwr DONE
|
# with / without pwr DONE
|
||||||
@@ -51,6 +51,51 @@ class Test_Wikipedia(unittest.TestCase):
|
|||||||
baseline = pd.read_table(baseline_file)
|
baseline = pd.read_table(baseline_file)
|
||||||
assert_frame_equal(test,baseline)
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
|
def test_WP_namespaces(self):
|
||||||
|
print(os.path.abspath('.'))
|
||||||
|
test_filename = "namespaces_" + self.wikiq_out_name
|
||||||
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
|
if os.path.exists(test_file):
|
||||||
|
os.remove(test_file)
|
||||||
|
|
||||||
|
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||||
|
call = call + " -n 0 -n 1"
|
||||||
|
print(call)
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||||
|
proc.wait()
|
||||||
|
copyfile(self.call_output, test_file)
|
||||||
|
baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
|
||||||
|
|
||||||
|
# as a test let's make sure that we get equal data frames
|
||||||
|
test = pd.read_table(test_file)
|
||||||
|
num_wrong_ns = sum(~ test.namespace.isin({0,1}))
|
||||||
|
self.assertEqual(num_wrong_ns, 0)
|
||||||
|
baseline = pd.read_table(baseline_file)
|
||||||
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
|
def test_WP_revert_radius(self):
|
||||||
|
print(os.path.abspath('.'))
|
||||||
|
test_filename = "revert_radius_" + self.wikiq_out_name
|
||||||
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
|
if os.path.exists(test_file):
|
||||||
|
os.remove(test_file)
|
||||||
|
|
||||||
|
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||||
|
call = call + " -n 0 -n 1 -rr 1"
|
||||||
|
print(call)
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||||
|
proc.wait()
|
||||||
|
copyfile(self.call_output, test_file)
|
||||||
|
baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
|
||||||
|
|
||||||
|
# as a test let's make sure that we get equal data frames
|
||||||
|
test = pd.read_table(test_file)
|
||||||
|
num_wrong_ns = sum(~ test.namespace.isin({0,1}))
|
||||||
|
self.assertEqual(num_wrong_ns, 0)
|
||||||
|
baseline = pd.read_table(baseline_file)
|
||||||
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Test_Basic(unittest.TestCase):
|
class Test_Basic(unittest.TestCase):
|
||||||
|
|
||||||
@@ -108,6 +153,26 @@ class Test_Basic(unittest.TestCase):
|
|||||||
baseline = pd.read_table(baseline_file)
|
baseline = pd.read_table(baseline_file)
|
||||||
assert_frame_equal(test,baseline)
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
|
def test_pwr_segment(self):
|
||||||
|
test_filename = "persistence_segment_" + self.wikiq_out_name
|
||||||
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
|
if os.path.exists(test_file):
|
||||||
|
os.remove(test_file)
|
||||||
|
|
||||||
|
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||||
|
call = call + " --persistence segment"
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||||
|
proc.wait()
|
||||||
|
|
||||||
|
|
||||||
|
copyfile(self.call_output, test_file)
|
||||||
|
|
||||||
|
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
|
||||||
|
|
||||||
|
test = pd.read_table(test_file)
|
||||||
|
baseline = pd.read_table(baseline_file)
|
||||||
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
def test_pwr_legacy(self):
|
def test_pwr_legacy(self):
|
||||||
test_filename = "persistence_legacy_" + self.wikiq_out_name
|
test_filename = "persistence_legacy_" + self.wikiq_out_name
|
||||||
test_file = os.path.join(self.test_output_dir, test_filename)
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
@@ -115,7 +180,7 @@ class Test_Basic(unittest.TestCase):
|
|||||||
os.remove(test_file)
|
os.remove(test_file)
|
||||||
|
|
||||||
call = self.base_call.format(self.input_file, self.test_output_dir)
|
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||||
call = call + " --persistence-legacy"
|
call = call + " --persistence legacy"
|
||||||
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||||
proc.wait()
|
proc.wait()
|
||||||
|
|
||||||
@@ -131,8 +196,8 @@ class Test_Basic(unittest.TestCase):
|
|||||||
def test_pwr(self):
|
def test_pwr(self):
|
||||||
test_filename = "persistence_" + self.wikiq_out_name
|
test_filename = "persistence_" + self.wikiq_out_name
|
||||||
test_file = os.path.join(self.test_output_dir, test_filename)
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
if os.path.exists(test_file):
|
if os.path.exists(test_file):
|
||||||
os.remove(test_file)
|
os.remove(test_file)
|
||||||
|
|
||||||
call = self.base_call.format(self.input_file, self.test_output_dir)
|
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||||
call = call + " --persistence"
|
call = call + " --persistence"
|
||||||
@@ -169,7 +234,6 @@ class Test_Basic(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class Test_Malformed(unittest.TestCase):
|
class Test_Malformed(unittest.TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
if not os.path.exists("test_output"):
|
if not os.path.exists("test_output"):
|
||||||
os.mkdir("test_output")
|
os.mkdir("test_output")
|
||||||
@@ -218,6 +282,118 @@ class Test_Stdout(unittest.TestCase):
|
|||||||
test = pd.read_table(StringIO(outs))
|
test = pd.read_table(StringIO(outs))
|
||||||
baseline = pd.read_table(baseline_file)
|
baseline = pd.read_table(baseline_file)
|
||||||
assert_frame_equal(test,baseline)
|
assert_frame_equal(test,baseline)
|
||||||
|
|
||||||
|
class Test_Regex(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.wiki = 'regextest'
|
||||||
|
self.wikiq_out_name = self.wiki + '.tsv'
|
||||||
|
self.infile = "{0}.xml.bz2".format(self.wiki)
|
||||||
|
|
||||||
|
self.input_dir = "dumps"
|
||||||
|
self.input_file = os.path.join(".", self.input_dir,self.infile)
|
||||||
|
|
||||||
|
if not os.path.exists("test_output"):
|
||||||
|
os.mkdir("test_output")
|
||||||
|
|
||||||
|
self.test_output_dir = os.path.join(".", "test_output")
|
||||||
|
self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
|
||||||
|
# we have two base calls, one for checking inputs and the other for checking outputs
|
||||||
|
self.base_call = "../wikiq {0}"
|
||||||
|
self.base_call_outs = "../wikiq {0} -o {1}"
|
||||||
|
|
||||||
|
self.baseline_output_dir = "baseline_output"
|
||||||
|
|
||||||
|
# sample inputs for checking that bad inputs get terminated / test_regex_inputs
|
||||||
|
self.bad_inputs_list = [
|
||||||
|
#label is missing
|
||||||
|
"-RP '\\b\\d+\\b'",
|
||||||
|
#number of reg and number of labels do not match
|
||||||
|
"-RP 'NPO V' -RP THE -RPl testlabel",
|
||||||
|
#cp but rp label
|
||||||
|
"-CP '(Tamil|Li)' -RPl testlabel",
|
||||||
|
#regex is missing
|
||||||
|
"-CPl testlabel",
|
||||||
|
"-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
|
||||||
|
]
|
||||||
|
|
||||||
|
# sample inputs for checking the outcomes of good inputs / test_basic_regex
|
||||||
|
self.good_inputs_list = [
|
||||||
|
"-RP '\\b\\d{3}\\b' -RPl threedigits",
|
||||||
|
"-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
|
||||||
|
"-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
|
||||||
|
"-CP 'WP:EVADE' -CPl wp_evade"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
self.cap_inputs_list = [
|
||||||
|
"-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
|
||||||
|
"-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_regex_inputs(self):
|
||||||
|
for input in self.bad_inputs_list:
|
||||||
|
call = self.base_call.format(self.input_file)
|
||||||
|
call = call + " --stdout " + input
|
||||||
|
print(call)
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
|
||||||
|
stdout,stderr = proc.communicate()
|
||||||
|
#print(proc.returncode)
|
||||||
|
|
||||||
|
# we want to check that the bad inputs were caught and sys.exit is stopping the code
|
||||||
|
print(stderr.decode("utf-8"))
|
||||||
|
self.assertNotEqual(proc.returncode,0)
|
||||||
|
|
||||||
|
def test_basic_regex(self):
|
||||||
|
for i, input in enumerate(self.good_inputs_list):
|
||||||
|
|
||||||
|
test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
|
||||||
|
#print(test_filename)
|
||||||
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
|
if os.path.exists(test_file):
|
||||||
|
os.remove(test_file)
|
||||||
|
|
||||||
|
call = self.base_call_outs.format(self.input_file, self.test_output_dir)
|
||||||
|
call = call + " " + input
|
||||||
|
print(call)
|
||||||
|
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
|
||||||
|
proc.wait()
|
||||||
|
copyfile(self.call_output, test_file)
|
||||||
|
|
||||||
|
test = pd.read_table(test_file)
|
||||||
|
|
||||||
|
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
|
||||||
|
baseline = pd.read_table(baseline_file)
|
||||||
|
assert_frame_equal(test, baseline)
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
|
||||||
|
def test_capturegroup_regex(self):
|
||||||
|
for i, input in enumerate(self.cap_inputs_list):
|
||||||
|
test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
|
||||||
|
print(test_filename)
|
||||||
|
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||||
|
if os.path.exists(test_file):
|
||||||
|
os.remove(test_file)
|
||||||
|
|
||||||
|
call = self.base_call_outs.format(self.input_file, self.test_output_dir)
|
||||||
|
call = call + " " + input
|
||||||
|
print(call)
|
||||||
|
|
||||||
|
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
|
||||||
|
proc.wait()
|
||||||
|
|
||||||
|
copyfile(self.call_output, test_file)
|
||||||
|
|
||||||
|
test = pd.read_table(test_file)
|
||||||
|
|
||||||
|
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
|
||||||
|
baseline = pd.read_table(baseline_file)
|
||||||
|
assert_frame_equal(test, baseline)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
27
test/baseline_output/basic_regextest_0.tsv
Normal file
27
test/baseline_output/basic_regextest_0.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars threedigits title
|
||||||
|
FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 1141 126, 126, 126, 126 "User talk:86.139.142.254"
|
||||||
|
FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 None "User talk:Kavin kavitha"
|
||||||
|
FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 None "User talk:Dr.vivek163"
|
||||||
|
FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 None "User talk:Twistorl"
|
||||||
|
FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 119, 978, 500, 292, 225, 199, 292 "Kom Firin"
|
||||||
|
FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 119, 978, 500, 292, 225, 199, 292 "Kom Firin"
|
||||||
|
FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 798, 150, 150, 150, 621, 137, 137, 150, 150, 350, 195, 350, 195, 180, 180, 350, 195, 300, 150, 150, 150, 180, 180, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 798, 150, 150, 150, 621, 100, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 798, 150, 150, 150, 621, 100, 621 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 None "Anita del Rey"
|
||||||
|
FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 119, 157, 119, 157, 119, 157, 119, 157 "User talk:119.94.96.157"
|
||||||
|
FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 None "Category:Ohmi Railway"
|
||||||
|
FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 None "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 None "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 126, 126, 126, 126 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 None "User:Dipayanacharya"
|
||||||
|
FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 None "User:Dipayanacharya"
|
||||||
|
FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 None "BSCIC"
|
||||||
|
FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 None "Category:Women government ministers of Yemen"
|
||||||
|
FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 None "Talk:List of Morning Glories Characters"
|
||||||
|
FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114 "User talk:106.207.126.114"
|
||||||
|
FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114, 106, 207, 126, 114 "User talk:106.207.126.114"
|
||||||
|
27
test/baseline_output/basic_regextest_1.tsv
Normal file
27
test/baseline_output/basic_regextest_1.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid date_time deleted editor editor_id minor namespace page_word revert reverteds revid sha1 testcases text_chars title
|
||||||
|
FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 page, page FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 None 1141 "User talk:86.139.142.254"
|
||||||
|
FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 None FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo None 663 "User talk:Kavin kavitha"
|
||||||
|
FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 None FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv TestCase, TestCase 399 "User talk:Dr.vivek163"
|
||||||
|
FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf None 1260 "User talk:Twistorl"
|
||||||
|
FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 page FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 TestCase 2249 "Kom Firin"
|
||||||
|
FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 page FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh None 2230 "Kom Firin"
|
||||||
|
FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 page, page FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 None 27840 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 page, page FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 None 27787 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj None 27784 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h None 27783 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 page, page FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg None 27782 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 page, page FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 None 27757 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 page, page FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 None 27667 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 None FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 None 25 "Anita del Rey"
|
||||||
|
FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None 1274 "User talk:119.94.96.157"
|
||||||
|
FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 None FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr None 113 "Category:Ohmi Railway"
|
||||||
|
FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 None FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq None 199 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 page, page, page, page FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg None 1840 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 page, page, page, page, page, page FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 None 2949 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 None FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk None 28 "User:Dipayanacharya"
|
||||||
|
FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 None FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw None 38 "User:Dipayanacharya"
|
||||||
|
FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 None FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd None 65 "BSCIC"
|
||||||
|
FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 None FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n None 285 "Category:Women government ministers of Yemen"
|
||||||
|
FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 None FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None 103 "Talk:List of Morning Glories Characters"
|
||||||
|
FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 page FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi None 1330 "User talk:106.207.126.114"
|
||||||
|
FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 page FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe None 2355 "User talk:106.207.126.114"
|
||||||
|
27
test/baseline_output/basic_regextest_2.tsv
Normal file
27
test/baseline_output/basic_regextest_2.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid chev_com date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title warning wiki_welcome
|
||||||
|
FALSE 56237363 None 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 1141 "User talk:86.139.142.254" None None
|
||||||
|
FALSE 56237364 None 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 "User talk:Kavin kavitha" None None
|
||||||
|
FALSE 56237365 None 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 "User talk:Dr.vivek163" None None
|
||||||
|
FALSE 56237366 None 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 "User talk:Twistorl" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237368 None 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 "Kom Firin" None None
|
||||||
|
FALSE 56237368 None 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 "Kom Firin" None None
|
||||||
|
FALSE 56237369 Chevalier, Chevalier 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 Chevalier, Chevalier 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237370 None 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 "Anita del Rey" None None
|
||||||
|
FALSE 56237371 None 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 "User talk:119.94.96.157" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237372 None 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 "Category:Ohmi Railway" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237376 None 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 "User:Dipayanacharya" None None
|
||||||
|
FALSE 56237376 None 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 "User:Dipayanacharya" None None
|
||||||
|
FALSE 56237378 None 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 "BSCIC" None None
|
||||||
|
FALSE 56237379 None 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 "Category:Women government ministers of Yemen" None None
|
||||||
|
FALSE 56237381 None 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 "Talk:List of Morning Glories Characters" None None
|
||||||
|
FALSE 56237382 None 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 "User talk:106.207.126.114" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237382 None 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 "User talk:106.207.126.114" None welcome to Wikipedia
|
||||||
|
27
test/baseline_output/basic_regextest_3.tsv
Normal file
27
test/baseline_output/basic_regextest_3.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title wp_evade
|
||||||
|
FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 1141 "User talk:86.139.142.254" WP:EVADE
|
||||||
|
FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 "User talk:Kavin kavitha" None
|
||||||
|
FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 "User talk:Dr.vivek163" None
|
||||||
|
FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 "User talk:Twistorl" None
|
||||||
|
FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 "Kom Firin" None
|
||||||
|
FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 "Kom Firin" None
|
||||||
|
FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 "User:Editingaccount1994/sandbox" None
|
||||||
|
FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 "Anita del Rey" None
|
||||||
|
FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 "User talk:119.94.96.157" None
|
||||||
|
FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 "Category:Ohmi Railway" None
|
||||||
|
FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 "User talk:92.226.219.222" None
|
||||||
|
FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 "User talk:92.226.219.222" None
|
||||||
|
FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 "User talk:92.226.219.222" WP:EVADE
|
||||||
|
FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 "User:Dipayanacharya" None
|
||||||
|
FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 "User:Dipayanacharya" None
|
||||||
|
FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 "BSCIC" None
|
||||||
|
FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 "Category:Women government ministers of Yemen" None
|
||||||
|
FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 "Talk:List of Morning Glories Characters" None
|
||||||
|
FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 "User talk:106.207.126.114" None
|
||||||
|
FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 "User talk:106.207.126.114" None
|
||||||
|
27
test/baseline_output/capturegroup_regextest_0.tsv
Normal file
27
test/baseline_output/capturegroup_regextest_0.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid date_time deleted editor editor_id li_cheval minor namespace revert reverteds revid sha1 text_chars three_cat three_letter three_number title
|
||||||
|
FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 None FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 1141 None has, has None "User talk:86.139.142.254"
|
||||||
|
FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 None FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 None AES, for 01, 12, 2001 "User talk:Kavin kavitha"
|
||||||
|
FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 None FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 None new None "User talk:Dr.vivek163"
|
||||||
|
FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 None None 1 "User talk:Twistorl"
|
||||||
|
FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 None FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 None AES, jpg, the, the, the, the, and, you, Tor 67, 119 "Kom Firin"
|
||||||
|
FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 None TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 None None None "Kom Firin"
|
||||||
|
FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 None AES, nom None "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 None web, See, for None "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 None per, TFD, TFD None "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 None per, for, Log, TFD 2010, 13 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 None per, for, Log, TFD 2011, 17 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 None you, are, tor, you None "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier, Li Chevalier FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 None jpg, jpg, has, COM 16, 2018 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 None FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 None alt None "Anita del Rey"
|
||||||
|
FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 None None 119, 94, 96, 157, 119, 94, 96, 157, 1 "User talk:119.94.96.157"
|
||||||
|
FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 None FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 None AES None "Category:Ohmi Railway"
|
||||||
|
FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 None FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 None AES None "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 None TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 None See, for None "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 None FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 None has, has None "User talk:92.226.219.222"
|
||||||
|
FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 None FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 None None None "User:Dipayanacharya"
|
||||||
|
FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 None FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 None None None "User:Dipayanacharya"
|
||||||
|
FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 None FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 None AES, and None "BSCIC"
|
||||||
|
FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 None FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 None AES, Non None "Category:Women government ministers of Yemen"
|
||||||
|
FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 None FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 None AES, low, low None "Talk:List of Morning Glories Characters"
|
||||||
|
FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 None FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 None None 106, 207, 126, 114, 106, 207, 126, 114, 1 "User talk:106.207.126.114"
|
||||||
|
FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 None FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 None None None "User talk:106.207.126.114"
|
||||||
|
27
test/baseline_output/capturegroup_regextest_1.tsv
Normal file
27
test/baseline_output/capturegroup_regextest_1.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid date_time deleted editor editor_id minor namespace npov_neutral npov_npov revert reverteds revid sha1 testcase_a testcase_b testcase_c testcase_d text_chars title
|
||||||
|
FALSE 56237363 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 None None FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 None None None None 1141 "User talk:86.139.142.254"
|
||||||
|
FALSE 56237364 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 None None FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo None None None None 663 "User talk:Kavin kavitha"
|
||||||
|
FALSE 56237365 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 None NPOV, NPOV FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv None None None None 399 "User talk:Dr.vivek163"
|
||||||
|
FALSE 56237366 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf None None None None 1260 "User talk:Twistorl"
|
||||||
|
FALSE 56237368 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 None NPOV FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 None TestCaseB None None 2249 "Kom Firin"
|
||||||
|
FALSE 56237368 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 None None FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh None None None None 2230 "Kom Firin"
|
||||||
|
FALSE 56237369 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 None None FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 None None None None 27840 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 None None FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 None None None None 27787 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj None None None None 27784 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h None None None None 27783 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 None None FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg None None None None 27782 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 None None FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 None None None TestCaseD 27757 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237369 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 None None FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 None None None None 27667 "User:Editingaccount1994/sandbox"
|
||||||
|
FALSE 56237370 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 None None FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 None None None None 25 "Anita del Rey"
|
||||||
|
FALSE 56237371 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 None None None None 1274 "User talk:119.94.96.157"
|
||||||
|
FALSE 56237372 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 None None FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr None None None None 113 "Category:Ohmi Railway"
|
||||||
|
FALSE 56237375 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 None None FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq None None None None 199 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 None None FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg None None None None 1840 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237375 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 None None FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 None None None None 2949 "User talk:92.226.219.222"
|
||||||
|
FALSE 56237376 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 None None FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk None None None None 28 "User:Dipayanacharya"
|
||||||
|
FALSE 56237376 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 None None FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw None None None None 38 "User:Dipayanacharya"
|
||||||
|
FALSE 56237378 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 None None FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd None None None None 65 "BSCIC"
|
||||||
|
FALSE 56237379 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 None None FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n None None None None 285 "Category:Women government ministers of Yemen"
|
||||||
|
FALSE 56237381 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 None None FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 None None None None 103 "Talk:List of Morning Glories Characters"
|
||||||
|
FALSE 56237382 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 None None FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi None None None None 1330 "User talk:106.207.126.114"
|
||||||
|
FALSE 56237382 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 None None FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe None None None None 2355 "User talk:106.207.126.114"
|
||||||
|
File diff suppressed because it is too large
Load Diff
15421
test/baseline_output/namespaces_ikwiki-20180301-pages-meta-history.tsv
Normal file
15421
test/baseline_output/namespaces_ikwiki-20180301-pages-meta-history.tsv
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
4652
test/baseline_output/persistence_segment_sailormoon.tsv
Normal file
4652
test/baseline_output/persistence_segment_sailormoon.tsv
Normal file
File diff suppressed because it is too large
Load Diff
27
test/baseline_output/regextest.tsv
Normal file
27
test/baseline_output/regextest.tsv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
anon articleid chev_com date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title warning wiki_welcome
|
||||||
|
FALSE 56237363 None 2018-01-07 10:40:58 FALSE "NinjaRobotPirate" 3742946 FALSE 3 FALSE 819091731 135nz8q6lfam6cojla7azb7k5alx3t3 1141 "User talk:86.139.142.254" None None
|
||||||
|
FALSE 56237364 None 2018-01-07 10:41:10 FALSE "Kavin kavitha" 32792125 FALSE 3 FALSE 819091755 0pwezjc6yopz0smc8al6ogc4fax5bwo 663 "User talk:Kavin kavitha" None None
|
||||||
|
FALSE 56237365 None 2018-01-07 10:41:26 FALSE "Amicable always" 32621254 FALSE 3 FALSE 819091788 sz3t2ap7z8bpkdvdvi195f3i35949bv 399 "User talk:Dr.vivek163" None None
|
||||||
|
FALSE 56237366 None 2018-01-07 10:41:31 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091796 r6s5j8j3iykenrhuhpnkpsmmd71vubf 1260 "User talk:Twistorl" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237368 None 2018-01-07 10:41:51 FALSE "Khruner" 8409334 FALSE 0 FALSE 819091825 tf5qz2yaswx61zrlm9ovxzuhl7r2dc4 2249 "Kom Firin" None None
|
||||||
|
FALSE 56237368 None 2018-01-27 12:16:02 FALSE "Khruner" 8409334 TRUE 0 FALSE 822610647 e6oa4g0qv64icdaq26uu1zzbyr5hcbh 2230 "Kom Firin" None None
|
||||||
|
FALSE 56237369 Chevalier, Chevalier 2018-01-07 10:42:05 FALSE "Editingaccount1994" 32794215 FALSE 2 FALSE 819091844 0fyvyh2a8xu41gt8obr34oba0bfixj6 27840 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-07 11:09:52 FALSE "AnomieBOT" 7611264 TRUE 2 FALSE 819093984 8gy52aolt5rg3eaketwj5v7eiw0apv2 27787 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 21:45:50 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820064189 he8ydemaanxlrpftqxkez8jfpge1fsj 27784 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 23:28:11 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078679 0to17w9rth3url8n7gvucdtobybdq5h 27783 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-12 23:28:39 FALSE "SporkBot" 12406635 TRUE 2 FALSE 820078733 531dizmmloyxffbkdr5vph7owh921eg 27782 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 None 2018-01-13 13:45:33 FALSE "Frietjes" 13791031 FALSE 2 FALSE 820177382 nik9p2u2fuk4yazjxt8ymbicxv5qid9 27757 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237369 Chevalier, Chevalier 2018-01-24 01:35:22 FALSE "CommonsDelinker" 2304267 FALSE 2 FALSE 822038928 gwk6pampl8si1v5pv3kwgteg710sfw3 27667 "User:Editingaccount1994/sandbox" None None
|
||||||
|
FALSE 56237370 None 2018-01-07 10:42:20 FALSE "PamD" 1368779 FALSE 0 FALSE 819091874 n4ozbsgle13p9yywtfrz982ccj8woc9 25 "Anita del Rey" None None
|
||||||
|
FALSE 56237371 None 2018-01-07 10:42:27 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819091883 ksohnvsbeuzwpl5vb8a3v8m18hva0a7 1274 "User talk:119.94.96.157" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237372 None 2018-01-07 10:42:50 FALSE "Underbar dk" 677153 FALSE 14 FALSE 819091914 je7aw21fedbwyqsyofpisdrynsu7olr 113 "Category:Ohmi Railway" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 10:43:32 FALSE "TastyPoutine" 882433 FALSE 3 FALSE 819091968 cpm4tkzcx4hc6irr9ukbi06ogud8dtq 199 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 11:10:24 FALSE "AnomieBOT" 7611264 TRUE 3 FALSE 819094036 artmfz8b2gxhb3pp8a5p4ksplxqfkpg 1840 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237375 None 2018-01-07 14:33:36 FALSE "Only" 702940 FALSE 3 FALSE 819112363 dn9wj0n8d8pdd5lqe56uw5xamupowr1 2949 "User talk:92.226.219.222" None None
|
||||||
|
FALSE 56237376 None 2018-01-07 10:44:01 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092004 ofueugwatmmn7u73isw732neuza57gk 28 "User:Dipayanacharya" None None
|
||||||
|
FALSE 56237376 None 2018-01-07 10:49:08 FALSE "Dipayanacharya" 32794237 FALSE 2 FALSE 819092390 dsz55xv96ec2uv6w9c1z7c52ipfovbw 38 "User:Dipayanacharya" None None
|
||||||
|
FALSE 56237378 None 2018-01-07 10:44:56 FALSE "Vinegarymass911" 21516552 FALSE 0 FALSE 819092066 9ma38hak0ef1ew4fpiutxpnzd8oz1wd 65 "BSCIC" None None
|
||||||
|
FALSE 56237379 None 2018-01-07 10:45:21 FALSE "BrownHairedGirl" 754619 FALSE 14 FALSE 819092102 4dvakoat58bzyf5hmtthxukt29hip6n 285 "Category:Women government ministers of Yemen" None None
|
||||||
|
FALSE 56237381 None 2018-01-07 10:45:54 FALSE "PRehse" 410898 FALSE 1 FALSE 819092135 2sjrxsc7os9k9pg4su2t4rk2j8nn0h7 103 "Talk:List of Morning Glories Characters" None None
|
||||||
|
FALSE 56237382 None 2018-01-07 10:45:56 FALSE "ClueBot NG" 13286072 FALSE 3 FALSE 819092138 3y9t5wpk6ur5jhone75rhm4wjf01fgi 1330 "User talk:106.207.126.114" Warning welcome to Wikipedia
|
||||||
|
FALSE 56237382 None 2018-01-07 10:50:22 FALSE "HindWIKI" 31190506 FALSE 3 FALSE 819092495 8wvn6vh3isyt0dorpe89lztrburgupe 2355 "User talk:106.207.126.114" None welcome to Wikipedia
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
test/dumps/regextest.xml.bz2
Normal file
BIN
test/dumps/regextest.xml.bz2
Normal file
Binary file not shown.
275
wikiq
275
wikiq
@@ -3,6 +3,7 @@
|
|||||||
# original wikiq headers are: title articleid revid date_time anon
|
# original wikiq headers are: title articleid revid date_time anon
|
||||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||||
# additions_size deletions_size
|
# additions_size deletions_size
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import os, os.path
|
import os, os.path
|
||||||
@@ -21,6 +22,13 @@ from urllib.parse import quote
|
|||||||
TO_ENCODE = ('title', 'editor')
|
TO_ENCODE = ('title', 'editor')
|
||||||
PERSISTENCE_RADIUS=7
|
PERSISTENCE_RADIUS=7
|
||||||
from deltas import SequenceMatcher
|
from deltas import SequenceMatcher
|
||||||
|
from deltas import SegmentMatcher
|
||||||
|
|
||||||
|
class PersistMethod:
|
||||||
|
none = 0
|
||||||
|
sequence = 1
|
||||||
|
segment = 2
|
||||||
|
legacy = 3
|
||||||
|
|
||||||
def calculate_persistence(tokens_added):
|
def calculate_persistence(tokens_added):
|
||||||
return(sum([(len(x.revisions)-1) for x in tokens_added]),
|
return(sum([(len(x.revisions)-1) for x in tokens_added]),
|
||||||
@@ -32,11 +40,15 @@ class WikiqIterator():
|
|||||||
self.fh = fh
|
self.fh = fh
|
||||||
self.collapse_user = collapse_user
|
self.collapse_user = collapse_user
|
||||||
self.mwiterator = Dump.from_file(self.fh)
|
self.mwiterator = Dump.from_file(self.fh)
|
||||||
|
self.namespace_map = { ns.id : ns.name for ns in
|
||||||
|
self.mwiterator.site_info.namespaces }
|
||||||
self.__pages = self.load_pages()
|
self.__pages = self.load_pages()
|
||||||
|
|
||||||
def load_pages(self):
|
def load_pages(self):
|
||||||
for page in self.mwiterator:
|
for page in self.mwiterator:
|
||||||
yield WikiqPage(page, collapse_user=self.collapse_user)
|
yield WikiqPage(page,
|
||||||
|
namespace_map = self.namespace_map,
|
||||||
|
collapse_user=self.collapse_user)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self.__pages
|
return self.__pages
|
||||||
@@ -49,13 +61,19 @@ class WikiqPage():
|
|||||||
'restrictions', 'mwpage', '__revisions',
|
'restrictions', 'mwpage', '__revisions',
|
||||||
'collapse_user')
|
'collapse_user')
|
||||||
|
|
||||||
def __init__(self, page, collapse_user=False):
|
def __init__(self, page, namespace_map, collapse_user=False):
|
||||||
self.id = page.id
|
self.id = page.id
|
||||||
self.title = page.title
|
|
||||||
self.namespace = page.namespace
|
self.namespace = page.namespace
|
||||||
self.redirect = page.redirect
|
# following mwxml, we assume namespace 0 in cases where
|
||||||
|
# page.namespace is inconsistent with namespace_map
|
||||||
|
if page.namespace not in namespace_map:
|
||||||
|
self.title = page.title
|
||||||
|
page.namespace = 0
|
||||||
|
if page.namespace != 0:
|
||||||
|
self.title = ':'.join([namespace_map[page.namespace], page.title])
|
||||||
|
else:
|
||||||
|
self.title = page.title
|
||||||
self.restrictions = page.restrictions
|
self.restrictions = page.restrictions
|
||||||
|
|
||||||
self.collapse_user = collapse_user
|
self.collapse_user = collapse_user
|
||||||
self.mwpage = page
|
self.mwpage = page
|
||||||
self.__revisions = self.rev_list()
|
self.__revisions = self.rev_list()
|
||||||
@@ -109,20 +127,122 @@ class WikiqPage():
|
|||||||
def __next__(self):
|
def __next__(self):
|
||||||
return next(self.__revisions)
|
return next(self.__revisions)
|
||||||
|
|
||||||
class WikiqParser():
|
|
||||||
|
|
||||||
|
class RegexPair(object):
|
||||||
|
def __init__(self, pattern, label):
|
||||||
|
self.pattern = re.compile(pattern)
|
||||||
|
self.label = label
|
||||||
|
self.has_groups = bool(self.pattern.groupindex)
|
||||||
|
if self.has_groups:
|
||||||
|
self.capture_groups = list(self.pattern.groupindex.keys())
|
||||||
|
|
||||||
|
def _make_key(self, cap_group):
|
||||||
|
return ("{}_{}".format(self.label, cap_group))
|
||||||
|
|
||||||
|
def matchmake(self, content, rev_data, count_only=False):
|
||||||
|
temp_dict = {}
|
||||||
|
# if there are named capture groups in the regex
|
||||||
|
if self.has_groups:
|
||||||
|
|
||||||
|
# if there are matches of some sort in this revision content, fill the lists for each cap_group
|
||||||
|
if content is not None and len(matchobjects := list(self.pattern.finditer(content))) > 0:
|
||||||
|
for cap_group in self.capture_groups:
|
||||||
|
key = self._make_key(cap_group)
|
||||||
|
temp_list = []
|
||||||
|
for match in matchobjects:
|
||||||
|
# we only want to add the match for the capture group if the match is not None
|
||||||
|
if (group := match.group(cap_group)) is not None:
|
||||||
|
temp_list.append(group)
|
||||||
|
|
||||||
|
# if temp_list of matches is empty just make that column None
|
||||||
|
if len(temp_list)==0:
|
||||||
|
temp_dict[key] = None
|
||||||
|
# else we put in the list we made in the for-loop above
|
||||||
|
else:
|
||||||
|
if count_only:
|
||||||
|
temp_dict[key] = len(temp_list)
|
||||||
|
else:
|
||||||
|
temp_dict[key] = ', '.join(temp_list)
|
||||||
|
|
||||||
|
# there are no matches at all in this revision content, we default values to None
|
||||||
|
else:
|
||||||
|
for cap_group in self.capture_groups:
|
||||||
|
key = self._make_key(cap_group)
|
||||||
|
if count_only:
|
||||||
|
temp_dict[key] = 0
|
||||||
|
else:
|
||||||
|
temp_dict[key] = None
|
||||||
|
|
||||||
|
# there are no capture groups, we just search for all the matches of the regex
|
||||||
|
else:
|
||||||
|
#given that there are matches to be made
|
||||||
|
if content is not None and self.pattern.search(content) is not None:
|
||||||
|
m = self.pattern.findall(content)
|
||||||
|
if count_only:
|
||||||
|
temp_dict[self.label] = len(m)
|
||||||
|
else:
|
||||||
|
temp_dict[self.label] = ', '.join(m)
|
||||||
|
else:
|
||||||
|
if count_only:
|
||||||
|
temp_dict[self.label] = 0
|
||||||
|
else:
|
||||||
|
temp_dict[self.label] = None
|
||||||
|
# update rev_data with our new columns
|
||||||
|
rev_data.update(temp_dict)
|
||||||
|
return rev_data
|
||||||
|
|
||||||
def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
|
|
||||||
|
|
||||||
|
class WikiqParser():
|
||||||
|
def __init__(self, input_file, output_file, regex_revision_match, regex_revision_label, regex_revision_output_count, regex_comment_match, regex_comment_label, regex_comment_output_count, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15):
|
||||||
|
"""
|
||||||
|
Parameters:
|
||||||
|
persist : what persistence method to use. Takes a PersistMethod value
|
||||||
|
"""
|
||||||
self.input_file = input_file
|
self.input_file = input_file
|
||||||
self.output_file = output_file
|
self.output_file = output_file
|
||||||
self.collapse_user = collapse_user
|
self.collapse_user = collapse_user
|
||||||
self.persist = persist
|
self.persist = persist
|
||||||
self.persist_legacy = persist_legacy
|
|
||||||
self.printed_header = False
|
self.printed_header = False
|
||||||
self.namespaces = []
|
self.namespaces = []
|
||||||
self.urlencode = urlencode
|
self.urlencode = urlencode
|
||||||
|
self.revert_radius = revert_radius
|
||||||
|
|
||||||
|
if namespaces is not None:
|
||||||
|
self.namespace_filter = set(namespaces)
|
||||||
|
else:
|
||||||
|
self.namespace_filter = None
|
||||||
|
|
||||||
|
self.regex_revision_pairs = self.make_matchmake_pairs(regex_revision_match, regex_revision_label)
|
||||||
|
self.regex_revision_output_count = regex_revision_output_count
|
||||||
|
|
||||||
|
self.regex_comment_pairs = self.make_matchmake_pairs(regex_comment_match, regex_comment_label)
|
||||||
|
self.regex_comment_output_count = regex_comment_output_count
|
||||||
|
|
||||||
|
def make_matchmake_pairs(self, patterns, labels):
|
||||||
|
if (patterns is not None and labels is not None) and \
|
||||||
|
(len(patterns) == len(labels)):
|
||||||
|
return [RegexPair(pattern, label) for pattern, label in zip(patterns, labels)]
|
||||||
|
elif (patterns is None and labels is None):
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
|
||||||
|
|
||||||
|
def matchmake(self, rev, rev_data):
|
||||||
|
rev_data = self.matchmake_revision(rev.text, rev_data)
|
||||||
|
rev_data = self.matchmake_comment(rev.comment, rev_data)
|
||||||
|
return rev_data
|
||||||
|
|
||||||
|
def matchmake_revision(self, text, rev_data):
|
||||||
|
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs, self.regex_revision_output_count)
|
||||||
|
|
||||||
|
def matchmake_comment(self, comment, rev_data):
|
||||||
|
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs, self.regex_comment_output_count)
|
||||||
|
|
||||||
|
def matchmake_pairs(self, text, rev_data, pairs, count_only):
|
||||||
|
for pair in pairs:
|
||||||
|
rev_data = pair.matchmake(text, rev_data, count_only)
|
||||||
|
return rev_data
|
||||||
|
|
||||||
def __get_namespace_from_title(self, title):
|
def __get_namespace_from_title(self, title):
|
||||||
default_ns = None
|
default_ns = None
|
||||||
|
|
||||||
@@ -138,6 +258,7 @@ class WikiqParser():
|
|||||||
# if we've made it this far with no matches, we return the default namespace
|
# if we've made it this far with no matches, we return the default namespace
|
||||||
return default_ns
|
return default_ns
|
||||||
|
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
|
|
||||||
# create a regex that creates the output filename
|
# create a regex that creates the output filename
|
||||||
@@ -157,29 +278,46 @@ class WikiqParser():
|
|||||||
|
|
||||||
# Iterate through pages
|
# Iterate through pages
|
||||||
for page in dump:
|
for page in dump:
|
||||||
rev_detector = mwreverts.Detector()
|
namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
|
||||||
|
|
||||||
if self.persist or self.persist_legacy:
|
# skip namespaces not in the filter
|
||||||
|
if self.namespace_filter is not None:
|
||||||
|
if namespace not in self.namespace_filter:
|
||||||
|
continue
|
||||||
|
|
||||||
|
rev_detector = mwreverts.Detector(radius = self.revert_radius)
|
||||||
|
|
||||||
|
if self.persist != PersistMethod.none:
|
||||||
window = deque(maxlen=PERSISTENCE_RADIUS)
|
window = deque(maxlen=PERSISTENCE_RADIUS)
|
||||||
|
|
||||||
if not self.persist_legacy:
|
if self.persist == PersistMethod.sequence:
|
||||||
state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
|
state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
|
||||||
revert_radius=PERSISTENCE_RADIUS)
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
|
|
||||||
|
elif self.persist == PersistMethod.segment:
|
||||||
|
state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
|
||||||
|
revert_radius=PERSISTENCE_RADIUS)
|
||||||
|
|
||||||
|
# self.persist == PersistMethod.legacy
|
||||||
else:
|
else:
|
||||||
from mw.lib import persistence
|
from mw.lib import persistence
|
||||||
state = persistence.State()
|
state = persistence.State()
|
||||||
|
|
||||||
# Iterate through a page's revisions
|
# Iterate through a page's revisions
|
||||||
for rev in page:
|
for rev in page:
|
||||||
|
|
||||||
|
# initialize rev_data
|
||||||
|
rev_data = {
|
||||||
|
'revid':rev.id,
|
||||||
|
'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
'articleid' : page.id,
|
||||||
|
'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
|
||||||
|
'title' : '"' + page.title + '"',
|
||||||
|
'namespace' : namespace,
|
||||||
|
'deleted' : "TRUE" if rev.deleted.text else "FALSE"
|
||||||
|
}
|
||||||
|
|
||||||
rev_data = {'revid' : rev.id,
|
rev_data = self.matchmake(rev, rev_data)
|
||||||
'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
'articleid' : page.id,
|
|
||||||
'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
|
|
||||||
'title' : '"' + page.title + '"',
|
|
||||||
'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
|
|
||||||
'deleted' : "TRUE" if rev.deleted.text else "FALSE" }
|
|
||||||
|
|
||||||
# if revisions are deleted, /many/ things will be missing
|
# if revisions are deleted, /many/ things will be missing
|
||||||
if rev.deleted.text:
|
if rev.deleted.text:
|
||||||
@@ -204,7 +342,7 @@ class WikiqParser():
|
|||||||
|
|
||||||
# TODO rev.bytes doesn't work.. looks like a bug
|
# TODO rev.bytes doesn't work.. looks like a bug
|
||||||
rev_data['text_chars'] = len(rev.text)
|
rev_data['text_chars'] = len(rev.text)
|
||||||
|
|
||||||
# generate revert data
|
# generate revert data
|
||||||
revert = rev_detector.process(text_sha1, rev.id)
|
revert = rev_detector.process(text_sha1, rev.id)
|
||||||
|
|
||||||
@@ -238,14 +376,15 @@ class WikiqParser():
|
|||||||
if self.collapse_user:
|
if self.collapse_user:
|
||||||
rev_data['collapsed_revs'] = rev.collapsed_revs
|
rev_data['collapsed_revs'] = rev.collapsed_revs
|
||||||
|
|
||||||
if self.persist or self.persist_legacy:
|
if self.persist != PersistMethod.none:
|
||||||
|
# initialize an empty dictionary before assigning things into it. this catches bugs if the first revision is deleted
|
||||||
|
old_rev_data = {}
|
||||||
if rev.deleted.text:
|
if rev.deleted.text:
|
||||||
|
|
||||||
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
|
for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
|
||||||
old_rev_data[k] = None
|
old_rev_data[k] = None
|
||||||
else:
|
else:
|
||||||
|
|
||||||
if not self.persist_legacy:
|
if self.persist != PersistMethod.legacy:
|
||||||
_, tokens_added, tokens_removed = state.update(rev.text, rev.id)
|
_, tokens_added, tokens_removed = state.update(rev.text, rev.id)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -270,7 +409,7 @@ class WikiqParser():
|
|||||||
|
|
||||||
rev_count += 1
|
rev_count += 1
|
||||||
|
|
||||||
if self.persist or self.persist_legacy:
|
if self.persist != PersistMethod.none:
|
||||||
# print out metadata for the last RADIUS revisions
|
# print out metadata for the last RADIUS revisions
|
||||||
for i, item in enumerate(window):
|
for i, item in enumerate(window):
|
||||||
# if the window was full, we've already printed item 0
|
# if the window was full, we've already printed item 0
|
||||||
@@ -344,17 +483,59 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
|
|||||||
parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
|
parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
|
||||||
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
|
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
|
||||||
|
|
||||||
parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
|
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
|
||||||
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
|
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
|
||||||
|
|
||||||
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
|
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
|
||||||
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
|
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
|
||||||
|
|
||||||
parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
|
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
|
||||||
help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
|
help="Id number of namspace to include. Can be specified more than once.")
|
||||||
|
|
||||||
|
parser.add_argument('-rr',
|
||||||
|
'--revert-radius',
|
||||||
|
dest="revert_radius",
|
||||||
|
type=int,
|
||||||
|
action='store',
|
||||||
|
default=15,
|
||||||
|
help="Number of edits to check when looking for reverts (default: 15)")
|
||||||
|
|
||||||
|
parser.add_argument('-RP', '--revision-pattern', dest="regex_revision_match", default=None, type=str, action='append',
|
||||||
|
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
|
||||||
|
|
||||||
|
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
|
||||||
|
help="The label for the outputted column based on matching the regex in revision text.")
|
||||||
|
|
||||||
|
parser.add_argument('-RPc', '--revision-pattern-count', dest="regex_revision_output_count", action='store_true',
|
||||||
|
help="If present, this will cause the revision patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all revision patterns.")
|
||||||
|
|
||||||
|
parser.add_argument('-CP', '--comment-pattern', dest="regex_comment_match", default=None, type=str, action='append',
|
||||||
|
help="The regular expression to search for in comments of revisions.")
|
||||||
|
|
||||||
|
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
|
||||||
|
help="The label for the outputted column based on matching the regex in comments.")
|
||||||
|
|
||||||
|
parser.add_argument('-CPc', '--comment-pattern-count', dest="regex_comment_output_count", action='store_true',
|
||||||
|
help="If present, this will cause the comments patterns to return counts of the number of matches instead of the text of the matches themselves. It will affect all comment patterns.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# set persistence method
|
||||||
|
|
||||||
|
if args.persist is None:
|
||||||
|
persist = PersistMethod.none
|
||||||
|
elif args.persist == "segment":
|
||||||
|
persist = PersistMethod.segment
|
||||||
|
elif args.persist == "legacy":
|
||||||
|
persist = PersistMethod.legacy
|
||||||
|
else:
|
||||||
|
persist = PersistMethod.sequence
|
||||||
|
|
||||||
|
if args.namespace_filter is not None:
|
||||||
|
namespaces = args.namespace_filter
|
||||||
|
else:
|
||||||
|
namespaces = None
|
||||||
|
|
||||||
if len(args.dumpfiles) > 0:
|
if len(args.dumpfiles) > 0:
|
||||||
for filename in args.dumpfiles:
|
for filename in args.dumpfiles:
|
||||||
input_file = open_input_file(filename)
|
input_file = open_input_file(filename)
|
||||||
@@ -373,12 +554,19 @@ if len(args.dumpfiles) > 0:
|
|||||||
filename = os.path.join(output_dir, os.path.basename(filename))
|
filename = os.path.join(output_dir, os.path.basename(filename))
|
||||||
output_file = open_output_file(filename)
|
output_file = open_output_file(filename)
|
||||||
|
|
||||||
wikiq = WikiqParser(input_file, output_file,
|
wikiq = WikiqParser(input_file,
|
||||||
|
output_file,
|
||||||
collapse_user=args.collapse_user,
|
collapse_user=args.collapse_user,
|
||||||
persist=args.persist,
|
persist=persist,
|
||||||
persist_legacy=args.persist_legacy,
|
urlencode=args.urlencode,
|
||||||
urlencode=args.urlencode)
|
namespaces=namespaces,
|
||||||
|
revert_radius=args.revert_radius,
|
||||||
|
regex_revision_match = args.regex_revision_match,
|
||||||
|
regex_revision_label = args.regex_revision_label,
|
||||||
|
regex_revision_output_count = args.regex_revision_output_count,
|
||||||
|
regex_comment_match = args.regex_comment_match,
|
||||||
|
regex_comment_label = args.regex_comment_label,
|
||||||
|
regex_comment_output_count = args.regex_comment_output_count)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
|
||||||
@@ -386,12 +574,23 @@ if len(args.dumpfiles) > 0:
|
|||||||
input_file.close()
|
input_file.close()
|
||||||
output_file.close()
|
output_file.close()
|
||||||
else:
|
else:
|
||||||
wikiq = WikiqParser(sys.stdin, sys.stdout,
|
wikiq = WikiqParser(sys.stdin,
|
||||||
|
sys.stdout,
|
||||||
collapse_user=args.collapse_user,
|
collapse_user=args.collapse_user,
|
||||||
persist=args.persist,
|
persist=persist,
|
||||||
persist_legacy=args.persist_legacy,
|
#persist_legacy=args.persist_legacy,
|
||||||
urlencode=args.urlencode)
|
urlencode=args.urlencode,
|
||||||
wikiq.process()
|
namespaces=namespaces,
|
||||||
|
revert_radius=args.revert_radius,
|
||||||
|
regex_revision_match = args.regex_revision_match,
|
||||||
|
regex_revision_label = args.regex_revision_label,
|
||||||
|
regex_revision_output_count = args.regex_revision_output_count,
|
||||||
|
regex_comment_match = args.regex_comment_match,
|
||||||
|
regex_comment_label = args.regex_comment_label,
|
||||||
|
regex_comment_output_count = args.regex_comment_output_count)
|
||||||
|
|
||||||
|
|
||||||
|
wikiq.process()
|
||||||
|
|
||||||
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
|
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
|
||||||
# stop_words = stop_words.split(",")
|
# stop_words = stop_words.split(",")
|
||||||
|
|||||||
Reference in New Issue
Block a user