add flag for excluding whitespace and punctuation

This commit is contained in:
2018-12-12 16:38:47 -08:00
parent 19eda6dd0e
commit 2d5008113b
3 changed files with 106 additions and 47 deletions

View File

@@ -92,14 +92,53 @@ class Test_Persistence(unittest.TestCase):
self.assertEqual(test['tokens_added'][0],7)
self.assertEqual(test['tokens_added'][1],10)
self.assertEqual(test['tokens_added'][2],0)
self.assertEqual(test['tokens_added'][3],11)
self.assertEqual(test['tokens_added'][3],8)
self.assertEqual(test['tokens_added'][4],0)
self.assertEqual(test['tokens_removed'][0],0)
self.assertEqual(test['tokens_removed'][1],0)
self.assertEqual(test['tokens_removed'][2],10)
self.assertEqual(test['tokens_removed'][3],4)
self.assertEqual(test['tokens_removed'][4],0)
self.assertEqual(test['token_revs'][0],8*3)
self.assertEqual(test['token_revs'][1],0)
self.assertEqual(test['token_revs'][2],0)
self.assertEqual(test['token_revs'][3],0)
self.assertEqual(test['token_revs'][4],0)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline)
def test_segment_persistence_exclude_ws(self):
test_filename = "segment_excludews_" + self.wikiq_out_name
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --url-encode --persistence segment --exclude-whitespace"
print(os.path.abspath('.'))
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
# as a test let's make sure that we get equal data frames
test = pd.read_table(test_file)
self.assertEqual(test['tokens_added'][0],4)
self.assertEqual(test['tokens_added'][1],5)
self.assertEqual(test['tokens_added'][2],0)
self.assertEqual(test['tokens_added'][3],6)
self.assertEqual(test['tokens_added'][4],0)
self.assertEqual(test['tokens_removed'][0],0)
self.assertEqual(test['tokens_removed'][1],0)
self.assertEqual(test['tokens_removed'][2],0)
self.assertEqual(test['tokens_removed'][3],7)
self.assertEqual(test['tokens_removed'][3],4)
self.assertEqual(test['tokens_removed'][4],0)
self.assertEqual(test['token_revs'][0],7*3)
self.assertEqual(test['token_revs'][0],4*3)
self.assertEqual(test['token_revs'][1],0)
self.assertEqual(test['token_revs'][2],0)
self.assertEqual(test['token_revs'][3],0)
@@ -289,6 +328,29 @@ class Test_Basic(unittest.TestCase):
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline)
def test_pwr_segment_collapse(self):
test_filename = "persistence_segment_collapse_" + self.wikiq_out_name
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --persistence segment --collapse-user"
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
test = pd.read_table(test_file)
print(test)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline)
def test_pwr_legacy(self):
test_filename = "persistence_legacy_" + self.wikiq_out_name
test_file = os.path.join(self.test_output_dir, test_filename)

View File

@@ -0,0 +1,6 @@
anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title token_revs tokens_added tokens_removed tokens_window
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 12 4 0 4
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cb3aeff37c151ab439ee4a7a76cea85679272d5e 44 %22Test%20Title%22 0 5 0 3
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cp0mfp4o90fbpyxooys1rw95zn3ddff 21 %22Test%20Title%22 0 0 0 2
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.53%22 FALSE 0 FALSE None 9e63d8f3d87be0ed4cc3fab3f750a4429997bead 31 %22Test%20Title%22 0 6 4 1
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.60%22 FALSE 0 TRUE "None,None,None" None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 0 0 0 0
1 anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title token_revs tokens_added tokens_removed tokens_window
2 TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 12 4 0 4
3 TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cb3aeff37c151ab439ee4a7a76cea85679272d5e 44 %22Test%20Title%22 0 5 0 3
4 TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cp0mfp4o90fbpyxooys1rw95zn3ddff 21 %22Test%20Title%22 0 0 0 2
5 TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.53%22 FALSE 0 FALSE None 9e63d8f3d87be0ed4cc3fab3f750a4429997bead 31 %22Test%20Title%22 0 6 4 1
6 TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.60%22 FALSE 0 TRUE None,None,None None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 0 0 0 0