add flag for excluding whitespace and punctuation
This commit is contained in:
@@ -92,14 +92,53 @@ class Test_Persistence(unittest.TestCase):
|
||||
self.assertEqual(test['tokens_added'][0],7)
|
||||
self.assertEqual(test['tokens_added'][1],10)
|
||||
self.assertEqual(test['tokens_added'][2],0)
|
||||
self.assertEqual(test['tokens_added'][3],11)
|
||||
self.assertEqual(test['tokens_added'][3],8)
|
||||
self.assertEqual(test['tokens_added'][4],0)
|
||||
self.assertEqual(test['tokens_removed'][0],0)
|
||||
self.assertEqual(test['tokens_removed'][1],0)
|
||||
self.assertEqual(test['tokens_removed'][2],10)
|
||||
self.assertEqual(test['tokens_removed'][3],4)
|
||||
self.assertEqual(test['tokens_removed'][4],0)
|
||||
self.assertEqual(test['token_revs'][0],8*3)
|
||||
self.assertEqual(test['token_revs'][1],0)
|
||||
self.assertEqual(test['token_revs'][2],0)
|
||||
self.assertEqual(test['token_revs'][3],0)
|
||||
self.assertEqual(test['token_revs'][4],0)
|
||||
|
||||
baseline = pd.read_table(baseline_file)
|
||||
assert_frame_equal(test,baseline)
|
||||
|
||||
|
||||
|
||||
def test_segment_persistence_exclude_ws(self):
|
||||
test_filename = "segment_excludews_" + self.wikiq_out_name
|
||||
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||
if os.path.exists(test_file):
|
||||
os.remove(test_file)
|
||||
|
||||
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||
call = call + " --url-encode --persistence segment --exclude-whitespace"
|
||||
print(os.path.abspath('.'))
|
||||
print(call)
|
||||
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||
proc.wait()
|
||||
|
||||
copyfile(self.call_output, test_file)
|
||||
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
|
||||
|
||||
# as a test let's make sure that we get equal data frames
|
||||
test = pd.read_table(test_file)
|
||||
self.assertEqual(test['tokens_added'][0],4)
|
||||
self.assertEqual(test['tokens_added'][1],5)
|
||||
self.assertEqual(test['tokens_added'][2],0)
|
||||
self.assertEqual(test['tokens_added'][3],6)
|
||||
self.assertEqual(test['tokens_added'][4],0)
|
||||
self.assertEqual(test['tokens_removed'][0],0)
|
||||
self.assertEqual(test['tokens_removed'][1],0)
|
||||
self.assertEqual(test['tokens_removed'][2],0)
|
||||
self.assertEqual(test['tokens_removed'][3],7)
|
||||
self.assertEqual(test['tokens_removed'][3],4)
|
||||
self.assertEqual(test['tokens_removed'][4],0)
|
||||
self.assertEqual(test['token_revs'][0],7*3)
|
||||
self.assertEqual(test['token_revs'][0],4*3)
|
||||
self.assertEqual(test['token_revs'][1],0)
|
||||
self.assertEqual(test['token_revs'][2],0)
|
||||
self.assertEqual(test['token_revs'][3],0)
|
||||
@@ -289,6 +328,29 @@ class Test_Basic(unittest.TestCase):
|
||||
baseline = pd.read_table(baseline_file)
|
||||
assert_frame_equal(test,baseline)
|
||||
|
||||
def test_pwr_segment_collapse(self):
|
||||
test_filename = "persistence_segment_collapse_" + self.wikiq_out_name
|
||||
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||
if os.path.exists(test_file):
|
||||
os.remove(test_file)
|
||||
|
||||
call = self.base_call.format(self.input_file, self.test_output_dir)
|
||||
call = call + " --persistence segment --collapse-user"
|
||||
print(call)
|
||||
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
|
||||
proc.wait()
|
||||
|
||||
|
||||
copyfile(self.call_output, test_file)
|
||||
|
||||
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
|
||||
|
||||
test = pd.read_table(test_file)
|
||||
print(test)
|
||||
baseline = pd.read_table(baseline_file)
|
||||
assert_frame_equal(test,baseline)
|
||||
|
||||
|
||||
def test_pwr_legacy(self):
|
||||
test_filename = "persistence_legacy_" + self.wikiq_out_name
|
||||
test_file = os.path.join(self.test_output_dir, test_filename)
|
||||
|
||||
6
test/baseline_output/segment_excludews_pwr-test.tsv
Normal file
6
test/baseline_output/segment_excludews_pwr-test.tsv
Normal file
@@ -0,0 +1,6 @@
|
||||
anon articleid date_time deleted editor editor_id minor namespace revert reverteds revid sha1 text_chars title token_revs tokens_added tokens_removed tokens_window
|
||||
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 12 4 0 4
|
||||
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cb3aeff37c151ab439ee4a7a76cea85679272d5e 44 %22Test%20Title%22 0 5 0 3
|
||||
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.52%22 FALSE 0 FALSE None cp0mfp4o90fbpyxooys1rw95zn3ddff 21 %22Test%20Title%22 0 0 0 2
|
||||
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.53%22 FALSE 0 FALSE None 9e63d8f3d87be0ed4cc3fab3f750a4429997bead 31 %22Test%20Title%22 0 6 4 1
|
||||
TRUE 1 2010-12-13 02:51:55 FALSE %2276.102.106.60%22 FALSE 0 TRUE "None,None,None" None e867e7cd0e53b58428b23ee346c6fa523756e7d1 21 %22Test%20Title%22 0 0 0 0
|
||||
|
Reference in New Issue
Block a user