wikiq mostly functional, but reverters take all the credit for the content they restore.
This commit is contained in:
36
wikiq
36
wikiq
@@ -1,9 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# original wikiq headers are: title articleid revid date_time anon
|
||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||
# additions_size deletions_size
|
||||
import pdb
|
||||
import argparse
|
||||
import sys
|
||||
import os, os.path
|
||||
@@ -77,9 +76,25 @@ class PersistMethod:
|
||||
segment = 2
|
||||
legacy = 3
|
||||
|
||||
def calculate_persistence(tokens_added):
|
||||
def calculate_persistence(tokens_added, tokens_removed, exclude_ws = True, exclude_punct = False):
|
||||
cond = True
|
||||
ws_lex = ['break','whitespace']
|
||||
punct_lex = ['period','qmark','epoint','comma','colon','scolon','paren_open','paren_close','brack_open','brack_close','dbrack_close','dbrack_open','tab_close','tab_open','dcurly_close','dcurly_open','equals','bar','etc','bold','italic','tag','comment_end','comment_start']
|
||||
|
||||
|
||||
if exclude_ws:
|
||||
cond = lambda t: cond and not t.type in ws_lex
|
||||
|
||||
if exclude_punct:
|
||||
cond = lambda t: cond and not t.type in punct_lex
|
||||
|
||||
tokens_added = [t for t in tokens_added if cond(t)]
|
||||
tokens_removed = [t for t in tokens_removed if cond(t)]
|
||||
|
||||
return(sum([(len(x.revisions)-1) for x in tokens_added]),
|
||||
len(tokens_added))
|
||||
len(tokens_added),
|
||||
len(tokens_removed)
|
||||
)
|
||||
|
||||
class WikiqIterator(Dump):
|
||||
|
||||
@@ -395,11 +410,11 @@ class WikiqParser():
|
||||
if len(window) == PERSISTENCE_RADIUS:
|
||||
old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
|
||||
|
||||
num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
|
||||
num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(old_tokens_added, old_tokens_removed)
|
||||
|
||||
old_rev_data["token_revs"] = num_token_revs
|
||||
old_rev_data["tokens_added"] = num_tokens
|
||||
old_rev_data["tokens_removed"] = len(old_tokens_removed)
|
||||
old_rev_data["tokens_added"] = num_tokens_added
|
||||
old_rev_data["tokens_removed"] = num_tokens_removed
|
||||
old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
|
||||
|
||||
self.print_rev_data(old_rev_data)
|
||||
@@ -417,11 +432,12 @@ class WikiqParser():
|
||||
continue
|
||||
|
||||
rev_id, rev_data, tokens_added, tokens_removed = item
|
||||
num_token_revs, num_tokens = calculate_persistence(tokens_added)
|
||||
|
||||
num_token_revs, num_tokens_added, num_tokens_removed = calculate_persistence(tokens_added, tokens_removed)
|
||||
|
||||
rev_data["token_revs"] = num_token_revs
|
||||
rev_data["tokens_added"] = num_tokens
|
||||
rev_data["tokens_removed"] = len(tokens_removed)
|
||||
rev_data["tokens_added"] = num_tokens_added
|
||||
rev_data["tokens_removed"] = num_tokens_removed
|
||||
rev_data["tokens_window"] = len(window)-(i+1)
|
||||
|
||||
self.print_rev_data(rev_data)
|
||||
|
||||
Reference in New Issue
Block a user