sub assertEquals assertEqual

add namespace filter parameter
Merge branch 'advanced_persistence' of code.communitydata.cc:mediawiki_dump_tools into advanced_persistence
2018-09-03 11:21:49 -07:00 · 2018-09-03 11:13:48 -07:00 · 2018-08-23 19:00:49 -07:00 · 2018-08-23 18:49:32 -07:00 · 2018-08-23 18:27:09 -07:00 · 2018-08-23 18:25:51 -07:00
11 changed files with 69196 additions and 49031 deletions
--- a/1
+++ b/1
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -51,6 +51,28 @@ class Test_Wikipedia(unittest.TestCase):
        baseline = pd.read_table(baseline_file)
        assert_frame_equal(test,baseline)
    def test_WP_namespaces(self):
        print(os.path.abspath('.'))
        test_filename =  "namespaces_" + self.wikiq_out_name
        test_file = os.path.join(self.test_output_dir, test_filename)
        if os.path.exists(test_file):
            os.remove(test_file)
        call = self.base_call.format(self.input_file, self.test_output_dir)
        call = call + " -n 0 -n 1"
        print(call)
        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
        proc.wait()
        copyfile(self.call_output, test_file)
        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
        # as a test let's make sure that we get equal data frames
        test = pd.read_table(test_file)
        num_wrong_ns = sum(~ test.namespace.isin({0,1}))
        self.assertEqual(num_wrong_ns, 0)
        baseline = pd.read_table(baseline_file)
        assert_frame_equal(test,baseline)
 class Test_Basic(unittest.TestCase):
@@ -108,6 +130,26 @@ class Test_Basic(unittest.TestCase):
        baseline = pd.read_table(baseline_file)
        assert_frame_equal(test,baseline)
    def test_pwr_segment(self):
        test_filename =  "persistence_segment_" + self.wikiq_out_name
        test_file = os.path.join(self.test_output_dir, test_filename)
        if os.path.exists(test_file):
            os.remove(test_file)
        call = self.base_call.format(self.input_file, self.test_output_dir)
        call = call + " --persistence segment"
        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
        proc.wait()
        copyfile(self.call_output, test_file)
        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
        test = pd.read_table(test_file)
        baseline = pd.read_table(baseline_file)
        assert_frame_equal(test,baseline)
    def test_pwr_legacy(self):
        test_filename =  "persistence_legacy_" + self.wikiq_out_name
        test_file = os.path.join(self.test_output_dir, test_filename)
@@ -115,7 +157,7 @@ class Test_Basic(unittest.TestCase):
            os.remove(test_file)
        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence-legacy"
+        call = call + " --persistence legacy"
        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
        proc.wait()
@@ -131,8 +173,8 @@ class Test_Basic(unittest.TestCase):
    def test_pwr(self):
        test_filename =  "persistence_" + self.wikiq_out_name
        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
+        if os.path.exists(test_file): 
-            os.remove(test_file)
+           os.remove(test_file)
        call = self.base_call.format(self.input_file, self.test_output_dir)
        call = call + " --persistence"
@@ -169,7 +211,6 @@ class Test_Basic(unittest.TestCase):
 class Test_Malformed(unittest.TestCase):
    def setUp(self):
        if not os.path.exists("test_output"):
            os.mkdir("test_output")
--- a/test/baseline_output/collapse-user_sailormoon.tsv
+++ b/test/baseline_output/collapse-user_sailormoon.tsv
--- a/test/baseline_output/namespaces_ikwiki-20180301-pages-meta-history.tsv
+++ b/test/baseline_output/namespaces_ikwiki-20180301-pages-meta-history.tsv
--- a/test/baseline_output/noargs_sailormoon.tsv
+++ b/test/baseline_output/noargs_sailormoon.tsv
--- a/test/baseline_output/persistence_legacy_sailormoon.tsv
+++ b/test/baseline_output/persistence_legacy_sailormoon.tsv
--- a/test/baseline_output/persistence_sailormoon.tsv
+++ b/test/baseline_output/persistence_sailormoon.tsv
--- a/test/baseline_output/persistence_segment_sailormoon.tsv
+++ b/test/baseline_output/persistence_segment_sailormoon.tsv
--- a/test/baseline_output/url-encode_ikwiki-20180301-pages-meta-history.tsv
+++ b/test/baseline_output/url-encode_ikwiki-20180301-pages-meta-history.tsv
--- a/test/baseline_output/url-encode_sailormoon.tsv
+++ b/test/baseline_output/url-encode_sailormoon.tsv
--- a/108
+++ b/108
@@ -3,6 +3,7 @@
 # original wikiq headers are: title articleid revid date_time anon
 # editor editor_id minor text_size text_entropy text_md5 reversion
 # additions_size deletions_size
 import argparse
 import sys
 import os, os.path
@@ -21,22 +22,32 @@ from urllib.parse import quote
 TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS=7
 from deltas import SequenceMatcher
 from deltas import SegmentMatcher
 class PersistMethod:
    none = 0
    sequence = 1
    segment = 2
    legacy = 3
 def calculate_persistence(tokens_added):
    return(sum([(len(x.revisions)-1) for x in tokens_added]),
           len(tokens_added))
 class WikiqIterator():
    def __init__(self, fh, collapse_user=False):
        self.fh = fh
        self.collapse_user = collapse_user
        self.mwiterator = Dump.from_file(self.fh)
        self.namespace_map = { ns.id : ns.name for ns in
                               self.mwiterator.site_info.namespaces }
        self.__pages = self.load_pages()
    def load_pages(self):
        for page in self.mwiterator:
-            yield WikiqPage(page, collapse_user=self.collapse_user)
+            yield WikiqPage(page,
                            namespace_map = self.namespace_map,
                            collapse_user=self.collapse_user)
    def __iter__(self):
        return self.__pages
@@ -49,13 +60,19 @@ class WikiqPage():
                 'restrictions', 'mwpage', '__revisions',
                 'collapse_user')
-    def __init__(self, page, collapse_user=False):
+    def __init__(self, page, namespace_map, collapse_user=False):
        self.id = page.id
        self.title = page.title
        self.namespace = page.namespace
-        self.redirect = page.redirect
+        # following mwxml, we assume namespace 0 in cases where
        # page.namespace is inconsistent with namespace_map
        if page.namespace not in namespace_map:
            self.title = page.title
            page.namespace = 0
        if page.namespace != 0:
            self.title = ':'.join([namespace_map[page.namespace], page.title])
        else:
            self.title = page.title
        self.restrictions = page.restrictions
        self.collapse_user = collapse_user
        self.mwpage = page
        self.__revisions = self.rev_list()
@@ -110,19 +127,25 @@ class WikiqPage():
        return next(self.__revisions)
 class WikiqParser():
    def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None):
        """ 
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
        """
    def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
        self.input_file = input_file
        self.output_file = output_file
        self.collapse_user = collapse_user
        self.persist = persist
        self.persist_legacy = persist_legacy
        self.printed_header = False
        self.namespaces = []
        self.urlencode = urlencode
-        
+        if namespaces is not None:
            self.namespace_filter = set(namespaces)
        else:
            self.namespace_filter = None
    def __get_namespace_from_title(self, title):
        default_ns = None
@@ -157,15 +180,27 @@ class WikiqParser():
        # Iterate through pages
        for page in dump:
            namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
            # skip namespaces not in the filter
            if self.namespace_filter is not None:
                if namespace not in self.namespace_filter:
                    continue
            rev_detector = mwreverts.Detector()
-            if self.persist or self.persist_legacy:
+            if self.persist != PersistMethod.none:
                window = deque(maxlen=PERSISTENCE_RADIUS)
-                if not self.persist_legacy:
+                if self.persist == PersistMethod.sequence:
                    state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.segment:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                # self.persist == PersistMethod.legacy
                else:
                    from mw.lib import persistence
                    state = persistence.State()
@@ -178,7 +213,7 @@ class WikiqParser():
                            'articleid' : page.id,
                            'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
                            'title' : '"' + page.title + '"',
-                            'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
+                            'namespace' : namespace,
                            'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
                # if revisions are deleted, /many/ things will be missing
@@ -238,14 +273,13 @@ class WikiqParser():
                if self.collapse_user:
                    rev_data['collapsed_revs'] = rev.collapsed_revs
-                if self.persist or self.persist_legacy:
+                if self.persist != PersistMethod.none:
                    if rev.deleted.text:
                        for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
                            old_rev_data[k] = None
                    else:
-                        if not self.persist_legacy:
+                        if self.persist != PersistMethod.legacy:
                            _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
                        else:
@@ -270,7 +304,7 @@ class WikiqParser():
                rev_count += 1
-            if self.persist or self.persist_legacy:
+            if self.persist != PersistMethod.none:
                # print out metadata for the last RADIUS revisions
                for i, item in enumerate(window):
                    # if the window was full, we've already printed item 0
@@ -344,17 +378,35 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
-parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
+parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
-                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
+                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
                    help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
-parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
+parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
-                    help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
+                    help="Id number of namspace to include.")
 args = parser.parse_args()
 # set persistence method
 if args.persist is None:
    persist = PersistMethod.none
 elif args.persist == "segment":
    persist = PersistMethod.segment
 elif args.persist == "legacy":
    persist = PersistMethod.legacy
 else:
    persist = PersistMethod.sequence
 if args.namespace_filter is not None:
    namespaces = args.namespace_filter
 else:
    namespaces = None
 if len(args.dumpfiles) > 0:
    for filename in args.dumpfiles:
        input_file = open_input_file(filename)
@@ -375,10 +427,9 @@ if len(args.dumpfiles) > 0:
        wikiq = WikiqParser(input_file, output_file, 
                            collapse_user=args.collapse_user,
-                            persist=args.persist,
+                            persist=persist,
-                            persist_legacy=args.persist_legacy,
+                            urlencode=args.urlencode,
-                            urlencode=args.urlencode)
+                            namespaces = namespaces)
        wikiq.process()
@@ -388,9 +439,10 @@ if len(args.dumpfiles) > 0:
 else:
    wikiq = WikiqParser(sys.stdin, sys.stdout,
                        collapse_user=args.collapse_user,
-                        persist=args.persist,
+                        persist=persist,
                        persist_legacy=args.persist_legacy,
-                        urlencode=args.urlencode)
+                        urlencode=args.urlencode,
                        namespaces = namespaces)
    wikiq.process()
 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
Author	SHA1	Message	Date
groceryheist	f7f5bf8fd4	sub assertEquals assertEqual	2018-09-03 11:21:49 -07:00
Nate E TeBlunthuis	f784c77f60	add namespace filter parameter	2018-09-03 11:13:48 -07:00
groceryheist	317bafb50d	Merge branch 'advanced_persistence' of code.communitydata.cc:mediawiki_dump_tools into advanced_persistence	2018-08-23 19:00:49 -07:00
groceryheist	7cd0bf3b9e	Add parameter for selecting specific namespaces.	2018-08-23 18:49:32 -07:00
groceryheist	d93769c21f	Merge branch 'advanced_persistence' of code.communitydata.cc:mediawiki_dump_tools into advanced_persistence	2018-08-23 18:27:09 -07:00
Nate E TeBlunthuis	afd40c1a45	Merge branch 'advanced_persistence' of code.communitydata.cc:/mediawiki_dump_tools into advanced_persistence	2018-08-23 18:25:51 -07:00
Nate E TeBlunthuis	e4222c45dd	add namespace filter parameter	2018-08-23 18:25:08 -07:00
Nate E TeBlunthuis	829ffcffae	Merge branch 'advanced_persistence' of code.communitydata.cc:/mediawiki_dump_tools into advanced_persistence	2018-08-23 18:23:36 -07:00
Nate E TeBlunthuis	776b73519a	add namespace filter parameter	2018-08-23 18:23:23 -07:00
Nate E TeBlunthuis	5b6aaad862	add namespace filter parameter	2018-08-23 18:02:56 -07:00
groceryheist	f468d1a5b6	add support for persistence with segment matching	2018-08-20 16:08:16 -07:00
groceryheist	bf396ad366	Prefix page titles with namespace names.	2018-07-09 22:11:17 -07:00