From 7a4c41159c68d3ac3229b52fb283d7e8ef278831 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 26 May 2025 10:48:17 -0500 Subject: [PATCH 01/11] Exclude JetBrains config folder in .gitignore Signed-off-by: Will Beason --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 616dc22..1e1f74f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ *.xml.bz2 *.xml.xz *.swp + +# JetBrains +/.idea From 4804ecc4b335fb1a51730e906e3ccf64b9b35b30 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 26 May 2025 12:29:49 -0500 Subject: [PATCH 02/11] Add additional test dependencies These are now noted in requirements.txt Also make dependency on 7zip and ffmpeg explicit in README Signed-off-by: Will Beason --- .gitignore | 4 ++++ README.rst | 8 ++++++++ requirements.txt | 42 +++++++++++++++++++++++++++++++++++++++++ test/Wikiq_Unit_Test.py | 5 ++++- 4 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 1e1f74f..c90a397 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ # JetBrains /.idea + +# Python build and test output +__pycache__/ +test_output/ diff --git a/README.rst b/README.rst index b9e5f0a..761a9b3 100644 --- a/README.rst +++ b/README.rst @@ -12,6 +12,14 @@ submodule like:: Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on `7za`, `gzcat`, and `zcat`. +Dependencies +---------------- +These non-Python dependencies must be installed on your system for wikiq and its +associated tests to work. + +- 7zip +- ffmpeg + TODO: _______________ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a41306b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,42 @@ +apeek==0.1.1 +attrs==25.3.0 +certifi==2025.4.26 +charset-normalizer==3.4.2 +Cython==0.29.37 +deltas==0.7.0 +docopt==0.6.2 +gnureadline==8.1.2 +idna==3.10 +jsonable==0.3.1 +jsonschema==4.23.0 +jsonschema-specifications==2025.4.1 +mediawiki-utilities==0.4.18 +mw==0.4.0 +mwcli==0.0.3 +mwdiffs==0.0.2 +mwpersistence==0.2.4 +mwreverts==0.1.5 +mwtypes==0.4.0 +mwxml==0.3.6 +numpy==1.26.4 +pandas==2.2.3 +para==0.0.8 +parsimonious==0.10.0 +pyarrow==20.0.0 +pydub==0.25.1 +PyMySQL==1.1.1 +python-dateutil==2.9.0.post0 +pytz==2025.2 +PyYAML==5.4.1 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rpds-py==0.25.1 +setuptools==80.8.0 +six==1.17.0 +stopit==1.1.2 +typing_extensions==4.13.2 +tzdata==2025.2 +urllib3==2.4.0 +wheel==0.45.1 +yamlconf==0.2.6 diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 0a90c6c..7f4df39 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -5,11 +5,14 @@ from shutil import copyfile import pandas as pd from pandas.testing import assert_frame_equal from io import StringIO +import tracemalloc + +tracemalloc.start() # with / without pwr DONE # with / without url encode DONE # with / without collapse user DONE -# with output to sdtout DONE +# with output to stdout DONE # note that the persistence radius is 7 by default # reading various file formats including # 7z, gz, bz2, xml DONE From 9c5bf577e6ad1a0326b35550a25b25ccb36a9e68 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 26 May 2025 14:15:01 -0500 Subject: [PATCH 03/11] Remove unused dependencies and fix spacing The "mw" and "numpy" dependencies were unneeded. Spaces and tabs were inconsistently used. They are now used consistently, changes via auto-formatter. Signed-off-by: Will Beason --- requirements.txt | 3 - wikiq | 244 ++++++++++++++++++++++++++--------------------- 2 files changed, 134 insertions(+), 113 deletions(-) diff --git a/requirements.txt b/requirements.txt index a41306b..a54cca5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -apeek==0.1.1 attrs==25.3.0 certifi==2025.4.26 charset-normalizer==3.4.2 @@ -11,14 +10,12 @@ jsonable==0.3.1 jsonschema==4.23.0 jsonschema-specifications==2025.4.1 mediawiki-utilities==0.4.18 -mw==0.4.0 mwcli==0.0.3 mwdiffs==0.0.2 mwpersistence==0.2.4 mwreverts==0.1.5 mwtypes==0.4.0 mwxml==0.3.6 -numpy==1.26.4 pandas==2.2.3 para==0.0.8 parsimonious==0.10.0 diff --git a/wikiq b/wikiq index 2cb0c3a..7b59720 100755 --- a/wikiq +++ b/wikiq @@ -6,9 +6,9 @@ import argparse import sys -import os, os.path +import os.path import re -from datetime import datetime,timezone +from datetime import datetime, timezone from subprocess import Popen, PIPE from collections import deque @@ -20,8 +20,9 @@ from deltas.tokenizers import wikitext_split import mwpersistence import mwreverts from urllib.parse import quote + TO_ENCODE = ('title', 'editor') -PERSISTENCE_RADIUS=7 +PERSISTENCE_RADIUS = 7 from deltas import SequenceMatcher from deltas import SegmentMatcher @@ -30,42 +31,46 @@ from dataclasses import dataclass import pyarrow as pa import pyarrow.parquet as pq + class PersistMethod: none = 0 sequence = 1 segment = 2 legacy = 3 + def calculate_persistence(tokens_added): - return(sum([(len(x.revisions)-1) for x in tokens_added]), - len(tokens_added)) + return (sum([(len(x.revisions) - 1) for x in tokens_added]), + len(tokens_added)) + class WikiqIterator(): def __init__(self, fh, collapse_user=False): self.fh = fh self.collapse_user = collapse_user self.mwiterator = Dump.from_file(self.fh) - self.namespace_map = { ns.id : ns.name for ns in - self.mwiterator.site_info.namespaces } + self.namespace_map = {ns.id: ns.name for ns in + self.mwiterator.site_info.namespaces} self.__pages = self.load_pages() def load_pages(self): for page in self.mwiterator: yield WikiqPage(page, - namespace_map = self.namespace_map, + namespace_map=self.namespace_map, collapse_user=self.collapse_user) def __iter__(self): return self.__pages def __next__(self): - return next(self._pages) + return next(self.__pages) + class WikiqPage(): __slots__ = ('id', 'title', 'namespace', 'redirect', 'restrictions', 'mwpage', '__revisions', 'collapse_user') - + def __init__(self, page, namespace_map, collapse_user=False): self.id = page.id self.namespace = page.namespace @@ -95,7 +100,7 @@ class WikiqPage(): for i, rev in enumerate(self.mwpage): # never yield the first time if i == 0: - if self.collapse_user: + if self.collapse_user: collapsed_revs = 1 rev.collapsed_revs = collapsed_revs @@ -138,6 +143,8 @@ A RegexPair is defined by a regular expression (pattern) and a label. The pattern can include capture groups. If it does then each capture group will have a resulting column in the output. If the pattern does not include a capture group, then only one output column will result. """ + + class RegexPair(object): def __init__(self, pattern, label): self.pattern = re.compile(pattern) @@ -145,10 +152,10 @@ class RegexPair(object): self.has_groups = bool(self.pattern.groupindex) if self.has_groups: self.capture_groups = list(self.pattern.groupindex.keys()) - + def get_pyarrow_fields(self): if self.has_groups: - fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string())) + fields = [pa.field(self._make_key(cap_group), pa.list_(pa.string())) for cap_group in self.capture_groups] else: fields = [pa.field(self.label, pa.list_(pa.string()))] @@ -159,7 +166,7 @@ class RegexPair(object): return ("{}_{}".format(self.label, cap_group)) def matchmake(self, content, rev_data): - + temp_dict = {} # if there are named capture groups in the regex if self.has_groups: @@ -178,7 +185,7 @@ class RegexPair(object): temp_list.append(match.group(cap_group)) # if temp_list of matches is empty just make that column None - if len(temp_list)==0: + if len(temp_list) == 0: temp_dict[key] = None # else we put in the list we made in the for-loop above else: @@ -192,8 +199,8 @@ class RegexPair(object): # there are no capture groups, we just search for all the matches of the regex else: - #given that there are matches to be made - if type(content) in(str, bytes): + # given that there are matches to be made + if type(content) in (str, bytes): if self.pattern.search(content) is not None: m = self.pattern.findall(content) temp_dict[self.label] = ', '.join(m) @@ -206,6 +213,7 @@ class RegexPair(object): return rev_data + """ We used to use a dictionary to collect fields for the output. @@ -222,9 +230,11 @@ It also needs to have the correct pyarrow schema so we can write parquet files. The RevDataBase type has all the fields that will be output no matter how wikiq is invoked. """ + + @dataclass() class RevDataBase(): - revid: int + revid: int date_time: datetime articleid: int editorid: int @@ -249,18 +259,18 @@ class RevDataBase(): pa_schema_fields = [ pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('ms')), - pa.field("articleid",pa.int64()), - pa.field("editorid",pa.int64()), - pa.field("title",pa.string()), - pa.field("namespace",pa.int32()), - pa.field("deleted",pa.bool_()), - pa.field("test_chars",pa.int32()), - pa.field("revert",pa.bool_()), - pa.field("reverteds",pa.list_(pa.int64())), - pa.field("sha1",pa.string()), - pa.field("minor",pa.bool_()), - pa.field("editor",pa.string()), - pa.field("anon",pa.bool_()) + pa.field("articleid", pa.int64()), + pa.field("editorid", pa.int64()), + pa.field("title", pa.string()), + pa.field("namespace", pa.int32()), + pa.field("deleted", pa.bool_()), + pa.field("test_chars", pa.int32()), + pa.field("revert", pa.bool_()), + pa.field("reverteds", pa.list_(pa.int64())), + pa.field("sha1", pa.string()), + pa.field("minor", pa.bool_()), + pa.field("editor", pa.string()), + pa.field("anon", pa.bool_()) ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function @@ -269,7 +279,7 @@ class RevDataBase(): # logic to convert each field into the wikiq tsv format goes here. def to_tsv_row(self): - + row = [] for f in dc.fields(self): val = getattr(self, f.name) @@ -281,7 +291,7 @@ class RevDataBase(): elif f.type == datetime: row.append(val.strftime('%Y-%m-%d %H:%M:%S')) - elif f.name in {'editor','title'}: + elif f.name in {'editor', 'title'}: s = '"' + val + '"' if self.urlencode and f.name in TO_ENCODE: row.append(quote(str(s))) @@ -299,11 +309,12 @@ class RevDataBase(): else: row.append(val) - return '\t'.join(map(str,row)) + return '\t'.join(map(str, row)) def header_row(self): return '\t'.join(map(lambda f: f.name, dc.fields(self))) + """ If collapse=True we'll use a RevDataCollapse dataclass. @@ -312,43 +323,54 @@ This class inherits from RevDataBase. This means that it has all the same fields It just adds a new field and updates the pyarrow schema. """ + + @dataclass() class RevDataCollapse(RevDataBase): - collapsed_revs:int = None + collapsed_revs: int = None - pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64()) + pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64()) pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema] + """ If persistence data is to be computed we'll need the fields added by RevDataPersistence. """ + + @dataclass() class RevDataPersistence(RevDataBase): - token_revs:int = None - tokens_added:int = None - tokens_removed:int = None - tokens_window:int = None + token_revs: int = None + tokens_added: int = None + tokens_removed: int = None + tokens_window: int = None pa_persistence_schema_fields = [ pa.field("token_revs", pa.int64()), pa.field("tokens_added", pa.int64()), pa.field("tokens_removed", pa.int64()), pa.field("tokens_window", pa.int64())] - - pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields + + pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields + """ class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields. """ + + @dataclass() class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence): pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields -class WikiqParser(): - def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000): + +class WikiqParser: + def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, + regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces=None, + revert_radius=15, output_parquet=True, parquet_buffer_size=2000): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@ -360,7 +382,7 @@ class WikiqParser(): self.namespaces = [] self.urlencode = urlencode self.revert_radius = revert_radius - + if namespaces is not None: self.namespace_filter = set(namespaces) else: @@ -370,9 +392,8 @@ class WikiqParser(): self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) - # This is where we set the type for revdata. - + if self.collapse_user is True: if self.persist == PersistMethod.none: revdata_type = RevDataCollapse @@ -391,10 +412,10 @@ class WikiqParser(): self.revdata_type = dc.make_dataclass('RevData_Parser', fields=regex_fields, bases=(revdata_type,)) - + # we also need to make sure that we have the right pyarrow schema self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas - + self.revdata_type.urlencode = self.urlencode self.schema = pa.schema(self.revdata_type.pa_schema_fields) @@ -409,15 +430,15 @@ class WikiqParser(): else: self.print_header = True if output_file == sys.stdout: - + self.output_file = output_file else: - self.output_file = open(output_file,'w') + self.output_file = open(output_file, 'w') self.output_parquet = False def make_matchmake_pairs(self, patterns, labels): if (patterns is not None and labels is not None) and \ - (len(patterns) == len(labels)): + (len(patterns) == len(labels)): result = [] for pattern, label in zip(patterns, labels): rp = RegexPair(pattern, label) @@ -435,7 +456,7 @@ class WikiqParser(): return rev_data def matchmake_text(self, text, rev_data): - return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) + return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) def matchmake_comment(self, comment, rev_data): return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs) @@ -450,7 +471,7 @@ class WikiqParser(): for ns in self.namespaces: # skip if the namespace is not defined - if ns == None: + if ns is None: default_ns = self.namespaces[ns] continue @@ -460,7 +481,6 @@ class WikiqParser(): # if we've made it this far with no matches, we return the default namespace return default_ns - def process(self): # create a regex that creates the output filename @@ -472,12 +492,11 @@ class WikiqParser(): dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) # extract list of namspaces - self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} + self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces} page_count = 0 rev_count = 0 - # Iterate through pages for page in dump: namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title) @@ -487,17 +506,17 @@ class WikiqParser(): if namespace not in self.namespace_filter: continue - rev_detector = mwreverts.Detector(radius = self.revert_radius) + rev_detector = mwreverts.Detector(radius=self.revert_radius) if self.persist != PersistMethod.none: window = deque(maxlen=PERSISTENCE_RADIUS) - + if self.persist == PersistMethod.sequence: - state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), + state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.segment: - state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split), + state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) # self.persist == PersistMethod.legacy @@ -507,15 +526,15 @@ class WikiqParser(): # Iterate through a page's revisions for rev in page: - + # create a new data object instead of a dictionary. - rev_data = self.revdata_type(revid = rev.id, - date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc), - articleid = page.id, - editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, - title = page.title, - deleted = rev.deleted.text, - namespace = namespace + rev_data = self.revdata_type(revid=rev.id, + date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc), + articleid=page.id, + editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id, + title=page.title, + deleted=rev.deleted.text, + namespace=namespace ) rev_data = self.matchmake_revision(rev, rev_data) @@ -530,7 +549,7 @@ class WikiqParser(): text_sha1 = rev.sha1 else: text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() - + rev_data.sha1 = text_sha1 # TODO rev.bytes doesn't work.. looks like a bug @@ -538,7 +557,7 @@ class WikiqParser(): # generate revert data revert = rev_detector.process(text_sha1, rev.id) - + if revert: rev_data.revert = True rev_data.reverteds = revert.reverteds @@ -550,16 +569,16 @@ class WikiqParser(): if not rev.deleted.user: # wrap user-defined editors in quotes for fread - rev_data.editor = rev.user.text + rev_data.editor = rev.user.text rev_data.anon = rev.user.id is None - - #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): + + # if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): # redirect = True - #else: + # else: # redirect = False - - #TODO missing: additions_size deletions_size - + + # TODO missing: additions_size deletions_size + # if collapse user was on, lets run that if self.collapse_user: rev_data.collapsed_revs = rev.collapsed_revs @@ -573,18 +592,18 @@ class WikiqParser(): else: _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) - + window.append((rev.id, rev_data, tokens_added, tokens_removed)) - + if len(window) == PERSISTENCE_RADIUS: old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] - + num_token_revs, num_tokens = calculate_persistence(old_tokens_added) old_rev_data.token_revs = num_token_revs old_rev_data.tokens_added = num_tokens old_rev_data.tokens_removed = len(old_tokens_removed) - old_rev_data.tokens_window = PERSISTENCE_RADIUS-1 + old_rev_data.tokens_window = PERSISTENCE_RADIUS - 1 self.print_rev_data(old_rev_data) @@ -606,7 +625,7 @@ class WikiqParser(): rev_data.token_revs = num_token_revs rev_data.tokens_added = num_tokens rev_data.tokens_removed = len(tokens_removed) - rev_data.tokens_window = len(window)-(i+1) + rev_data.tokens_window = len(window) - (i + 1) self.print_rev_data(rev_data) page_count += 1 @@ -622,11 +641,11 @@ class WikiqParser(): else: self.output_file.close() - """ For performance reasons it's better to write parquet in batches instead of one row at a time. So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written). """ + def write_parquet_row(self, rev_data): padata = rev_data.to_pyarrow() self.parquet_buffer.append(padata) @@ -634,16 +653,17 @@ class WikiqParser(): if len(self.parquet_buffer) >= self.parquet_buffer_size: self.flush_parquet_buffer() - """ Function that actually writes data to the parquet file. It needs to transpose the data from row-by-row to column-by-column """ + def flush_parquet_buffer(self): """ Returns the pyarrow table that we'll write """ + def rows_to_table(rg, schema): cols = [] first = rg[0] @@ -661,18 +681,18 @@ class WikiqParser(): outtable = rows_to_table(self.parquet_buffer, self.schema) if self.pq_writer is None: - self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') + self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark') self.pq_writer.write_table(outtable) self.parquet_buffer = [] - + # depending on if we are configured to write tsv or parquet, we'll call a different function. def print_rev_data(self, rev_data): if self.output_parquet is False: printfunc = self.write_tsv_row else: printfunc = self.write_parquet_row - + printfunc(rev_data) def write_tsv_row(self, rev_data): @@ -686,11 +706,11 @@ class WikiqParser(): def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, "*.xml"] + cmd = ["7za", "x", "-so", input_filename, "*.xml"] elif re.match(r'.*\.gz$', input_filename): - cmd = ["zcat", input_filename] + cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): - cmd = ["bzcat", "-dk", input_filename] + cmd = ["bzcat", "-dk", input_filename] try: input_file = Popen(cmd, stdout=PIPE).stdout @@ -699,7 +719,8 @@ def open_input_file(input_filename): return input_file -def get_output_filename(input_filename, parquet = False): + +def get_output_filename(input_filename, parquet=False): output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) output_filename = re.sub(r'\.xml', '', output_filename) if parquet is False: @@ -708,16 +729,18 @@ def get_output_filename(input_filename, parquet = False): output_filename = output_filename + ".parquet" return output_filename + def open_output_file(input_filename): # create a regex that creates the output filename - output_filename = get_output_filename(input_filename, parquet = False) + output_filename = get_output_filename(input_filename, parquet=False) output_file = open(output_filename, "w") return output_file + parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') # arguments for the input direction -parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, +parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, @@ -729,7 +752,8 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") -parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?', +parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, + choices=['', 'segment', 'sequence', 'legacy'], nargs='?', help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", @@ -749,19 +773,19 @@ parser.add_argument('-rr', parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") -parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', +parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, + action='append', help="The label for the outputted column based on matching the regex in revision text.") parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', help="The regular expression to search for in comments of revisions.") -parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', +parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, + action='append', help="The label for the outputted column based on matching the regex in comments.") args = parser.parse_args() - - # set persistence method if args.persist is None: @@ -798,7 +822,7 @@ if len(args.dumpfiles) > 0: output_file = sys.stdout else: filename = os.path.join(output_dir, os.path.basename(filename)) - output_file = get_output_filename(filename, parquet = output_parquet) + output_file = get_output_filename(filename, parquet=output_parquet) wikiq = WikiqParser(input_file, output_file, @@ -807,15 +831,15 @@ if len(args.dumpfiles) > 0: urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, - regex_match_revision = args.regex_match_revision, - regex_revision_label = args.regex_revision_label, - regex_match_comment = args.regex_match_comment, - regex_comment_label = args.regex_comment_label, + regex_match_revision=args.regex_match_revision, + regex_revision_label=args.regex_revision_label, + regex_match_comment=args.regex_match_comment, + regex_comment_label=args.regex_comment_label, output_parquet=output_parquet) wikiq.process() - # close things + # close things input_file.close() else: @@ -823,16 +847,16 @@ else: sys.stdout, collapse_user=args.collapse_user, persist=persist, - #persist_legacy=args.persist_legacy, + # persist_legacy=args.persist_legacy, urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, - regex_match_revision = args.regex_match_revision, - regex_revision_label = args.regex_revision_label, - regex_match_comment = args.regex_match_comment, - regex_comment_label = args.regex_comment_label) + regex_match_revision=args.regex_match_revision, + regex_revision_label=args.regex_revision_label, + regex_match_comment=args.regex_match_comment, + regex_comment_label=args.regex_comment_label) - wikiq.process() + wikiq.process() # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" # stop_words = stop_words.split(",") From 09a84e7d11b4de51b6824ef65236a262a26aa055 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 26 May 2025 15:07:39 -0500 Subject: [PATCH 04/11] Reformat Wikiq_Unit_Test.py Separate out reformatting from editing. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 160 +++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 85 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 7f4df39..0cb78dc 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -9,6 +9,7 @@ import tracemalloc tracemalloc.start() + # with / without pwr DONE # with / without url encode DONE # with / without collapse user DONE @@ -25,25 +26,25 @@ class Test_Wikipedia(unittest.TestCase): os.mkdir("test_output") self.wiki = 'ikwiki-20180301-pages-meta-history' - self.wikiq_out_name = self.wiki + ".tsv" + self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - self.infile = "{0}.xml.bz2".format(self.wiki) + self.infile = "{0}.xml.bz2".format(self.wiki) self.base_call = "../wikiq {0} -o {1}" self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) + self.input_file = os.path.join(".", self.input_dir, self.infile) self.baseline_output_dir = "baseline_output" def test_WP_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name + test_filename = "url-encode_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) @@ -52,52 +53,51 @@ class Test_Wikipedia(unittest.TestCase): # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) def test_WP_namespaces(self): print(os.path.abspath('.')) - test_filename = "namespaces_" + self.wikiq_out_name + test_filename = "namespaces_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1" print(call) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) - num_wrong_ns = sum(~ test.namespace.isin({0,1})) + num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) def test_WP_revert_radius(self): print(os.path.abspath('.')) - test_filename = "revert_radius_" + self.wikiq_out_name + test_filename = "revert_radius_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1 -rr 1" print(call) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) - num_wrong_ns = sum(~ test.namespace.isin({0,1})) + num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) - + assert_frame_equal(test, baseline, check_like=True) class Test_Basic(unittest.TestCase): @@ -107,25 +107,25 @@ class Test_Basic(unittest.TestCase): os.mkdir("test_output") self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" + self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) self.infile = "{0}.xml.7z".format(self.wiki) self.base_call = "../wikiq {0} -o {1}" self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) + self.input_file = os.path.join(".", self.input_dir, self.infile) self.baseline_output_dir = "baseline_output" def test_noargs(self): - test_filename = "noargs_" + self.wikiq_out_name + test_filename = "noargs_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) @@ -134,19 +134,18 @@ class Test_Basic(unittest.TestCase): test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) - + assert_frame_equal(test, baseline, check_like=True) def test_collapse_user(self): - test_filename = "collapse-user_" + self.wikiq_out_name + test_filename = "collapse-user_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --collapse-user" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) @@ -154,60 +153,57 @@ class Test_Basic(unittest.TestCase): baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) def test_pwr_segment(self): - test_filename = "persistence_segment_" + self.wikiq_out_name + test_filename = "persistence_segment_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence segment" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() - copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) def test_pwr_legacy(self): - test_filename = "persistence_legacy_" + self.wikiq_out_name + test_filename = "persistence_legacy_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence legacy" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() - copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) def test_pwr(self): - test_filename = "persistence_" + self.wikiq_out_name + test_filename = "persistence_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) - if os.path.exists(test_file): - os.remove(test_file) - + if os.path.exists(test_file): + os.remove(test_file) + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() - copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -216,19 +212,18 @@ class Test_Basic(unittest.TestCase): baseline = pd.read_table(baseline_file) test = test.reindex(columns=sorted(test.columns)) - assert_frame_equal(test,baseline, check_like=True) - + assert_frame_equal(test, baseline, check_like=True) def test_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name + test_filename = "url-encode_" + self.wikiq_out_name test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) - + call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) @@ -237,7 +232,7 @@ class Test_Basic(unittest.TestCase): baseline = pd.read_table(baseline_file) test = test.reindex(columns=sorted(test.columns)) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) class Test_Malformed(unittest.TestCase): @@ -246,42 +241,40 @@ class Test_Malformed(unittest.TestCase): os.mkdir("test_output") self.wiki = 'twinpeaks' - self.wikiq_out_name = self.wiki + ".tsv" + self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) self.infile = "{0}.xml.7z".format(self.wiki) self.base_call = "../wikiq {0} -o {1}" self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) - + self.input_file = os.path.join(".", self.input_dir, self.infile) def test_malformed_noargs(self): - call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) proc.wait() outs, errs = proc.communicate() errlines = str(errs).split("\\n") - self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + class Test_Stdout(unittest.TestCase): def setUp(self): self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" + self.wikiq_out_name = self.wiki + ".tsv" self.infile = "{0}.xml.7z".format(self.wiki) self.base_call = "../wikiq {0} --stdout" self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) + self.input_file = os.path.join(".", self.input_dir, self.infile) self.baseline_output_dir = "baseline_output" def test_noargs(self): - call = self.base_call.format(self.input_file) print(call) - proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True) + proc = subprocess.run(call, stdout=subprocess.PIPE, shell=True) outs = proc.stdout.decode("utf8") test_file = "noargs_" + self.wikiq_out_name @@ -289,7 +282,8 @@ class Test_Stdout(unittest.TestCase): print(baseline_file) test = pd.read_table(StringIO(outs)) baseline = pd.read_table(baseline_file) - assert_frame_equal(test,baseline, check_like=True) + assert_frame_equal(test, baseline, check_like=True) + class Test_Regex(unittest.TestCase): @@ -299,7 +293,7 @@ class Test_Regex(unittest.TestCase): self.infile = "{0}.xml.bz2".format(self.wiki) self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir,self.infile) + self.input_file = os.path.join(".", self.input_dir, self.infile) if not os.path.exists("test_output"): os.mkdir("test_output") @@ -314,13 +308,13 @@ class Test_Regex(unittest.TestCase): # sample inputs for checking that bad inputs get terminated / test_regex_inputs self.bad_inputs_list = [ - #label is missing - "-RP '\\b\\d+\\b'", - #number of reg and number of labels do not match + # label is missing + "-RP '\\b\\d+\\b'", + # number of reg and number of labels do not match "-RP 'NPO V' -RP THE -RPl testlabel", - #cp but rp label + # cp but rp label "-CP '(Tamil|Li)' -RPl testlabel", - #regex is missing + # regex is missing "-CPl testlabel", "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] @@ -330,35 +324,32 @@ class Test_Regex(unittest.TestCase): "-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", - "-CP 'WP:EVADE' -CPl wp_evade" + "-CP 'WP:EVADE' -CPl wp_evade" ] - self.cap_inputs_list = [ "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" ] - - def test_regex_inputs(self): for input in self.bad_inputs_list: call = self.base_call.format(self.input_file) call = call + " --stdout " + input print(call) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) - stdout,stderr = proc.communicate() - #print(proc.returncode) - + proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + stdout, stderr = proc.communicate() + # print(proc.returncode) + # we want to check that the bad inputs were caught and sys.exit is stopping the code print(stderr.decode("utf-8")) - self.assertNotEqual(proc.returncode,0) + self.assertNotEqual(proc.returncode, 0) def test_basic_regex(self): for i, input in enumerate(self.good_inputs_list): test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - #print(test_filename) + # print(test_filename) test_file = os.path.join(self.test_output_dir, test_filename) if os.path.exists(test_file): os.remove(test_file) @@ -367,18 +358,17 @@ class Test_Regex(unittest.TestCase): call = call + " " + input print(call) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) test = pd.read_table(test_file) - + baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) - def test_capturegroup_regex(self): for i, input in enumerate(self.cap_inputs_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) @@ -391,13 +381,13 @@ class Test_Regex(unittest.TestCase): call = call + " " + input print(call) - proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) + proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) proc.wait() copyfile(self.call_output, test_file) - + test = pd.read_table(test_file) - + baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) From 6d133575c75958f45852350baae608b930e04821 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Mon, 26 May 2025 15:08:47 -0500 Subject: [PATCH 05/11] Remove resource leaks from tests Close subprocesses within tests to fix resource leak warning. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 88 +++++++++++++++++++++++++---------------- wikiq | 13 +++--- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 0cb78dc..d78ed32 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -44,8 +44,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -65,8 +67,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1" print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) + copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -87,8 +91,10 @@ class Test_Wikipedia(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " -n 0 -n 1 -rr 1" print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) + copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -125,8 +131,10 @@ class Test_Basic(unittest.TestCase): os.remove(test_file) call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -144,9 +152,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --collapse-user" - - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -163,8 +172,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence segment" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -182,8 +193,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence legacy" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -201,8 +214,10 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --persistence" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) @@ -223,8 +238,9 @@ class Test_Basic(unittest.TestCase): call = self.base_call.format(self.input_file, self.test_output_dir) call = call + " --url-encode" - proc = subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) - proc.wait() + with subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -252,11 +268,13 @@ class Test_Malformed(unittest.TestCase): def test_malformed_noargs(self): call = self.base_call.format(self.input_file, self.test_output_dir) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() - outs, errs = proc.communicate() - errlines = str(errs).split("\\n") - self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + self.assertNotEqual(proc.returncode, 0) + outs, errs = proc.communicate() + errlines = str(errs).split("\\n") + self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') class Test_Stdout(unittest.TestCase): @@ -337,13 +355,12 @@ class Test_Regex(unittest.TestCase): call = self.base_call.format(self.input_file) call = call + " --stdout " + input print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - stdout, stderr = proc.communicate() - # print(proc.returncode) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + stdout, stderr = proc.communicate() + # we want to check that the bad inputs were caught and sys.exit is stopping the code + print(stderr.decode("utf-8")) - # we want to check that the bad inputs were caught and sys.exit is stopping the code - print(stderr.decode("utf-8")) - self.assertNotEqual(proc.returncode, 0) + self.assertNotEqual(proc.returncode, 0) def test_basic_regex(self): for i, input in enumerate(self.good_inputs_list): @@ -357,9 +374,10 @@ class Test_Regex(unittest.TestCase): call = self.base_call_outs.format(self.input_file, self.test_output_dir) call = call + " " + input print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() copyfile(self.call_output, test_file) test = pd.read_table(test_file) @@ -381,8 +399,10 @@ class Test_Regex(unittest.TestCase): call = call + " " + input print(call) - proc = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - proc.wait() + print(call) + with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: + proc.wait() + assert (proc.returncode == 0) copyfile(self.call_output, test_file) diff --git a/wikiq b/wikiq index 7b59720..3c41f1d 100755 --- a/wikiq +++ b/wikiq @@ -97,6 +97,7 @@ class WikiqPage(): # 3 A B True # 4 A A False # Post-loop A Always + collapsed_revs = 0 for i, rev in enumerate(self.mwpage): # never yield the first time if i == 0: @@ -491,7 +492,7 @@ class WikiqParser: # Construct dump file iterator dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) - # extract list of namspaces + # extract list of namespaces self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces} page_count = 0 @@ -703,7 +704,6 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) - def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, "*.xml"] @@ -711,14 +711,13 @@ def open_input_file(input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] + else: + raise ValueError("Unrecognized file type: %s" % input_filename) try: - input_file = Popen(cmd, stdout=PIPE).stdout + return Popen(cmd, stdout=PIPE).stdout except NameError: - input_file = open(input_filename, 'r') - - return input_file - + return open(input_filename, 'r') def get_output_filename(input_filename, parquet=False): output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) From 3d0bf89938705b2b474f3ebb737ce97f156a2328 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 27 May 2025 11:10:42 -0500 Subject: [PATCH 06/11] Move main logic to main() This avoids: 1) the main function running when sourcing the file 2) Creating many globally-scoped variables in the main logic Also begin refactor of test output file logic Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 45 ++++----- wikiq | 200 +++++++++++++++++++++------------------- 2 files changed, 128 insertions(+), 117 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index d78ed32..be4777f 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -22,9 +22,6 @@ tracemalloc.start() class Test_Wikipedia(unittest.TestCase): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'ikwiki-20180301-pages-meta-history' self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") @@ -109,9 +106,6 @@ class Test_Wikipedia(unittest.TestCase): class Test_Basic(unittest.TestCase): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'sailormoon' self.wikiq_out_name = self.wiki + ".tsv" self.test_output_dir = os.path.join(".", "test_output") @@ -318,14 +312,14 @@ class Test_Regex(unittest.TestCase): self.test_output_dir = os.path.join(".", "test_output") self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) - # we have two base calls, one for checking inputs and the other for checking outputs + # we have two base calls, one for checking arguments and the other for checking outputs self.base_call = "../wikiq {0}" self.base_call_outs = "../wikiq {0} -o {1}" self.baseline_output_dir = "baseline_output" - # sample inputs for checking that bad inputs get terminated / test_regex_inputs - self.bad_inputs_list = [ + # sample arguments for checking that bad arguments get terminated / test_regex_arguments + self.bad_arguments_list = [ # label is missing "-RP '\\b\\d+\\b'", # number of reg and number of labels do not match @@ -337,33 +331,33 @@ class Test_Regex(unittest.TestCase): "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] - # sample inputs for checking the outcomes of good inputs / test_basic_regex - self.good_inputs_list = [ + # sample arguments for checking the outcomes of good arguments / test_basic_regex + self.good_arguments_list = [ "-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", "-CP 'WP:EVADE' -CPl wp_evade" ] - self.cap_inputs_list = [ + self.cap_arguments_list = [ "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" ] - def test_regex_inputs(self): - for input in self.bad_inputs_list: + def test_regex_arguments(self): + for arguments in self.bad_arguments_list: call = self.base_call.format(self.input_file) - call = call + " --stdout " + input + call = call + " --stdout " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: stdout, stderr = proc.communicate() - # we want to check that the bad inputs were caught and sys.exit is stopping the code + # we want to check that the bad arguments were caught and sys.exit is stopping the code print(stderr.decode("utf-8")) self.assertNotEqual(proc.returncode, 0) def test_basic_regex(self): - for i, input in enumerate(self.good_inputs_list): + for i, arguments in enumerate(self.good_arguments_list): test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) # print(test_filename) @@ -372,7 +366,7 @@ class Test_Regex(unittest.TestCase): os.remove(test_file) call = self.base_call_outs.format(self.input_file, self.test_output_dir) - call = call + " " + input + call = call + " " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() @@ -388,7 +382,7 @@ class Test_Regex(unittest.TestCase): print(i) def test_capturegroup_regex(self): - for i, input in enumerate(self.cap_inputs_list): + for i, arguments in enumerate(self.cap_arguments_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) print(test_filename) test_file = os.path.join(self.test_output_dir, test_filename) @@ -396,10 +390,9 @@ class Test_Regex(unittest.TestCase): os.remove(test_file) call = self.base_call_outs.format(self.input_file, self.test_output_dir) - call = call + " " + input + call = call + " " + arguments print(call) - print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() assert (proc.returncode == 0) @@ -414,4 +407,14 @@ class Test_Regex(unittest.TestCase): if __name__ == '__main__': + # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. + if not os.path.exists("test_output"): + os.mkdir("test_output") + else: + # Avoid subsequent calls to tests interfering with each other. + # Otherwise, a test may erroneously pass if the program has no output + # but a previous run output what was expected. + for f in os.listdir("test_output"): + os.remove(os.path.join("test_output", f)) + unittest.main() diff --git a/wikiq b/wikiq index 3c41f1d..7553d8c 100755 --- a/wikiq +++ b/wikiq @@ -44,7 +44,7 @@ def calculate_persistence(tokens_added): len(tokens_added)) -class WikiqIterator(): +class WikiqIterator: def __init__(self, fh, collapse_user=False): self.fh = fh self.collapse_user = collapse_user @@ -66,7 +66,7 @@ class WikiqIterator(): return next(self.__pages) -class WikiqPage(): +class WikiqPage: __slots__ = ('id', 'title', 'namespace', 'redirect', 'restrictions', 'mwpage', '__revisions', 'collapse_user') @@ -164,7 +164,7 @@ class RegexPair(object): return fields def _make_key(self, cap_group): - return ("{}_{}".format(self.label, cap_group)) + return "{}_{}".format(self.label, cap_group) def matchmake(self, content, rev_data): @@ -182,7 +182,7 @@ class RegexPair(object): temp_list = [] for match in matchobjects: # we only want to add the match for the capture group if the match is not None - if match.group(cap_group) != None: + if match.group(cap_group) is not None: temp_list.append(match.group(cap_group)) # if temp_list of matches is empty just make that column None @@ -234,7 +234,7 @@ The RevDataBase type has all the fields that will be output no matter how wikiq @dataclass() -class RevDataBase(): +class RevDataBase: revid: int date_time: datetime articleid: int @@ -358,7 +358,7 @@ class RevDataPersistence(RevDataBase): """ -class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields. +class RevDataCollapsePersistence uses multiple inheritance to make a class that has both persistence and collapse fields. """ @@ -446,7 +446,7 @@ class WikiqParser: result.append(rp) self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() return result - elif (patterns is None and labels is None): + elif (patterns is None) and (labels is None): return [] else: sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') @@ -580,7 +580,7 @@ class WikiqParser: # TODO missing: additions_size deletions_size - # if collapse user was on, lets run that + # if collapse user was on, let's run that if self.collapse_user: rev_data.collapsed_revs = rev.collapsed_revs @@ -704,6 +704,7 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) + def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, "*.xml"] @@ -719,6 +720,7 @@ def open_input_file(input_filename): except NameError: return open(input_filename, 'r') + def get_output_filename(input_filename, parquet=False): output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) output_filename = re.sub(r'\.xml', '', output_filename) @@ -736,126 +738,132 @@ def open_output_file(input_filename): return output_file -parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') +def main(): + parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimited data.') -# arguments for the input direction -parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, - help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") + # arguments for the input direction + parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, + help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") -parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, - help="Directory for output files. If it ends with .parquet output will be in parquet format.") + parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, + help="Directory for output files. If it ends with .parquet output will be in parquet format.") -parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", - help="Write output to standard out (do not create dump file)") + parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", + help="Write output to standard out (do not create dump file)") -parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", - help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") + parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", + help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") -parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, - choices=['', 'segment', 'sequence', 'legacy'], nargs='?', - help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") + parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, + choices=['', 'segment', 'sequence', 'legacy'], nargs='?', + help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") -parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", - help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") + parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", + help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") -parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', - help="Id number of namspace to include. Can be specified more than once.") + parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', + help="Id number of namespace to include. Can be specified more than once.") -parser.add_argument('-rr', - '--revert-radius', - dest="revert_radius", - type=int, - action='store', - default=15, - help="Number of edits to check when looking for reverts (default: 15)") + parser.add_argument('-rr', + '--revert-radius', + dest="revert_radius", + type=int, + action='store', + default=15, + help="Number of edits to check when looking for reverts (default: 15)") -parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', - help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") + parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, + action='append', + help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") -parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, - action='append', - help="The label for the outputted column based on matching the regex in revision text.") + parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, + action='append', + help="The label for the outputted column based on matching the regex in revision text.") -parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', - help="The regular expression to search for in comments of revisions.") + parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', + help="The regular expression to search for in comments of revisions.") -parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, - action='append', - help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, + action='append', + help="The label for the outputted column based on matching the regex in comments.") -args = parser.parse_args() + args = parser.parse_args() -# set persistence method + # set persistence method -if args.persist is None: - persist = PersistMethod.none -elif args.persist == "segment": - persist = PersistMethod.segment -elif args.persist == "legacy": - persist = PersistMethod.legacy -else: - persist = PersistMethod.sequence + if args.persist is None: + persist = PersistMethod.none + elif args.persist == "segment": + persist = PersistMethod.segment + elif args.persist == "legacy": + persist = PersistMethod.legacy + else: + persist = PersistMethod.sequence -if args.namespace_filter is not None: - namespaces = args.namespace_filter -else: - namespaces = None + if args.namespace_filter is not None: + namespaces = args.namespace_filter + else: + namespaces = None -if len(args.dumpfiles) > 0: - output_parquet = False - for filename in args.dumpfiles: - input_file = open_input_file(filename) + if len(args.dumpfiles) > 0: + output_parquet = False + for filename in args.dumpfiles: + input_file = open_input_file(filename) - # open directory for output - if args.output_dir: - output_dir = args.output_dir[0] - else: - output_dir = "." + # open directory for output + if args.output_dir: + output_dir = args.output_dir[0] + else: + output_dir = "." - if output_dir.endswith(".parquet"): - output_parquet = True + if output_dir.endswith(".parquet"): + output_parquet = True - print("Processing file: %s" % filename, file=sys.stderr) + print("Processing file: %s" % filename, file=sys.stderr) - if args.stdout: - output_file = sys.stdout - else: - filename = os.path.join(output_dir, os.path.basename(filename)) - output_file = get_output_filename(filename, parquet=output_parquet) + if args.stdout: + output_file = sys.stdout + else: + filename = os.path.join(output_dir, os.path.basename(filename)) + output_file = get_output_filename(filename, parquet=output_parquet) - wikiq = WikiqParser(input_file, - output_file, + wikiq = WikiqParser(input_file, + output_file, + collapse_user=args.collapse_user, + persist=persist, + urlencode=args.urlencode, + namespaces=namespaces, + revert_radius=args.revert_radius, + regex_match_revision=args.regex_match_revision, + regex_revision_label=args.regex_revision_label, + regex_match_comment=args.regex_match_comment, + regex_comment_label=args.regex_comment_label, + output_parquet=output_parquet) + + wikiq.process() + + # close things + input_file.close() + + else: + wikiq = WikiqParser(sys.stdin, + sys.stdout, collapse_user=args.collapse_user, persist=persist, + # persist_legacy=args.persist_legacy, urlencode=args.urlencode, namespaces=namespaces, revert_radius=args.revert_radius, regex_match_revision=args.regex_match_revision, regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, - regex_comment_label=args.regex_comment_label, - output_parquet=output_parquet) + regex_comment_label=args.regex_comment_label) wikiq.process() - # close things - input_file.close() + # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" + # stop_words = stop_words.split(",") -else: - wikiq = WikiqParser(sys.stdin, - sys.stdout, - collapse_user=args.collapse_user, - persist=persist, - # persist_legacy=args.persist_legacy, - urlencode=args.urlencode, - namespaces=namespaces, - revert_radius=args.revert_radius, - regex_match_revision=args.regex_match_revision, - regex_revision_label=args.regex_revision_label, - regex_match_comment=args.regex_match_comment, - regex_comment_label=args.regex_comment_label) - wikiq.process() - -# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" -# stop_words = stop_words.split(",") +if __name__ == "__main__": + main() From ebc57864f2e6e6a911a016fdbe4f19270a59ff5d Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 27 May 2025 13:40:57 -0500 Subject: [PATCH 07/11] Make tests runnable from anywhere Tests no longer implicitly require that the caller be in a specific working directory. Signed-off-by: Will Beason --- README.rst | 5 ++ test/Wikiq_Unit_Test.py | 150 ++++++++++++++++++++-------------------- test/__init__.py | 0 3 files changed, 79 insertions(+), 76 deletions(-) create mode 100644 test/__init__.py diff --git a/README.rst b/README.rst index 761a9b3..77199c8 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,11 @@ associated tests to work. - 7zip - ffmpeg +Tests +---- +To run tests:: + + python -m unittest test.Wikiq_Unit_Test TODO: _______________ diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index be4777f..75cda8d 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -6,9 +6,13 @@ import pandas as pd from pandas.testing import assert_frame_equal from io import StringIO import tracemalloc +from typing import Final -tracemalloc.start() +# Make references to files and wikiq relative to this file, not to the current working directory. +TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) +WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") +TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") # with / without pwr DONE # with / without url encode DONE @@ -24,27 +28,29 @@ class Test_Wikipedia(unittest.TestCase): def setUp(self): self.wiki = 'ikwiki-20180301-pages-meta-history' self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) self.infile = "{0}.xml.bz2".format(self.wiki) - self.base_call = "../wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir, self.infile) - self.baseline_output_dir = "baseline_output" + + self.base_call = WIKIQ + " {0} -o {1}" + self.input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") def test_WP_url_encode(self): test_filename = "url-encode_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --url-encode" print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - assert (proc.returncode == 0) + try: + subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + except subprocess.CalledProcessError as exc: + print(exc.stderr.decode("utf8")) + self.fail() copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -57,16 +63,16 @@ class Test_Wikipedia(unittest.TestCase): def test_WP_namespaces(self): print(os.path.abspath('.')) test_filename = "namespaces_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " -n 0 -n 1" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -81,16 +87,16 @@ class Test_Wikipedia(unittest.TestCase): def test_WP_revert_radius(self): print(os.path.abspath('.')) test_filename = "revert_radius_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " -n 0 -n 1 -rr 1" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) @@ -108,27 +114,26 @@ class Test_Basic(unittest.TestCase): def setUp(self): self.wiki = 'sailormoon' self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir, self.infile) - self.baseline_output_dir = "baseline_output" + self.base_call = WIKIQ + " {0} -o {1}" + self.input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") def test_noargs(self): test_filename = "noargs_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -140,16 +145,16 @@ class Test_Basic(unittest.TestCase): def test_collapse_user(self): test_filename = "collapse-user_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --collapse-user" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -160,16 +165,16 @@ class Test_Basic(unittest.TestCase): def test_pwr_segment(self): test_filename = "persistence_segment_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --persistence segment" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -181,16 +186,16 @@ class Test_Basic(unittest.TestCase): def test_pwr_legacy(self): test_filename = "persistence_legacy_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --persistence legacy" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -202,16 +207,16 @@ class Test_Basic(unittest.TestCase): def test_pwr(self): test_filename = "persistence_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --persistence" print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -226,15 +231,15 @@ class Test_Basic(unittest.TestCase): def test_url_encode(self): test_filename = "url-encode_" + self.wikiq_out_name - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) call = call + " --url-encode" with subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) @@ -247,21 +252,17 @@ class Test_Basic(unittest.TestCase): class Test_Malformed(unittest.TestCase): def setUp(self): - if not os.path.exists("test_output"): - os.mkdir("test_output") - self.wiki = 'twinpeaks' self.wikiq_out_name = self.wiki + ".tsv" - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} -o {1}" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir, self.infile) + self.base_call = WIKIQ + " {0} -o {1}" + self.input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) def test_malformed_noargs(self): - call = self.base_call.format(self.input_file, self.test_output_dir) + call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() @@ -278,10 +279,10 @@ class Test_Stdout(unittest.TestCase): self.wikiq_out_name = self.wiki + ".tsv" self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = "../wikiq {0} --stdout" - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir, self.infile) - self.baseline_output_dir = "baseline_output" + self.base_call = WIKIQ + " {0} --stdout" + self.input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") def test_noargs(self): call = self.base_call.format(self.input_file) @@ -298,25 +299,20 @@ class Test_Stdout(unittest.TestCase): class Test_Regex(unittest.TestCase): - def setUp(self): self.wiki = 'regextest' self.wikiq_out_name = self.wiki + '.tsv' self.infile = "{0}.xml.bz2".format(self.wiki) - self.input_dir = "dumps" - self.input_file = os.path.join(".", self.input_dir, self.infile) + self.input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) - if not os.path.exists("test_output"): - os.mkdir("test_output") - - self.test_output_dir = os.path.join(".", "test_output") - self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) # we have two base calls, one for checking arguments and the other for checking outputs - self.base_call = "../wikiq {0}" - self.base_call_outs = "../wikiq {0} -o {1}" + self.base_call = WIKIQ + " {0}" + self.base_call_outs = WIKIQ + " {0} -o {1}" - self.baseline_output_dir = "baseline_output" + self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") # sample arguments for checking that bad arguments get terminated / test_regex_arguments self.bad_arguments_list = [ @@ -361,16 +357,16 @@ class Test_Regex(unittest.TestCase): test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) # print(test_filename) - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call_outs.format(self.input_file, self.test_output_dir) + call = self.base_call_outs.format(self.input_file, TEST_OUTPUT_DIR) call = call + " " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -385,17 +381,17 @@ class Test_Regex(unittest.TestCase): for i, arguments in enumerate(self.cap_arguments_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) print(test_filename) - test_file = os.path.join(self.test_output_dir, test_filename) + test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) if os.path.exists(test_file): os.remove(test_file) - call = self.base_call_outs.format(self.input_file, self.test_output_dir) + call = self.base_call_outs.format(self.input_file, TEST_OUTPUT_DIR) call = call + " " + arguments print(call) with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: proc.wait() - assert (proc.returncode == 0) + self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) @@ -407,14 +403,16 @@ class Test_Regex(unittest.TestCase): if __name__ == '__main__': + tracemalloc.start() + # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. - if not os.path.exists("test_output"): - os.mkdir("test_output") + if not os.path.exists(TEST_OUTPUT_DIR): + os.mkdir(TEST_OUTPUT_DIR) else: # Avoid subsequent calls to tests interfering with each other. # Otherwise, a test may erroneously pass if the program has no output # but a previous run output what was expected. - for f in os.listdir("test_output"): - os.remove(os.path.join("test_output", f)) + for f in os.listdir(TEST_OUTPUT_DIR): + os.remove(os.path.join(TEST_OUTPUT_DIR, f)) unittest.main() diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 From 4d3900b54122303158028eb566b99d0dff970a10 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 27 May 2025 14:27:49 -0500 Subject: [PATCH 08/11] Standardize calling for wikiq in tests This way failures show the output of stderr/etc. Also create path constant strings for use in tests to avoid repetition and make changes easier. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 103 +++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 53 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 75cda8d..08fdfc4 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -13,6 +13,27 @@ from typing import Final TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") +BASELINE_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") + +def setup(): + tracemalloc.start() + + # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. + if not os.path.exists(TEST_OUTPUT_DIR): + os.mkdir(TEST_OUTPUT_DIR) + else: + # Avoid subsequent calls to tests interfering with each other. + # Otherwise, a test may erroneously pass if the program has no output + # but a previous run output what was expected. + for f in os.listdir(TEST_OUTPUT_DIR): + os.remove(os.path.join(TEST_OUTPUT_DIR, f)) + +setup() + +def call_wikiq(*args: str): + call = ' '.join([WIKIQ, *args]) + print(call) + subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) # with / without pwr DONE # with / without url encode DONE @@ -26,16 +47,14 @@ TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") class Test_Wikipedia(unittest.TestCase): def setUp(self): - self.wiki = 'ikwiki-20180301-pages-meta-history' - self.wikiq_out_name = self.wiki + ".tsv" + wiki = 'ikwiki-20180301-pages-meta-history' + self.wikiq_out_name = wiki + ".tsv" self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - self.infile = "{0}.xml.bz2".format(self.wiki) + infile = "{0}.xml.bz2".format(wiki) - self.base_call = WIKIQ + " {0} -o {1}" - self.input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) - self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") + input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_WP_url_encode(self): test_filename = "url-encode_" + self.wikiq_out_name @@ -43,17 +62,13 @@ class Test_Wikipedia(unittest.TestCase): if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --url-encode" - print(call) try: - subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, "--url-encode") except subprocess.CalledProcessError as exc: - print(exc.stderr.decode("utf8")) - self.fail() + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -67,15 +82,14 @@ class Test_Wikipedia(unittest.TestCase): if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " -n 0 -n 1" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, + "-n 0", "-n 1") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -91,15 +105,14 @@ class Test_Wikipedia(unittest.TestCase): if os.path.exists(test_file): os.remove(test_file) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " -n 0 -n 1 -rr 1" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, + "-n 0", "-n 1", "-rr 1") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -120,7 +133,6 @@ class Test_Basic(unittest.TestCase): self.base_call = WIKIQ + " {0} -o {1}" self.input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) - self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") def test_noargs(self): @@ -137,7 +149,7 @@ class Test_Basic(unittest.TestCase): copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -158,7 +170,7 @@ class Test_Basic(unittest.TestCase): copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) @@ -178,7 +190,7 @@ class Test_Basic(unittest.TestCase): copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -199,7 +211,7 @@ class Test_Basic(unittest.TestCase): copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -220,7 +232,7 @@ class Test_Basic(unittest.TestCase): copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -242,7 +254,7 @@ class Test_Basic(unittest.TestCase): self.assertEqual(proc.returncode, 0) copyfile(self.call_output, test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -282,7 +294,6 @@ class Test_Stdout(unittest.TestCase): self.base_call = WIKIQ + " {0} --stdout" self.input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) - self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") def test_noargs(self): call = self.base_call.format(self.input_file) @@ -291,7 +302,7 @@ class Test_Stdout(unittest.TestCase): outs = proc.stdout.decode("utf8") test_file = "noargs_" + self.wikiq_out_name - baseline_file = os.path.join(".", self.baseline_output_dir, test_file) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_file) print(baseline_file) test = pd.read_table(StringIO(outs)) baseline = pd.read_table(baseline_file) @@ -312,8 +323,6 @@ class Test_Regex(unittest.TestCase): self.base_call = WIKIQ + " {0}" self.base_call_outs = WIKIQ + " {0} -o {1}" - self.baseline_output_dir = os.path.join(TEST_DIR, "baseline_output") - # sample arguments for checking that bad arguments get terminated / test_regex_arguments self.bad_arguments_list = [ # label is missing @@ -372,7 +381,7 @@ class Test_Regex(unittest.TestCase): test = pd.read_table(test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) @@ -397,22 +406,10 @@ class Test_Regex(unittest.TestCase): test = pd.read_table(test_file) - baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) + baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) if __name__ == '__main__': - tracemalloc.start() - - # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. - if not os.path.exists(TEST_OUTPUT_DIR): - os.mkdir(TEST_OUTPUT_DIR) - else: - # Avoid subsequent calls to tests interfering with each other. - # Otherwise, a test may erroneously pass if the program has no output - # but a previous run output what was expected. - for f in os.listdir(TEST_OUTPUT_DIR): - os.remove(os.path.join(TEST_OUTPUT_DIR, f)) - unittest.main() From c8b14c3303301fcd775ce8a34cfbb4040e32bc03 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Tue, 27 May 2025 16:24:07 -0500 Subject: [PATCH 09/11] Refactor test temporary file logic and wikiq call pattern Test file refreshing and path computation is now handled by a helper. The wikiq command is now constructed and handled by a single method rather than in several ad-hoc ways. The last places relying on the working directory are now removed. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 292 +++++++++++++++++----------------------- 1 file changed, 120 insertions(+), 172 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 08fdfc4..eae8020 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -8,12 +8,12 @@ from io import StringIO import tracemalloc from typing import Final - # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") -BASELINE_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") +BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") + def setup(): tracemalloc.start() @@ -28,12 +28,36 @@ def setup(): for f in os.listdir(TEST_OUTPUT_DIR): os.remove(os.path.join(TEST_OUTPUT_DIR, f)) + +# Always run setup, even if this is executed via "python -m unittest" rather +# than as __main__. setup() -def call_wikiq(*args: str): - call = ' '.join([WIKIQ, *args]) + +def call_wikiq(input_file: str, *args: str, out: bool = True): + if out: + call = ' '.join([WIKIQ, input_file, "-o", TEST_OUTPUT_DIR, *args]) + else: + call = ' '.join([WIKIQ, input_file, *args]) + print(call) - subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + + +def tmp_test_file(name: str) -> (str, str): + """ + Removes any existing test file with the given name and returns the path to + the that file. + :param name: The test case name. Should be unique to each test case. + :return: The path to the test file. + """ + baseline_file = os.path.join(BASELINE_DIR, name) + test_file = os.path.join(TEST_OUTPUT_DIR, name) + if os.path.exists(test_file): + os.remove(test_file) + + return baseline_file, test_file + # with / without pwr DONE # with / without url encode DONE @@ -48,27 +72,22 @@ def call_wikiq(*args: str): class Test_Wikipedia(unittest.TestCase): def setUp(self): wiki = 'ikwiki-20180301-pages-meta-history' - self.wikiq_out_name = wiki + ".tsv" + self.wikiq_out_name = "{0}.tsv".format(wiki) self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) infile = "{0}.xml.bz2".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_WP_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) try: - call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, "--url-encode") + call_wikiq(self.input_file, "--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -76,20 +95,14 @@ class Test_Wikipedia(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) def test_WP_namespaces(self): - print(os.path.abspath('.')) - test_filename = "namespaces_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("namespaces_" + self.wikiq_out_name) try: - call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, - "-n 0", "-n 1") + call_wikiq(self.input_file, "-n 0", "-n 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -99,20 +112,14 @@ class Test_Wikipedia(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) def test_WP_revert_radius(self): - print(os.path.abspath('.')) - test_filename = "revert_radius_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("revert_radius_" + self.wikiq_out_name) try: - call_wikiq(self.input_file, "-o", TEST_OUTPUT_DIR, - "-n 0", "-n 1", "-rr 1") + call_wikiq(self.input_file, "-n 0", "-n 1", "-rr 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) @@ -125,115 +132,80 @@ class Test_Wikipedia(unittest.TestCase): class Test_Basic(unittest.TestCase): def setUp(self): - self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" + wiki = 'sailormoon' + self.wikiq_out_name = wiki + ".tsv" self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = WIKIQ + " {0} -o {1}" - self.input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + infile = "{0}.xml.7z".format(wiki) + input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_noargs(self): + baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) - test_filename = "noargs_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) - - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file) + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) - test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_collapse_user(self): - test_filename = "collapse-user_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("collapse-user_" + self.wikiq_out_name) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --collapse-user" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "--collapse-user") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_segment(self): - test_filename = "persistence_segment_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("persistence_segment_" + self.wikiq_out_name) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --persistence segment" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "--persistence segment") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) - test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_legacy(self): - test_filename = "persistence_legacy_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("persistence_legacy_" + self.wikiq_out_name) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --persistence legacy" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "--persistence legacy") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) - test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr(self): - test_filename = "persistence_" + self.wikiq_out_name - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file("persistence_" + self.wikiq_out_name) - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --persistence" - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "--persistence") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) - test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -241,20 +213,14 @@ class Test_Basic(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) def test_url_encode(self): - test_filename = "url-encode_" + self.wikiq_out_name + baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) - - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " --url-encode" - with subprocess.Popen(call, stdout=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, "--url-encode") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) @@ -264,46 +230,47 @@ class Test_Basic(unittest.TestCase): class Test_Malformed(unittest.TestCase): def setUp(self): - self.wiki = 'twinpeaks' - self.wikiq_out_name = self.wiki + ".tsv" - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + wiki = 'twinpeaks' - self.infile = "{0}.xml.7z".format(self.wiki) - self.base_call = WIKIQ + " {0} -o {1}" - self.input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + infile = "{0}.xml.7z".format(wiki) + input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_malformed_noargs(self): - call = self.base_call.format(self.input_file, TEST_OUTPUT_DIR) - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertNotEqual(proc.returncode, 0) - outs, errs = proc.communicate() - errlines = str(errs).split("\\n") - self.assertEqual(errlines[-2], 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') + want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' + + try: + call_wikiq(self.input_file) + except subprocess.CalledProcessError as exc: + errlines = exc.stderr.decode("utf8").splitlines() + self.assertEqual(errlines[-1], want_exception) + else: + self.fail("No exception raised, want: {}".format(want_exception)) class Test_Stdout(unittest.TestCase): def setUp(self): - self.wiki = 'sailormoon' - self.wikiq_out_name = self.wiki + ".tsv" + wiki = 'sailormoon' + self.wikiq_out_name = wiki + ".tsv" + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - self.infile = "{0}.xml.7z".format(self.wiki) + infile = "{0}.xml.7z".format(wiki) self.base_call = WIKIQ + " {0} --stdout" - self.input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_noargs(self): - call = self.base_call.format(self.input_file) - print(call) - proc = subprocess.run(call, stdout=subprocess.PIPE, shell=True) - outs = proc.stdout.decode("utf8") + baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) + + outs = "" + try: + outs = call_wikiq(self.input_file, "--stdout", out=False).decode("utf8") + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) + + copyfile(self.call_output, test_file) - test_file = "noargs_" + self.wikiq_out_name - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_file) - print(baseline_file) test = pd.read_table(StringIO(outs)) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) @@ -311,17 +278,14 @@ class Test_Stdout(unittest.TestCase): class Test_Regex(unittest.TestCase): def setUp(self): - self.wiki = 'regextest' - self.wikiq_out_name = self.wiki + '.tsv' - self.infile = "{0}.xml.bz2".format(self.wiki) + wiki = 'regextest' + self.wikiq_out_name = wiki + '.tsv' + infile = "{0}.xml.bz2".format(wiki) - self.input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, self.input_dir, self.infile) + input_dir = os.path.join(TEST_DIR, "dumps") + self.input_file = os.path.join(TEST_DIR, input_dir, infile) self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - # we have two base calls, one for checking arguments and the other for checking outputs - self.base_call = WIKIQ + " {0}" - self.base_call_outs = WIKIQ + " {0} -o {1}" # sample arguments for checking that bad arguments get terminated / test_regex_arguments self.bad_arguments_list = [ @@ -351,37 +315,28 @@ class Test_Regex(unittest.TestCase): def test_regex_arguments(self): for arguments in self.bad_arguments_list: - call = self.base_call.format(self.input_file) - call = call + " --stdout " + arguments - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - stdout, stderr = proc.communicate() + try: + call_wikiq(self.input_file, "--stdout", arguments, out=False) + except subprocess.CalledProcessError as exc: # we want to check that the bad arguments were caught and sys.exit is stopping the code - print(stderr.decode("utf-8")) - - self.assertNotEqual(proc.returncode, 0) + print(exc.stderr.decode("utf-8")) + else: + self.fail("No exception raised, want Exception") def test_basic_regex(self): for i, arguments in enumerate(self.good_arguments_list): - test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - # print(test_filename) - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file(test_filename) - call = self.base_call_outs.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " " + arguments - print(call) - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, arguments) + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) @@ -389,24 +344,17 @@ class Test_Regex(unittest.TestCase): def test_capturegroup_regex(self): for i, arguments in enumerate(self.cap_arguments_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - print(test_filename) - test_file = os.path.join(TEST_OUTPUT_DIR, test_filename) - if os.path.exists(test_file): - os.remove(test_file) + baseline_file, test_file = tmp_test_file(test_filename) - call = self.base_call_outs.format(self.input_file, TEST_OUTPUT_DIR) - call = call + " " + arguments - print(call) - - with subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) as proc: - proc.wait() - self.assertEqual(proc.returncode, 0) + try: + call_wikiq(self.input_file, arguments) + except subprocess.CalledProcessError as exc: + self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) - baseline_file = os.path.join(BASELINE_OUTPUT_DIR, test_filename) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) From f3e6cc939266dacf400b19ccaef9c666fadcfbb8 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Wed, 28 May 2025 09:11:36 -0500 Subject: [PATCH 10/11] Begin refactor of tests to make new tests easier to write Handle file naming logic centrally rather than requiring a dedicated class per input file. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 67 +++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index eae8020..1258724 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -14,6 +14,7 @@ WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") +IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history" def setup(): tracemalloc.start() @@ -34,6 +35,23 @@ def setup(): setup() +class WikiqTester: + def __init__(self, + wiki: str, + case_name: str, + out_format: str = "tsv", + ): + self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.bz2".format(wiki)) + + self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) + self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + + self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + if os.path.exists(self.test_file): + os.remove(self.test_file) + + def call_wikiq(input_file: str, *args: str, out: bool = True): if out: call = ' '.join([WIKIQ, input_file, "-o", TEST_OUTPUT_DIR, *args]) @@ -69,67 +87,58 @@ def tmp_test_file(name: str) -> (str, str): # wikia and wikipedia data DONE # malformed xmls DONE -class Test_Wikipedia(unittest.TestCase): - def setUp(self): - wiki = 'ikwiki-20180301-pages-meta-history' - self.wikiq_out_name = "{0}.tsv".format(wiki) - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - - infile = "{0}.xml.bz2".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - +class TestWikipedia(unittest.TestCase): def test_WP_url_encode(self): - baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) + tester = WikiqTester(IKWIKI, "url-encode") try: - call_wikiq(self.input_file, "--url-encode") + call_wikiq(tester.input_file, "--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) # as a test let's make sure that we get equal data frames - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_WP_namespaces(self): - baseline_file, test_file = tmp_test_file("namespaces_" + self.wikiq_out_name) + tester = WikiqTester(IKWIKI, "namespaces") try: - call_wikiq(self.input_file, "-n 0", "-n 1") + call_wikiq(tester.input_file, "-n 0", "-n 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) # as a test let's make sure that we get equal data frames - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_WP_revert_radius(self): - baseline_file, test_file = tmp_test_file("revert_radius_" + self.wikiq_out_name) + tester = WikiqTester(IKWIKI, "revert_radius") try: - call_wikiq(self.input_file, "-n 0", "-n 1", "-rr 1") + call_wikiq(tester.input_file, "-n 0", "-n 1", "-rr 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) # as a test let's make sure that we get equal data frames - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) -class Test_Basic(unittest.TestCase): +class TestBasic(unittest.TestCase): def setUp(self): wiki = 'sailormoon' @@ -228,7 +237,7 @@ class Test_Basic(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) -class Test_Malformed(unittest.TestCase): +class TestMalformed(unittest.TestCase): def setUp(self): wiki = 'twinpeaks' @@ -248,7 +257,7 @@ class Test_Malformed(unittest.TestCase): self.fail("No exception raised, want: {}".format(want_exception)) -class Test_Stdout(unittest.TestCase): +class TestStdout(unittest.TestCase): def setUp(self): wiki = 'sailormoon' @@ -276,7 +285,7 @@ class Test_Stdout(unittest.TestCase): assert_frame_equal(test, baseline, check_like=True) -class Test_Regex(unittest.TestCase): +class TestRegex(unittest.TestCase): def setUp(self): wiki = 'regextest' self.wikiq_out_name = wiki + '.tsv' From df0ad1de630842466b1152b623c9ec7fcfe01333 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Wed, 28 May 2025 10:11:58 -0500 Subject: [PATCH 11/11] Finish test standardization Test logic is executed within the WikiqTestCase, while WikiqTester handles creating and managing the variables tests need. Signed-off-by: Will Beason --- test/Wikiq_Unit_Test.py | 259 ++++++++++++++++------------------------ 1 file changed, 103 insertions(+), 156 deletions(-) diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 1258724..a45e9d9 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -15,6 +15,9 @@ TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history" +SAILORMOON: Final[str] = "sailormoon" +TWINPEAKS: Final[str] = "twinpeaks" +REGEXTEST: Final[str] = "regextest" def setup(): tracemalloc.start() @@ -22,12 +25,6 @@ def setup(): # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. if not os.path.exists(TEST_OUTPUT_DIR): os.mkdir(TEST_OUTPUT_DIR) - else: - # Avoid subsequent calls to tests interfering with each other. - # Otherwise, a test may erroneously pass if the program has no output - # but a previous run output what was expected. - for f in os.listdir(TEST_OUTPUT_DIR): - os.remove(os.path.join(TEST_OUTPUT_DIR, f)) # Always run setup, even if this is executed via "python -m unittest" rather @@ -38,44 +35,40 @@ setup() class WikiqTester: def __init__(self, wiki: str, - case_name: str, + case_name: str | None = None, + suffix: str | None = None, + in_compression: str = "bz2", out_format: str = "tsv", ): - self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.bz2".format(wiki)) + self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) - self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + if suffix is None: + self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) + else: + self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) + self.call_output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(wiki, out_format)) - self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) - self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) - if os.path.exists(self.test_file): - os.remove(self.test_file) + # If case_name is unset, there are no relevant baseline or test files. + if case_name is not None: + self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + if os.path.exists(self.test_file): + os.remove(self.test_file) + def call_wikiq(self, *args: str, out: bool = True): + """ + Calls wikiq with the passed arguments on the input file relevant to the test. + :param args: The command line arguments to pass to wikiq. + :param out: Whether to pass an output argument to wikiq. + :return: The output of the wikiq call. + """ + if out: + call = ' '.join([WIKIQ, self.input_file, "-o", TEST_OUTPUT_DIR, *args]) + else: + call = ' '.join([WIKIQ, self.input_file, *args]) -def call_wikiq(input_file: str, *args: str, out: bool = True): - if out: - call = ' '.join([WIKIQ, input_file, "-o", TEST_OUTPUT_DIR, *args]) - else: - call = ' '.join([WIKIQ, input_file, *args]) - - print(call) - return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) - - -def tmp_test_file(name: str) -> (str, str): - """ - Removes any existing test file with the given name and returns the path to - the that file. - :param name: The test case name. Should be unique to each test case. - :return: The path to the test file. - """ - baseline_file = os.path.join(BASELINE_DIR, name) - test_file = os.path.join(TEST_OUTPUT_DIR, name) - if os.path.exists(test_file): - os.remove(test_file) - - return baseline_file, test_file - + print(call) + return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) # with / without pwr DONE # with / without url encode DONE @@ -87,12 +80,12 @@ def tmp_test_file(name: str) -> (str, str): # wikia and wikipedia data DONE # malformed xmls DONE -class TestWikipedia(unittest.TestCase): +class WikiqTestCase(unittest.TestCase): def test_WP_url_encode(self): tester = WikiqTester(IKWIKI, "url-encode") try: - call_wikiq(tester.input_file, "--url-encode") + tester.call_wikiq("--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -107,7 +100,7 @@ class TestWikipedia(unittest.TestCase): tester = WikiqTester(IKWIKI, "namespaces") try: - call_wikiq(tester.input_file, "-n 0", "-n 1") + tester.call_wikiq("-n 0", "-n 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -124,7 +117,7 @@ class TestWikipedia(unittest.TestCase): tester = WikiqTester(IKWIKI, "revert_radius") try: - call_wikiq(tester.input_file, "-n 0", "-n 1", "-rr 1") + tester.call_wikiq("-n 0", "-n 1", "-rr 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -137,167 +130,124 @@ class TestWikipedia(unittest.TestCase): baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - -class TestBasic(unittest.TestCase): - - def setUp(self): - wiki = 'sailormoon' - self.wikiq_out_name = wiki + ".tsv" - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - - infile = "{0}.xml.7z".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - def test_noargs(self): - baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") try: - call_wikiq(self.input_file) + tester.call_wikiq() except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_collapse_user(self): - baseline_file, test_file = tmp_test_file("collapse-user_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - call_wikiq(self.input_file, "--collapse-user") + tester.call_wikiq("--collapse-user") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_segment(self): - baseline_file, test_file = tmp_test_file("persistence_segment_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence segment") + tester.call_wikiq("--persistence segment") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_legacy(self): - baseline_file, test_file = tmp_test_file("persistence_legacy_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence legacy") + tester.call_wikiq("--persistence legacy") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr(self): - baseline_file, test_file = tmp_test_file("persistence_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence") + tester.call_wikiq("--persistence") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) def test_url_encode(self): - baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - call_wikiq(self.input_file, "--url-encode") + tester.call_wikiq("--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + copyfile(tester.call_output, tester.test_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) - -class TestMalformed(unittest.TestCase): - def setUp(self): - wiki = 'twinpeaks' - - infile = "{0}.xml.7z".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - def test_malformed_noargs(self): + tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' try: - call_wikiq(self.input_file) + tester.call_wikiq() except subprocess.CalledProcessError as exc: errlines = exc.stderr.decode("utf8").splitlines() self.assertEqual(errlines[-1], want_exception) else: self.fail("No exception raised, want: {}".format(want_exception)) + def test_stdout_noargs(self): + tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") -class TestStdout(unittest.TestCase): - - def setUp(self): - wiki = 'sailormoon' - self.wikiq_out_name = wiki + ".tsv" - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - - infile = "{0}.xml.7z".format(wiki) - self.base_call = WIKIQ + " {0} --stdout" - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - - def test_noargs(self): - baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) - - outs = "" try: - outs = call_wikiq(self.input_file, "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) test = pd.read_table(StringIO(outs)) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - -class TestRegex(unittest.TestCase): - def setUp(self): - wiki = 'regextest' - self.wikiq_out_name = wiki + '.tsv' - infile = "{0}.xml.bz2".format(wiki) - - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + def test_bad_regex(self): + tester = WikiqTester(wiki=REGEXTEST) # sample arguments for checking that bad arguments get terminated / test_regex_arguments - self.bad_arguments_list = [ + bad_arguments_list = [ # label is missing "-RP '\\b\\d+\\b'", # number of reg and number of labels do not match @@ -309,62 +259,59 @@ class TestRegex(unittest.TestCase): "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] - # sample arguments for checking the outcomes of good arguments / test_basic_regex - self.good_arguments_list = [ - "-RP '\\b\\d{3}\\b' -RPl threedigits", - "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", - "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", - "-CP 'WP:EVADE' -CPl wp_evade" - ] - - self.cap_arguments_list = [ - "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", - "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" - ] - - def test_regex_arguments(self): - for arguments in self.bad_arguments_list: + for arguments in bad_arguments_list: try: - call_wikiq(self.input_file, "--stdout", arguments, out=False) + tester.call_wikiq("--stdout", arguments, out=False) except subprocess.CalledProcessError as exc: # we want to check that the bad arguments were caught and sys.exit is stopping the code print(exc.stderr.decode("utf-8")) else: self.fail("No exception raised, want Exception") - def test_basic_regex(self): - for i, arguments in enumerate(self.good_arguments_list): - test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - baseline_file, test_file = tmp_test_file(test_filename) + def test_good_regex(self): + # sample arguments for checking the outcomes of good arguments / test_basic_regex + good_arguments_list = [ + "-RP '\\b\\d{3}\\b' -RPl threedigits", + "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", + "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", + "-CP 'WP:EVADE' -CPl wp_evade" + ] + + for i, arguments in enumerate(good_arguments_list): + tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) try: - call_wikiq(self.input_file, arguments) + tester.call_wikiq( arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) def test_capturegroup_regex(self): - for i, arguments in enumerate(self.cap_arguments_list): - test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - baseline_file, test_file = tmp_test_file(test_filename) + cap_arguments_list = [ + "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", + "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" + ] + + for i, arguments in enumerate(cap_arguments_list): + tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)) try: - call_wikiq(self.input_file, arguments) + tester.call_wikiq(arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True)