Merge branch 'parquet_support' of gitea:collective/mediawiki_dump_tools into parquet_support

This commit is contained in:
Nathan TeBlunthuis 2025-05-28 21:09:13 -07:00
commit 383ee03250
6 changed files with 492 additions and 487 deletions

7
.gitignore vendored
View File

@ -3,3 +3,10 @@
*.xml.bz2 *.xml.bz2
*.xml.xz *.xml.xz
*.swp *.swp
# JetBrains
/.idea
# Python build and test output
__pycache__/
test_output/

View File

@ -12,6 +12,19 @@ submodule like::
Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
`7za`, `gzcat`, and `zcat`. `7za`, `gzcat`, and `zcat`.
Dependencies
----------------
These non-Python dependencies must be installed on your system for wikiq and its
associated tests to work.
- 7zip
- ffmpeg
Tests
----
To run tests::
python -m unittest test.Wikiq_Unit_Test
TODO: TODO:
_______________ _______________

39
requirements.txt Normal file
View File

@ -0,0 +1,39 @@
attrs==25.3.0
certifi==2025.4.26
charset-normalizer==3.4.2
Cython==0.29.37
deltas==0.7.0
docopt==0.6.2
gnureadline==8.1.2
idna==3.10
jsonable==0.3.1
jsonschema==4.23.0
jsonschema-specifications==2025.4.1
mediawiki-utilities==0.4.18
mwcli==0.0.3
mwdiffs==0.0.2
mwpersistence==0.2.4
mwreverts==0.1.5
mwtypes==0.4.0
mwxml==0.3.6
pandas==2.2.3
para==0.0.8
parsimonious==0.10.0
pyarrow==20.0.0
pydub==0.25.1
PyMySQL==1.1.1
python-dateutil==2.9.0.post0
pytz==2025.2
PyYAML==5.4.1
referencing==0.36.2
regex==2024.11.6
requests==2.32.3
rpds-py==0.25.1
setuptools==80.8.0
six==1.17.0
stopit==1.1.2
typing_extensions==4.13.2
tzdata==2025.2
urllib3==2.4.0
wheel==0.45.1
yamlconf==0.2.6

View File

@ -5,398 +5,313 @@ from shutil import copyfile
import pandas as pd import pandas as pd
from pandas.testing import assert_frame_equal from pandas.testing import assert_frame_equal
from io import StringIO from io import StringIO
import tracemalloc
from typing import Final
# Make references to files and wikiq relative to this file, not to the current working directory.
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
SAILORMOON: Final[str] = "sailormoon"
TWINPEAKS: Final[str] = "twinpeaks"
REGEXTEST: Final[str] = "regextest"
def setup():
tracemalloc.start()
# Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
if not os.path.exists(TEST_OUTPUT_DIR):
os.mkdir(TEST_OUTPUT_DIR)
# Always run setup, even if this is executed via "python -m unittest" rather
# than as __main__.
setup()
class WikiqTester:
def __init__(self,
wiki: str,
case_name: str | None = None,
suffix: str | None = None,
in_compression: str = "bz2",
out_format: str = "tsv",
):
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
if suffix is None:
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
else:
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
self.call_output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(wiki, out_format))
# If case_name is unset, there are no relevant baseline or test files.
if case_name is not None:
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
if os.path.exists(self.test_file):
os.remove(self.test_file)
def call_wikiq(self, *args: str, out: bool = True):
"""
Calls wikiq with the passed arguments on the input file relevant to the test.
:param args: The command line arguments to pass to wikiq.
:param out: Whether to pass an output argument to wikiq.
:return: The output of the wikiq call.
"""
if out:
call = ' '.join([WIKIQ, self.input_file, "-o", TEST_OUTPUT_DIR, *args])
else:
call = ' '.join([WIKIQ, self.input_file, *args])
print(call)
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
# with / without pwr DONE # with / without pwr DONE
# with / without url encode DONE # with / without url encode DONE
# with / without collapse user DONE # with / without collapse user DONE
# with output to sdtout DONE # with output to stdout DONE
# note that the persistence radius is 7 by default # note that the persistence radius is 7 by default
# reading various file formats including # reading various file formats including
# 7z, gz, bz2, xml DONE # 7z, gz, bz2, xml DONE
# wikia and wikipedia data DONE # wikia and wikipedia data DONE
# malformed xmls DONE # malformed xmls DONE
class Test_Wikipedia(unittest.TestCase): class WikiqTestCase(unittest.TestCase):
def setUp(self):
if not os.path.exists("test_output"):
os.mkdir("test_output")
self.wiki = 'ikwiki-20180301-pages-meta-history'
self.wikiq_out_name = self.wiki + ".tsv"
self.test_output_dir = os.path.join(".", "test_output")
self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
self.infile = "{0}.xml.bz2".format(self.wiki)
self.base_call = "../wikiq {0} -o {1}"
self.input_dir = "dumps"
self.input_file = os.path.join(".", self.input_dir,self.infile)
self.baseline_output_dir = "baseline_output"
def test_WP_url_encode(self): def test_WP_url_encode(self):
test_filename = "url-encode_" + self.wikiq_out_name tester = WikiqTester(IKWIKI, "url-encode")
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --url-encode"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file) try:
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) tester.call_wikiq("--url-encode")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(test_file) test = pd.read_table(tester.test_file)
baseline = pd.read_table(baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_namespaces(self): def test_WP_namespaces(self):
print(os.path.abspath('.')) tester = WikiqTester(IKWIKI, "namespaces")
test_filename = "namespaces_" + self.wikiq_out_name
test_file = os.path.join(self.test_output_dir, test_filename) try:
if os.path.exists(test_file): tester.call_wikiq("-n 0", "-n 1")
os.remove(test_file) except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " -n 0 -n 1" copyfile(tester.call_output, tester.test_file)
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(test_file) test = pd.read_table(tester.test_file)
num_wrong_ns = sum(~ test.namespace.isin({0,1})) num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
self.assertEqual(num_wrong_ns, 0) self.assertEqual(num_wrong_ns, 0)
baseline = pd.read_table(baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_revert_radius(self): def test_WP_revert_radius(self):
print(os.path.abspath('.')) tester = WikiqTester(IKWIKI, "revert_radius")
test_filename = "revert_radius_" + self.wikiq_out_name
test_file = os.path.join(self.test_output_dir, test_filename) try:
if os.path.exists(test_file): tester.call_wikiq("-n 0", "-n 1", "-rr 1")
os.remove(test_file) except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " -n 0 -n 1 -rr 1" copyfile(tester.call_output, tester.test_file)
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(test_file) test = pd.read_table(tester.test_file)
num_wrong_ns = sum(~ test.namespace.isin({0,1})) num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
self.assertEqual(num_wrong_ns, 0) self.assertEqual(num_wrong_ns, 0)
baseline = pd.read_table(baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
class Test_Basic(unittest.TestCase):
def setUp(self):
if not os.path.exists("test_output"):
os.mkdir("test_output")
self.wiki = 'sailormoon'
self.wikiq_out_name = self.wiki + ".tsv"
self.test_output_dir = os.path.join(".", "test_output")
self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
self.infile = "{0}.xml.7z".format(self.wiki)
self.base_call = "../wikiq {0} -o {1}"
self.input_dir = "dumps"
self.input_file = os.path.join(".", self.input_dir,self.infile)
self.baseline_output_dir = "baseline_output"
def test_noargs(self): def test_noargs(self):
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
test_filename = "noargs_" + self.wikiq_out_name try:
test_file = os.path.join(self.test_output_dir, test_filename) tester.call_wikiq()
if os.path.exists(test_file): except subprocess.CalledProcessError as exc:
os.remove(test_file) self.fail(exc.stderr.decode("utf8"))
call = self.base_call.format(self.input_file, self.test_output_dir)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
test = pd.read_table(test_file)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline, check_like=True)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
def test_collapse_user(self): def test_collapse_user(self):
test_filename = "collapse-user_" + self.wikiq_out_name tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --collapse-user"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) try:
proc.wait() tester.call_wikiq("--collapse-user")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(tester.test_file)
test = pd.read_table(test_file) baseline = pd.read_table(tester.baseline_file)
baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True)
assert_frame_equal(test,baseline, check_like=True)
def test_pwr_segment(self): def test_pwr_segment(self):
test_filename = "persistence_segment_" + self.wikiq_out_name tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --persistence segment"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
try:
tester.call_wikiq("--persistence segment")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
test = pd.read_table(test_file) assert_frame_equal(test, baseline, check_like=True)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline, check_like=True)
def test_pwr_legacy(self): def test_pwr_legacy(self):
test_filename = "persistence_legacy_" + self.wikiq_out_name tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --persistence legacy"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
try:
tester.call_wikiq("--persistence legacy")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
test = pd.read_table(test_file) assert_frame_equal(test, baseline, check_like=True)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test,baseline, check_like=True)
def test_pwr(self): def test_pwr(self):
test_filename = "persistence_" + self.wikiq_out_name tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --persistence"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
try:
tester.call_wikiq("--persistence")
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
test = pd.read_table(test_file)
baseline = pd.read_table(baseline_file)
test = test.reindex(columns=sorted(test.columns)) test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_url_encode(self): def test_url_encode(self):
test_filename = "url-encode_" + self.wikiq_out_name tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")
test_file = os.path.join(self.test_output_dir, test_filename) try:
if os.path.exists(test_file): tester.call_wikiq("--url-encode")
os.remove(test_file) except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
call = self.base_call.format(self.input_file, self.test_output_dir)
call = call + " --url-encode"
proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file) copyfile(tester.call_output, tester.test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) test = pd.read_table(tester.test_file)
test = pd.read_table(test_file) baseline = pd.read_table(tester.baseline_file)
baseline = pd.read_table(baseline_file)
test = test.reindex(columns=sorted(test.columns)) test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
class Test_Malformed(unittest.TestCase):
def setUp(self):
if not os.path.exists("test_output"):
os.mkdir("test_output")
self.wiki = 'twinpeaks'
self.wikiq_out_name = self.wiki + ".tsv"
self.test_output_dir = os.path.join(".", "test_output")
self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
self.infile = "{0}.xml.7z".format(self.wiki)
self.base_call = "../wikiq {0} -o {1}"
self.input_dir = "dumps"
self.input_file = os.path.join(".", self.input_dir,self.infile)
def test_malformed_noargs(self): def test_malformed_noargs(self):
tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
call = self.base_call.format(self.input_file, self.test_output_dir) try:
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True) tester.call_wikiq()
proc.wait() except subprocess.CalledProcessError as exc:
outs, errs = proc.communicate() errlines = exc.stderr.decode("utf8").splitlines()
errlines = str(errs).split("\\n") self.assertEqual(errlines[-1], want_exception)
self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0') else:
self.fail("No exception raised, want: {}".format(want_exception))
class Test_Stdout(unittest.TestCase): def test_stdout_noargs(self):
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
def setUp(self): try:
self.wiki = 'sailormoon' outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
self.wikiq_out_name = self.wiki + ".tsv" except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
self.infile = "{0}.xml.7z".format(self.wiki) copyfile(tester.call_output, tester.test_file)
self.base_call = "../wikiq {0} --stdout"
self.input_dir = "dumps"
self.input_file = os.path.join(".", self.input_dir,self.infile)
self.baseline_output_dir = "baseline_output"
def test_noargs(self):
call = self.base_call.format(self.input_file)
print(call)
proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
outs = proc.stdout.decode("utf8")
test_file = "noargs_" + self.wikiq_out_name
baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
print(baseline_file)
test = pd.read_table(StringIO(outs)) test = pd.read_table(StringIO(outs))
baseline = pd.read_table(baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test,baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
class Test_Regex(unittest.TestCase): def test_bad_regex(self):
tester = WikiqTester(wiki=REGEXTEST)
def setUp(self): # sample arguments for checking that bad arguments get terminated / test_regex_arguments
self.wiki = 'regextest' bad_arguments_list = [
self.wikiq_out_name = self.wiki + '.tsv' # label is missing
self.infile = "{0}.xml.bz2".format(self.wiki) "-RP '\\b\\d+\\b'",
# number of reg and number of labels do not match
self.input_dir = "dumps"
self.input_file = os.path.join(".", self.input_dir,self.infile)
if not os.path.exists("test_output"):
os.mkdir("test_output")
self.test_output_dir = os.path.join(".", "test_output")
self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
# we have two base calls, one for checking inputs and the other for checking outputs
self.base_call = "../wikiq {0}"
self.base_call_outs = "../wikiq {0} -o {1}"
self.baseline_output_dir = "baseline_output"
# sample inputs for checking that bad inputs get terminated / test_regex_inputs
self.bad_inputs_list = [
#label is missing
"-RP '\\b\\d+\\b'",
#number of reg and number of labels do not match
"-RP 'NPO V' -RP THE -RPl testlabel", "-RP 'NPO V' -RP THE -RPl testlabel",
#cp but rp label # cp but rp label
"-CP '(Tamil|Li)' -RPl testlabel", "-CP '(Tamil|Li)' -RPl testlabel",
#regex is missing # regex is missing
"-CPl testlabel", "-CPl testlabel",
"-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
] ]
# sample inputs for checking the outcomes of good inputs / test_basic_regex for arguments in bad_arguments_list:
self.good_inputs_list = [ try:
tester.call_wikiq("--stdout", arguments, out=False)
except subprocess.CalledProcessError as exc:
# we want to check that the bad arguments were caught and sys.exit is stopping the code
print(exc.stderr.decode("utf-8"))
else:
self.fail("No exception raised, want Exception")
def test_good_regex(self):
# sample arguments for checking the outcomes of good arguments / test_basic_regex
good_arguments_list = [
"-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP '\\b\\d{3}\\b' -RPl threedigits",
"-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
"-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
"-CP 'WP:EVADE' -CPl wp_evade" "-CP 'WP:EVADE' -CPl wp_evade"
] ]
for i, arguments in enumerate(good_arguments_list):
self.cap_inputs_list = [ tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))
try:
tester.call_wikiq( arguments)
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
copyfile(tester.call_output, tester.test_file)
test = pd.read_table(tester.test_file)
baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True)
print(i)
def test_capturegroup_regex(self):
cap_arguments_list = [
"-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
"-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov" "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
] ]
for i, arguments in enumerate(cap_arguments_list):
tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i))
try:
tester.call_wikiq(arguments)
except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8"))
def test_regex_inputs(self): copyfile(tester.call_output, tester.test_file)
for input in self.bad_inputs_list:
call = self.base_call.format(self.input_file)
call = call + " --stdout " + input
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
stdout,stderr = proc.communicate()
#print(proc.returncode)
# we want to check that the bad inputs were caught and sys.exit is stopping the code
print(stderr.decode("utf-8"))
self.assertNotEqual(proc.returncode,0)
def test_basic_regex(self): test = pd.read_table(tester.test_file)
for i, input in enumerate(self.good_inputs_list):
test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) baseline = pd.read_table(tester.baseline_file)
#print(test_filename)
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call_outs.format(self.input_file, self.test_output_dir)
call = call + " " + input
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
test = pd.read_table(test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test, baseline, check_like=True)
print(i)
def test_capturegroup_regex(self):
for i, input in enumerate(self.cap_inputs_list):
test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
print(test_filename)
test_file = os.path.join(self.test_output_dir, test_filename)
if os.path.exists(test_file):
os.remove(test_file)
call = self.base_call_outs.format(self.input_file, self.test_output_dir)
call = call + " " + input
print(call)
proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
proc.wait()
copyfile(self.call_output, test_file)
test = pd.read_table(test_file)
baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
baseline = pd.read_table(baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)

0
test/__init__.py Normal file
View File

393
wikiq
View File

@ -6,9 +6,9 @@
import argparse import argparse
import sys import sys
import os, os.path import os.path
import re import re
from datetime import datetime,timezone from datetime import datetime, timezone
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from collections import deque from collections import deque
@ -20,8 +20,9 @@ from deltas.tokenizers import wikitext_split
import mwpersistence import mwpersistence
import mwreverts import mwreverts
from urllib.parse import quote from urllib.parse import quote
TO_ENCODE = ('title', 'editor') TO_ENCODE = ('title', 'editor')
PERSISTENCE_RADIUS=7 PERSISTENCE_RADIUS = 7
from deltas import SequenceMatcher from deltas import SequenceMatcher
from deltas import SegmentMatcher from deltas import SegmentMatcher
@ -30,42 +31,46 @@ from dataclasses import dataclass
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
class PersistMethod: class PersistMethod:
none = 0 none = 0
sequence = 1 sequence = 1
segment = 2 segment = 2
legacy = 3 legacy = 3
def calculate_persistence(tokens_added):
return(sum([(len(x.revisions)-1) for x in tokens_added]),
len(tokens_added))
class WikiqIterator(): def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]),
len(tokens_added))
class WikiqIterator:
def __init__(self, fh, collapse_user=False): def __init__(self, fh, collapse_user=False):
self.fh = fh self.fh = fh
self.collapse_user = collapse_user self.collapse_user = collapse_user
self.mwiterator = Dump.from_file(self.fh) self.mwiterator = Dump.from_file(self.fh)
self.namespace_map = { ns.id : ns.name for ns in self.namespace_map = {ns.id: ns.name for ns in
self.mwiterator.site_info.namespaces } self.mwiterator.site_info.namespaces}
self.__pages = self.load_pages() self.__pages = self.load_pages()
def load_pages(self): def load_pages(self):
for page in self.mwiterator: for page in self.mwiterator:
yield WikiqPage(page, yield WikiqPage(page,
namespace_map = self.namespace_map, namespace_map=self.namespace_map,
collapse_user=self.collapse_user) collapse_user=self.collapse_user)
def __iter__(self): def __iter__(self):
return self.__pages return self.__pages
def __next__(self): def __next__(self):
return next(self._pages) return next(self.__pages)
class WikiqPage():
class WikiqPage:
__slots__ = ('id', 'title', 'namespace', 'redirect', __slots__ = ('id', 'title', 'namespace', 'redirect',
'restrictions', 'mwpage', '__revisions', 'restrictions', 'mwpage', '__revisions',
'collapse_user') 'collapse_user')
def __init__(self, page, namespace_map, collapse_user=False): def __init__(self, page, namespace_map, collapse_user=False):
self.id = page.id self.id = page.id
self.namespace = page.namespace self.namespace = page.namespace
@ -92,10 +97,11 @@ class WikiqPage():
# 3 A B True # 3 A B True
# 4 A A False # 4 A A False
# Post-loop A Always # Post-loop A Always
collapsed_revs = 0
for i, rev in enumerate(self.mwpage): for i, rev in enumerate(self.mwpage):
# never yield the first time # never yield the first time
if i == 0: if i == 0:
if self.collapse_user: if self.collapse_user:
collapsed_revs = 1 collapsed_revs = 1
rev.collapsed_revs = collapsed_revs rev.collapsed_revs = collapsed_revs
@ -138,6 +144,8 @@ A RegexPair is defined by a regular expression (pattern) and a label.
The pattern can include capture groups. If it does then each capture group will have a resulting column in the output. The pattern can include capture groups. If it does then each capture group will have a resulting column in the output.
If the pattern does not include a capture group, then only one output column will result. If the pattern does not include a capture group, then only one output column will result.
""" """
class RegexPair(object): class RegexPair(object):
def __init__(self, pattern, label): def __init__(self, pattern, label):
self.pattern = re.compile(pattern) self.pattern = re.compile(pattern)
@ -145,10 +153,10 @@ class RegexPair(object):
self.has_groups = bool(self.pattern.groupindex) self.has_groups = bool(self.pattern.groupindex)
if self.has_groups: if self.has_groups:
self.capture_groups = list(self.pattern.groupindex.keys()) self.capture_groups = list(self.pattern.groupindex.keys())
def get_pyarrow_fields(self): def get_pyarrow_fields(self):
if self.has_groups: if self.has_groups:
fields = [pa.field(self._make_key(cap_group),pa.list_(pa.string())) fields = [pa.field(self._make_key(cap_group), pa.list_(pa.string()))
for cap_group in self.capture_groups] for cap_group in self.capture_groups]
else: else:
fields = [pa.field(self.label, pa.list_(pa.string()))] fields = [pa.field(self.label, pa.list_(pa.string()))]
@ -156,10 +164,10 @@ class RegexPair(object):
return fields return fields
def _make_key(self, cap_group): def _make_key(self, cap_group):
return ("{}_{}".format(self.label, cap_group)) return "{}_{}".format(self.label, cap_group)
def matchmake(self, content, rev_data): def matchmake(self, content, rev_data):
temp_dict = {} temp_dict = {}
# if there are named capture groups in the regex # if there are named capture groups in the regex
if self.has_groups: if self.has_groups:
@ -174,11 +182,11 @@ class RegexPair(object):
temp_list = [] temp_list = []
for match in matchobjects: for match in matchobjects:
# we only want to add the match for the capture group if the match is not None # we only want to add the match for the capture group if the match is not None
if match.group(cap_group) != None: if match.group(cap_group) is not None:
temp_list.append(match.group(cap_group)) temp_list.append(match.group(cap_group))
# if temp_list of matches is empty just make that column None # if temp_list of matches is empty just make that column None
if len(temp_list)==0: if len(temp_list) == 0:
temp_dict[key] = None temp_dict[key] = None
# else we put in the list we made in the for-loop above # else we put in the list we made in the for-loop above
else: else:
@ -192,8 +200,8 @@ class RegexPair(object):
# there are no capture groups, we just search for all the matches of the regex # there are no capture groups, we just search for all the matches of the regex
else: else:
#given that there are matches to be made # given that there are matches to be made
if type(content) in(str, bytes): if type(content) in (str, bytes):
if self.pattern.search(content) is not None: if self.pattern.search(content) is not None:
m = self.pattern.findall(content) m = self.pattern.findall(content)
temp_dict[self.label] = ', '.join(m) temp_dict[self.label] = ', '.join(m)
@ -206,6 +214,7 @@ class RegexPair(object):
return rev_data return rev_data
""" """
We used to use a dictionary to collect fields for the output. We used to use a dictionary to collect fields for the output.
@ -222,9 +231,11 @@ It also needs to have the correct pyarrow schema so we can write parquet files.
The RevDataBase type has all the fields that will be output no matter how wikiq is invoked. The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
""" """
@dataclass() @dataclass()
class RevDataBase(): class RevDataBase:
revid: int revid: int
date_time: datetime date_time: datetime
articleid: int articleid: int
editorid: int editorid: int
@ -269,7 +280,7 @@ class RevDataBase():
# logic to convert each field into the wikiq tsv format goes here. # logic to convert each field into the wikiq tsv format goes here.
def to_tsv_row(self): def to_tsv_row(self):
row = [] row = []
for f in dc.fields(self): for f in dc.fields(self):
val = getattr(self, f.name) val = getattr(self, f.name)
@ -281,7 +292,7 @@ class RevDataBase():
elif f.type == datetime: elif f.type == datetime:
row.append(val.strftime('%Y-%m-%d %H:%M:%S')) row.append(val.strftime('%Y-%m-%d %H:%M:%S'))
elif f.name in {'editor','title'}: elif f.name in {'editor', 'title'}:
s = '"' + val + '"' s = '"' + val + '"'
if self.urlencode and f.name in TO_ENCODE: if self.urlencode and f.name in TO_ENCODE:
row.append(quote(str(s))) row.append(quote(str(s)))
@ -299,11 +310,12 @@ class RevDataBase():
else: else:
row.append(val) row.append(val)
return '\t'.join(map(str,row)) return '\t'.join(map(str, row))
def header_row(self): def header_row(self):
return '\t'.join(map(lambda f: f.name, dc.fields(self))) return '\t'.join(map(lambda f: f.name, dc.fields(self)))
""" """
If collapse=True we'll use a RevDataCollapse dataclass. If collapse=True we'll use a RevDataCollapse dataclass.
@ -312,43 +324,54 @@ This class inherits from RevDataBase. This means that it has all the same fields
It just adds a new field and updates the pyarrow schema. It just adds a new field and updates the pyarrow schema.
""" """
@dataclass() @dataclass()
class RevDataCollapse(RevDataBase): class RevDataCollapse(RevDataBase):
collapsed_revs:int = None collapsed_revs: int = None
pa_collapsed_revs_schema = pa.field('collapsed_revs',pa.int64()) pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema] pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]
""" """
If persistence data is to be computed we'll need the fields added by RevDataPersistence. If persistence data is to be computed we'll need the fields added by RevDataPersistence.
""" """
@dataclass() @dataclass()
class RevDataPersistence(RevDataBase): class RevDataPersistence(RevDataBase):
token_revs:int = None token_revs: int = None
tokens_added:int = None tokens_added: int = None
tokens_removed:int = None tokens_removed: int = None
tokens_window:int = None tokens_window: int = None
pa_persistence_schema_fields = [ pa_persistence_schema_fields = [
pa.field("token_revs", pa.int64()), pa.field("token_revs", pa.int64()),
pa.field("tokens_added", pa.int64()), pa.field("tokens_added", pa.int64()),
pa.field("tokens_removed", pa.int64()), pa.field("tokens_removed", pa.int64()),
pa.field("tokens_window", pa.int64())] pa.field("tokens_window", pa.int64())]
pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields
""" """
class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields. class RevDataCollapsePersistence uses multiple inheritance to make a class that has both persistence and collapse fields.
""" """
@dataclass() @dataclass()
class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence): class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields
class WikiqParser():
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000): class WikiqParser:
def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label,
regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces=None,
revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
""" """
Parameters: Parameters:
persist : what persistence method to use. Takes a PersistMethod value persist : what persistence method to use. Takes a PersistMethod value
@ -360,7 +383,7 @@ class WikiqParser():
self.namespaces = [] self.namespaces = []
self.urlencode = urlencode self.urlencode = urlencode
self.revert_radius = revert_radius self.revert_radius = revert_radius
if namespaces is not None: if namespaces is not None:
self.namespace_filter = set(namespaces) self.namespace_filter = set(namespaces)
else: else:
@ -370,9 +393,8 @@ class WikiqParser():
self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label) self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label) self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)
# This is where we set the type for revdata. # This is where we set the type for revdata.
if self.collapse_user is True: if self.collapse_user is True:
if self.persist == PersistMethod.none: if self.persist == PersistMethod.none:
revdata_type = RevDataCollapse revdata_type = RevDataCollapse
@ -391,10 +413,10 @@ class WikiqParser():
self.revdata_type = dc.make_dataclass('RevData_Parser', self.revdata_type = dc.make_dataclass('RevData_Parser',
fields=regex_fields, fields=regex_fields,
bases=(revdata_type,)) bases=(revdata_type,))
# we also need to make sure that we have the right pyarrow schema # we also need to make sure that we have the right pyarrow schema
self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas self.revdata_type.pa_schema_fields = revdata_type.pa_schema_fields + self.regex_schemas
self.revdata_type.urlencode = self.urlencode self.revdata_type.urlencode = self.urlencode
self.schema = pa.schema(self.revdata_type.pa_schema_fields) self.schema = pa.schema(self.revdata_type.pa_schema_fields)
@ -409,22 +431,22 @@ class WikiqParser():
else: else:
self.print_header = True self.print_header = True
if output_file == sys.stdout: if output_file == sys.stdout:
self.output_file = output_file self.output_file = output_file
else: else:
self.output_file = open(output_file,'w') self.output_file = open(output_file, 'w')
self.output_parquet = False self.output_parquet = False
def make_matchmake_pairs(self, patterns, labels): def make_matchmake_pairs(self, patterns, labels):
if (patterns is not None and labels is not None) and \ if (patterns is not None and labels is not None) and \
(len(patterns) == len(labels)): (len(patterns) == len(labels)):
result = [] result = []
for pattern, label in zip(patterns, labels): for pattern, label in zip(patterns, labels):
rp = RegexPair(pattern, label) rp = RegexPair(pattern, label)
result.append(rp) result.append(rp)
self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields() self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
return result return result
elif (patterns is None and labels is None): elif (patterns is None) and (labels is None):
return [] return []
else: else:
sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
@ -435,7 +457,7 @@ class WikiqParser():
return rev_data return rev_data
def matchmake_text(self, text, rev_data): def matchmake_text(self, text, rev_data):
return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs)
def matchmake_comment(self, comment, rev_data): def matchmake_comment(self, comment, rev_data):
return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs) return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs)
@ -450,7 +472,7 @@ class WikiqParser():
for ns in self.namespaces: for ns in self.namespaces:
# skip if the namespace is not defined # skip if the namespace is not defined
if ns == None: if ns is None:
default_ns = self.namespaces[ns] default_ns = self.namespaces[ns]
continue continue
@ -460,7 +482,6 @@ class WikiqParser():
# if we've made it this far with no matches, we return the default namespace # if we've made it this far with no matches, we return the default namespace
return default_ns return default_ns
def process(self): def process(self):
# create a regex that creates the output filename # create a regex that creates the output filename
@ -471,13 +492,12 @@ class WikiqParser():
# Construct dump file iterator # Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
# extract list of namspaces # extract list of namespaces
self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces} self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces}
page_count = 0 page_count = 0
rev_count = 0 rev_count = 0
# Iterate through pages # Iterate through pages
for page in dump: for page in dump:
namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title) namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
@ -487,17 +507,17 @@ class WikiqParser():
if namespace not in self.namespace_filter: if namespace not in self.namespace_filter:
continue continue
rev_detector = mwreverts.Detector(radius = self.revert_radius) rev_detector = mwreverts.Detector(radius=self.revert_radius)
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS) window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist == PersistMethod.sequence: if self.persist == PersistMethod.sequence:
state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split), state = mwpersistence.DiffState(SequenceMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.segment: elif self.persist == PersistMethod.segment:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split), state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
# self.persist == PersistMethod.legacy # self.persist == PersistMethod.legacy
@ -507,15 +527,15 @@ class WikiqParser():
# Iterate through a page's revisions # Iterate through a page's revisions
for rev in page: for rev in page:
# create a new data object instead of a dictionary. # create a new data object instead of a dictionary.
rev_data = self.revdata_type(revid = rev.id, rev_data = self.revdata_type(revid=rev.id,
date_time = datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc), date_time=datetime.fromtimestamp(rev.timestamp.unix(), tz=timezone.utc),
articleid = page.id, articleid=page.id,
editorid = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, editorid="" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
title = page.title, title=page.title,
deleted = rev.deleted.text, deleted=rev.deleted.text,
namespace = namespace namespace=namespace
) )
rev_data = self.matchmake_revision(rev, rev_data) rev_data = self.matchmake_revision(rev, rev_data)
@ -530,7 +550,7 @@ class WikiqParser():
text_sha1 = rev.sha1 text_sha1 = rev.sha1
else: else:
text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest() text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
rev_data.sha1 = text_sha1 rev_data.sha1 = text_sha1
# TODO rev.bytes doesn't work.. looks like a bug # TODO rev.bytes doesn't work.. looks like a bug
@ -538,7 +558,7 @@ class WikiqParser():
# generate revert data # generate revert data
revert = rev_detector.process(text_sha1, rev.id) revert = rev_detector.process(text_sha1, rev.id)
if revert: if revert:
rev_data.revert = True rev_data.revert = True
rev_data.reverteds = revert.reverteds rev_data.reverteds = revert.reverteds
@ -550,17 +570,17 @@ class WikiqParser():
if not rev.deleted.user: if not rev.deleted.user:
# wrap user-defined editors in quotes for fread # wrap user-defined editors in quotes for fread
rev_data.editor = rev.user.text rev_data.editor = rev.user.text
rev_data.anon = rev.user.id is None rev_data.anon = rev.user.id is None
#if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I): # if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
# redirect = True # redirect = True
#else: # else:
# redirect = False # redirect = False
#TODO missing: additions_size deletions_size # TODO missing: additions_size deletions_size
# if collapse user was on, lets run that # if collapse user was on, let's run that
if self.collapse_user: if self.collapse_user:
rev_data.collapsed_revs = rev.collapsed_revs rev_data.collapsed_revs = rev.collapsed_revs
@ -573,18 +593,18 @@ class WikiqParser():
else: else:
_, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1) _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
window.append((rev.id, rev_data, tokens_added, tokens_removed)) window.append((rev.id, rev_data, tokens_added, tokens_removed))
if len(window) == PERSISTENCE_RADIUS: if len(window) == PERSISTENCE_RADIUS:
old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0] old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
num_token_revs, num_tokens = calculate_persistence(old_tokens_added) num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
old_rev_data.token_revs = num_token_revs old_rev_data.token_revs = num_token_revs
old_rev_data.tokens_added = num_tokens old_rev_data.tokens_added = num_tokens
old_rev_data.tokens_removed = len(old_tokens_removed) old_rev_data.tokens_removed = len(old_tokens_removed)
old_rev_data.tokens_window = PERSISTENCE_RADIUS-1 old_rev_data.tokens_window = PERSISTENCE_RADIUS - 1
self.print_rev_data(old_rev_data) self.print_rev_data(old_rev_data)
@ -606,7 +626,7 @@ class WikiqParser():
rev_data.token_revs = num_token_revs rev_data.token_revs = num_token_revs
rev_data.tokens_added = num_tokens rev_data.tokens_added = num_tokens
rev_data.tokens_removed = len(tokens_removed) rev_data.tokens_removed = len(tokens_removed)
rev_data.tokens_window = len(window)-(i+1) rev_data.tokens_window = len(window) - (i + 1)
self.print_rev_data(rev_data) self.print_rev_data(rev_data)
page_count += 1 page_count += 1
@ -622,11 +642,11 @@ class WikiqParser():
else: else:
self.output_file.close() self.output_file.close()
""" """
For performance reasons it's better to write parquet in batches instead of one row at a time. For performance reasons it's better to write parquet in batches instead of one row at a time.
So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written). So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
""" """
def write_parquet_row(self, rev_data): def write_parquet_row(self, rev_data):
padata = rev_data.to_pyarrow() padata = rev_data.to_pyarrow()
self.parquet_buffer.append(padata) self.parquet_buffer.append(padata)
@ -634,16 +654,17 @@ class WikiqParser():
if len(self.parquet_buffer) >= self.parquet_buffer_size: if len(self.parquet_buffer) >= self.parquet_buffer_size:
self.flush_parquet_buffer() self.flush_parquet_buffer()
""" """
Function that actually writes data to the parquet file. Function that actually writes data to the parquet file.
It needs to transpose the data from row-by-row to column-by-column It needs to transpose the data from row-by-row to column-by-column
""" """
def flush_parquet_buffer(self): def flush_parquet_buffer(self):
""" """
Returns the pyarrow table that we'll write Returns the pyarrow table that we'll write
""" """
def rows_to_table(rg, schema): def rows_to_table(rg, schema):
cols = [] cols = []
first = rg[0] first = rg[0]
@ -661,18 +682,18 @@ class WikiqParser():
outtable = rows_to_table(self.parquet_buffer, self.schema) outtable = rows_to_table(self.parquet_buffer, self.schema)
if self.pq_writer is None: if self.pq_writer is None:
self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark') self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')
self.pq_writer.write_table(outtable) self.pq_writer.write_table(outtable)
self.parquet_buffer = [] self.parquet_buffer = []
# depending on if we are configured to write tsv or parquet, we'll call a different function. # depending on if we are configured to write tsv or parquet, we'll call a different function.
def print_rev_data(self, rev_data): def print_rev_data(self, rev_data):
if self.output_parquet is False: if self.output_parquet is False:
printfunc = self.write_tsv_row printfunc = self.write_tsv_row
else: else:
printfunc = self.write_parquet_row printfunc = self.write_parquet_row
printfunc(rev_data) printfunc(rev_data)
def write_tsv_row(self, rev_data): def write_tsv_row(self, rev_data):
@ -686,20 +707,21 @@ class WikiqParser():
def open_input_file(input_filename): def open_input_file(input_filename):
if re.match(r'.*\.7z$', input_filename): if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, "*.xml"] cmd = ["7za", "x", "-so", input_filename, "*.xml"]
elif re.match(r'.*\.gz$', input_filename): elif re.match(r'.*\.gz$', input_filename):
cmd = ["zcat", input_filename] cmd = ["zcat", input_filename]
elif re.match(r'.*\.bz2$', input_filename): elif re.match(r'.*\.bz2$', input_filename):
cmd = ["bzcat", "-dk", input_filename] cmd = ["bzcat", "-dk", input_filename]
else:
raise ValueError("Unrecognized file type: %s" % input_filename)
try: try:
input_file = Popen(cmd, stdout=PIPE).stdout return Popen(cmd, stdout=PIPE).stdout
except NameError: except NameError:
input_file = open(input_filename, 'r') return open(input_filename, 'r')
return input_file
def get_output_filename(input_filename, parquet = False): def get_output_filename(input_filename, parquet=False):
output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename) output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
output_filename = re.sub(r'\.xml', '', output_filename) output_filename = re.sub(r'\.xml', '', output_filename)
if parquet is False: if parquet is False:
@ -708,131 +730,140 @@ def get_output_filename(input_filename, parquet = False):
output_filename = output_filename + ".parquet" output_filename = output_filename + ".parquet"
return output_filename return output_filename
def open_output_file(input_filename): def open_output_file(input_filename):
# create a regex that creates the output filename # create a regex that creates the output filename
output_filename = get_output_filename(input_filename, parquet = False) output_filename = get_output_filename(input_filename, parquet=False)
output_file = open(output_filename, "w") output_file = open(output_filename, "w")
return output_file return output_file
parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
# arguments for the input direction def main():
parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimited data.')
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, # arguments for the input direction
help="Directory for output files. If it ends with .parquet output will be in parquet format.") parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
help="Write output to standard out (do not create dump file)") help="Directory for output files. If it ends with .parquet output will be in parquet format.")
parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") help="Write output to standard out (do not create dump file)")
parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?', parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.") help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow. The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
help="Id number of namspace to include. Can be specified more than once.") help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
parser.add_argument('-rr', parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
'--revert-radius', help="Id number of namespace to include. Can be specified more than once.")
dest="revert_radius",
type=int,
action='store',
default=15,
help="Number of edits to check when looking for reverts (default: 15)")
parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', parser.add_argument('-rr',
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") '--revert-radius',
dest="revert_radius",
type=int,
action='store',
default=15,
help="Number of edits to check when looking for reverts (default: 15)")
parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str,
help="The label for the outputted column based on matching the regex in revision text.") action='append',
help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")
parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str,
help="The regular expression to search for in comments of revisions.") action='append',
help="The label for the outputted column based on matching the regex in revision text.")
parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
help="The label for the outputted column based on matching the regex in comments.") help="The regular expression to search for in comments of revisions.")
args = parser.parse_args() parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str,
action='append',
help="The label for the outputted column based on matching the regex in comments.")
args = parser.parse_args()
# set persistence method
# set persistence method if args.persist is None:
persist = PersistMethod.none
elif args.persist == "segment":
persist = PersistMethod.segment
elif args.persist == "legacy":
persist = PersistMethod.legacy
else:
persist = PersistMethod.sequence
if args.persist is None: if args.namespace_filter is not None:
persist = PersistMethod.none namespaces = args.namespace_filter
elif args.persist == "segment": else:
persist = PersistMethod.segment namespaces = None
elif args.persist == "legacy":
persist = PersistMethod.legacy
else:
persist = PersistMethod.sequence
if args.namespace_filter is not None: if len(args.dumpfiles) > 0:
namespaces = args.namespace_filter output_parquet = False
else: for filename in args.dumpfiles:
namespaces = None input_file = open_input_file(filename)
if len(args.dumpfiles) > 0: # open directory for output
output_parquet = False if args.output_dir:
for filename in args.dumpfiles: output_dir = args.output_dir[0]
input_file = open_input_file(filename) else:
output_dir = "."
# open directory for output if output_dir.endswith(".parquet"):
if args.output_dir: output_parquet = True
output_dir = args.output_dir[0]
else:
output_dir = "."
if output_dir.endswith(".parquet"): print("Processing file: %s" % filename, file=sys.stderr)
output_parquet = True
print("Processing file: %s" % filename, file=sys.stderr) if args.stdout:
output_file = sys.stdout
else:
filename = os.path.join(output_dir, os.path.basename(filename))
output_file = get_output_filename(filename, parquet=output_parquet)
if args.stdout: wikiq = WikiqParser(input_file,
output_file = sys.stdout output_file,
else: collapse_user=args.collapse_user,
filename = os.path.join(output_dir, os.path.basename(filename)) persist=persist,
output_file = get_output_filename(filename, parquet = output_parquet) urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label,
output_parquet=output_parquet)
wikiq = WikiqParser(input_file, wikiq.process()
output_file,
# close things
input_file.close()
else:
wikiq = WikiqParser(sys.stdin,
sys.stdout,
collapse_user=args.collapse_user, collapse_user=args.collapse_user,
persist=persist, persist=persist,
# persist_legacy=args.persist_legacy,
urlencode=args.urlencode, urlencode=args.urlencode,
namespaces=namespaces, namespaces=namespaces,
revert_radius=args.revert_radius, revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision, regex_match_revision=args.regex_match_revision,
regex_revision_label = args.regex_revision_label, regex_revision_label=args.regex_revision_label,
regex_match_comment = args.regex_match_comment, regex_match_comment=args.regex_match_comment,
regex_comment_label = args.regex_comment_label, regex_comment_label=args.regex_comment_label)
output_parquet=output_parquet)
wikiq.process() wikiq.process()
# close things # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
input_file.close() # stop_words = stop_words.split(",")
else:
wikiq = WikiqParser(sys.stdin,
sys.stdout,
collapse_user=args.collapse_user,
persist=persist,
#persist_legacy=args.persist_legacy,
urlencode=args.urlencode,
namespaces=namespaces,
revert_radius=args.revert_radius,
regex_match_revision = args.regex_match_revision,
regex_revision_label = args.regex_revision_label,
regex_match_comment = args.regex_match_comment,
regex_comment_label = args.regex_comment_label)
wikiq.process() if __name__ == "__main__":
main()
# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
# stop_words = stop_words.split(",")