diff --git a/README.md b/README.md new file mode 100644 index 0000000..7d59051 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +When you install this from git, you will need to first clone the +repository: + + git clone git://projects.mako.cc/mediawiki_dump_tools + +From within the repository working directory, initiatlize and set up the +submodule like: + + git submodule init + git submodule update + +Wikimedia dumps are usually in a compressed format such as 7z (most +common), gz, or bz2. Wikiq uses your computer\'s compression software to +read these files. Therefore wikiq depends on [7za]{.title-ref}, +[gzcat]{.title-ref}, and [zcat]{.title-ref}. + +# Dependencies + +These non-Python dependencies must be installed on your system for wikiq +and its associated tests to work. + +- 7zip +- ffmpeg + +A new diff engine based on [\_wikidiff2]{.title-ref} can be used for +word-persistence. Wikiq can also output the diffs between each page +revision. This requires installing Wikidiff 2 on your system. On Debian +or Ubuntu Linux this can be done via. + +`apt-get install php-wikidiff2` + +You may have to also run. `sudo phpenmod wikidiff2`. + +Tests \-\-\--To run tests: + + python -m unittest test.Wikiq_Unit_Test + +## TODO: + +1. \[\] Output metadata about the run. What parameters were used? What + versions of deltas? +2. \[\] Url encoding by default diff --git a/php.ini b/php.ini deleted file mode 100644 index f12e1d2..0000000 --- a/php.ini +++ /dev/null @@ -1,12 +0,0 @@ -; wikidiff2 extension settings -wikidiff2.change_threshold = 0.2 -wikidiff2.moved_line_threshold = 0.4 -wikidiff2.moved_paragraph_detection_cutoff = 5000000 -wikidiff2.max_word_level_diff_complexity = 40000000 -wikidiff2.max_split_size = 1 -wikidiff2.initial_split_threshold = 0.1 -wikidiff2.final_split_threshold = 0.6 - -; It is possible this limit will need to be larger for some pages. -post_max_size = 10000M -opcache.enable=0 diff --git a/pyproject.toml b/pyproject.toml index 19c04c8..b312298 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,18 @@ dependencies = [ "yamlconf>=0.2.6", ] +[project.scripts] +wikiq = "wikiq:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/wikiq"] + [tool.uv.sources] + yamlconf = { git = "https://github.com/groceryheist/yamlconf" } mwxml = { git = "https://github.com/groceryheist/python-mwxml" } deltas = { git = "https://github.com/groceryheist/deltas" } diff --git a/wikiq b/src/wikiq/__init__.py similarity index 99% rename from wikiq rename to src/wikiq/__init__.py index b1633da..fa4449d 100755 --- a/wikiq +++ b/src/wikiq/__init__.py @@ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split import mwpersistence import mwreverts -import tables -from tables import RevisionTable -from wiki_diff_matcher import WikiDiffMatcher +import wikiq.tables as tables +from wikiq.tables import RevisionTable +from wikiq.wiki_diff_matcher import WikiDiffMatcher TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS = 7 @@ -352,7 +352,7 @@ class WikiqParser: schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) if self.diff: - from diff_pyarrow_schema import diff_field + from wikiq.diff_pyarrow_schema import diff_field schema = schema.append(diff_field) # Add regex fields to the schema. diff --git a/diff_pyarrow_schema.py b/src/wikiq/diff_pyarrow_schema.py similarity index 100% rename from diff_pyarrow_schema.py rename to src/wikiq/diff_pyarrow_schema.py diff --git a/tables.py b/src/wikiq/tables.py similarity index 100% rename from tables.py rename to src/wikiq/tables.py diff --git a/wiki_diff_matcher.py b/src/wikiq/wiki_diff_matcher.py similarity index 100% rename from wiki_diff_matcher.py rename to src/wikiq/wiki_diff_matcher.py diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index ad57f3a..df33905 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) -WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") +WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") diff --git a/test/test_wiki_diff_matcher.py b/test/test_wiki_diff_matcher.py index fee41f3..9665edb 100644 --- a/test/test_wiki_diff_matcher.py +++ b/test/test_wiki_diff_matcher.py @@ -6,7 +6,7 @@ import pytest_asyncio from typing import List from deltas import Delete, Equal, Insert, wikitext_split from mwpersistence import Token -from wiki_diff_matcher import WikiDiffMatcher +from wikiq.wiki_diff_matcher import WikiDiffMatcher def _replace_whitespace(match): if match.group(1): # If spaces matched (e.g., ' ')