refactor into src-layout package.

This commit is contained in:
Nathan TeBlunthuis 2025-07-07 20:13:17 -07:00
parent 56c90fe1cc
commit c597a6b7f4
9 changed files with 59 additions and 18 deletions

42
README.md Normal file
View File

@ -0,0 +1,42 @@
When you install this from git, you will need to first clone the
repository:
git clone git://projects.mako.cc/mediawiki_dump_tools
From within the repository working directory, initiatlize and set up the
submodule like:
git submodule init
git submodule update
Wikimedia dumps are usually in a compressed format such as 7z (most
common), gz, or bz2. Wikiq uses your computer\'s compression software to
read these files. Therefore wikiq depends on [7za]{.title-ref},
[gzcat]{.title-ref}, and [zcat]{.title-ref}.
# Dependencies
These non-Python dependencies must be installed on your system for wikiq
and its associated tests to work.
- 7zip
- ffmpeg
A new diff engine based on [\_wikidiff2]{.title-ref} can be used for
word-persistence. Wikiq can also output the diffs between each page
revision. This requires installing Wikidiff 2 on your system. On Debian
or Ubuntu Linux this can be done via.
`apt-get install php-wikidiff2`
You may have to also run. `sudo phpenmod wikidiff2`.
Tests \-\-\--To run tests:
python -m unittest test.Wikiq_Unit_Test
## TODO:
1. \[\] Output metadata about the run. What parameters were used? What
versions of deltas?
2. \[\] Url encoding by default

12
php.ini
View File

@ -1,12 +0,0 @@
; wikidiff2 extension settings
wikidiff2.change_threshold = 0.2
wikidiff2.moved_line_threshold = 0.4
wikidiff2.moved_paragraph_detection_cutoff = 5000000
wikidiff2.max_word_level_diff_complexity = 40000000
wikidiff2.max_split_size = 1
wikidiff2.initial_split_threshold = 0.1
wikidiff2.final_split_threshold = 0.6
; It is possible this limit will need to be larger for some pages.
post_max_size = 10000M
opcache.enable=0

View File

@ -17,7 +17,18 @@ dependencies = [
"yamlconf>=0.2.6",
]
[project.scripts]
wikiq = "wikiq:main"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/wikiq"]
[tool.uv.sources]
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
deltas = { git = "https://github.com/groceryheist/deltas" }

View File

@ -23,9 +23,9 @@ from deltas.tokenizers import wikitext_split
import mwpersistence
import mwreverts
import tables
from tables import RevisionTable
from wiki_diff_matcher import WikiDiffMatcher
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
TO_ENCODE = ('title', 'editor')
PERSISTENCE_RADIUS = 7
@ -352,7 +352,7 @@ class WikiqParser:
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
if self.diff:
from diff_pyarrow_schema import diff_field
from wikiq.diff_pyarrow_schema import diff_field
schema = schema.append(diff_field)
# Add regex fields to the schema.

View File

@ -12,7 +12,7 @@ from pandas.testing import assert_frame_equal, assert_series_equal
# Make references to files and wikiq relative to this file, not to the current working directory.
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR,".."), "src/wikiq/__init__.py")
TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")

View File

@ -6,7 +6,7 @@ import pytest_asyncio
from typing import List
from deltas import Delete, Equal, Insert, wikitext_split
from mwpersistence import Token
from wiki_diff_matcher import WikiDiffMatcher
from wikiq.wiki_diff_matcher import WikiDiffMatcher
def _replace_whitespace(match):
if match.group(1): # If spaces matched (e.g., ' ')