diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index f22f35e..ca42709 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -633,11 +633,14 @@ def test_resume_with_partition_namespaces(): def test_external_links_only(): - """Test that --external-links extracts external links without --citations.""" + """Test that --external-links extracts external links correctly.""" + import mwparserfromhell + tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet") try: - tester.call_wikiq("--external-links", "--fandom-2020") + # Also include --text so we can verify extraction against actual wikitext + tester.call_wikiq("--external-links", "--text", "--fandom-2020") except subprocess.CalledProcessError as exc: pytest.fail(exc.stderr.decode("utf8")) @@ -653,15 +656,46 @@ def test_external_links_only(): assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ "external_links should be a list/array type or None" - print(f"External links only test passed! {len(test)} rows processed") + # Verify that extracted URLs look like valid URIs (have a scheme or are protocol-relative) + all_urls = [] + for links in test["external_links"]: + if links is not None and len(links) > 0: + all_urls.extend(links) + + for url in all_urls: + # External links can be http, https, mailto, ftp, etc. or protocol-relative (//) + has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme + is_protocol_relative = url.startswith("//") + assert has_scheme or is_protocol_relative, \ + f"External link should be a valid URI, got: {url}" + + # Verify extraction matches mwparserfromhell for a sample of rows with text + rows_with_links = test[test["external_links"].apply(lambda x: x is not None and len(x) > 0)] + if len(rows_with_links) > 0: + # Test up to 5 rows + sample = rows_with_links.head(5) + for idx, row in sample.iterrows(): + text = row["text"] + if text: + wikicode = mwparserfromhell.parse(text) + expected_links = [str(link.url) for link in wikicode.filter_external_links()] + actual_links = list(row["external_links"]) + assert actual_links == expected_links, \ + f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}" + + print(f"External links only test passed! {len(test)} rows, {len(all_urls)} total URLs extracted") def test_citations_only(): - """Test that --citations extracts citations without --external-links.""" + """Test that --citations extracts citations correctly.""" + import mwparserfromhell + from wikiq.wikitext_parser import WikitextParser + tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet") try: - tester.call_wikiq("--citations", "--fandom-2020") + # Also include --text so we can verify extraction against actual wikitext + tester.call_wikiq("--citations", "--text", "--fandom-2020") except subprocess.CalledProcessError as exc: pytest.fail(exc.stderr.decode("utf8")) @@ -677,15 +711,43 @@ def test_citations_only(): assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ "citations should be a list/array type or None" - print(f"Citations only test passed! {len(test)} rows processed") + # Verify that extracted citations have correct prefixes (ref: or template:) + all_citations = [] + for citations in test["citations"]: + if citations is not None and len(citations) > 0: + all_citations.extend(citations) + + for citation in all_citations: + assert citation.startswith("ref:") or citation.startswith("template:"), \ + f"Citation should start with 'ref:' or 'template:', got: {citation}" + + # Verify extraction matches WikitextParser for a sample of rows with text + rows_with_citations = test[test["citations"].apply(lambda x: x is not None and len(x) > 0)] + if len(rows_with_citations) > 0: + parser = WikitextParser() + # Test up to 5 rows + sample = rows_with_citations.head(5) + for idx, row in sample.iterrows(): + text = row["text"] + if text: + expected_citations = parser.extract_citations(text) + actual_citations = list(row["citations"]) + assert actual_citations == expected_citations, \ + f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}" + + print(f"Citations only test passed! {len(test)} rows, {len(all_citations)} total citations extracted") def test_external_links_and_citations(): """Test that both --external-links and --citations work together (shared parser).""" + import mwparserfromhell + from wikiq.wikitext_parser import WikitextParser + tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet") try: - tester.call_wikiq("--external-links", "--citations", "--fandom-2020") + # Also include --text so we can verify extraction against actual wikitext + tester.call_wikiq("--external-links", "--citations", "--text", "--fandom-2020") except subprocess.CalledProcessError as exc: pytest.fail(exc.stderr.decode("utf8")) @@ -701,7 +763,56 @@ def test_external_links_and_citations(): assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ "citations should be a list/array type or None" - print(f"External links and citations test passed! {len(test)} rows processed") + # Verify URLs look like valid URIs (have a scheme or are protocol-relative) + all_urls = [] + for links in test["external_links"]: + if links is not None and len(links) > 0: + all_urls.extend(links) + + for url in all_urls: + # External links can be http, https, mailto, ftp, etc. or protocol-relative (//) + has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme + is_protocol_relative = url.startswith("//") + assert has_scheme or is_protocol_relative, \ + f"External link should be a valid URI, got: {url}" + + # Verify citations have correct prefixes + all_citations = [] + for citations in test["citations"]: + if citations is not None and len(citations) > 0: + all_citations.extend(citations) + + for citation in all_citations: + assert citation.startswith("ref:") or citation.startswith("template:"), \ + f"Citation should start with 'ref:' or 'template:', got: {citation}" + + # Verify extraction matches WikitextParser for a sample of rows with text + # This tests that the shared parser optimization works correctly + parser = WikitextParser() + rows_with_content = test[ + (test["external_links"].apply(lambda x: x is not None and len(x) > 0)) | + (test["citations"].apply(lambda x: x is not None and len(x) > 0)) + ] + if len(rows_with_content) > 0: + # Test up to 5 rows + sample = rows_with_content.head(5) + for idx, row in sample.iterrows(): + text = row["text"] + if text: + # Verify external links + wikicode = mwparserfromhell.parse(text) + expected_links = [str(link.url) for link in wikicode.filter_external_links()] + actual_links = list(row["external_links"]) if row["external_links"] is not None else [] + assert actual_links == expected_links, \ + f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}" + + # Verify citations + expected_citations = parser.extract_citations(text) + actual_citations = list(row["citations"]) if row["citations"] is not None else [] + assert actual_citations == expected_citations, \ + f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}" + + print(f"External links and citations test passed! {len(test)} rows, {len(all_urls)} URLs, {len(all_citations)} citations") def test_no_wikitext_columns():