Merge pull request #20 from bcye/feature/test-parser

Add Unit Tests for Parser
This commit is contained in:
Bruce 2025-04-30 22:08:53 +02:00 committed by GitHub
commit 3126d2c39b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 4920 additions and 2 deletions

23
.github/workflows/test-parser.yaml vendored Normal file
View File

@ -0,0 +1,23 @@
on:
pull_request:
jobs:
run-tests:
name: Unit-Test Parser
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v5
- name: "Set up Python"
uses: actions/setup-python@v5
with:
python-version-file: ".python-version"
- name: Install the project
run: uv sync --locked --dev
- name: Run tests
run: PYTHONPATH=. uv run pytest

View File

@ -4,6 +4,22 @@ Small utility to convert the wikitext data from the Wikivoyage dumps into a stru
## Installation
## Documentation
See [docs](docs) for more information on how to use this utility.
See [docs](docs) for more information on how to use this utility.
## Testing
Run `PYTHONPATH=. pytest` from inside the venv
## License
### Code
(c) 2025 bcye and moll-re
All code and documentation unless otherwise stated is licensed under the AGPLv3 license, refer to [LICENSE](LICENSE) for the full license text.
### Examples
Files in the `docs/example` and `tests/fixtures` are copies (.txt) or derivatives (.json) of the Boston Article on Wikivoyage and licensed under CC BY-SA 4.0. A [list of contributors is available on the original article](https://en.wikivoyage.org/w/index.php?title=Boston&action=history).

View File

@ -12,3 +12,8 @@ dependencies = [
"python-dotenv>=1.1.0",
"wikitextparser>=0.56.3",
]
[dependency-groups]
dev = [
"pytest>=8.3.5",
]

1577
tests/fixtures/boston_input.txt vendored Normal file

File diff suppressed because it is too large Load Diff

2905
tests/fixtures/boston_output.json vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,333 @@
import json
import os
import pytest
from transformers import WikivoyageParser
def dump(obj):
# canonical JSON for deep compare
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
def wrap(children):
"""Wrap a list of child nodes in the default root envelope."""
return {
"type": "root",
"properties": {},
"children": children
}
@pytest.fixture
def parser():
return WikivoyageParser()
def test_empty_input_is_root_only(parser):
got = parser.parse("")
assert dump(got) == dump(wrap([]))
def test_plain_text_node(parser):
got = parser.parse("Just some plain text.")
expected = wrap([
{"type":"text","properties":{"markdown":"Just some plain text."},"children":[]}
])
assert dump(got) == dump(expected)
def test_template_node(parser):
got = parser.parse("{{foo|a=1|b=two}}")
expected = wrap([
{
"type":"template",
"properties":{"name":"foo","params":{"a":"1","b":"two"}},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_see_listing_full_properties(parser):
snippet = (
"{{see"
"|name=Statue"
"|alt=Monument"
"|url=http://x"
"|email=a@b.com"
"|address=1 Road"
"|lat=1.23"
"|long=4.56"
"|directions=North"
"|phone=12345"
"|tollfree=800"
"|fax=54321"
"|hours=24/7"
"|price=Free"
"|lastedit=2020-01-01"
"|wikipedia=Statue"
"|wikidata=Q1"
"|content=Big statue"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"see",
"properties":{
"name":"Statue","alt":"Monument","url":"http://x",
"email":"a@b.com","address":"1 Road","lat":"1.23","long":"4.56",
"directions":"North","phone":"12345","tollfree":"800",
"fax":"54321","hours":"24/7","price":"Free",
"lastedit":"2020-01-01","wikipedia":"Statue","wikidata":"Q1",
"content":"Big statue"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_do_listing_full_properties(parser):
snippet = (
"{{do"
"|name=Walk"
"|alt=Stroll"
"|url=http://walk"
"|email=hi@walk"
"|address=Main Street"
"|lat=2.34"
"|long=5.67"
"|directions=East"
"|phone=222-333"
"|tollfree=800-DO-WALK"
"|fax=999-888"
"|hours=All day"
"|price=Free"
"|lastedit=2021-02-02"
"|wikipedia=Walking"
"|wikidata=Q2"
"|content=Enjoy a walk"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"do",
"properties":{
"name":"Walk","alt":"Stroll","url":"http://walk",
"email":"hi@walk","address":"Main Street","lat":"2.34","long":"5.67",
"directions":"East","phone":"222-333","tollfree":"800-DO-WALK",
"fax":"999-888","hours":"All day","price":"Free",
"lastedit":"2021-02-02","wikipedia":"Walking","wikidata":"Q2",
"content":"Enjoy a walk"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_buy_listing_full_properties(parser):
snippet = (
"{{buy"
"|name=Shirt"
"|alt=Tees"
"|url=http://shop"
"|email=sales@shop"
"|address=Market St"
"|lat=3.45"
"|long=6.78"
"|directions=West"
"|phone=444-555"
"|tollfree=800-BUY-TEE"
"|fax=777-666"
"|hours=96"
"|price=$20"
"|lastedit=2022-03-03"
"|wikipedia=Shopping"
"|wikidata=Q3"
"|content=Quality tees"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"buy",
"properties":{
"name":"Shirt","alt":"Tees","url":"http://shop",
"email":"sales@shop","address":"Market St","lat":"3.45","long":"6.78",
"directions":"West","phone":"444-555","tollfree":"800-BUY-TEE",
"fax":"777-666","hours":"96","price":"$20",
"lastedit":"2022-03-03","wikipedia":"Shopping","wikidata":"Q3",
"content":"Quality tees"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_eat_listing_full_properties(parser):
snippet = (
"{{eat"
"|name=Diner"
"|alt=Cafe"
"|url=http://eat"
"|email=food@eat"
"|address=Food Lane"
"|lat=4.56"
"|long=7.89"
"|directions=South"
"|phone=666-777"
"|tollfree=800-EAT-YUM"
"|fax=555-444"
"|hours=Breakfast"
"|price=$10$30"
"|lastedit=2023-04-04"
"|wikipedia=Dining"
"|wikidata=Q4"
"|content=Best pancakes"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"eat",
"properties":{
"name":"Diner","alt":"Cafe","url":"http://eat",
"email":"food@eat","address":"Food Lane","lat":"4.56","long":"7.89",
"directions":"South","phone":"666-777","tollfree":"800-EAT-YUM",
"fax":"555-444","hours":"Breakfast","price":"$10$30",
"lastedit":"2023-04-04","wikipedia":"Dining","wikidata":"Q4",
"content":"Best pancakes"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_drink_listing_full_properties(parser):
snippet = (
"{{drink"
"|name=Pub"
"|alt=Bar"
"|url=http://drink"
"|email=cheers@drink"
"|address=Bar Street"
"|lat=5.67"
"|long=8.90"
"|directions=Center"
"|phone=888-999"
"|tollfree=800-DRINK"
"|fax=333-222"
"|hours=Evening"
"|price=$7$30"
"|lastedit=2024-05-05"
"|wikipedia=Nightlife"
"|wikidata=Q5"
"|content=Great brews"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"drink",
"properties":{
"name":"Pub","alt":"Bar","url":"http://drink",
"email":"cheers@drink","address":"Bar Street","lat":"5.67","long":"8.90",
"directions":"Center","phone":"888-999","tollfree":"800-DRINK",
"fax":"333-222","hours":"Evening","price":"$7$30",
"lastedit":"2024-05-05","wikipedia":"Nightlife","wikidata":"Q5",
"content":"Great brews"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_sleep_listing_full_properties(parser):
snippet = (
"{{sleep"
"|name=Hotel"
"|alt=Inn"
"|url=http://sleep"
"|email=stay@sleep"
"|address=Sleepy Ave"
"|lat=6.78"
"|long=9.01"
"|directions=Uptown"
"|phone=000-111"
"|tollfree=800-SLEEP"
"|fax=111-000"
"|hours=24h"
"|price=$100"
"|lastedit=2025-06-06"
"|wikipedia=Accommodation"
"|wikidata=Q6"
"|checkin=3PM"
"|checkout=11AM"
"|content=Cozy rooms"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"sleep",
"properties":{
"name":"Hotel","alt":"Inn","url":"http://sleep",
"email":"stay@sleep","address":"Sleepy Ave","lat":"6.78","long":"9.01",
"directions":"Uptown","phone":"000-111","tollfree":"800-SLEEP",
"fax":"111-000","hours":"24h","price":"$100",
"lastedit":"2025-06-06","wikipedia":"Accommodation","wikidata":"Q6",
"checkin":"3PM","checkout":"11AM","content":"Cozy rooms"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_generic_listing_full_properties(parser):
snippet = (
"{{listing"
"|name=Info"
"|alt=Data"
"|url=http://info"
"|email=info@info"
"|address=Down St"
"|lat=7.89"
"|long=0.12"
"|directions=Here"
"|phone=123-000"
"|tollfree=800-INFO"
"|fax=000-123"
"|hours=All times"
"|price=$0"
"|lastedit=2026-07-07"
"|wikipedia=InfoPage"
"|wikidata=Q7"
"|content=Useful info"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"listing",
"properties":{
"name":"Info","alt":"Data","url":"http://info",
"email":"info@info","address":"Down St","lat":"7.89","long":"0.12",
"directions":"Here","phone":"123-000","tollfree":"800-INFO",
"fax":"000-123","hours":"All times","price":"$0",
"lastedit":"2026-07-07","wikipedia":"InfoPage","wikidata":"Q7",
"content":"Useful info"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_section_and_subsection(parser):
got = parser.parse("Intro\n== First ==\nHello\n=== Sub ===\nWorld")
sec = got["children"][1]
assert sec["type"] == "section" and sec["properties"]["level"] == 2
sub = sec["children"][1]
assert sub["type"] == "section" and sub["properties"]["level"] == 3
def test_full_boston_snapshot(parser):
here = os.path.dirname(__file__)
inp = os.path.join(here, "fixtures", "boston_input.txt")
out = os.path.join(here, "fixtures", "boston_output.json")
wikicode = open(inp, encoding="utf-8").read()
expected = json.load(open(out, encoding="utf-8"))
got = parser.parse(wikicode)
assert dump(got) == dump(expected)

59
uv.lock generated
View File

@ -135,6 +135,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 },
]
[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
]
[[package]]
name = "cryptography"
version = "44.0.2"
@ -239,6 +248,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
]
[[package]]
name = "iniconfig"
version = "2.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
]
[[package]]
name = "mapvoyage-extract"
version = "0.1.0"
@ -252,6 +270,11 @@ dependencies = [
{ name = "wikitextparser" },
]
[package.dev-dependencies]
dev = [
{ name = "pytest" },
]
[package.metadata]
requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
@ -262,6 +285,9 @@ requires-dist = [
{ name = "wikitextparser", specifier = ">=0.56.3" },
]
[package.metadata.requires-dev]
dev = [{ name = "pytest", specifier = ">=8.3.5" }]
[[package]]
name = "multidict"
version = "6.4.3"
@ -335,6 +361,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
]
[[package]]
name = "packaging"
version = "25.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 },
]
[[package]]
name = "pluggy"
version = "1.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
]
[[package]]
name = "propcache"
version = "0.3.1"
@ -401,6 +445,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
]
[[package]]
name = "pytest"
version = "8.3.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "iniconfig" },
{ name = "packaging" },
{ name = "pluggy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
]
[[package]]
name = "python-dotenv"
version = "1.1.0"