Compare commits

...

37 Commits

Author SHA1 Message Date
Bruce
8f099dc7bc
Merge pull request #33 from bcye/feature/multiple-handlers
Allow for multiple handlers
2025-06-03 14:52:14 +02:00
Bruce Röttgers
be28fddeb5 accept kwargs to forward max conc 2025-05-16 20:42:31 +02:00
Bruce Röttgers
5031f33ea2 move semaphore to handler level 2025-05-16 20:32:21 +02:00
Bruce Röttgers
60c13fb9ec support multiple handlers 2025-05-16 20:27:59 +02:00
Bruce
f73046bd65
Merge pull request #32 from bcye/relicense-types
relicense types to MIT
2025-05-16 16:34:54 +02:00
Bruce
5d1ec5bb2e
relicense types to MIT 2025-05-16 16:33:54 +02:00
Bruce
38901474c6
Merge pull request #29 from bcye/feature/parse-titles
Parse Titles
2025-05-13 16:28:27 +02:00
Bruce Röttgers
b33201e930 add title parsing from xml 2025-05-07 15:22:20 +02:00
Bruce
93d99bf062
Merge pull request #21 from bcye/feature/docker
Dockerize
2025-04-30 22:11:31 +02:00
Bruce
1a5b9b44e0
Merge branch 'main' into feature/docker 2025-04-30 22:10:55 +02:00
Bruce
3126d2c39b
Merge pull request #20 from bcye/feature/test-parser
Add Unit Tests for Parser
2025-04-30 22:08:53 +02:00
Bruce Röttgers
84e9a68bac update reference 2025-04-30 22:06:29 +02:00
Bruce Röttgers
f67d8d3963 Merge branch 'main' into feature/test-parser 2025-04-30 22:05:41 +02:00
Bruce Röttgers
c780a4bb99 Merge branch 'main' into feature/test-parser 2025-04-30 22:04:05 +02:00
Bruce Röttgers
1e89b20483 test scripts didnt have excerpts afterall 2025-04-30 22:02:41 +02:00
Bruce
1c16ee87e6
Update README.md 2025-04-30 21:47:19 +02:00
Bruce
fba9be556e
add proper attribution for fixtures from wikivoyage 2025-04-30 21:34:43 +02:00
Bruce
729d4adc62
Merge pull request #18 from bcye/feature/only-python
Integrate Node Script into Python
2025-04-30 21:12:51 +02:00
Bruce Röttgers
0c2905c119 Merge branch 'main' into feature/only-python 2025-04-30 16:20:13 +02:00
Bruce Röttgers
08cd8b41fe reflect new filename (main.py) in docker 2025-04-30 14:01:56 +02:00
Bruce Röttgers
63babeace3 Merge branch 'feature/only-python' into feature/docker 2025-04-30 14:01:34 +02:00
Bruce Röttgers
b18387a83c refactor transform code into own module 2025-04-30 14:01:20 +02:00
Bruce
3e2149ebcc
Merge pull request #17 from bcye/feature/npm
Refactor types into npm package and add CI Publish Action
2025-04-29 17:22:50 +02:00
Bruce Röttgers
243c4be9fe forgot to remove cd 2025-04-29 17:20:52 +02:00
Bruce Röttgers
6faf2a1a97 try new workflow 2025-04-29 17:19:44 +02:00
Bruce Röttgers
82520947e0 0.2.3 2025-04-27 23:05:30 +02:00
Bruce Röttgers
ac2ab450b9 v0.2.2 2025-04-27 23:04:02 +02:00
Bruce Röttgers
4bed99ca8c v0.2.1 2025-04-27 22:58:22 +02:00
Bruce Röttgers
96843f104c remove fixed package manager version 2025-04-27 22:58:07 +02:00
Bruce Röttgers
75503c971d v0.2.0 compile ts 2025-04-27 22:55:36 +02:00
Bruce Röttgers
559bcdda44 back to uv 2025-04-26 21:57:43 +02:00
Bruce Röttgers
322df10561 try new install method 2025-04-26 19:43:44 +02:00
Bruce Röttgers
59b2aeb1f4 add tests and workflow 2025-04-26 19:40:41 +02:00
Bruce Röttgers
d48e75ce01 types/0.1.2 2025-04-26 15:07:07 +02:00
Bruce Röttgers
5e74672049 types/0.1.1 2025-04-26 15:03:50 +02:00
Bruce Röttgers
4d25bc9e4c add git workflow 2025-04-26 14:48:36 +02:00
Bruce Röttgers
201387be5e refactor types to own folder and init npm pkg 2025-04-26 14:38:32 +02:00
24 changed files with 5456 additions and 250 deletions

40
.github/workflows/publish-types.yaml vendored Normal file
View File

@ -0,0 +1,40 @@
# Example from https://docs.github.com/en/actions/use-cases-and-examples/publishing-packages/publishing-nodejs-packages#publishing-packages-to-the-npm-registry
name: Publish Types Package to npmjs
on:
push:
tags:
- "types/*"
defaults:
run:
working-directory: types
jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
name: Install pnpm
with:
version: 10
run_install: false
# Setup .npmrc file to publish to npm
- uses: actions/setup-node@v4
with:
node-version: "20.x"
cache: "pnpm"
cache-dependency-path: "types/pnpm-lock.yaml"
registry-url: "https://registry.npmjs.org"
- run: pnpm install --frozen-lockfile
- run: pnpm tsc
- run: pnpm publish --provenance --access public --no-git-checks
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

23
.github/workflows/test-parser.yaml vendored Normal file
View File

@ -0,0 +1,23 @@
on:
pull_request:
jobs:
run-tests:
name: Unit-Test Parser
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v5
- name: "Set up Python"
uses: actions/setup-python@v5
with:
python-version-file: ".python-version"
- name: Install the project
run: uv sync --locked --dev
- name: Run tests
run: PYTHONPATH=. uv run pytest

View File

@ -7,4 +7,4 @@ RUN uv sync --frozen
COPY . . COPY . .
CMD ["uv", "run", "transform-documents.py"] CMD ["uv", "run", "main.py"]

View File

@ -15,3 +15,19 @@ TypeScript types for consuming the json output are available, you may install th
## Documentation ## Documentation
See [docs](docs) for more information on how to use this utility. See [docs](docs) for more information on how to use this utility.
## Testing
Run `PYTHONPATH=. pytest` from inside the venv
## License
### Code
(c) 2025 bcye and moll-re
All code and documentation unless otherwise stated is licensed under the AGPLv3 license, refer to [LICENSE](LICENSE) for the full license text. The types package and all its code is [licensed under MIT](types/LICENSE).
### Examples
Files in the `docs/example` and `tests/fixtures` are copies (.txt) or derivatives (.json) of the Boston Article on Wikivoyage and licensed under CC BY-SA 4.0. A [list of contributors is available on the original article](https://en.wikivoyage.org/w/index.php?title=Boston&action=history).

5
docs/types.md Normal file
View File

@ -0,0 +1,5 @@
# Types Package
## Publishing new versions
Up version in package.json and create a new commit and tag it with "types/x.y.z", the version will be published when the tag is pushed to GitHub

143
main.py Normal file
View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
import os
import sys
import re
import zlib
import bz2
import asyncio
import logging
import importlib
import xml.sax
from pathlib import Path
from dotenv import load_dotenv
import aiohttp
from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
logger = logging.getLogger(__name__)
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key.replace(prefix, "").lower()
# cast ints
if val.isdigit():
val = int(val)
# cast bools
elif val.lower() in ("true", "false"):
val = val.lower() == "true"
kwargs[param] = val
logger.debug(f"Handler kwargs: {kwargs}")
return kwargs
async def process_dump(
mappings: dict[str, str], handlers
):
"""
Stream-download the bzip2-compressed XML dump and feed to SAX.
"""
xml_url = (
"https://dumps.wikimedia.org/"
"enwikivoyage/latest/"
"enwikivoyage-latest-pages-articles.xml.bz2"
)
decomp = bz2.BZ2Decompressor()
sax_parser = xml.sax.make_parser()
dump_handler = WikiDumpHandler(mappings, handlers)
sax_parser.setContentHandler(dump_handler)
async with aiohttp.ClientSession() as session:
async with session.get(xml_url) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024 * 1024):
data = decomp.decompress(chunk)
if not data:
continue
text = data.decode("utf-8", errors="ignore")
sax_parser.feed(text)
sax_parser.close()
if dump_handler.tasks:
await asyncio.gather(*dump_handler.tasks)
async def main():
# 1. Which handler(s) to load?
handler_names = os.getenv("HANDLER", "").split(",")
if not handler_names or not handler_names[0]:
logger.error("Error: set ENV HANDLER (e.g. 'filesystem' or 'filesystem,sftp')")
sys.exit(1)
# 2. Read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
raise ValueError("MAX_CONCURRENT must be an integer")
if max_conc < 0:
raise ValueError("MAX_CONCURRENT must be >= 0")
handlers = []
# 3. Load each handler
for handler_name in handler_names:
handler_name = handler_name.strip()
if not handler_name:
continue
# Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
mod = importlib.import_module(module_path)
except ImportError as e:
logger.error(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
# Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
logger.error(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
logger.info(f"Using handler from {module_path}")
# Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
# Add max_concurrent to kwargs
handler_kwargs["max_concurrent"] = max_conc
# Instantiate
handler = HandlerCls(**handler_kwargs)
handlers.append(handler)
# 4. Fetch mappings
logger.info("Fetching mappings from SQL dump…")
mappings = await fetch_mappings()
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
# 5. Stream & split the XML dump
logger.info("Processing XML dump…")
await process_dump(mappings, handlers) # Pass 0 as max_concurrent since handlers handle it
# 6. Finish up
await asyncio.gather(*[handler.close() for handler in handlers])
logger.info("All done.")
if __name__ == "__main__":
load_dotenv()
if os.getenv("DEBUG"):
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
asyncio.run(main())

View File

@ -1,6 +1,7 @@
"""Reference handler for output handlers.""" """Reference handler for output handlers."""
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import logging import logging
import asyncio
@ -14,15 +15,20 @@ class BaseHandler(ABC):
_successful_writes = 0 _successful_writes = 0
_failed_writes = 0 _failed_writes = 0
def __init__(self, fail_on_error: bool = True, **kwargs): def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
""" """
Initializes the BaseHandler with optional parameters. Initializes the BaseHandler with optional parameters.
Args: Args:
fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True. fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
max_concurrent: Maximum number of concurrent write operations.
0 means unlimited concurrency.
**kwargs: Additional keyword arguments for specific handler implementations. **kwargs: Additional keyword arguments for specific handler implementations.
""" """
self.fail_on_error = fail_on_error self.fail_on_error = fail_on_error
self.semaphore = None
if max_concurrent > 0:
self.semaphore = asyncio.Semaphore(max_concurrent)
@abstractmethod @abstractmethod
@ -47,6 +53,10 @@ class BaseHandler(ABC):
entry (dict): The entry to write (will be JSON-encoded). entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended. uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
""" """
if self.semaphore:
async with self.semaphore:
success = await self._write_entry(entry, uid)
else:
success = await self._write_entry(entry, uid) success = await self._write_entry(entry, uid)
if success: if success:
self.logger.debug(f"Successfully wrote entry with UID {uid}") self.logger.debug(f"Successfully wrote entry with UID {uid}")

View File

@ -10,8 +10,9 @@ class BunnyStorageHandler(BaseHandler):
api_key: str, api_key: str,
fail_on_error: bool = True, fail_on_error: bool = True,
keepalive_timeout: int = 75, keepalive_timeout: int = 75,
**kwargs,
): ):
super().__init__(fail_on_error=fail_on_error) super().__init__(fail_on_error=fail_on_error, **kwargs)
self.base_url = f"https://{region}.bunnycdn.com/{base_path}" self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
self.headers = { self.headers = {
"AccessKey": api_key, "AccessKey": api_key,

View File

@ -12,3 +12,8 @@ dependencies = [
"python-dotenv>=1.1.0", "python-dotenv>=1.1.0",
"wikitextparser>=0.56.3", "wikitextparser>=0.56.3",
] ]
[dependency-groups]
dev = [
"pytest>=8.3.5",
]

1577
tests/fixtures/boston_input.txt vendored Normal file

File diff suppressed because it is too large Load Diff

2905
tests/fixtures/boston_output.json vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,333 @@
import json
import os
import pytest
from transformers import WikivoyageParser
def dump(obj):
# canonical JSON for deep compare
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
def wrap(children):
"""Wrap a list of child nodes in the default root envelope."""
return {
"type": "root",
"properties": {},
"children": children
}
@pytest.fixture
def parser():
return WikivoyageParser()
def test_empty_input_is_root_only(parser):
got = parser.parse("")
assert dump(got) == dump(wrap([]))
def test_plain_text_node(parser):
got = parser.parse("Just some plain text.")
expected = wrap([
{"type":"text","properties":{"markdown":"Just some plain text."},"children":[]}
])
assert dump(got) == dump(expected)
def test_template_node(parser):
got = parser.parse("{{foo|a=1|b=two}}")
expected = wrap([
{
"type":"template",
"properties":{"name":"foo","params":{"a":"1","b":"two"}},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_see_listing_full_properties(parser):
snippet = (
"{{see"
"|name=Statue"
"|alt=Monument"
"|url=http://x"
"|email=a@b.com"
"|address=1 Road"
"|lat=1.23"
"|long=4.56"
"|directions=North"
"|phone=12345"
"|tollfree=800"
"|fax=54321"
"|hours=24/7"
"|price=Free"
"|lastedit=2020-01-01"
"|wikipedia=Statue"
"|wikidata=Q1"
"|content=Big statue"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"see",
"properties":{
"name":"Statue","alt":"Monument","url":"http://x",
"email":"a@b.com","address":"1 Road","lat":"1.23","long":"4.56",
"directions":"North","phone":"12345","tollfree":"800",
"fax":"54321","hours":"24/7","price":"Free",
"lastedit":"2020-01-01","wikipedia":"Statue","wikidata":"Q1",
"content":"Big statue"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_do_listing_full_properties(parser):
snippet = (
"{{do"
"|name=Walk"
"|alt=Stroll"
"|url=http://walk"
"|email=hi@walk"
"|address=Main Street"
"|lat=2.34"
"|long=5.67"
"|directions=East"
"|phone=222-333"
"|tollfree=800-DO-WALK"
"|fax=999-888"
"|hours=All day"
"|price=Free"
"|lastedit=2021-02-02"
"|wikipedia=Walking"
"|wikidata=Q2"
"|content=Enjoy a walk"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"do",
"properties":{
"name":"Walk","alt":"Stroll","url":"http://walk",
"email":"hi@walk","address":"Main Street","lat":"2.34","long":"5.67",
"directions":"East","phone":"222-333","tollfree":"800-DO-WALK",
"fax":"999-888","hours":"All day","price":"Free",
"lastedit":"2021-02-02","wikipedia":"Walking","wikidata":"Q2",
"content":"Enjoy a walk"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_buy_listing_full_properties(parser):
snippet = (
"{{buy"
"|name=Shirt"
"|alt=Tees"
"|url=http://shop"
"|email=sales@shop"
"|address=Market St"
"|lat=3.45"
"|long=6.78"
"|directions=West"
"|phone=444-555"
"|tollfree=800-BUY-TEE"
"|fax=777-666"
"|hours=96"
"|price=$20"
"|lastedit=2022-03-03"
"|wikipedia=Shopping"
"|wikidata=Q3"
"|content=Quality tees"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"buy",
"properties":{
"name":"Shirt","alt":"Tees","url":"http://shop",
"email":"sales@shop","address":"Market St","lat":"3.45","long":"6.78",
"directions":"West","phone":"444-555","tollfree":"800-BUY-TEE",
"fax":"777-666","hours":"96","price":"$20",
"lastedit":"2022-03-03","wikipedia":"Shopping","wikidata":"Q3",
"content":"Quality tees"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_eat_listing_full_properties(parser):
snippet = (
"{{eat"
"|name=Diner"
"|alt=Cafe"
"|url=http://eat"
"|email=food@eat"
"|address=Food Lane"
"|lat=4.56"
"|long=7.89"
"|directions=South"
"|phone=666-777"
"|tollfree=800-EAT-YUM"
"|fax=555-444"
"|hours=Breakfast"
"|price=$10$30"
"|lastedit=2023-04-04"
"|wikipedia=Dining"
"|wikidata=Q4"
"|content=Best pancakes"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"eat",
"properties":{
"name":"Diner","alt":"Cafe","url":"http://eat",
"email":"food@eat","address":"Food Lane","lat":"4.56","long":"7.89",
"directions":"South","phone":"666-777","tollfree":"800-EAT-YUM",
"fax":"555-444","hours":"Breakfast","price":"$10$30",
"lastedit":"2023-04-04","wikipedia":"Dining","wikidata":"Q4",
"content":"Best pancakes"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_drink_listing_full_properties(parser):
snippet = (
"{{drink"
"|name=Pub"
"|alt=Bar"
"|url=http://drink"
"|email=cheers@drink"
"|address=Bar Street"
"|lat=5.67"
"|long=8.90"
"|directions=Center"
"|phone=888-999"
"|tollfree=800-DRINK"
"|fax=333-222"
"|hours=Evening"
"|price=$7$30"
"|lastedit=2024-05-05"
"|wikipedia=Nightlife"
"|wikidata=Q5"
"|content=Great brews"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"drink",
"properties":{
"name":"Pub","alt":"Bar","url":"http://drink",
"email":"cheers@drink","address":"Bar Street","lat":"5.67","long":"8.90",
"directions":"Center","phone":"888-999","tollfree":"800-DRINK",
"fax":"333-222","hours":"Evening","price":"$7$30",
"lastedit":"2024-05-05","wikipedia":"Nightlife","wikidata":"Q5",
"content":"Great brews"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_sleep_listing_full_properties(parser):
snippet = (
"{{sleep"
"|name=Hotel"
"|alt=Inn"
"|url=http://sleep"
"|email=stay@sleep"
"|address=Sleepy Ave"
"|lat=6.78"
"|long=9.01"
"|directions=Uptown"
"|phone=000-111"
"|tollfree=800-SLEEP"
"|fax=111-000"
"|hours=24h"
"|price=$100"
"|lastedit=2025-06-06"
"|wikipedia=Accommodation"
"|wikidata=Q6"
"|checkin=3PM"
"|checkout=11AM"
"|content=Cozy rooms"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"sleep",
"properties":{
"name":"Hotel","alt":"Inn","url":"http://sleep",
"email":"stay@sleep","address":"Sleepy Ave","lat":"6.78","long":"9.01",
"directions":"Uptown","phone":"000-111","tollfree":"800-SLEEP",
"fax":"111-000","hours":"24h","price":"$100",
"lastedit":"2025-06-06","wikipedia":"Accommodation","wikidata":"Q6",
"checkin":"3PM","checkout":"11AM","content":"Cozy rooms"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_generic_listing_full_properties(parser):
snippet = (
"{{listing"
"|name=Info"
"|alt=Data"
"|url=http://info"
"|email=info@info"
"|address=Down St"
"|lat=7.89"
"|long=0.12"
"|directions=Here"
"|phone=123-000"
"|tollfree=800-INFO"
"|fax=000-123"
"|hours=All times"
"|price=$0"
"|lastedit=2026-07-07"
"|wikipedia=InfoPage"
"|wikidata=Q7"
"|content=Useful info"
"}}"
)
got = parser.parse(snippet)
expected = wrap([
{
"type":"listing",
"properties":{
"name":"Info","alt":"Data","url":"http://info",
"email":"info@info","address":"Down St","lat":"7.89","long":"0.12",
"directions":"Here","phone":"123-000","tollfree":"800-INFO",
"fax":"000-123","hours":"All times","price":"$0",
"lastedit":"2026-07-07","wikipedia":"InfoPage","wikidata":"Q7",
"content":"Useful info"
},
"children":[]
}
])
assert dump(got) == dump(expected)
def test_section_and_subsection(parser):
got = parser.parse("Intro\n== First ==\nHello\n=== Sub ===\nWorld")
sec = got["children"][1]
assert sec["type"] == "section" and sec["properties"]["level"] == 2
sub = sec["children"][1]
assert sub["type"] == "section" and sub["properties"]["level"] == 3
def test_full_boston_snapshot(parser):
here = os.path.dirname(__file__)
inp = os.path.join(here, "fixtures", "boston_input.txt")
out = os.path.join(here, "fixtures", "boston_output.json")
wikicode = open(inp, encoding="utf-8").read()
expected = json.load(open(out, encoding="utf-8"))
got = parser.parse(wikicode)
assert dump(got) == dump(expected)

View File

@ -1,246 +0,0 @@
#!/usr/bin/env python3
import os
import sys
import re
import zlib
import bz2
import asyncio
import logging
import importlib
import xml.sax
from pathlib import Path
from dotenv import load_dotenv
import aiohttp
from parser import WikivoyageParser
logger = logging.getLogger(__name__)
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key.replace(prefix, "").lower()
# cast ints
if val.isdigit():
val = int(val)
# cast bools
elif val.lower() in ("true", "false"):
val = val.lower() == "true"
kwargs[param] = val
logger.debug(f"Handler kwargs: {kwargs}")
return kwargs
async def fetch_mappings() -> dict[str, str]:
"""
Download and gunzip the page_props SQL dump, extract
pagewikibase_item mappings.
"""
sql_url = (
"https://dumps.wikimedia.org/"
"enwikivoyage/latest/"
"enwikivoyage-latest-page_props.sql.gz"
)
# decompress gzip
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
# regex for tuples: (page,'prop','value',NULL_or_number)
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
buffer = ""
mappings: dict[str, str] = {}
async with aiohttp.ClientSession() as session:
async with session.get(sql_url) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024 * 1024):
data = decomp.decompress(chunk)
if not data:
continue
text = data.decode("utf-8", errors="ignore")
buffer += text
for m in tuple_re.finditer(buffer):
page_id, prop, value = m.group(1), m.group(2), m.group(3)
if prop == "wikibase_item":
mappings[page_id] = value
# keep tail to handle split tuples
if len(buffer) > 1000:
buffer = buffer[-1000:]
return mappings
class WikiDumpHandler(xml.sax.ContentHandler):
"""
SAX handler that, for each <page> whose <id> is in mappings,
collects the <text> and schedules an async task to parse
and write via the usersupplied handler.
"""
def __init__(self, mappings, handler, max_concurrent):
super().__init__()
self.mappings = mappings
self.handler = handler
self.sem = (
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
)
self.tasks: list[asyncio.Task] = []
self.currentTag: str | None = None
self.inPage = False
self.inRevision = False
self.inText = False
self.currentPageId: str | None = None
self.currentText: list[str] = []
def startElement(self, name, attrs):
self.currentTag = name
if name == "page":
self.inPage = True
self.currentPageId = None
self.currentText = []
elif name == "revision":
self.inRevision = True
elif name == "text" and self.inRevision:
self.inText = True
def endElement(self, name):
if name == "page":
pid = self.currentPageId
if pid and pid in self.mappings:
wd_id = self.mappings[pid]
text = "".join(self.currentText)
# schedule processing
if self.sem:
task = asyncio.create_task(self._bounded_process(text, wd_id))
else:
task = asyncio.create_task(self._process(text, wd_id))
self.tasks.append(task)
# reset
self.inPage = self.inRevision = self.inText = False
self.currentPageId = None
self.currentText = []
elif name == "revision":
self.inRevision = False
elif name == "text":
self.inText = False
self.currentTag = None
def characters(self, content):
# Only filter whitespace for ID fields, preserve all content for text
if (
self.currentTag == "id"
and self.inPage
and not self.inRevision
and not self.currentPageId
):
content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content)
async def _process(self, text: str, uid: str):
parser = WikivoyageParser()
entry = parser.parse(text)
await self.handler.write_entry(entry, uid)
async def _bounded_process(self, text: str, uid: str):
# Only run N at once
async with self.sem:
await self._process(text, uid)
async def process_dump(
mappings: dict[str, str], handler, max_concurrent: int
):
"""
Stream-download the bzip2-compressed XML dump and feed to SAX.
"""
xml_url = (
"https://dumps.wikimedia.org/"
"enwikivoyage/latest/"
"enwikivoyage-latest-pages-articles.xml.bz2"
)
decomp = bz2.BZ2Decompressor()
sax_parser = xml.sax.make_parser()
dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
sax_parser.setContentHandler(dump_handler)
async with aiohttp.ClientSession() as session:
async with session.get(xml_url) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024 * 1024):
data = decomp.decompress(chunk)
if not data:
continue
text = data.decode("utf-8", errors="ignore")
sax_parser.feed(text)
sax_parser.close()
if dump_handler.tasks:
await asyncio.gather(*dump_handler.tasks)
async def main():
# 1. Which handler to load?
handler_name = os.getenv("HANDLER")
if not handler_name:
logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
sys.exit(1)
# 2. Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
mod = importlib.import_module(module_path)
except ImportError as e:
logger.error(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
# 3. Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
logger.error(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
logger.info(f"Using handler from {module_path}")
# 4. Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
# 5. Instantiate
handler = HandlerCls(**handler_kwargs)
# 6. read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
raise ValueError("MAX_CONCURRENT must be an integer")
if max_conc < 0:
raise ValueError("MAX_CONCURRENT must be >= 0")
# 7. Fetch mappings
logger.info("Fetching mappings from SQL dump…")
mappings = await fetch_mappings()
logger.info(f"Got {len(mappings)} wikibase_item mappings.")
# 8. Stream & split the XML dump
logger.info("Processing XML dump…")
await process_dump(mappings, handler, max_conc)
# 5. Finish up
await handler.close()
logger.info("All done.")
if __name__ == "__main__":
load_dotenv()
if os.getenv("DEBUG"):
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
asyncio.run(main())

3
transformers/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .fetch_mappings import fetch_mappings
from .wiki_dump_handler import WikiDumpHandler
from .parser import WikivoyageParser

View File

@ -0,0 +1,42 @@
from logging import getLogger
import zlib
import re
import aiohttp
logger = getLogger(__name__)
async def fetch_mappings() -> dict[str, str]:
"""
Download and gunzip the page_props SQL dump, extract
pagewikibase_item mappings.
"""
sql_url = (
"https://dumps.wikimedia.org/"
"enwikivoyage/latest/"
"enwikivoyage-latest-page_props.sql.gz"
)
# decompress gzip
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
# regex for tuples: (page,'prop','value',NULL_or_number)
tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
buffer = ""
mappings: dict[str, str] = {}
async with aiohttp.ClientSession() as session:
async with session.get(sql_url) as resp:
resp.raise_for_status()
async for chunk in resp.content.iter_chunked(1024 * 1024):
data = decomp.decompress(chunk)
if not data:
continue
text = data.decode("utf-8", errors="ignore")
buffer += text
for m in tuple_re.finditer(buffer):
page_id, prop, value = m.group(1), m.group(2), m.group(3)
if prop == "wikibase_item":
logger.debug(f"Found mapping {page_id} -> {value}")
mappings[page_id] = value
# keep tail to handle split tuples
if len(buffer) > 1000:
buffer = buffer[-1000:]
return mappings

View File

@ -0,0 +1,100 @@
from logging import getLogger
import xml.sax
import asyncio
from .parser import WikivoyageParser
logger = getLogger(__name__)
class WikiDumpHandler(xml.sax.ContentHandler):
"""
SAX handler that, for each <page> whose <id> is in mappings,
collects the <text> and schedules an async task to parse
and write via the usersupplied handler(s).
"""
def __init__(self, mappings, handlers):
super().__init__()
self.mappings = mappings
# Support a single handler or a list of handlers
self.handlers = handlers
self.tasks: list[asyncio.Task] = []
self.currentTag: str | None = None
self.inPage = False
self.inRevision = False
self.inText = False
self.currentPageId: str | None = None
self.currentTitle: str | None = None
self.currentText: list[str] = []
def startElement(self, name, attrs):
self.currentTag = name
if name == "page":
logger.debug("start page")
self.inPage = True
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("start revision")
self.inRevision = True
elif name == "text" and self.inRevision:
logger.debug("start text")
self.inText = True
def endElement(self, name):
if name == "page":
logger.debug("end page")
pid = self.currentPageId
if pid and pid in self.mappings:
wd_id = self.mappings[pid]
text = "".join(self.currentText)
title = self.currentTitle
logger.debug(f"scheduled {wd_id} for handling")
# schedule processing
task = asyncio.create_task(self._process(text, wd_id, title))
self.tasks.append(task)
else:
logger.debug(f"page {pid} without wikidata id, skipping...")
# reset
self.inPage = self.inRevision = self.inText = False
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("end revision")
self.inRevision = False
elif name == "text":
logger.debug("end text")
self.inText = False
self.currentTag = None
def characters(self, content):
# Only filter whitespace for ID fields, preserve all content for text
if (
self.currentTag == "id"
and self.inPage
and not self.inRevision
and not self.currentPageId
):
content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.currentTag == "title" and self.inPage:
if self.currentTitle is None:
self.currentTitle = content
else:
self.currentTitle += content
elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content)
async def _process(self, text: str, uid: str, title: str):
parser = WikivoyageParser()
entry = parser.parse(text)
entry['properties']['title'] = title
# Write to all handlers concurrently
await asyncio.gather(*[
handler.write_entry(entry, uid) for handler in self.handlers
])

21
types/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 bcye and moll-re
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

3
types/README.md Normal file
View File

@ -0,0 +1,3 @@
# @bcye/structured-wikivoyage-types
Types to use when consuming json trees from the structured-wikivoyage-exports project

29
types/package.json Normal file
View File

@ -0,0 +1,29 @@
{
"name": "@bcye/structured-wikivoyage-types",
"version": "0.2.5",
"description": "Types to use when consuming json trees from the structured-wikivoyage-exports project",
"keywords": [],
"contributors": [
"bcye",
"moll-re"
],
"license": "MIT",c
"repository": {
"type": "git",
"url": "git+https://github.com/bcye/structured-wikivoyage-exports.git"
},
"bugs": {
"url": "https://github.com/bcye/structured-wikivoyage-exports/issues"
},
"homepage": "https://github.com/bcye/structured-wikivoyage-exports#readme",
"files": [
"dist/index.d.ts",
"dist/index.js"
],
"main": "dist/index.js",
"types": "dist/index.d.ts",
"private": false,
"devDependencies": {
"typescript": "^5.8.3"
}
}

24
types/pnpm-lock.yaml generated Normal file
View File

@ -0,0 +1,24 @@
lockfileVersion: '9.0'
settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
importers:
.:
devDependencies:
typescript:
specifier: ^5.8.3
version: 5.8.3
packages:
typescript@5.8.3:
resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==}
engines: {node: '>=14.17'}
hasBin: true
snapshots:
typescript@5.8.3: {}

113
types/tsconfig.json Normal file
View File

@ -0,0 +1,113 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2016" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "libReplacement": true, /* Enable lib replacement. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs" /* Specify what module code is generated. */,
// "rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
"declaration": true /* Generate .d.ts files from TypeScript and JavaScript files in your project. */,
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
"outDir": "./dist/" /* Specify an output folder for all emitted files. */,
// "removeComments": true, /* Disable emitting comments. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
// "erasableSyntaxOnly": true, /* Do not allow runtime constructs that are not part of ECMAScript. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */,
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
/* Type Checking */
"strict": true /* Enable all strict type-checking options. */,
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
}
}

59
uv.lock generated
View File

@ -135,6 +135,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 },
] ]
[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
]
[[package]] [[package]]
name = "cryptography" name = "cryptography"
version = "44.0.2" version = "44.0.2"
@ -239,6 +248,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
] ]
[[package]]
name = "iniconfig"
version = "2.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
]
[[package]] [[package]]
name = "mapvoyage-extract" name = "mapvoyage-extract"
version = "0.1.0" version = "0.1.0"
@ -252,6 +270,11 @@ dependencies = [
{ name = "wikitextparser" }, { name = "wikitextparser" },
] ]
[package.dev-dependencies]
dev = [
{ name = "pytest" },
]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" }, { name = "aiofiles", specifier = ">=24.1.0" },
@ -262,6 +285,9 @@ requires-dist = [
{ name = "wikitextparser", specifier = ">=0.56.3" }, { name = "wikitextparser", specifier = ">=0.56.3" },
] ]
[package.metadata.requires-dev]
dev = [{ name = "pytest", specifier = ">=8.3.5" }]
[[package]] [[package]]
name = "multidict" name = "multidict"
version = "6.4.3" version = "6.4.3"
@ -335,6 +361,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
] ]
[[package]]
name = "packaging"
version = "25.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 },
]
[[package]]
name = "pluggy"
version = "1.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
]
[[package]] [[package]]
name = "propcache" name = "propcache"
version = "0.3.1" version = "0.3.1"
@ -401,6 +445,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
] ]
[[package]]
name = "pytest"
version = "8.3.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "iniconfig" },
{ name = "packaging" },
{ name = "pluggy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
]
[[package]] [[package]]
name = "python-dotenv" name = "python-dotenv"
version = "1.1.0" version = "1.1.0"