Merge pull request #33 from bcye/feature/multiple-handlers

Allow for multiple handlers
accept kwargs to forward max conc
2025-06-09 09:24:04 +00:00 · 2025-06-03 14:52:14 +02:00 · 2025-05-16 20:42:31 +02:00 · 2025-05-16 20:32:21 +02:00 · 2025-05-16 20:27:59 +02:00 · 2025-05-16 16:34:54 +02:00
24 changed files with 5456 additions and 250 deletions
--- a/.github/workflows/publish-types.yaml
+++ b/.github/workflows/publish-types.yaml
@ -0,0 +1,40 @@
 # Example from https://docs.github.com/en/actions/use-cases-and-examples/publishing-packages/publishing-nodejs-packages#publishing-packages-to-the-npm-registry
 name: Publish Types Package to npmjs
 on:
  push:
    tags:
      - "types/*"
 defaults:
  run:
    working-directory: types
 jobs:
  build:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      id-token: write
    steps:
      - uses: actions/checkout@v4
      - uses: pnpm/action-setup@v4
        name: Install pnpm
        with:
          version: 10
          run_install: false
      # Setup .npmrc file to publish to npm
      - uses: actions/setup-node@v4
        with:
          node-version: "20.x"
          cache: "pnpm"
          cache-dependency-path: "types/pnpm-lock.yaml"
          registry-url: "https://registry.npmjs.org"
      - run: pnpm install --frozen-lockfile
      - run: pnpm tsc
      - run: pnpm publish --provenance --access public --no-git-checks
        env:
          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
--- a/.github/workflows/test-parser.yaml
+++ b/.github/workflows/test-parser.yaml
@ -0,0 +1,23 @@
 on:
  pull_request:
 jobs:
  run-tests:
    name: Unit-Test Parser
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: astral-sh/setup-uv@v5
      - name: "Set up Python"
        uses: actions/setup-python@v5
        with:
          python-version-file: ".python-version"
      - name: Install the project
        run: uv sync --locked --dev
      - name: Run tests
        run: PYTHONPATH=. uv run pytest
--- a/2
+++ b/2
@ -7,4 +7,4 @@ RUN uv sync --frozen
 COPY . .
-CMD ["uv", "run", "transform-documents.py"]
+CMD ["uv", "run", "main.py"]
--- a/README.md
+++ b/README.md
@ -15,3 +15,19 @@ TypeScript types for consuming the json output are available, you may install th
 ## Documentation
 See [docs](docs) for more information on how to use this utility.
 ## Testing
 Run `PYTHONPATH=. pytest` from inside the venv
 ## License
 ### Code
 (c) 2025 bcye and moll-re
 All code and documentation unless otherwise stated is licensed under the AGPLv3 license, refer to [LICENSE](LICENSE) for the full license text. The types package and all its code is [licensed under MIT](types/LICENSE).
 ### Examples
 Files in the `docs/example` and `tests/fixtures` are copies (.txt) or derivatives (.json) of the Boston Article on Wikivoyage and licensed under CC BY-SA 4.0. A [list of contributors is available on the original article](https://en.wikivoyage.org/w/index.php?title=Boston&action=history).
--- a/docs/types.md
+++ b/docs/types.md
@ -0,0 +1,5 @@
 # Types Package
 ## Publishing new versions
 Up version in package.json and create a new commit and tag it with "types/x.y.z", the version will be published when the tag is pushed to GitHub
--- a/main.py
+++ b/main.py
@ -0,0 +1,143 @@
 #!/usr/bin/env python3
 import os
 import sys
 import re
 import zlib
 import bz2
 import asyncio
 import logging
 import importlib
 import xml.sax
 from pathlib import Path
 from dotenv import load_dotenv
 import aiohttp
 from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
 logger = logging.getLogger(__name__)
 def gather_handler_kwargs(handler_name: str) -> dict:
    """
    Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
    E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
    """
    prefix = f"HANDLER_{handler_name.upper()}_"
    kwargs = {}
    for env_key, val in os.environ.items():
        if not env_key.startswith(prefix):
            continue
        param = env_key.replace(prefix, "").lower()
        # cast ints
        if val.isdigit():
            val = int(val)
        # cast bools
        elif val.lower() in ("true", "false"):
            val = val.lower() == "true"
        kwargs[param] = val
    logger.debug(f"Handler kwargs: {kwargs}")
    return kwargs
 async def process_dump(
    mappings: dict[str, str], handlers
 ):
    """
    Stream-download the bzip2-compressed XML dump and feed to SAX.
    """
    xml_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-pages-articles.xml.bz2"
    )
    decomp = bz2.BZ2Decompressor()
    sax_parser = xml.sax.make_parser()
    dump_handler = WikiDumpHandler(mappings, handlers)
    sax_parser.setContentHandler(dump_handler)
    async with aiohttp.ClientSession() as session:
        async with session.get(xml_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                sax_parser.feed(text)
    sax_parser.close()
    if dump_handler.tasks:
        await asyncio.gather(*dump_handler.tasks)
 async def main():
    # 1. Which handler(s) to load?
    handler_names = os.getenv("HANDLER", "").split(",")
    if not handler_names or not handler_names[0]:
        logger.error("Error: set ENV HANDLER (e.g. 'filesystem' or 'filesystem,sftp')")
        sys.exit(1)
    # 2. Read concurrency setting
    try:
        max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
    except ValueError:
        raise ValueError("MAX_CONCURRENT must be an integer")
    if max_conc < 0:
        raise ValueError("MAX_CONCURRENT must be >= 0")
    handlers = []
    # 3. Load each handler
    for handler_name in handler_names:
        handler_name = handler_name.strip()
        if not handler_name:
            continue
        # Dynamic import
        module_path = f"output_handlers.{handler_name}"
        try:
            mod = importlib.import_module(module_path)
        except ImportError as e:
            logger.error(f"Error loading handler module {module_path}: {e}")
            sys.exit(1)
        # Find the class: e.g. "sftp" → "SftpHandler"
        class_name = handler_name.title().replace("_", "") + "Handler"
        if not hasattr(mod, class_name):
            logger.error(f"{module_path} defines no class {class_name}")
            sys.exit(1)
        HandlerCls = getattr(mod, class_name)
        logger.info(f"Using handler from {module_path}")
        # Build kwargs from ENV
        handler_kwargs = gather_handler_kwargs(handler_name)
        # Add max_concurrent to kwargs
        handler_kwargs["max_concurrent"] = max_conc
        # Instantiate
        handler = HandlerCls(**handler_kwargs)
        handlers.append(handler)
    # 4. Fetch mappings
    logger.info("Fetching mappings from SQL dump…")
    mappings = await fetch_mappings()
    logger.info(f"Got {len(mappings)} wikibase_item mappings.")
    # 5. Stream & split the XML dump
    logger.info("Processing XML dump…")
    await process_dump(mappings, handlers)  # Pass 0 as max_concurrent since handlers handle it
    # 6. Finish up
    await asyncio.gather(*[handler.close() for handler in handlers])
    logger.info("All done.")
 if __name__ == "__main__":
    load_dotenv()
    if os.getenv("DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    asyncio.run(main())
--- a/output_handlers/base_handler.py
+++ b/output_handlers/base_handler.py
@ -1,6 +1,7 @@
 """Reference handler for output handlers."""
 from abc import ABC, abstractmethod
 import logging
 import asyncio
@ -14,15 +15,20 @@ class BaseHandler(ABC):
    _successful_writes = 0
    _failed_writes = 0
-    def __init__(self, fail_on_error: bool = True, **kwargs):
+    def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
        """
        Initializes the BaseHandler with optional parameters.
        Args:
            fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
            max_concurrent: Maximum number of concurrent write operations.
                            0 means unlimited concurrency.
            **kwargs: Additional keyword arguments for specific handler implementations.
        """
        self.fail_on_error = fail_on_error
        self.semaphore = None
        if max_concurrent > 0:
            self.semaphore = asyncio.Semaphore(max_concurrent)
    @abstractmethod
@ -47,6 +53,10 @@ class BaseHandler(ABC):
            entry (dict): The entry to write (will be JSON-encoded).
            uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended. 
        """
        if self.semaphore:
            async with self.semaphore:
                success = await self._write_entry(entry, uid)
        else:
            success = await self._write_entry(entry, uid)
        if success:
            self.logger.debug(f"Successfully wrote entry with UID {uid}")
--- a/output_handlers/bunny_storage.py
+++ b/output_handlers/bunny_storage.py
@ -10,8 +10,9 @@ class BunnyStorageHandler(BaseHandler):
        api_key: str,
        fail_on_error: bool = True,
        keepalive_timeout: int = 75,
        **kwargs,
    ):
-        super().__init__(fail_on_error=fail_on_error)
+        super().__init__(fail_on_error=fail_on_error, **kwargs)
        self.base_url = f"https://{region}.bunnycdn.com/{base_path}"
        self.headers = {
            "AccessKey": api_key,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,3 +12,8 @@ dependencies = [
    "python-dotenv>=1.1.0",
    "wikitextparser>=0.56.3",
 ]
 [dependency-groups]
 dev = [
    "pytest>=8.3.5",
 ]
--- a/tests/fixtures/boston_input.txt
+++ b/tests/fixtures/boston_input.txt
--- a/tests/fixtures/boston_output.json
+++ b/tests/fixtures/boston_output.json
--- a/tests/test_parser_json_snippets.py
+++ b/tests/test_parser_json_snippets.py
@ -0,0 +1,333 @@
 import json
 import os
 import pytest
 from transformers import WikivoyageParser
 def dump(obj):
    # canonical JSON for deep compare
    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
 def wrap(children):
    """Wrap a list of child nodes in the default root envelope."""
    return {
        "type": "root",
        "properties": {},
        "children": children
    }
@pytest.fixture
 def parser():
    return WikivoyageParser()
 def test_empty_input_is_root_only(parser):
    got = parser.parse("")
    assert dump(got) == dump(wrap([]))
 def test_plain_text_node(parser):
    got = parser.parse("Just some plain text.")
    expected = wrap([
        {"type":"text","properties":{"markdown":"Just some plain text."},"children":[]}
    ])
    assert dump(got) == dump(expected)
 def test_template_node(parser):
    got = parser.parse("{{foo|a=1|b=two}}")
    expected = wrap([
        {
            "type":"template",
            "properties":{"name":"foo","params":{"a":"1","b":"two"}},
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_see_listing_full_properties(parser):
    snippet = (
        "{{see"
        "|name=Statue"
        "|alt=Monument"
        "|url=http://x"
        "|email=a@b.com"
        "|address=1 Road"
        "|lat=1.23"
        "|long=4.56"
        "|directions=North"
        "|phone=12345"
        "|tollfree=800"
        "|fax=54321"
        "|hours=24/7"
        "|price=Free"
        "|lastedit=2020-01-01"
        "|wikipedia=Statue"
        "|wikidata=Q1"
        "|content=Big statue"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"see",
            "properties":{
                "name":"Statue","alt":"Monument","url":"http://x",
                "email":"a@b.com","address":"1 Road","lat":"1.23","long":"4.56",
                "directions":"North","phone":"12345","tollfree":"800",
                "fax":"54321","hours":"24/7","price":"Free",
                "lastedit":"2020-01-01","wikipedia":"Statue","wikidata":"Q1",
                "content":"Big statue"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_do_listing_full_properties(parser):
    snippet = (
        "{{do"
        "|name=Walk"
        "|alt=Stroll"
        "|url=http://walk"
        "|email=hi@walk"
        "|address=Main Street"
        "|lat=2.34"
        "|long=5.67"
        "|directions=East"
        "|phone=222-333"
        "|tollfree=800-DO-WALK"
        "|fax=999-888"
        "|hours=All day"
        "|price=Free"
        "|lastedit=2021-02-02"
        "|wikipedia=Walking"
        "|wikidata=Q2"
        "|content=Enjoy a walk"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"do",
            "properties":{
                "name":"Walk","alt":"Stroll","url":"http://walk",
                "email":"hi@walk","address":"Main Street","lat":"2.34","long":"5.67",
                "directions":"East","phone":"222-333","tollfree":"800-DO-WALK",
                "fax":"999-888","hours":"All day","price":"Free",
                "lastedit":"2021-02-02","wikipedia":"Walking","wikidata":"Q2",
                "content":"Enjoy a walk"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_buy_listing_full_properties(parser):
    snippet = (
        "{{buy"
        "|name=Shirt"
        "|alt=Tees"
        "|url=http://shop"
        "|email=sales@shop"
        "|address=Market St"
        "|lat=3.45"
        "|long=6.78"
        "|directions=West"
        "|phone=444-555"
        "|tollfree=800-BUY-TEE"
        "|fax=777-666"
        "|hours=9–6"
        "|price=$20"
        "|lastedit=2022-03-03"
        "|wikipedia=Shopping"
        "|wikidata=Q3"
        "|content=Quality tees"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"buy",
            "properties":{
                "name":"Shirt","alt":"Tees","url":"http://shop",
                "email":"sales@shop","address":"Market St","lat":"3.45","long":"6.78",
                "directions":"West","phone":"444-555","tollfree":"800-BUY-TEE",
                "fax":"777-666","hours":"9–6","price":"$20",
                "lastedit":"2022-03-03","wikipedia":"Shopping","wikidata":"Q3",
                "content":"Quality tees"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_eat_listing_full_properties(parser):
    snippet = (
        "{{eat"
        "|name=Diner"
        "|alt=Cafe"
        "|url=http://eat"
        "|email=food@eat"
        "|address=Food Lane"
        "|lat=4.56"
        "|long=7.89"
        "|directions=South"
        "|phone=666-777"
        "|tollfree=800-EAT-YUM"
        "|fax=555-444"
        "|hours=Breakfast"
        "|price=$10–$30"
        "|lastedit=2023-04-04"
        "|wikipedia=Dining"
        "|wikidata=Q4"
        "|content=Best pancakes"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"eat",
            "properties":{
                "name":"Diner","alt":"Cafe","url":"http://eat",
                "email":"food@eat","address":"Food Lane","lat":"4.56","long":"7.89",
                "directions":"South","phone":"666-777","tollfree":"800-EAT-YUM",
                "fax":"555-444","hours":"Breakfast","price":"$10–$30",
                "lastedit":"2023-04-04","wikipedia":"Dining","wikidata":"Q4",
                "content":"Best pancakes"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_drink_listing_full_properties(parser):
    snippet = (
        "{{drink"
        "|name=Pub"
        "|alt=Bar"
        "|url=http://drink"
        "|email=cheers@drink"
        "|address=Bar Street"
        "|lat=5.67"
        "|long=8.90"
        "|directions=Center"
        "|phone=888-999"
        "|tollfree=800-DRINK"
        "|fax=333-222"
        "|hours=Evening"
        "|price=$7–$30"
        "|lastedit=2024-05-05"
        "|wikipedia=Nightlife"
        "|wikidata=Q5"
        "|content=Great brews"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"drink",
            "properties":{
                "name":"Pub","alt":"Bar","url":"http://drink",
                "email":"cheers@drink","address":"Bar Street","lat":"5.67","long":"8.90",
                "directions":"Center","phone":"888-999","tollfree":"800-DRINK",
                "fax":"333-222","hours":"Evening","price":"$7–$30",
                "lastedit":"2024-05-05","wikipedia":"Nightlife","wikidata":"Q5",
                "content":"Great brews"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_sleep_listing_full_properties(parser):
    snippet = (
        "{{sleep"
        "|name=Hotel"
        "|alt=Inn"
        "|url=http://sleep"
        "|email=stay@sleep"
        "|address=Sleepy Ave"
        "|lat=6.78"
        "|long=9.01"
        "|directions=Uptown"
        "|phone=000-111"
        "|tollfree=800-SLEEP"
        "|fax=111-000"
        "|hours=24h"
        "|price=$100"
        "|lastedit=2025-06-06"
        "|wikipedia=Accommodation"
        "|wikidata=Q6"
        "|checkin=3PM"
        "|checkout=11AM"
        "|content=Cozy rooms"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"sleep",
            "properties":{
                "name":"Hotel","alt":"Inn","url":"http://sleep",
                "email":"stay@sleep","address":"Sleepy Ave","lat":"6.78","long":"9.01",
                "directions":"Uptown","phone":"000-111","tollfree":"800-SLEEP",
                "fax":"111-000","hours":"24h","price":"$100",
                "lastedit":"2025-06-06","wikipedia":"Accommodation","wikidata":"Q6",
                "checkin":"3PM","checkout":"11AM","content":"Cozy rooms"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_generic_listing_full_properties(parser):
    snippet = (
        "{{listing"
        "|name=Info"
        "|alt=Data"
        "|url=http://info"
        "|email=info@info"
        "|address=Down St"
        "|lat=7.89"
        "|long=0.12"
        "|directions=Here"
        "|phone=123-000"
        "|tollfree=800-INFO"
        "|fax=000-123"
        "|hours=All times"
        "|price=$0"
        "|lastedit=2026-07-07"
        "|wikipedia=InfoPage"
        "|wikidata=Q7"
        "|content=Useful info"
        "}}"
    )
    got = parser.parse(snippet)
    expected = wrap([
        {
            "type":"listing",
            "properties":{
                "name":"Info","alt":"Data","url":"http://info",
                "email":"info@info","address":"Down St","lat":"7.89","long":"0.12",
                "directions":"Here","phone":"123-000","tollfree":"800-INFO",
                "fax":"000-123","hours":"All times","price":"$0",
                "lastedit":"2026-07-07","wikipedia":"InfoPage","wikidata":"Q7",
                "content":"Useful info"
            },
            "children":[]
        }
    ])
    assert dump(got) == dump(expected)
 def test_section_and_subsection(parser):
    got = parser.parse("Intro\n== First ==\nHello\n=== Sub ===\nWorld")
    sec = got["children"][1]
    assert sec["type"] == "section" and sec["properties"]["level"] == 2
    sub = sec["children"][1]
    assert sub["type"] == "section" and sub["properties"]["level"] == 3
 def test_full_boston_snapshot(parser):
    here = os.path.dirname(__file__)
    inp = os.path.join(here, "fixtures", "boston_input.txt")
    out = os.path.join(here, "fixtures", "boston_output.json")
    wikicode = open(inp, encoding="utf-8").read()
    expected = json.load(open(out, encoding="utf-8"))
    got = parser.parse(wikicode)
    assert dump(got) == dump(expected)
--- a/transform-documents.py
+++ b/transform-documents.py
@ -1,246 +0,0 @@
 #!/usr/bin/env python3
 import os
 import sys
 import re
 import zlib
 import bz2
 import asyncio
 import logging
 import importlib
 import xml.sax
 from pathlib import Path
 from dotenv import load_dotenv
 import aiohttp
 from parser import WikivoyageParser
 logger = logging.getLogger(__name__)
 def gather_handler_kwargs(handler_name: str) -> dict:
    """
    Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
    E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
    """
    prefix = f"HANDLER_{handler_name.upper()}_"
    kwargs = {}
    for env_key, val in os.environ.items():
        if not env_key.startswith(prefix):
            continue
        param = env_key.replace(prefix, "").lower()
        # cast ints
        if val.isdigit():
            val = int(val)
        # cast bools
        elif val.lower() in ("true", "false"):
            val = val.lower() == "true"
        kwargs[param] = val
    logger.debug(f"Handler kwargs: {kwargs}")
    return kwargs
 async def fetch_mappings() -> dict[str, str]:
    """
    Download and gunzip the page_props SQL dump, extract
    page→wikibase_item mappings.
    """
    sql_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-page_props.sql.gz"
    )
    # decompress gzip
    decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
    # regex for tuples: (page,'prop','value',NULL_or_number)
    tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
    buffer = ""
    mappings: dict[str, str] = {}
    async with aiohttp.ClientSession() as session:
        async with session.get(sql_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                buffer += text
                for m in tuple_re.finditer(buffer):
                    page_id, prop, value = m.group(1), m.group(2), m.group(3)
                    if prop == "wikibase_item":
                        mappings[page_id] = value
                # keep tail to handle split tuples
                if len(buffer) > 1000:
                    buffer = buffer[-1000:]
    return mappings
 class WikiDumpHandler(xml.sax.ContentHandler):
    """
    SAX handler that, for each <page> whose <id> is in mappings,
    collects the <text> and schedules an async task to parse
    and write via the user‐supplied handler.
    """
    def __init__(self, mappings, handler, max_concurrent):
        super().__init__()
        self.mappings = mappings
        self.handler = handler
        self.sem = (
            asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
        )
        self.tasks: list[asyncio.Task] = []
        self.currentTag: str | None = None
        self.inPage = False
        self.inRevision = False
        self.inText = False
        self.currentPageId: str | None = None
        self.currentText: list[str] = []
    def startElement(self, name, attrs):
        self.currentTag = name
        if name == "page":
            self.inPage = True
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            self.inRevision = True
        elif name == "text" and self.inRevision:
            self.inText = True
    def endElement(self, name):
        if name == "page":
            pid = self.currentPageId
            if pid and pid in self.mappings:
                wd_id = self.mappings[pid]
                text = "".join(self.currentText)
                # schedule processing
                if self.sem:
                    task = asyncio.create_task(self._bounded_process(text, wd_id))
                else:
                    task = asyncio.create_task(self._process(text, wd_id))
                self.tasks.append(task)
            # reset
            self.inPage = self.inRevision = self.inText = False
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            self.inRevision = False
        elif name == "text":
            self.inText = False
        self.currentTag = None
    def characters(self, content):
        # Only filter whitespace for ID fields, preserve all content for text
        if (
            self.currentTag == "id"
            and self.inPage
            and not self.inRevision
            and not self.currentPageId
        ):
            content_stripped = content.strip()
            if content_stripped:  # Only process non-empty ID content
                self.currentPageId = content_stripped
        elif self.inText:
            # Always append text content, even if it's just whitespace or newlines
            self.currentText.append(content)
    async def _process(self, text: str, uid: str):
        parser = WikivoyageParser()
        entry = parser.parse(text)
        await self.handler.write_entry(entry, uid)
    async def _bounded_process(self, text: str, uid: str):
        # Only run N at once
        async with self.sem:
            await self._process(text, uid)
 async def process_dump(
    mappings: dict[str, str], handler, max_concurrent: int
 ):
    """
    Stream-download the bzip2-compressed XML dump and feed to SAX.
    """
    xml_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-pages-articles.xml.bz2"
    )
    decomp = bz2.BZ2Decompressor()
    sax_parser = xml.sax.make_parser()
    dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
    sax_parser.setContentHandler(dump_handler)
    async with aiohttp.ClientSession() as session:
        async with session.get(xml_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                sax_parser.feed(text)
    sax_parser.close()
    if dump_handler.tasks:
        await asyncio.gather(*dump_handler.tasks)
 async def main():
    # 1. Which handler to load?
    handler_name = os.getenv("HANDLER")
    if not handler_name:
        logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
        sys.exit(1)
    # 2. Dynamic import
    module_path = f"output_handlers.{handler_name}"
    try:
        mod = importlib.import_module(module_path)
    except ImportError as e:
        logger.error(f"Error loading handler module {module_path}: {e}")
        sys.exit(1)
    # 3. Find the class: e.g. "sftp" → "SftpHandler"
    class_name = handler_name.title().replace("_", "") + "Handler"
    if not hasattr(mod, class_name):
        logger.error(f"{module_path} defines no class {class_name}")
        sys.exit(1)
    HandlerCls = getattr(mod, class_name)
    logger.info(f"Using handler from {module_path}")
    # 4. Build kwargs from ENV
    handler_kwargs = gather_handler_kwargs(handler_name)
    # 5. Instantiate
    handler = HandlerCls(**handler_kwargs)
    # 6. read concurrency setting
    try:
        max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
    except ValueError:
        raise ValueError("MAX_CONCURRENT must be an integer")
    if max_conc < 0:
        raise ValueError("MAX_CONCURRENT must be >= 0")
    # 7. Fetch mappings
    logger.info("Fetching mappings from SQL dump…")
    mappings = await fetch_mappings()
    logger.info(f"Got {len(mappings)} wikibase_item mappings.")
    # 8. Stream & split the XML dump
    logger.info("Processing XML dump…")
    await process_dump(mappings, handler, max_conc)
    # 5. Finish up
    await handler.close()
    logger.info("All done.")
 if __name__ == "__main__":
    load_dotenv()
    if os.getenv("DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    asyncio.run(main())
--- a/transformers/init.py
+++ b/transformers/init.py
@ -0,0 +1,3 @@
 from .fetch_mappings import fetch_mappings
 from .wiki_dump_handler import WikiDumpHandler
 from .parser import WikivoyageParser
--- a/transformers/fetch_mappings.py
+++ b/transformers/fetch_mappings.py
@ -0,0 +1,42 @@
 from logging import getLogger
 import zlib
 import re
 import aiohttp
 logger = getLogger(__name__)
 async def fetch_mappings() -> dict[str, str]:
    """
    Download and gunzip the page_props SQL dump, extract
    page→wikibase_item mappings.
    """
    sql_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-page_props.sql.gz"
    )
    # decompress gzip
    decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
    # regex for tuples: (page,'prop','value',NULL_or_number)
    tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
    buffer = ""
    mappings: dict[str, str] = {}
    async with aiohttp.ClientSession() as session:
        async with session.get(sql_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                buffer += text
                for m in tuple_re.finditer(buffer):
                    page_id, prop, value = m.group(1), m.group(2), m.group(3)
                    if prop == "wikibase_item":
                        logger.debug(f"Found mapping {page_id} -> {value}")
                        mappings[page_id] = value
                # keep tail to handle split tuples
                if len(buffer) > 1000:
                    buffer = buffer[-1000:]
    return mappings
--- a/transformers/parser.py
+++ b/transformers/parser.py
--- a/transformers/wiki_dump_handler.py
+++ b/transformers/wiki_dump_handler.py
@ -0,0 +1,100 @@
 from logging import getLogger
 import xml.sax
 import asyncio
 from .parser import WikivoyageParser
 logger = getLogger(__name__)
 class WikiDumpHandler(xml.sax.ContentHandler):
    """
    SAX handler that, for each <page> whose <id> is in mappings,
    collects the <text> and schedules an async task to parse
    and write via the user‐supplied handler(s).
    """
    def __init__(self, mappings, handlers):
        super().__init__()
        self.mappings = mappings
        # Support a single handler or a list of handlers
        self.handlers = handlers
        self.tasks: list[asyncio.Task] = []
        self.currentTag: str | None = None
        self.inPage = False
        self.inRevision = False
        self.inText = False
        self.currentPageId: str | None = None
        self.currentTitle: str | None = None
        self.currentText: list[str] = []
    def startElement(self, name, attrs):
        self.currentTag = name
        if name == "page":
            logger.debug("start page")
            self.inPage = True
            self.currentPageId = None
            self.currentTitle = None
            self.currentText = []
        elif name == "revision":
            logger.debug("start revision")
            self.inRevision = True
        elif name == "text" and self.inRevision:
            logger.debug("start text")
            self.inText = True
    def endElement(self, name):
        if name == "page":
            logger.debug("end page")
            pid = self.currentPageId
            if pid and pid in self.mappings:
                wd_id = self.mappings[pid]
                text = "".join(self.currentText)
                title = self.currentTitle
                logger.debug(f"scheduled {wd_id} for handling")
                # schedule processing
                task = asyncio.create_task(self._process(text, wd_id, title))
                self.tasks.append(task)
            else:
                logger.debug(f"page {pid} without wikidata id, skipping...")
            # reset
            self.inPage = self.inRevision = self.inText = False
            self.currentPageId = None
            self.currentTitle = None
            self.currentText = []
        elif name == "revision":
            logger.debug("end revision")
            self.inRevision = False
        elif name == "text":
            logger.debug("end text")
            self.inText = False
        self.currentTag = None
    def characters(self, content):
        # Only filter whitespace for ID fields, preserve all content for text
        if (
            self.currentTag == "id"
            and self.inPage
            and not self.inRevision
            and not self.currentPageId
        ):
            content_stripped = content.strip()
            if content_stripped:  # Only process non-empty ID content
                self.currentPageId = content_stripped
        elif self.currentTag == "title" and self.inPage:
            if self.currentTitle is None:
                self.currentTitle = content
            else:
                self.currentTitle += content
        elif self.inText:
            # Always append text content, even if it's just whitespace or newlines
            self.currentText.append(content)
    async def _process(self, text: str, uid: str, title: str):
        parser = WikivoyageParser()
        entry = parser.parse(text)
        entry['properties']['title'] = title
        # Write to all handlers concurrently
        await asyncio.gather(*[
            handler.write_entry(entry, uid) for handler in self.handlers
        ])
--- a/types/LICENSE
+++ b/types/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2025 bcye and moll-re
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/types/README.md
+++ b/types/README.md
@ -0,0 +1,3 @@
 # @bcye/structured-wikivoyage-types
 Types to use when consuming json trees from the structured-wikivoyage-exports project
--- a/types/index.ts
+++ b/types/index.ts
--- a/types/package.json
+++ b/types/package.json
@ -0,0 +1,29 @@
 {
  "name": "@bcye/structured-wikivoyage-types",
  "version": "0.2.5",
  "description": "Types to use when consuming json trees from the structured-wikivoyage-exports project",
  "keywords": [],
  "contributors": [
    "bcye",
    "moll-re"
  ],
  "license": "MIT",c
  "repository": {
    "type": "git",
    "url": "git+https://github.com/bcye/structured-wikivoyage-exports.git"
  },
  "bugs": {
    "url": "https://github.com/bcye/structured-wikivoyage-exports/issues"
  },
  "homepage": "https://github.com/bcye/structured-wikivoyage-exports#readme",
  "files": [
    "dist/index.d.ts",
    "dist/index.js"
  ],
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "private": false,
  "devDependencies": {
    "typescript": "^5.8.3"
  }
 }
--- a/types/pnpm-lock.yaml
+++ b/types/pnpm-lock.yaml
@ -0,0 +1,24 @@
 lockfileVersion: '9.0'
 settings:
  autoInstallPeers: true
  excludeLinksFromLockfile: false
 importers:
  .:
    devDependencies:
      typescript:
        specifier: ^5.8.3
        version: 5.8.3
 packages:
  typescript@5.8.3:
    resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==}
    engines: {node: '>=14.17'}
    hasBin: true
 snapshots:
  typescript@5.8.3: {}
--- a/types/tsconfig.json
+++ b/types/tsconfig.json
@ -0,0 +1,113 @@
 {
  "compilerOptions": {
    /* Visit https://aka.ms/tsconfig to read more about this file */
    /* Projects */
    // "incremental": true,                              /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
    // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
    // "tsBuildInfoFile": "./.tsbuildinfo",              /* Specify the path to .tsbuildinfo incremental compilation file. */
    // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects. */
    // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
    // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
    /* Language and Environment */
    "target": "es2016" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
    // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
    // "jsx": "preserve",                                /* Specify what JSX code is generated. */
    // "libReplacement": true,                           /* Enable lib replacement. */
    // "experimentalDecorators": true,                   /* Enable experimental support for legacy experimental decorators. */
    // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
    // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
    // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
    // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
    // "reactNamespace": "",                             /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
    // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
    // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
    // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
    /* Modules */
    "module": "commonjs" /* Specify what module code is generated. */,
    // "rootDir": "./",                                  /* Specify the root folder within your source files. */
    // "moduleResolution": "node10",                     /* Specify how TypeScript looks up a file from a given module specifier. */
    // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
    // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
    // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
    // "typeRoots": [],                                  /* Specify multiple folders that act like './node_modules/@types'. */
    // "types": [],                                      /* Specify type package names to be included without being referenced in a source file. */
    // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
    // "moduleSuffixes": [],                             /* List of file name suffixes to search when resolving a module. */
    // "allowImportingTsExtensions": true,               /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
    // "rewriteRelativeImportExtensions": true,          /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
    // "resolvePackageJsonExports": true,                /* Use the package.json 'exports' field when resolving package imports. */
    // "resolvePackageJsonImports": true,                /* Use the package.json 'imports' field when resolving imports. */
    // "customConditions": [],                           /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
    // "noUncheckedSideEffectImports": true,             /* Check side effect imports. */
    // "resolveJsonModule": true,                        /* Enable importing .json files. */
    // "allowArbitraryExtensions": true,                 /* Enable importing files with any extension, provided a declaration file is present. */
    // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
    /* JavaScript Support */
    // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
    // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
    // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
    /* Emit */
    "declaration": true /* Generate .d.ts files from TypeScript and JavaScript files in your project. */,
    // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
    // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
    // "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
    // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
    // "noEmit": true,                                   /* Disable emitting files from a compilation. */
    // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
    "outDir": "./dist/" /* Specify an output folder for all emitted files. */,
    // "removeComments": true,                           /* Disable emitting comments. */
    // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
    // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
    // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
    // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
    // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
    // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
    // "newLine": "crlf",                                /* Set the newline character for emitting files. */
    // "stripInternal": true,                            /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
    // "noEmitHelpers": true,                            /* Disable generating custom helper functions like '__extends' in compiled output. */
    // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
    // "preserveConstEnums": true,                       /* Disable erasing 'const enum' declarations in generated code. */
    // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
    /* Interop Constraints */
    // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
    // "verbatimModuleSyntax": true,                     /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
    // "isolatedDeclarations": true,                     /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
    // "erasableSyntaxOnly": true,                       /* Do not allow runtime constructs that are not part of ECMAScript. */
    // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
    "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */,
    // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
    "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
    /* Type Checking */
    "strict": true /* Enable all strict type-checking options. */,
    // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied 'any' type. */
    // "strictNullChecks": true,                         /* When type checking, take into account 'null' and 'undefined'. */
    // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
    // "strictBindCallApply": true,                      /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
    // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
    // "strictBuiltinIteratorReturn": true,              /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
    // "noImplicitThis": true,                           /* Enable error reporting when 'this' is given the type 'any'. */
    // "useUnknownInCatchVariables": true,               /* Default catch clause variables as 'unknown' instead of 'any'. */
    // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
    // "noUnusedLocals": true,                           /* Enable error reporting when local variables aren't read. */
    // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
    // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
    // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
    // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
    // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
    // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
    // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */
    // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
    // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
    /* Completeness */
    // "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
    "skipLibCheck": true /* Skip type checking all .d.ts files. */
  }
 }
--- a/uv.lock
+++ b/uv.lock
@ -135,6 +135,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 },
 ]
 [[package]]
 name = "colorama"
 version = "0.4.6"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]
 [[package]]
 name = "cryptography"
 version = "44.0.2"
@ -239,6 +248,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
 ]
 [[package]]
 name = "iniconfig"
 version = "2.1.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
 ]
 [[package]]
 name = "mapvoyage-extract"
 version = "0.1.0"
@ -252,6 +270,11 @@ dependencies = [
    { name = "wikitextparser" },
 ]
 [package.dev-dependencies]
 dev = [
    { name = "pytest" },
 ]
 [package.metadata]
 requires-dist = [
    { name = "aiofiles", specifier = ">=24.1.0" },
@ -262,6 +285,9 @@ requires-dist = [
    { name = "wikitextparser", specifier = ">=0.56.3" },
 ]
 [package.metadata.requires-dev]
 dev = [{ name = "pytest", specifier = ">=8.3.5" }]
 [[package]]
 name = "multidict"
 version = "6.4.3"
@ -335,6 +361,24 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
 ]
 [[package]]
 name = "packaging"
 version = "25.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 },
 ]
 [[package]]
 name = "pluggy"
 version = "1.5.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
 ]
 [[package]]
 name = "propcache"
 version = "0.3.1"
@ -401,6 +445,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 },
 ]
 [[package]]
 name = "pytest"
 version = "8.3.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
    { name = "iniconfig" },
    { name = "packaging" },
    { name = "pluggy" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
 ]
 [[package]]
 name = "python-dotenv"
 version = "1.1.0"
Author	SHA1	Message	Date
Bruce	8f099dc7bc	Merge pull request #33 from bcye/feature/multiple-handlers Allow for multiple handlers	2025-06-03 14:52:14 +02:00
Bruce Röttgers	be28fddeb5	accept kwargs to forward max conc	2025-05-16 20:42:31 +02:00
Bruce Röttgers	5031f33ea2	move semaphore to handler level	2025-05-16 20:32:21 +02:00
Bruce Röttgers	60c13fb9ec	support multiple handlers	2025-05-16 20:27:59 +02:00
Bruce	f73046bd65	Merge pull request #32 from bcye/relicense-types relicense types to MIT	2025-05-16 16:34:54 +02:00
Bruce	5d1ec5bb2e	relicense types to MIT	2025-05-16 16:33:54 +02:00
Bruce	38901474c6	Merge pull request #29 from bcye/feature/parse-titles Parse Titles	2025-05-13 16:28:27 +02:00
Bruce Röttgers	b33201e930	add title parsing from xml	2025-05-07 15:22:20 +02:00
Bruce	93d99bf062	Merge pull request #21 from bcye/feature/docker Dockerize	2025-04-30 22:11:31 +02:00
Bruce	1a5b9b44e0	Merge branch 'main' into feature/docker	2025-04-30 22:10:55 +02:00
Bruce	3126d2c39b	Merge pull request #20 from bcye/feature/test-parser Add Unit Tests for Parser	2025-04-30 22:08:53 +02:00
Bruce Röttgers	84e9a68bac	update reference	2025-04-30 22:06:29 +02:00
Bruce Röttgers	f67d8d3963	Merge branch 'main' into feature/test-parser	2025-04-30 22:05:41 +02:00
Bruce Röttgers	c780a4bb99	Merge branch 'main' into feature/test-parser	2025-04-30 22:04:05 +02:00
Bruce Röttgers	1e89b20483	test scripts didnt have excerpts afterall	2025-04-30 22:02:41 +02:00
Bruce	1c16ee87e6	Update README.md	2025-04-30 21:47:19 +02:00
Bruce	fba9be556e	add proper attribution for fixtures from wikivoyage	2025-04-30 21:34:43 +02:00
Bruce	729d4adc62	Merge pull request #18 from bcye/feature/only-python Integrate Node Script into Python	2025-04-30 21:12:51 +02:00
Bruce Röttgers	0c2905c119	Merge branch 'main' into feature/only-python	2025-04-30 16:20:13 +02:00
Bruce Röttgers	08cd8b41fe	reflect new filename (main.py) in docker	2025-04-30 14:01:56 +02:00
Bruce Röttgers	63babeace3	Merge branch 'feature/only-python' into feature/docker	2025-04-30 14:01:34 +02:00
Bruce Röttgers	b18387a83c	refactor transform code into own module	2025-04-30 14:01:20 +02:00
Bruce	3e2149ebcc	Merge pull request #17 from bcye/feature/npm Refactor types into npm package and add CI Publish Action	2025-04-29 17:22:50 +02:00
Bruce Röttgers	243c4be9fe	forgot to remove cd	2025-04-29 17:20:52 +02:00
Bruce Röttgers	6faf2a1a97	try new workflow	2025-04-29 17:19:44 +02:00
Bruce Röttgers	82520947e0	0.2.3	2025-04-27 23:05:30 +02:00
Bruce Röttgers	ac2ab450b9	v0.2.2	2025-04-27 23:04:02 +02:00
Bruce Röttgers	4bed99ca8c	v0.2.1	2025-04-27 22:58:22 +02:00
Bruce Röttgers	96843f104c	remove fixed package manager version	2025-04-27 22:58:07 +02:00
Bruce Röttgers	75503c971d	v0.2.0 compile ts	2025-04-27 22:55:36 +02:00
Bruce Röttgers	559bcdda44	back to uv	2025-04-26 21:57:43 +02:00
Bruce Röttgers	322df10561	try new install method	2025-04-26 19:43:44 +02:00
Bruce Röttgers	59b2aeb1f4	add tests and workflow	2025-04-26 19:40:41 +02:00
Bruce Röttgers	d48e75ce01	types/0.1.2	2025-04-26 15:07:07 +02:00
Bruce Röttgers	5e74672049	types/0.1.1	2025-04-26 15:03:50 +02:00
Bruce Röttgers	4d25bc9e4c	add git workflow	2025-04-26 14:48:36 +02:00
Bruce Röttgers	201387be5e	refactor types to own folder and init npm pkg	2025-04-26 14:38:32 +02:00
`@ -7,4 +7,4 @@ RUN uv sync --frozen`

	`COPY . .`	`COPY . .`

	`CMD ["uv", "run", "transform-documents.py"]`	`CMD ["uv", "run", "main.py"]`
		`@ -0,0 +1,3 @@`
							`# @bcye/structured-wikivoyage-types`

							`Types to use when consuming json trees from the structured-wikivoyage-exports project`