From 10fbef63b3a175babade2f88e525411f575189ba Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Tue, 13 May 2025 16:51:53 +0200 Subject: [PATCH 1/6] move code to dedicated src/ folder --- .github/workflows/test-parser.yaml | 2 +- README.md | 2 +- main.py => src/main.py | 0 .../output_handlers}/__init__.py | 0 .../output_handlers}/base_handler.py | 0 .../output_handlers}/bunny_storage.py | 0 .../output_handlers}/filesystem.py | 0 .../transformers}/__init__.py | 0 .../transformers}/fetch_mappings.py | 0 {transformers => src/transformers}/parser.py | 0 src/transformers/wiki_dump_handler.py | 96 +++++++++++++++++++ 11 files changed, 98 insertions(+), 2 deletions(-) rename main.py => src/main.py (100%) rename {output_handlers => src/output_handlers}/__init__.py (100%) rename {output_handlers => src/output_handlers}/base_handler.py (100%) rename {output_handlers => src/output_handlers}/bunny_storage.py (100%) rename {output_handlers => src/output_handlers}/filesystem.py (100%) rename {transformers => src/transformers}/__init__.py (100%) rename {transformers => src/transformers}/fetch_mappings.py (100%) rename {transformers => src/transformers}/parser.py (100%) create mode 100644 src/transformers/wiki_dump_handler.py diff --git a/.github/workflows/test-parser.yaml b/.github/workflows/test-parser.yaml index e6ee580..70a7c77 100644 --- a/.github/workflows/test-parser.yaml +++ b/.github/workflows/test-parser.yaml @@ -20,4 +20,4 @@ jobs: run: uv sync --locked --dev - name: Run tests - run: PYTHONPATH=. uv run pytest + run: PYTHONPATH=src uv run pytest diff --git a/README.md b/README.md index 69a1aaa..d0096bd 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ See [docs](docs) for more information on how to use this utility. ## Testing -Run `PYTHONPATH=. pytest` from inside the venv +Run `PYTHONPATH=src pytest` from inside the venv, or directly call `PYTHONPATH=src uv run -- pytest`. ## License diff --git a/main.py b/src/main.py similarity index 100% rename from main.py rename to src/main.py diff --git a/output_handlers/__init__.py b/src/output_handlers/__init__.py similarity index 100% rename from output_handlers/__init__.py rename to src/output_handlers/__init__.py diff --git a/output_handlers/base_handler.py b/src/output_handlers/base_handler.py similarity index 100% rename from output_handlers/base_handler.py rename to src/output_handlers/base_handler.py diff --git a/output_handlers/bunny_storage.py b/src/output_handlers/bunny_storage.py similarity index 100% rename from output_handlers/bunny_storage.py rename to src/output_handlers/bunny_storage.py diff --git a/output_handlers/filesystem.py b/src/output_handlers/filesystem.py similarity index 100% rename from output_handlers/filesystem.py rename to src/output_handlers/filesystem.py diff --git a/transformers/__init__.py b/src/transformers/__init__.py similarity index 100% rename from transformers/__init__.py rename to src/transformers/__init__.py diff --git a/transformers/fetch_mappings.py b/src/transformers/fetch_mappings.py similarity index 100% rename from transformers/fetch_mappings.py rename to src/transformers/fetch_mappings.py diff --git a/transformers/parser.py b/src/transformers/parser.py similarity index 100% rename from transformers/parser.py rename to src/transformers/parser.py diff --git a/src/transformers/wiki_dump_handler.py b/src/transformers/wiki_dump_handler.py new file mode 100644 index 0000000..eecf022 --- /dev/null +++ b/src/transformers/wiki_dump_handler.py @@ -0,0 +1,96 @@ +from logging import getLogger +import xml.sax +import asyncio +from .parser import WikivoyageParser + +logger = getLogger(__name__) + +class WikiDumpHandler(xml.sax.ContentHandler): + """ + SAX handler that, for each whose is in mappings, + collects the and schedules an async task to parse + and write via the user‐supplied handler. + """ + + def __init__(self, mappings, handler, max_concurrent): + super().__init__() + self.mappings = mappings + self.handler = handler + self.sem = ( + asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None + ) + self.tasks: list[asyncio.Task] = [] + + self.currentTag: str | None = None + self.inPage = False + self.inRevision = False + self.inText = False + self.currentPageId: str | None = None + self.currentText: list[str] = [] + + def startElement(self, name, attrs): + self.currentTag = name + if name == "page": + logger.debug("start page") + self.inPage = True + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("start revision") + self.inRevision = True + elif name == "text" and self.inRevision: + logger.debug("start text") + self.inText = True + + def endElement(self, name): + if name == "page": + logger.debug("end page") + pid = self.currentPageId + if pid and pid in self.mappings: + wd_id = self.mappings[pid] + text = "".join(self.currentText) + logger.debug(f"scheduled {wd_id} for handling") + # schedule processing + if self.sem: + task = asyncio.create_task(self._bounded_process(text, wd_id)) + else: + task = asyncio.create_task(self._process(text, wd_id)) + self.tasks.append(task) + else: + logger.debug(f"page {pid} without wikidata id, skipping...") + # reset + self.inPage = self.inRevision = self.inText = False + self.currentPageId = None + self.currentText = [] + elif name == "revision": + logger.debug("end revision") + self.inRevision = False + elif name == "text": + logger.debug("end text") + self.inText = False + self.currentTag = None + + def characters(self, content): + # Only filter whitespace for ID fields, preserve all content for text + if ( + self.currentTag == "id" + and self.inPage + and not self.inRevision + and not self.currentPageId + ): + content_stripped = content.strip() + if content_stripped: # Only process non-empty ID content + self.currentPageId = content_stripped + elif self.inText: + # Always append text content, even if it's just whitespace or newlines + self.currentText.append(content) + + async def _process(self, text: str, uid: str): + parser = WikivoyageParser() + entry = parser.parse(text) + await self.handler.write_entry(entry, uid) + + async def _bounded_process(self, text: str, uid: str): + # Only run N at once + async with self.sem: + await self._process(text, uid) From 03e41cf6854ebe02dda05c5ea7223a672edf985e Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Tue, 13 May 2025 16:52:20 +0200 Subject: [PATCH 2/6] switch to multistage build --- .dockerignore | 8 -------- Dockerfile | 13 +++++++++---- 2 files changed, 9 insertions(+), 12 deletions(-) delete mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index db7e340..0000000 --- a/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -.env -__pycache__ -.venv -.pytest_cache -docs -node_modules -output -sketching diff --git a/Dockerfile b/Dockerfile index 161a820..054c4f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,15 @@ -FROM ghcr.io/astral-sh/uv:0.6-python3.12-bookworm +# use python 3.12 as a base image +FROM docker.io/python:3.12-alpine +# use the latest version of uv, independently of the python version +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ WORKDIR /app -COPY pyproject.toml uv.lock ./ +# copy the requirements and install them +COPY pyproject.toml uv.lock . RUN uv sync --frozen -COPY . . +# copy the rest of the code +COPY src . -CMD ["uv", "run", "main.py"] \ No newline at end of file +CMD ["uv", "run", "main.py"] From ba3091b11ff33fcb3c5cca949b69edade707e0c1 Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Tue, 13 May 2025 16:52:35 +0200 Subject: [PATCH 3/6] allow multi-architecture builds --- .github/workflows/build-image.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml index 1e667cf..9d824a2 100644 --- a/.github/workflows/build-image.yaml +++ b/.github/workflows/build-image.yaml @@ -23,11 +23,15 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Set up QEMU for multi-platform builds + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build and push uses: docker/build-push-action@v6 with: + platforms: linux/amd64,linux/arm64 push: true tags: ghcr.io/bcye/structured-wikivoyage-exports:latest From 833d5374f60ea987ae79442bd805eae9de8bbd70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruce=20R=C3=B6ttgers?= Date: Mon, 29 Sep 2025 17:47:28 +0200 Subject: [PATCH 4/6] revert ./ change --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 054c4f0..936e333 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,6 @@ COPY pyproject.toml uv.lock . RUN uv sync --frozen # copy the rest of the code -COPY src . +COPY src ./ CMD ["uv", "run", "main.py"] From 0bf0238ef1d13180dcceec84438b84b6e25ce9b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruce=20R=C3=B6ttgers?= Date: Mon, 29 Sep 2025 17:51:42 +0200 Subject: [PATCH 5/6] move remaining files --- .../tests}/fixtures/boston_input.txt | 0 .../tests}/fixtures/boston_output.json | 0 .../tests}/test_parser_json_snippets.py | 0 src/transformers/wiki_dump_handler.py | 37 ++++--- transformers/wiki_dump_handler.py | 100 ------------------ 5 files changed, 21 insertions(+), 116 deletions(-) rename {tests => src/tests}/fixtures/boston_input.txt (100%) rename {tests => src/tests}/fixtures/boston_output.json (100%) rename {tests => src/tests}/test_parser_json_snippets.py (100%) delete mode 100644 transformers/wiki_dump_handler.py diff --git a/tests/fixtures/boston_input.txt b/src/tests/fixtures/boston_input.txt similarity index 100% rename from tests/fixtures/boston_input.txt rename to src/tests/fixtures/boston_input.txt diff --git a/tests/fixtures/boston_output.json b/src/tests/fixtures/boston_output.json similarity index 100% rename from tests/fixtures/boston_output.json rename to src/tests/fixtures/boston_output.json diff --git a/tests/test_parser_json_snippets.py b/src/tests/test_parser_json_snippets.py similarity index 100% rename from tests/test_parser_json_snippets.py rename to src/tests/test_parser_json_snippets.py diff --git a/src/transformers/wiki_dump_handler.py b/src/transformers/wiki_dump_handler.py index eecf022..5b561cd 100644 --- a/src/transformers/wiki_dump_handler.py +++ b/src/transformers/wiki_dump_handler.py @@ -5,20 +5,19 @@ from .parser import WikivoyageParser logger = getLogger(__name__) + class WikiDumpHandler(xml.sax.ContentHandler): """ SAX handler that, for each whose is in mappings, collects the and schedules an async task to parse - and write via the user‐supplied handler. + and write via the user‐supplied handler(s). """ - def __init__(self, mappings, handler, max_concurrent): + def __init__(self, mappings, handlers): super().__init__() self.mappings = mappings - self.handler = handler - self.sem = ( - asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None - ) + # Support a single handler or a list of handlers + self.handlers = handlers self.tasks: list[asyncio.Task] = [] self.currentTag: str | None = None @@ -26,6 +25,7 @@ class WikiDumpHandler(xml.sax.ContentHandler): self.inRevision = False self.inText = False self.currentPageId: str | None = None + self.currentTitle: str | None = None self.currentText: list[str] = [] def startElement(self, name, attrs): @@ -34,6 +34,7 @@ class WikiDumpHandler(xml.sax.ContentHandler): logger.debug("start page") self.inPage = True self.currentPageId = None + self.currentTitle = None self.currentText = [] elif name == "revision": logger.debug("start revision") @@ -49,18 +50,17 @@ class WikiDumpHandler(xml.sax.ContentHandler): if pid and pid in self.mappings: wd_id = self.mappings[pid] text = "".join(self.currentText) + title = self.currentTitle logger.debug(f"scheduled {wd_id} for handling") # schedule processing - if self.sem: - task = asyncio.create_task(self._bounded_process(text, wd_id)) - else: - task = asyncio.create_task(self._process(text, wd_id)) + task = asyncio.create_task(self._process(text, wd_id, title)) self.tasks.append(task) else: logger.debug(f"page {pid} without wikidata id, skipping...") # reset self.inPage = self.inRevision = self.inText = False self.currentPageId = None + self.currentTitle = None self.currentText = [] elif name == "revision": logger.debug("end revision") @@ -81,16 +81,21 @@ class WikiDumpHandler(xml.sax.ContentHandler): content_stripped = content.strip() if content_stripped: # Only process non-empty ID content self.currentPageId = content_stripped + elif self.currentTag == "title" and self.inPage: + if self.currentTitle is None: + self.currentTitle = content + else: + self.currentTitle += content elif self.inText: # Always append text content, even if it's just whitespace or newlines self.currentText.append(content) - async def _process(self, text: str, uid: str): + async def _process(self, text: str, uid: str, title: str): parser = WikivoyageParser() entry = parser.parse(text) - await self.handler.write_entry(entry, uid) + entry["properties"]["title"] = title - async def _bounded_process(self, text: str, uid: str): - # Only run N at once - async with self.sem: - await self._process(text, uid) + # Write to all handlers concurrently + await asyncio.gather( + *[handler.write_entry(entry, uid) for handler in self.handlers] + ) diff --git a/transformers/wiki_dump_handler.py b/transformers/wiki_dump_handler.py deleted file mode 100644 index c566f44..0000000 --- a/transformers/wiki_dump_handler.py +++ /dev/null @@ -1,100 +0,0 @@ -from logging import getLogger -import xml.sax -import asyncio -from .parser import WikivoyageParser - -logger = getLogger(__name__) - -class WikiDumpHandler(xml.sax.ContentHandler): - """ - SAX handler that, for each whose is in mappings, - collects the and schedules an async task to parse - and write via the user‐supplied handler(s). - """ - - def __init__(self, mappings, handlers): - super().__init__() - self.mappings = mappings - # Support a single handler or a list of handlers - self.handlers = handlers - self.tasks: list[asyncio.Task] = [] - - self.currentTag: str | None = None - self.inPage = False - self.inRevision = False - self.inText = False - self.currentPageId: str | None = None - self.currentTitle: str | None = None - self.currentText: list[str] = [] - - def startElement(self, name, attrs): - self.currentTag = name - if name == "page": - logger.debug("start page") - self.inPage = True - self.currentPageId = None - self.currentTitle = None - self.currentText = [] - elif name == "revision": - logger.debug("start revision") - self.inRevision = True - elif name == "text" and self.inRevision: - logger.debug("start text") - self.inText = True - - def endElement(self, name): - if name == "page": - logger.debug("end page") - pid = self.currentPageId - if pid and pid in self.mappings: - wd_id = self.mappings[pid] - text = "".join(self.currentText) - title = self.currentTitle - logger.debug(f"scheduled {wd_id} for handling") - # schedule processing - task = asyncio.create_task(self._process(text, wd_id, title)) - self.tasks.append(task) - else: - logger.debug(f"page {pid} without wikidata id, skipping...") - # reset - self.inPage = self.inRevision = self.inText = False - self.currentPageId = None - self.currentTitle = None - self.currentText = [] - elif name == "revision": - logger.debug("end revision") - self.inRevision = False - elif name == "text": - logger.debug("end text") - self.inText = False - self.currentTag = None - - def characters(self, content): - # Only filter whitespace for ID fields, preserve all content for text - if ( - self.currentTag == "id" - and self.inPage - and not self.inRevision - and not self.currentPageId - ): - content_stripped = content.strip() - if content_stripped: # Only process non-empty ID content - self.currentPageId = content_stripped - elif self.currentTag == "title" and self.inPage: - if self.currentTitle is None: - self.currentTitle = content - else: - self.currentTitle += content - elif self.inText: - # Always append text content, even if it's just whitespace or newlines - self.currentText.append(content) - - async def _process(self, text: str, uid: str, title: str): - parser = WikivoyageParser() - entry = parser.parse(text) - entry['properties']['title'] = title - - # Write to all handlers concurrently - await asyncio.gather(*[ - handler.write_entry(entry, uid) for handler in self.handlers - ]) From 8c4c66ed94d5c69c174a48243364a801f700985b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruce=20R=C3=B6ttgers?= Date: Mon, 29 Sep 2025 18:39:47 +0200 Subject: [PATCH 6/6] fix entrypoint for alpine --- entrypoint.sh | 2 -- src/entrypoint.sh | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 entrypoint.sh create mode 100644 src/entrypoint.sh diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100644 index 2d7e452..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -uv run main.py \ No newline at end of file diff --git a/src/entrypoint.sh b/src/entrypoint.sh new file mode 100644 index 0000000..742561c --- /dev/null +++ b/src/entrypoint.sh @@ -0,0 +1,2 @@ +#!/bin/sh +uv run main.py