Merge branch 'feature/only-python' into feature/docker

2025-12-14 20:52:44 +00:00 · 2025-04-30 14:01:34 +02:00
parent 789da95e49 b18387a83c
commit 63babeace3
6 changed files with 273 additions and 246 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,132 @@
 #!/usr/bin/env python3
 import os
 import sys
 import re
 import zlib
 import bz2
 import asyncio
 import logging
 import importlib
 import xml.sax
 from pathlib import Path
 from dotenv import load_dotenv
 import aiohttp
 from transformers import fetch_mappings, WikiDumpHandler, WikivoyageParser
 logger = logging.getLogger(__name__)
 def gather_handler_kwargs(handler_name: str) -> dict:
    """
    Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
    E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
    """
    prefix = f"HANDLER_{handler_name.upper()}_"
    kwargs = {}
    for env_key, val in os.environ.items():
        if not env_key.startswith(prefix):
            continue
        param = env_key.replace(prefix, "").lower()
        # cast ints
        if val.isdigit():
            val = int(val)
        # cast bools
        elif val.lower() in ("true", "false"):
            val = val.lower() == "true"
        kwargs[param] = val
    logger.debug(f"Handler kwargs: {kwargs}")
    return kwargs
 async def process_dump(
    mappings: dict[str, str], handler, max_concurrent: int
 ):
    """
    Stream-download the bzip2-compressed XML dump and feed to SAX.
    """
    xml_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-pages-articles.xml.bz2"
    )
    decomp = bz2.BZ2Decompressor()
    sax_parser = xml.sax.make_parser()
    dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
    sax_parser.setContentHandler(dump_handler)
    async with aiohttp.ClientSession() as session:
        async with session.get(xml_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                sax_parser.feed(text)
    sax_parser.close()
    if dump_handler.tasks:
        await asyncio.gather(*dump_handler.tasks)
 async def main():
    # 1. Which handler to load?
    handler_name = os.getenv("HANDLER")
    if not handler_name:
        logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
        sys.exit(1)
    # 2. Dynamic import
    module_path = f"output_handlers.{handler_name}"
    try:
        mod = importlib.import_module(module_path)
    except ImportError as e:
        logger.error(f"Error loading handler module {module_path}: {e}")
        sys.exit(1)
    # 3. Find the class: e.g. "sftp" → "SftpHandler"
    class_name = handler_name.title().replace("_", "") + "Handler"
    if not hasattr(mod, class_name):
        logger.error(f"{module_path} defines no class {class_name}")
        sys.exit(1)
    HandlerCls = getattr(mod, class_name)
    logger.info(f"Using handler from {module_path}")
    # 4. Build kwargs from ENV
    handler_kwargs = gather_handler_kwargs(handler_name)
    # 5. Instantiate
    handler = HandlerCls(**handler_kwargs)
    # 6. read concurrency setting
    try:
        max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
    except ValueError:
        raise ValueError("MAX_CONCURRENT must be an integer")
    if max_conc < 0:
        raise ValueError("MAX_CONCURRENT must be >= 0")
    # 7. Fetch mappings
    logger.info("Fetching mappings from SQL dump…")
    mappings = await fetch_mappings()
    logger.info(f"Got {len(mappings)} wikibase_item mappings.")
    # 8. Stream & split the XML dump
    logger.info("Processing XML dump…")
    await process_dump(mappings, handler, max_conc)
    # 5. Finish up
    await handler.close()
    logger.info("All done.")
 if __name__ == "__main__":
    load_dotenv()
    if os.getenv("DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    asyncio.run(main())
--- a/transform-documents.py
+++ b/transform-documents.py
@@ -1,246 +0,0 @@
 #!/usr/bin/env python3
 import os
 import sys
 import re
 import zlib
 import bz2
 import asyncio
 import logging
 import importlib
 import xml.sax
 from pathlib import Path
 from dotenv import load_dotenv
 import aiohttp
 from parser import WikivoyageParser
 logger = logging.getLogger(__name__)
 def gather_handler_kwargs(handler_name: str) -> dict:
    """
    Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
    E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222}
    """
    prefix = f"HANDLER_{handler_name.upper()}_"
    kwargs = {}
    for env_key, val in os.environ.items():
        if not env_key.startswith(prefix):
            continue
        param = env_key.replace(prefix, "").lower()
        # cast ints
        if val.isdigit():
            val = int(val)
        # cast bools
        elif val.lower() in ("true", "false"):
            val = val.lower() == "true"
        kwargs[param] = val
    logger.debug(f"Handler kwargs: {kwargs}")
    return kwargs
 async def fetch_mappings() -> dict[str, str]:
    """
    Download and gunzip the page_props SQL dump, extract
    page→wikibase_item mappings.
    """
    sql_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-page_props.sql.gz"
    )
    # decompress gzip
    decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
    # regex for tuples: (page,'prop','value',NULL_or_number)
    tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
    buffer = ""
    mappings: dict[str, str] = {}
    async with aiohttp.ClientSession() as session:
        async with session.get(sql_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                buffer += text
                for m in tuple_re.finditer(buffer):
                    page_id, prop, value = m.group(1), m.group(2), m.group(3)
                    if prop == "wikibase_item":
                        mappings[page_id] = value
                # keep tail to handle split tuples
                if len(buffer) > 1000:
                    buffer = buffer[-1000:]
    return mappings
 class WikiDumpHandler(xml.sax.ContentHandler):
    """
    SAX handler that, for each <page> whose <id> is in mappings,
    collects the <text> and schedules an async task to parse
    and write via the user‐supplied handler.
    """
    def __init__(self, mappings, handler, max_concurrent):
        super().__init__()
        self.mappings = mappings
        self.handler = handler
        self.sem = (
            asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
        )
        self.tasks: list[asyncio.Task] = []
        self.currentTag: str | None = None
        self.inPage = False
        self.inRevision = False
        self.inText = False
        self.currentPageId: str | None = None
        self.currentText: list[str] = []
    def startElement(self, name, attrs):
        self.currentTag = name
        if name == "page":
            self.inPage = True
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            self.inRevision = True
        elif name == "text" and self.inRevision:
            self.inText = True
    def endElement(self, name):
        if name == "page":
            pid = self.currentPageId
            if pid and pid in self.mappings:
                wd_id = self.mappings[pid]
                text = "".join(self.currentText)
                # schedule processing
                if self.sem:
                    task = asyncio.create_task(self._bounded_process(text, wd_id))
                else:
                    task = asyncio.create_task(self._process(text, wd_id))
                self.tasks.append(task)
            # reset
            self.inPage = self.inRevision = self.inText = False
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            self.inRevision = False
        elif name == "text":
            self.inText = False
        self.currentTag = None
    def characters(self, content):
        # Only filter whitespace for ID fields, preserve all content for text
        if (
            self.currentTag == "id"
            and self.inPage
            and not self.inRevision
            and not self.currentPageId
        ):
            content_stripped = content.strip()
            if content_stripped:  # Only process non-empty ID content
                self.currentPageId = content_stripped
        elif self.inText:
            # Always append text content, even if it's just whitespace or newlines
            self.currentText.append(content)
    async def _process(self, text: str, uid: str):
        parser = WikivoyageParser()
        entry = parser.parse(text)
        await self.handler.write_entry(entry, uid)
    async def _bounded_process(self, text: str, uid: str):
        # Only run N at once
        async with self.sem:
            await self._process(text, uid)
 async def process_dump(
    mappings: dict[str, str], handler, max_concurrent: int
 ):
    """
    Stream-download the bzip2-compressed XML dump and feed to SAX.
    """
    xml_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-pages-articles.xml.bz2"
    )
    decomp = bz2.BZ2Decompressor()
    sax_parser = xml.sax.make_parser()
    dump_handler = WikiDumpHandler(mappings, handler, max_concurrent)
    sax_parser.setContentHandler(dump_handler)
    async with aiohttp.ClientSession() as session:
        async with session.get(xml_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                sax_parser.feed(text)
    sax_parser.close()
    if dump_handler.tasks:
        await asyncio.gather(*dump_handler.tasks)
 async def main():
    # 1. Which handler to load?
    handler_name = os.getenv("HANDLER")
    if not handler_name:
        logger.error("Error: set ENV HANDLER (e.g. 'filesystem')")
        sys.exit(1)
    # 2. Dynamic import
    module_path = f"output_handlers.{handler_name}"
    try:
        mod = importlib.import_module(module_path)
    except ImportError as e:
        logger.error(f"Error loading handler module {module_path}: {e}")
        sys.exit(1)
    # 3. Find the class: e.g. "sftp" → "SftpHandler"
    class_name = handler_name.title().replace("_", "") + "Handler"
    if not hasattr(mod, class_name):
        logger.error(f"{module_path} defines no class {class_name}")
        sys.exit(1)
    HandlerCls = getattr(mod, class_name)
    logger.info(f"Using handler from {module_path}")
    # 4. Build kwargs from ENV
    handler_kwargs = gather_handler_kwargs(handler_name)
    # 5. Instantiate
    handler = HandlerCls(**handler_kwargs)
    # 6. read concurrency setting
    try:
        max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
    except ValueError:
        raise ValueError("MAX_CONCURRENT must be an integer")
    if max_conc < 0:
        raise ValueError("MAX_CONCURRENT must be >= 0")
    # 7. Fetch mappings
    logger.info("Fetching mappings from SQL dump…")
    mappings = await fetch_mappings()
    logger.info(f"Got {len(mappings)} wikibase_item mappings.")
    # 8. Stream & split the XML dump
    logger.info("Processing XML dump…")
    await process_dump(mappings, handler, max_conc)
    # 5. Finish up
    await handler.close()
    logger.info("All done.")
 if __name__ == "__main__":
    load_dotenv()
    if os.getenv("DEBUG"):
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    asyncio.run(main())
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -0,0 +1,3 @@
 from .fetch_mappings import fetch_mappings
 from .wiki_dump_handler import WikiDumpHandler
 from .parser import WikivoyageParser
--- a/transformers/fetch_mappings.py
+++ b/transformers/fetch_mappings.py
@@ -0,0 +1,42 @@
 from logging import getLogger
 import zlib
 import re
 import aiohttp
 logger = getLogger(__name__)
 async def fetch_mappings() -> dict[str, str]:
    """
    Download and gunzip the page_props SQL dump, extract
    page→wikibase_item mappings.
    """
    sql_url = (
        "https://dumps.wikimedia.org/"
        "enwikivoyage/latest/"
        "enwikivoyage-latest-page_props.sql.gz"
    )
    # decompress gzip
    decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
    # regex for tuples: (page,'prop','value',NULL_or_number)
    tuple_re = re.compile(r"\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)")
    buffer = ""
    mappings: dict[str, str] = {}
    async with aiohttp.ClientSession() as session:
        async with session.get(sql_url) as resp:
            resp.raise_for_status()
            async for chunk in resp.content.iter_chunked(1024 * 1024):
                data = decomp.decompress(chunk)
                if not data:
                    continue
                text = data.decode("utf-8", errors="ignore")
                buffer += text
                for m in tuple_re.finditer(buffer):
                    page_id, prop, value = m.group(1), m.group(2), m.group(3)
                    if prop == "wikibase_item":
                        logger.debug(f"Found mapping {page_id} -> {value}")
                        mappings[page_id] = value
                # keep tail to handle split tuples
                if len(buffer) > 1000:
                    buffer = buffer[-1000:]
    return mappings
--- a/transformers/parser.py
+++ b/transformers/parser.py
--- a/transformers/wiki_dump_handler.py
+++ b/transformers/wiki_dump_handler.py
@@ -0,0 +1,96 @@
 from logging import getLogger
 import xml.sax
 import asyncio
 from .parser import WikivoyageParser
 logger = getLogger(__name__)
 class WikiDumpHandler(xml.sax.ContentHandler):
    """
    SAX handler that, for each <page> whose <id> is in mappings,
    collects the <text> and schedules an async task to parse
    and write via the user‐supplied handler.
    """
    def __init__(self, mappings, handler, max_concurrent):
        super().__init__()
        self.mappings = mappings
        self.handler = handler
        self.sem = (
            asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
        )
        self.tasks: list[asyncio.Task] = []
        self.currentTag: str | None = None
        self.inPage = False
        self.inRevision = False
        self.inText = False
        self.currentPageId: str | None = None
        self.currentText: list[str] = []
    def startElement(self, name, attrs):
        self.currentTag = name
        if name == "page":
            logger.debug("start page")
            self.inPage = True
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            logger.debug("start revision")
            self.inRevision = True
        elif name == "text" and self.inRevision:
            logger.debug("start text")
            self.inText = True
    def endElement(self, name):
        if name == "page":
            logger.debug("end page")
            pid = self.currentPageId
            if pid and pid in self.mappings:
                wd_id = self.mappings[pid]
                text = "".join(self.currentText)
                logger.debug(f"scheduled {wd_id} for handling")
                # schedule processing
                if self.sem:
                    task = asyncio.create_task(self._bounded_process(text, wd_id))
                else:
                    task = asyncio.create_task(self._process(text, wd_id))
                self.tasks.append(task)
            else:
                logger.debug(f"page {pid} without wikidata id, skipping...")
            # reset
            self.inPage = self.inRevision = self.inText = False
            self.currentPageId = None
            self.currentText = []
        elif name == "revision":
            logger.debug("end revision")
            self.inRevision = False
        elif name == "text":
            logger.debug("end text")
            self.inText = False
        self.currentTag = None
    def characters(self, content):
        # Only filter whitespace for ID fields, preserve all content for text
        if (
            self.currentTag == "id"
            and self.inPage
            and not self.inRevision
            and not self.currentPageId
        ):
            content_stripped = content.strip()
            if content_stripped:  # Only process non-empty ID content
                self.currentPageId = content_stripped
        elif self.inText:
            # Always append text content, even if it's just whitespace or newlines
            self.currentText.append(content)
    async def _process(self, text: str, uid: str):
        parser = WikivoyageParser()
        entry = parser.parse(text)
        await self.handler.write_entry(entry, uid)
    async def _bounded_process(self, text: str, uid: str):
        # Only run N at once
        async with self.sem:
            await self._process(text, uid)