move remaining files

This commit is contained in:
Bruce Röttgers
2025-09-29 17:51:42 +02:00
parent 183aa1ae66
commit 0bf0238ef1
5 changed files with 21 additions and 116 deletions

View File

@@ -5,20 +5,19 @@ from .parser import WikivoyageParser
logger = getLogger(__name__)
class WikiDumpHandler(xml.sax.ContentHandler):
"""
SAX handler that, for each <page> whose <id> is in mappings,
collects the <text> and schedules an async task to parse
and write via the usersupplied handler.
and write via the usersupplied handler(s).
"""
def __init__(self, mappings, handler, max_concurrent):
def __init__(self, mappings, handlers):
super().__init__()
self.mappings = mappings
self.handler = handler
self.sem = (
asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None
)
# Support a single handler or a list of handlers
self.handlers = handlers
self.tasks: list[asyncio.Task] = []
self.currentTag: str | None = None
@@ -26,6 +25,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
self.inRevision = False
self.inText = False
self.currentPageId: str | None = None
self.currentTitle: str | None = None
self.currentText: list[str] = []
def startElement(self, name, attrs):
@@ -34,6 +34,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
logger.debug("start page")
self.inPage = True
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("start revision")
@@ -49,18 +50,17 @@ class WikiDumpHandler(xml.sax.ContentHandler):
if pid and pid in self.mappings:
wd_id = self.mappings[pid]
text = "".join(self.currentText)
title = self.currentTitle
logger.debug(f"scheduled {wd_id} for handling")
# schedule processing
if self.sem:
task = asyncio.create_task(self._bounded_process(text, wd_id))
else:
task = asyncio.create_task(self._process(text, wd_id))
task = asyncio.create_task(self._process(text, wd_id, title))
self.tasks.append(task)
else:
logger.debug(f"page {pid} without wikidata id, skipping...")
# reset
self.inPage = self.inRevision = self.inText = False
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("end revision")
@@ -81,16 +81,21 @@ class WikiDumpHandler(xml.sax.ContentHandler):
content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.currentTag == "title" and self.inPage:
if self.currentTitle is None:
self.currentTitle = content
else:
self.currentTitle += content
elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content)
async def _process(self, text: str, uid: str):
async def _process(self, text: str, uid: str, title: str):
parser = WikivoyageParser()
entry = parser.parse(text)
await self.handler.write_entry(entry, uid)
entry["properties"]["title"] = title
async def _bounded_process(self, text: str, uid: str):
# Only run N at once
async with self.sem:
await self._process(text, uid)
# Write to all handlers concurrently
await asyncio.gather(
*[handler.write_entry(entry, uid) for handler in self.handlers]
)

View File

@@ -1,100 +0,0 @@
from logging import getLogger
import xml.sax
import asyncio
from .parser import WikivoyageParser
logger = getLogger(__name__)
class WikiDumpHandler(xml.sax.ContentHandler):
"""
SAX handler that, for each <page> whose <id> is in mappings,
collects the <text> and schedules an async task to parse
and write via the usersupplied handler(s).
"""
def __init__(self, mappings, handlers):
super().__init__()
self.mappings = mappings
# Support a single handler or a list of handlers
self.handlers = handlers
self.tasks: list[asyncio.Task] = []
self.currentTag: str | None = None
self.inPage = False
self.inRevision = False
self.inText = False
self.currentPageId: str | None = None
self.currentTitle: str | None = None
self.currentText: list[str] = []
def startElement(self, name, attrs):
self.currentTag = name
if name == "page":
logger.debug("start page")
self.inPage = True
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("start revision")
self.inRevision = True
elif name == "text" and self.inRevision:
logger.debug("start text")
self.inText = True
def endElement(self, name):
if name == "page":
logger.debug("end page")
pid = self.currentPageId
if pid and pid in self.mappings:
wd_id = self.mappings[pid]
text = "".join(self.currentText)
title = self.currentTitle
logger.debug(f"scheduled {wd_id} for handling")
# schedule processing
task = asyncio.create_task(self._process(text, wd_id, title))
self.tasks.append(task)
else:
logger.debug(f"page {pid} without wikidata id, skipping...")
# reset
self.inPage = self.inRevision = self.inText = False
self.currentPageId = None
self.currentTitle = None
self.currentText = []
elif name == "revision":
logger.debug("end revision")
self.inRevision = False
elif name == "text":
logger.debug("end text")
self.inText = False
self.currentTag = None
def characters(self, content):
# Only filter whitespace for ID fields, preserve all content for text
if (
self.currentTag == "id"
and self.inPage
and not self.inRevision
and not self.currentPageId
):
content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped
elif self.currentTag == "title" and self.inPage:
if self.currentTitle is None:
self.currentTitle = content
else:
self.currentTitle += content
elif self.inText:
# Always append text content, even if it's just whitespace or newlines
self.currentText.append(content)
async def _process(self, text: str, uid: str, title: str):
parser = WikivoyageParser()
entry = parser.parse(text)
entry['properties']['title'] = title
# Write to all handlers concurrently
await asyncio.gather(*[
handler.write_entry(entry, uid) for handler in self.handlers
])