add title parsing from xml

This commit is contained in:
Bruce Röttgers 2025-05-07 15:22:20 +02:00
parent 93d99bf062
commit b33201e930

View File

@ -26,6 +26,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
self.inRevision = False self.inRevision = False
self.inText = False self.inText = False
self.currentPageId: str | None = None self.currentPageId: str | None = None
self.currentTitle: str | None = None
self.currentText: list[str] = [] self.currentText: list[str] = []
def startElement(self, name, attrs): def startElement(self, name, attrs):
@ -34,6 +35,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
logger.debug("start page") logger.debug("start page")
self.inPage = True self.inPage = True
self.currentPageId = None self.currentPageId = None
self.currentTitle = None
self.currentText = [] self.currentText = []
elif name == "revision": elif name == "revision":
logger.debug("start revision") logger.debug("start revision")
@ -49,18 +51,20 @@ class WikiDumpHandler(xml.sax.ContentHandler):
if pid and pid in self.mappings: if pid and pid in self.mappings:
wd_id = self.mappings[pid] wd_id = self.mappings[pid]
text = "".join(self.currentText) text = "".join(self.currentText)
title = self.currentTitle
logger.debug(f"scheduled {wd_id} for handling") logger.debug(f"scheduled {wd_id} for handling")
# schedule processing # schedule processing
if self.sem: if self.sem:
task = asyncio.create_task(self._bounded_process(text, wd_id)) task = asyncio.create_task(self._bounded_process(text, wd_id, title))
else: else:
task = asyncio.create_task(self._process(text, wd_id)) task = asyncio.create_task(self._process(text, wd_id, title))
self.tasks.append(task) self.tasks.append(task)
else: else:
logger.debug(f"page {pid} without wikidata id, skipping...") logger.debug(f"page {pid} without wikidata id, skipping...")
# reset # reset
self.inPage = self.inRevision = self.inText = False self.inPage = self.inRevision = self.inText = False
self.currentPageId = None self.currentPageId = None
self.currentTitle = None
self.currentText = [] self.currentText = []
elif name == "revision": elif name == "revision":
logger.debug("end revision") logger.debug("end revision")
@ -81,16 +85,22 @@ class WikiDumpHandler(xml.sax.ContentHandler):
content_stripped = content.strip() content_stripped = content.strip()
if content_stripped: # Only process non-empty ID content if content_stripped: # Only process non-empty ID content
self.currentPageId = content_stripped self.currentPageId = content_stripped
elif self.currentTag == "title" and self.inPage:
if self.currentTitle is None:
self.currentTitle = content
else:
self.currentTitle += content
elif self.inText: elif self.inText:
# Always append text content, even if it's just whitespace or newlines # Always append text content, even if it's just whitespace or newlines
self.currentText.append(content) self.currentText.append(content)
async def _process(self, text: str, uid: str): async def _process(self, text: str, uid: str, title: str):
parser = WikivoyageParser() parser = WikivoyageParser()
entry = parser.parse(text) entry = parser.parse(text)
entry['properties']['title'] = title
await self.handler.write_entry(entry, uid) await self.handler.write_entry(entry, uid)
async def _bounded_process(self, text: str, uid: str): async def _bounded_process(self, text: str, uid: str, title: str):
# Only run N at once # Only run N at once
async with self.sem: async with self.sem:
await self._process(text, uid) await self._process(text, uid, title)