mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-06-07 16:34:04 +00:00
add title parsing from xml
This commit is contained in:
parent
93d99bf062
commit
b33201e930
@ -26,6 +26,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
self.inRevision = False
|
self.inRevision = False
|
||||||
self.inText = False
|
self.inText = False
|
||||||
self.currentPageId: str | None = None
|
self.currentPageId: str | None = None
|
||||||
|
self.currentTitle: str | None = None
|
||||||
self.currentText: list[str] = []
|
self.currentText: list[str] = []
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
@ -34,6 +35,7 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
logger.debug("start page")
|
logger.debug("start page")
|
||||||
self.inPage = True
|
self.inPage = True
|
||||||
self.currentPageId = None
|
self.currentPageId = None
|
||||||
|
self.currentTitle = None
|
||||||
self.currentText = []
|
self.currentText = []
|
||||||
elif name == "revision":
|
elif name == "revision":
|
||||||
logger.debug("start revision")
|
logger.debug("start revision")
|
||||||
@ -49,18 +51,20 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
if pid and pid in self.mappings:
|
if pid and pid in self.mappings:
|
||||||
wd_id = self.mappings[pid]
|
wd_id = self.mappings[pid]
|
||||||
text = "".join(self.currentText)
|
text = "".join(self.currentText)
|
||||||
|
title = self.currentTitle
|
||||||
logger.debug(f"scheduled {wd_id} for handling")
|
logger.debug(f"scheduled {wd_id} for handling")
|
||||||
# schedule processing
|
# schedule processing
|
||||||
if self.sem:
|
if self.sem:
|
||||||
task = asyncio.create_task(self._bounded_process(text, wd_id))
|
task = asyncio.create_task(self._bounded_process(text, wd_id, title))
|
||||||
else:
|
else:
|
||||||
task = asyncio.create_task(self._process(text, wd_id))
|
task = asyncio.create_task(self._process(text, wd_id, title))
|
||||||
self.tasks.append(task)
|
self.tasks.append(task)
|
||||||
else:
|
else:
|
||||||
logger.debug(f"page {pid} without wikidata id, skipping...")
|
logger.debug(f"page {pid} without wikidata id, skipping...")
|
||||||
# reset
|
# reset
|
||||||
self.inPage = self.inRevision = self.inText = False
|
self.inPage = self.inRevision = self.inText = False
|
||||||
self.currentPageId = None
|
self.currentPageId = None
|
||||||
|
self.currentTitle = None
|
||||||
self.currentText = []
|
self.currentText = []
|
||||||
elif name == "revision":
|
elif name == "revision":
|
||||||
logger.debug("end revision")
|
logger.debug("end revision")
|
||||||
@ -81,16 +85,22 @@ class WikiDumpHandler(xml.sax.ContentHandler):
|
|||||||
content_stripped = content.strip()
|
content_stripped = content.strip()
|
||||||
if content_stripped: # Only process non-empty ID content
|
if content_stripped: # Only process non-empty ID content
|
||||||
self.currentPageId = content_stripped
|
self.currentPageId = content_stripped
|
||||||
|
elif self.currentTag == "title" and self.inPage:
|
||||||
|
if self.currentTitle is None:
|
||||||
|
self.currentTitle = content
|
||||||
|
else:
|
||||||
|
self.currentTitle += content
|
||||||
elif self.inText:
|
elif self.inText:
|
||||||
# Always append text content, even if it's just whitespace or newlines
|
# Always append text content, even if it's just whitespace or newlines
|
||||||
self.currentText.append(content)
|
self.currentText.append(content)
|
||||||
|
|
||||||
async def _process(self, text: str, uid: str):
|
async def _process(self, text: str, uid: str, title: str):
|
||||||
parser = WikivoyageParser()
|
parser = WikivoyageParser()
|
||||||
entry = parser.parse(text)
|
entry = parser.parse(text)
|
||||||
|
entry['properties']['title'] = title
|
||||||
await self.handler.write_entry(entry, uid)
|
await self.handler.write_entry(entry, uid)
|
||||||
|
|
||||||
async def _bounded_process(self, text: str, uid: str):
|
async def _bounded_process(self, text: str, uid: str, title: str):
|
||||||
# Only run N at once
|
# Only run N at once
|
||||||
async with self.sem:
|
async with self.sem:
|
||||||
await self._process(text, uid)
|
await self._process(text, uid, title)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user