mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-30 22:52:45 +00:00 
			
		
		
		
	move remaining files
This commit is contained in:
		| @@ -5,20 +5,19 @@ from .parser import WikivoyageParser | |||||||
|  |  | ||||||
| logger = getLogger(__name__) | logger = getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| class WikiDumpHandler(xml.sax.ContentHandler): | class WikiDumpHandler(xml.sax.ContentHandler): | ||||||
|     """ |     """ | ||||||
|     SAX handler that, for each <page> whose <id> is in mappings, |     SAX handler that, for each <page> whose <id> is in mappings, | ||||||
|     collects the <text> and schedules an async task to parse |     collects the <text> and schedules an async task to parse | ||||||
|     and write via the user‐supplied handler. |     and write via the user‐supplied handler(s). | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __init__(self, mappings, handler, max_concurrent): |     def __init__(self, mappings, handlers): | ||||||
|         super().__init__() |         super().__init__() | ||||||
|         self.mappings = mappings |         self.mappings = mappings | ||||||
|         self.handler = handler |         # Support a single handler or a list of handlers | ||||||
|         self.sem = ( |         self.handlers = handlers | ||||||
|             asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None |  | ||||||
|         ) |  | ||||||
|         self.tasks: list[asyncio.Task] = [] |         self.tasks: list[asyncio.Task] = [] | ||||||
|  |  | ||||||
|         self.currentTag: str | None = None |         self.currentTag: str | None = None | ||||||
| @@ -26,6 +25,7 @@ class WikiDumpHandler(xml.sax.ContentHandler): | |||||||
|         self.inRevision = False |         self.inRevision = False | ||||||
|         self.inText = False |         self.inText = False | ||||||
|         self.currentPageId: str | None = None |         self.currentPageId: str | None = None | ||||||
|  |         self.currentTitle: str | None = None | ||||||
|         self.currentText: list[str] = [] |         self.currentText: list[str] = [] | ||||||
|  |  | ||||||
|     def startElement(self, name, attrs): |     def startElement(self, name, attrs): | ||||||
| @@ -34,6 +34,7 @@ class WikiDumpHandler(xml.sax.ContentHandler): | |||||||
|             logger.debug("start page") |             logger.debug("start page") | ||||||
|             self.inPage = True |             self.inPage = True | ||||||
|             self.currentPageId = None |             self.currentPageId = None | ||||||
|  |             self.currentTitle = None | ||||||
|             self.currentText = [] |             self.currentText = [] | ||||||
|         elif name == "revision": |         elif name == "revision": | ||||||
|             logger.debug("start revision") |             logger.debug("start revision") | ||||||
| @@ -49,18 +50,17 @@ class WikiDumpHandler(xml.sax.ContentHandler): | |||||||
|             if pid and pid in self.mappings: |             if pid and pid in self.mappings: | ||||||
|                 wd_id = self.mappings[pid] |                 wd_id = self.mappings[pid] | ||||||
|                 text = "".join(self.currentText) |                 text = "".join(self.currentText) | ||||||
|  |                 title = self.currentTitle | ||||||
|                 logger.debug(f"scheduled {wd_id} for handling") |                 logger.debug(f"scheduled {wd_id} for handling") | ||||||
|                 # schedule processing |                 # schedule processing | ||||||
|                 if self.sem: |                 task = asyncio.create_task(self._process(text, wd_id, title)) | ||||||
|                     task = asyncio.create_task(self._bounded_process(text, wd_id)) |  | ||||||
|                 else: |  | ||||||
|                     task = asyncio.create_task(self._process(text, wd_id)) |  | ||||||
|                 self.tasks.append(task) |                 self.tasks.append(task) | ||||||
|             else: |             else: | ||||||
|                 logger.debug(f"page {pid} without wikidata id, skipping...") |                 logger.debug(f"page {pid} without wikidata id, skipping...") | ||||||
|             # reset |             # reset | ||||||
|             self.inPage = self.inRevision = self.inText = False |             self.inPage = self.inRevision = self.inText = False | ||||||
|             self.currentPageId = None |             self.currentPageId = None | ||||||
|  |             self.currentTitle = None | ||||||
|             self.currentText = [] |             self.currentText = [] | ||||||
|         elif name == "revision": |         elif name == "revision": | ||||||
|             logger.debug("end revision") |             logger.debug("end revision") | ||||||
| @@ -81,16 +81,21 @@ class WikiDumpHandler(xml.sax.ContentHandler): | |||||||
|             content_stripped = content.strip() |             content_stripped = content.strip() | ||||||
|             if content_stripped:  # Only process non-empty ID content |             if content_stripped:  # Only process non-empty ID content | ||||||
|                 self.currentPageId = content_stripped |                 self.currentPageId = content_stripped | ||||||
|  |         elif self.currentTag == "title" and self.inPage: | ||||||
|  |             if self.currentTitle is None: | ||||||
|  |                 self.currentTitle = content | ||||||
|  |             else: | ||||||
|  |                 self.currentTitle += content | ||||||
|         elif self.inText: |         elif self.inText: | ||||||
|             # Always append text content, even if it's just whitespace or newlines |             # Always append text content, even if it's just whitespace or newlines | ||||||
|             self.currentText.append(content) |             self.currentText.append(content) | ||||||
|  |  | ||||||
|     async def _process(self, text: str, uid: str): |     async def _process(self, text: str, uid: str, title: str): | ||||||
|         parser = WikivoyageParser() |         parser = WikivoyageParser() | ||||||
|         entry = parser.parse(text) |         entry = parser.parse(text) | ||||||
|         await self.handler.write_entry(entry, uid) |         entry["properties"]["title"] = title | ||||||
|  |  | ||||||
|     async def _bounded_process(self, text: str, uid: str): |         # Write to all handlers concurrently | ||||||
|         # Only run N at once |         await asyncio.gather( | ||||||
|         async with self.sem: |             *[handler.write_entry(entry, uid) for handler in self.handlers] | ||||||
|             await self._process(text, uid) |         ) | ||||||
|   | |||||||
| @@ -1,100 +0,0 @@ | |||||||
| from logging import getLogger |  | ||||||
| import xml.sax |  | ||||||
| import asyncio |  | ||||||
| from .parser import WikivoyageParser |  | ||||||
|  |  | ||||||
| logger = getLogger(__name__) |  | ||||||
|  |  | ||||||
| class WikiDumpHandler(xml.sax.ContentHandler): |  | ||||||
|     """ |  | ||||||
|     SAX handler that, for each <page> whose <id> is in mappings, |  | ||||||
|     collects the <text> and schedules an async task to parse |  | ||||||
|     and write via the user‐supplied handler(s). |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, mappings, handlers): |  | ||||||
|         super().__init__() |  | ||||||
|         self.mappings = mappings |  | ||||||
|         # Support a single handler or a list of handlers |  | ||||||
|         self.handlers = handlers |  | ||||||
|         self.tasks: list[asyncio.Task] = [] |  | ||||||
|  |  | ||||||
|         self.currentTag: str | None = None |  | ||||||
|         self.inPage = False |  | ||||||
|         self.inRevision = False |  | ||||||
|         self.inText = False |  | ||||||
|         self.currentPageId: str | None = None |  | ||||||
|         self.currentTitle: str | None = None |  | ||||||
|         self.currentText: list[str] = [] |  | ||||||
|  |  | ||||||
|     def startElement(self, name, attrs): |  | ||||||
|         self.currentTag = name |  | ||||||
|         if name == "page": |  | ||||||
|             logger.debug("start page") |  | ||||||
|             self.inPage = True |  | ||||||
|             self.currentPageId = None |  | ||||||
|             self.currentTitle = None |  | ||||||
|             self.currentText = [] |  | ||||||
|         elif name == "revision": |  | ||||||
|             logger.debug("start revision") |  | ||||||
|             self.inRevision = True |  | ||||||
|         elif name == "text" and self.inRevision: |  | ||||||
|             logger.debug("start text") |  | ||||||
|             self.inText = True |  | ||||||
|  |  | ||||||
|     def endElement(self, name): |  | ||||||
|         if name == "page": |  | ||||||
|             logger.debug("end page") |  | ||||||
|             pid = self.currentPageId |  | ||||||
|             if pid and pid in self.mappings: |  | ||||||
|                 wd_id = self.mappings[pid] |  | ||||||
|                 text = "".join(self.currentText) |  | ||||||
|                 title = self.currentTitle |  | ||||||
|                 logger.debug(f"scheduled {wd_id} for handling") |  | ||||||
|                 # schedule processing |  | ||||||
|                 task = asyncio.create_task(self._process(text, wd_id, title)) |  | ||||||
|                 self.tasks.append(task) |  | ||||||
|             else: |  | ||||||
|                 logger.debug(f"page {pid} without wikidata id, skipping...") |  | ||||||
|             # reset |  | ||||||
|             self.inPage = self.inRevision = self.inText = False |  | ||||||
|             self.currentPageId = None |  | ||||||
|             self.currentTitle = None |  | ||||||
|             self.currentText = [] |  | ||||||
|         elif name == "revision": |  | ||||||
|             logger.debug("end revision") |  | ||||||
|             self.inRevision = False |  | ||||||
|         elif name == "text": |  | ||||||
|             logger.debug("end text") |  | ||||||
|             self.inText = False |  | ||||||
|         self.currentTag = None |  | ||||||
|  |  | ||||||
|     def characters(self, content): |  | ||||||
|         # Only filter whitespace for ID fields, preserve all content for text |  | ||||||
|         if ( |  | ||||||
|             self.currentTag == "id" |  | ||||||
|             and self.inPage |  | ||||||
|             and not self.inRevision |  | ||||||
|             and not self.currentPageId |  | ||||||
|         ): |  | ||||||
|             content_stripped = content.strip() |  | ||||||
|             if content_stripped:  # Only process non-empty ID content |  | ||||||
|                 self.currentPageId = content_stripped |  | ||||||
|         elif self.currentTag == "title" and self.inPage: |  | ||||||
|             if self.currentTitle is None: |  | ||||||
|                 self.currentTitle = content |  | ||||||
|             else: |  | ||||||
|                 self.currentTitle += content |  | ||||||
|         elif self.inText: |  | ||||||
|             # Always append text content, even if it's just whitespace or newlines |  | ||||||
|             self.currentText.append(content) |  | ||||||
|  |  | ||||||
|     async def _process(self, text: str, uid: str, title: str): |  | ||||||
|         parser = WikivoyageParser() |  | ||||||
|         entry = parser.parse(text) |  | ||||||
|         entry['properties']['title'] = title |  | ||||||
|          |  | ||||||
|         # Write to all handlers concurrently |  | ||||||
|         await asyncio.gather(*[ |  | ||||||
|             handler.write_entry(entry, uid) for handler in self.handlers |  | ||||||
|         ]) |  | ||||||
		Reference in New Issue
	
	Block a user
	 Bruce Röttgers
					Bruce Röttgers