mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-30 22:52:45 +00:00 
			
		
		
		
	move code to dedicated src/ folder
This commit is contained in:
		
							
								
								
									
										2
									
								
								.github/workflows/test-parser.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/test-parser.yaml
									
									
									
									
										vendored
									
									
								
							| @@ -20,4 +20,4 @@ jobs: | |||||||
|         run: uv sync --locked --dev |         run: uv sync --locked --dev | ||||||
|  |  | ||||||
|       - name: Run tests |       - name: Run tests | ||||||
|         run: PYTHONPATH=. uv run pytest |         run: PYTHONPATH=src uv run pytest | ||||||
|   | |||||||
| @@ -18,7 +18,7 @@ See [docs](docs) for more information on how to use this utility. | |||||||
|  |  | ||||||
| ## Testing | ## Testing | ||||||
|  |  | ||||||
| Run `PYTHONPATH=. pytest` from inside the venv | Run `PYTHONPATH=src pytest` from inside the venv, or directly call `PYTHONPATH=src uv run -- pytest`. | ||||||
|  |  | ||||||
| ## License | ## License | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										96
									
								
								src/transformers/wiki_dump_handler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										96
									
								
								src/transformers/wiki_dump_handler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,96 @@ | |||||||
|  | from logging import getLogger | ||||||
|  | import xml.sax | ||||||
|  | import asyncio | ||||||
|  | from .parser import WikivoyageParser | ||||||
|  |  | ||||||
|  | logger = getLogger(__name__) | ||||||
|  |  | ||||||
|  | class WikiDumpHandler(xml.sax.ContentHandler): | ||||||
|  |     """ | ||||||
|  |     SAX handler that, for each <page> whose <id> is in mappings, | ||||||
|  |     collects the <text> and schedules an async task to parse | ||||||
|  |     and write via the user‐supplied handler. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, mappings, handler, max_concurrent): | ||||||
|  |         super().__init__() | ||||||
|  |         self.mappings = mappings | ||||||
|  |         self.handler = handler | ||||||
|  |         self.sem = ( | ||||||
|  |             asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None | ||||||
|  |         ) | ||||||
|  |         self.tasks: list[asyncio.Task] = [] | ||||||
|  |  | ||||||
|  |         self.currentTag: str | None = None | ||||||
|  |         self.inPage = False | ||||||
|  |         self.inRevision = False | ||||||
|  |         self.inText = False | ||||||
|  |         self.currentPageId: str | None = None | ||||||
|  |         self.currentText: list[str] = [] | ||||||
|  |  | ||||||
|  |     def startElement(self, name, attrs): | ||||||
|  |         self.currentTag = name | ||||||
|  |         if name == "page": | ||||||
|  |             logger.debug("start page") | ||||||
|  |             self.inPage = True | ||||||
|  |             self.currentPageId = None | ||||||
|  |             self.currentText = [] | ||||||
|  |         elif name == "revision": | ||||||
|  |             logger.debug("start revision") | ||||||
|  |             self.inRevision = True | ||||||
|  |         elif name == "text" and self.inRevision: | ||||||
|  |             logger.debug("start text") | ||||||
|  |             self.inText = True | ||||||
|  |  | ||||||
|  |     def endElement(self, name): | ||||||
|  |         if name == "page": | ||||||
|  |             logger.debug("end page") | ||||||
|  |             pid = self.currentPageId | ||||||
|  |             if pid and pid in self.mappings: | ||||||
|  |                 wd_id = self.mappings[pid] | ||||||
|  |                 text = "".join(self.currentText) | ||||||
|  |                 logger.debug(f"scheduled {wd_id} for handling") | ||||||
|  |                 # schedule processing | ||||||
|  |                 if self.sem: | ||||||
|  |                     task = asyncio.create_task(self._bounded_process(text, wd_id)) | ||||||
|  |                 else: | ||||||
|  |                     task = asyncio.create_task(self._process(text, wd_id)) | ||||||
|  |                 self.tasks.append(task) | ||||||
|  |             else: | ||||||
|  |                 logger.debug(f"page {pid} without wikidata id, skipping...") | ||||||
|  |             # reset | ||||||
|  |             self.inPage = self.inRevision = self.inText = False | ||||||
|  |             self.currentPageId = None | ||||||
|  |             self.currentText = [] | ||||||
|  |         elif name == "revision": | ||||||
|  |             logger.debug("end revision") | ||||||
|  |             self.inRevision = False | ||||||
|  |         elif name == "text": | ||||||
|  |             logger.debug("end text") | ||||||
|  |             self.inText = False | ||||||
|  |         self.currentTag = None | ||||||
|  |  | ||||||
|  |     def characters(self, content): | ||||||
|  |         # Only filter whitespace for ID fields, preserve all content for text | ||||||
|  |         if ( | ||||||
|  |             self.currentTag == "id" | ||||||
|  |             and self.inPage | ||||||
|  |             and not self.inRevision | ||||||
|  |             and not self.currentPageId | ||||||
|  |         ): | ||||||
|  |             content_stripped = content.strip() | ||||||
|  |             if content_stripped:  # Only process non-empty ID content | ||||||
|  |                 self.currentPageId = content_stripped | ||||||
|  |         elif self.inText: | ||||||
|  |             # Always append text content, even if it's just whitespace or newlines | ||||||
|  |             self.currentText.append(content) | ||||||
|  |  | ||||||
|  |     async def _process(self, text: str, uid: str): | ||||||
|  |         parser = WikivoyageParser() | ||||||
|  |         entry = parser.parse(text) | ||||||
|  |         await self.handler.write_entry(entry, uid) | ||||||
|  |  | ||||||
|  |     async def _bounded_process(self, text: str, uid: str): | ||||||
|  |         # Only run N at once | ||||||
|  |         async with self.sem: | ||||||
|  |             await self._process(text, uid) | ||||||
		Reference in New Issue
	
	Block a user