mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-30 22:52:45 +00:00 
			
		
		
		
	Merge pull request #33 from bcye/feature/multiple-handlers
Allow for multiple handlers
This commit is contained in:
		
							
								
								
									
										81
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								main.py
									
									
									
									
									
								
							| @@ -40,7 +40,7 @@ def gather_handler_kwargs(handler_name: str) -> dict: | ||||
|  | ||||
|  | ||||
| async def process_dump( | ||||
|     mappings: dict[str, str], handler, max_concurrent: int | ||||
|     mappings: dict[str, str], handlers | ||||
| ): | ||||
|     """ | ||||
|     Stream-download the bzip2-compressed XML dump and feed to SAX. | ||||
| @@ -52,7 +52,7 @@ async def process_dump( | ||||
|     ) | ||||
|     decomp = bz2.BZ2Decompressor() | ||||
|     sax_parser = xml.sax.make_parser() | ||||
|     dump_handler = WikiDumpHandler(mappings, handler, max_concurrent) | ||||
|     dump_handler = WikiDumpHandler(mappings, handlers) | ||||
|     sax_parser.setContentHandler(dump_handler) | ||||
|  | ||||
|     async with aiohttp.ClientSession() as session: | ||||
| @@ -69,36 +69,13 @@ async def process_dump( | ||||
|         await asyncio.gather(*dump_handler.tasks) | ||||
|  | ||||
| async def main(): | ||||
|     # 1. Which handler to load? | ||||
|     handler_name = os.getenv("HANDLER") | ||||
|     if not handler_name: | ||||
|         logger.error("Error: set ENV HANDLER (e.g. 'filesystem')") | ||||
|     # 1. Which handler(s) to load? | ||||
|     handler_names = os.getenv("HANDLER", "").split(",") | ||||
|     if not handler_names or not handler_names[0]: | ||||
|         logger.error("Error: set ENV HANDLER (e.g. 'filesystem' or 'filesystem,sftp')") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     # 2. Dynamic import | ||||
|     module_path = f"output_handlers.{handler_name}" | ||||
|     try: | ||||
|         mod = importlib.import_module(module_path) | ||||
|     except ImportError as e: | ||||
|         logger.error(f"Error loading handler module {module_path}: {e}") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     # 3. Find the class: e.g. "sftp" → "SftpHandler" | ||||
|     class_name = handler_name.title().replace("_", "") + "Handler" | ||||
|     if not hasattr(mod, class_name): | ||||
|         logger.error(f"{module_path} defines no class {class_name}") | ||||
|         sys.exit(1) | ||||
|     HandlerCls = getattr(mod, class_name) | ||||
|  | ||||
|     logger.info(f"Using handler from {module_path}") | ||||
|  | ||||
|     # 4. Build kwargs from ENV | ||||
|     handler_kwargs = gather_handler_kwargs(handler_name) | ||||
|  | ||||
|     # 5. Instantiate | ||||
|     handler = HandlerCls(**handler_kwargs) | ||||
|  | ||||
|     # 6. read concurrency setting | ||||
|     # 2. Read concurrency setting | ||||
|     try: | ||||
|         max_conc = int(os.getenv("MAX_CONCURRENT", "0")) | ||||
|     except ValueError: | ||||
| @@ -107,18 +84,52 @@ async def main(): | ||||
|     if max_conc < 0: | ||||
|         raise ValueError("MAX_CONCURRENT must be >= 0") | ||||
|          | ||||
|     handlers = [] | ||||
|      | ||||
|     # 7. Fetch mappings | ||||
|     # 3. Load each handler | ||||
|     for handler_name in handler_names: | ||||
|         handler_name = handler_name.strip() | ||||
|         if not handler_name: | ||||
|             continue | ||||
|              | ||||
|         # Dynamic import | ||||
|         module_path = f"output_handlers.{handler_name}" | ||||
|         try: | ||||
|             mod = importlib.import_module(module_path) | ||||
|         except ImportError as e: | ||||
|             logger.error(f"Error loading handler module {module_path}: {e}") | ||||
|             sys.exit(1) | ||||
|  | ||||
|         # Find the class: e.g. "sftp" → "SftpHandler" | ||||
|         class_name = handler_name.title().replace("_", "") + "Handler" | ||||
|         if not hasattr(mod, class_name): | ||||
|             logger.error(f"{module_path} defines no class {class_name}") | ||||
|             sys.exit(1) | ||||
|         HandlerCls = getattr(mod, class_name) | ||||
|  | ||||
|         logger.info(f"Using handler from {module_path}") | ||||
|  | ||||
|         # Build kwargs from ENV | ||||
|         handler_kwargs = gather_handler_kwargs(handler_name) | ||||
|          | ||||
|         # Add max_concurrent to kwargs | ||||
|         handler_kwargs["max_concurrent"] = max_conc | ||||
|  | ||||
|         # Instantiate | ||||
|         handler = HandlerCls(**handler_kwargs) | ||||
|         handlers.append(handler) | ||||
|  | ||||
|     # 4. Fetch mappings | ||||
|     logger.info("Fetching mappings from SQL dump…") | ||||
|     mappings = await fetch_mappings() | ||||
|     logger.info(f"Got {len(mappings)} wikibase_item mappings.") | ||||
|  | ||||
|     # 8. Stream & split the XML dump | ||||
|     # 5. Stream & split the XML dump | ||||
|     logger.info("Processing XML dump…") | ||||
|     await process_dump(mappings, handler, max_conc) | ||||
|     await process_dump(mappings, handlers)  # Pass 0 as max_concurrent since handlers handle it | ||||
|  | ||||
|     # 5. Finish up | ||||
|     await handler.close() | ||||
|     # 6. Finish up | ||||
|     await asyncio.gather(*[handler.close() for handler in handlers]) | ||||
|     logger.info("All done.") | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| """Reference handler for output handlers.""" | ||||
| from abc import ABC, abstractmethod | ||||
| import logging | ||||
| import asyncio | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -14,15 +15,20 @@ class BaseHandler(ABC): | ||||
|     _successful_writes = 0 | ||||
|     _failed_writes = 0 | ||||
|  | ||||
|     def __init__(self, fail_on_error: bool = True, **kwargs): | ||||
|     def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs): | ||||
|         """ | ||||
|         Initializes the BaseHandler with optional parameters. | ||||
|  | ||||
|         Args: | ||||
|             fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True. | ||||
|             max_concurrent: Maximum number of concurrent write operations. | ||||
|                             0 means unlimited concurrency. | ||||
|             **kwargs: Additional keyword arguments for specific handler implementations. | ||||
|         """ | ||||
|         self.fail_on_error = fail_on_error | ||||
|         self.semaphore = None | ||||
|         if max_concurrent > 0: | ||||
|             self.semaphore = asyncio.Semaphore(max_concurrent) | ||||
|  | ||||
|  | ||||
|     @abstractmethod | ||||
| @@ -47,6 +53,10 @@ class BaseHandler(ABC): | ||||
|             entry (dict): The entry to write (will be JSON-encoded). | ||||
|             uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.  | ||||
|         """ | ||||
|         if self.semaphore: | ||||
|             async with self.semaphore: | ||||
|                 success = await self._write_entry(entry, uid) | ||||
|         else: | ||||
|             success = await self._write_entry(entry, uid) | ||||
|         if success: | ||||
|             self.logger.debug(f"Successfully wrote entry with UID {uid}") | ||||
|   | ||||
| @@ -10,8 +10,9 @@ class BunnyStorageHandler(BaseHandler): | ||||
|         api_key: str, | ||||
|         fail_on_error: bool = True, | ||||
|         keepalive_timeout: int = 75, | ||||
|         **kwargs, | ||||
|     ): | ||||
|         super().__init__(fail_on_error=fail_on_error) | ||||
|         super().__init__(fail_on_error=fail_on_error, **kwargs) | ||||
|         self.base_url = f"https://{region}.bunnycdn.com/{base_path}" | ||||
|         self.headers = { | ||||
|             "AccessKey": api_key, | ||||
|   | ||||
| @@ -9,16 +9,14 @@ class WikiDumpHandler(xml.sax.ContentHandler): | ||||
|     """ | ||||
|     SAX handler that, for each <page> whose <id> is in mappings, | ||||
|     collects the <text> and schedules an async task to parse | ||||
|     and write via the user‐supplied handler. | ||||
|     and write via the user‐supplied handler(s). | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, mappings, handler, max_concurrent): | ||||
|     def __init__(self, mappings, handlers): | ||||
|         super().__init__() | ||||
|         self.mappings = mappings | ||||
|         self.handler = handler | ||||
|         self.sem = ( | ||||
|             asyncio.Semaphore(max_concurrent) if max_concurrent > 0 else None | ||||
|         ) | ||||
|         # Support a single handler or a list of handlers | ||||
|         self.handlers = handlers | ||||
|         self.tasks: list[asyncio.Task] = [] | ||||
|  | ||||
|         self.currentTag: str | None = None | ||||
| @@ -54,9 +52,6 @@ class WikiDumpHandler(xml.sax.ContentHandler): | ||||
|                 title = self.currentTitle | ||||
|                 logger.debug(f"scheduled {wd_id} for handling") | ||||
|                 # schedule processing | ||||
|                 if self.sem: | ||||
|                     task = asyncio.create_task(self._bounded_process(text, wd_id, title)) | ||||
|                 else: | ||||
|                 task = asyncio.create_task(self._process(text, wd_id, title)) | ||||
|                 self.tasks.append(task) | ||||
|             else: | ||||
| @@ -98,9 +93,8 @@ class WikiDumpHandler(xml.sax.ContentHandler): | ||||
|         parser = WikivoyageParser() | ||||
|         entry = parser.parse(text) | ||||
|         entry['properties']['title'] = title | ||||
|         await self.handler.write_entry(entry, uid) | ||||
|          | ||||
|     async def _bounded_process(self, text: str, uid: str, title: str): | ||||
|         # Only run N at once | ||||
|         async with self.sem: | ||||
|             await self._process(text, uid, title) | ||||
|         # Write to all handlers concurrently | ||||
|         await asyncio.gather(*[ | ||||
|             handler.write_entry(entry, uid) for handler in self.handlers | ||||
|         ]) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Bruce
					Bruce