mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-30 22:52:45 +00:00 
			
		
		
		
	add bounding, env var config
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -11,3 +11,4 @@ wheels/ | ||||
|  | ||||
| .env | ||||
| node_modules | ||||
| output | ||||
| @@ -1,2 +1,2 @@ | ||||
| from .base_handler import BaseHandler | ||||
| from .filesystm_handler import FileSystemHandler | ||||
| from .filesystem import FilesystemHandler | ||||
| @@ -2,8 +2,9 @@ | ||||
| from pathlib import Path | ||||
| import aiofiles | ||||
| from .base_handler import BaseHandler | ||||
| import json | ||||
| 
 | ||||
| class FileSystemHandler(BaseHandler): | ||||
| class FilesystemHandler(BaseHandler): | ||||
|     """ | ||||
|     Handler that writes files to the filesystem. | ||||
|     """ | ||||
| @@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler): | ||||
|         try: | ||||
|             file_path = self.output_dir / f"{uid}.json" | ||||
|             async with aiofiles.open(file_path, 'w') as f: | ||||
|                 await f.write(entry) | ||||
|                 await f.write(json.dumps(entry)) | ||||
|             return True | ||||
|         except IOError as e: | ||||
|             self.logger.error(f"Error writing entry {uid}: {e}") | ||||
| @@ -7,5 +7,6 @@ requires-python = ">=3.12" | ||||
| dependencies = [ | ||||
|     "aiofiles>=24.1.0", | ||||
|     "mwparserfromhell>=0.6.6", | ||||
|     "python-dotenv>=1.1.0", | ||||
|     "wikitextparser>=0.56.3", | ||||
| ] | ||||
|   | ||||
| @@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple | ||||
| import os | ||||
| from pathlib import Path | ||||
| import sys | ||||
| import asyncio | ||||
| import importlib | ||||
| import logging | ||||
| from dotenv import load_dotenv | ||||
| load_dotenv() | ||||
|  | ||||
| class WikivoyageParser: | ||||
|     def __init__(self): | ||||
| @@ -342,41 +347,120 @@ class WikivoyageParser: | ||||
|              | ||||
|         return json.dumps(root, indent=indent) | ||||
|  | ||||
| def process_file(input_file: Path, parser: WikivoyageParser) -> None: | ||||
|     """Process a single wiki file and save JSON output""" | ||||
|     # Create output path with .json extension | ||||
|     output_file = input_file.with_suffix('.json') | ||||
| async def process_file( | ||||
|     input_file: Path, | ||||
|     parser: WikivoyageParser, | ||||
|     handler, | ||||
| ) -> None: | ||||
|     """ | ||||
|     Parse one wiki file and hand the resulting entry off to our handler. | ||||
|     Uses the filename (sans suffix) as the unique UID. | ||||
|     """ | ||||
|      | ||||
|     # Ensure output directory exists | ||||
|     output_file.parent.mkdir(parents=True, exist_ok=True) | ||||
|      | ||||
|     try: | ||||
|         # Read and parse input file | ||||
|         with open(input_file, 'r', encoding='utf-8') as f: | ||||
|             wikitext = f.read() | ||||
|              | ||||
|         result = parser.parse(wikitext) | ||||
|          | ||||
|         # Write JSON output | ||||
|         with open(output_file, 'w', encoding='utf-8') as f: | ||||
|             f.write(parser.export_json()) | ||||
|              | ||||
|     except Exception as e: | ||||
|         print(f"Error processing {input_file}: {e}") | ||||
|     text = input_file.read_text(encoding="utf-8") | ||||
|     entry = parser.parse(text)  # assume returns a dict | ||||
|     uid = input_file.stem | ||||
|  | ||||
| def main(): | ||||
|     # Initialize parser once for reuse | ||||
|     await handler.write_entry(entry, uid) | ||||
|  | ||||
| def gather_handler_kwargs(handler_name: str) -> dict: | ||||
|     """ | ||||
|     Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs. | ||||
|     E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222} | ||||
|     """ | ||||
|     prefix = f"HANDLER_{handler_name.upper()}_" | ||||
|     kwargs = {} | ||||
|  | ||||
|     for env_key, val in os.environ.items(): | ||||
|         if not env_key.startswith(prefix): | ||||
|             continue | ||||
|         param = env_key[len(prefix) :].lower() | ||||
|         # try to cast ints | ||||
|         if val.isdigit(): | ||||
|             val = int(val) | ||||
|         kwargs[param] = val | ||||
|  | ||||
|     return kwargs | ||||
|  | ||||
| async def main(): | ||||
|     logging.basicConfig(level=logging.DEBUG) | ||||
|  | ||||
|     # 1. Which handler to load? | ||||
|     handler_name = os.getenv("HANDLER") | ||||
|     if not handler_name: | ||||
|         print("Error: set ENV HANDLER (e.g. 'filesystem')") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     # 2. Dynamic import | ||||
|     module_path = f"output_handlers.{handler_name}" | ||||
|     try: | ||||
|         mod = importlib.import_module(module_path) | ||||
|     except ImportError as e: | ||||
|         print(f"Error loading handler module {module_path}: {e}") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     # 3. Find the class: e.g. "sftp" → "SftpHandler" | ||||
|     class_name = handler_name.title().replace("_", "") + "Handler" | ||||
|     if not hasattr(mod, class_name): | ||||
|         print(f"{module_path} defines no class {class_name}") | ||||
|         sys.exit(1) | ||||
|     HandlerCls = getattr(mod, class_name) | ||||
|  | ||||
|     # 4. Build kwargs from ENV | ||||
|     handler_kwargs = gather_handler_kwargs(handler_name) | ||||
|  | ||||
|     # 5. Instantiate | ||||
|     handler = HandlerCls(**handler_kwargs) | ||||
|  | ||||
|     # 6. Prepare parser | ||||
|     parser = WikivoyageParser() | ||||
|      | ||||
|     # Get input directory from command line or use current directory | ||||
|     input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') | ||||
|      | ||||
|     # Process all .txt files recursively | ||||
|     for txt_file in input_dir.rglob('*.txt'): | ||||
|         print(f"Processing {txt_file}") | ||||
|         process_file(txt_file, parser) | ||||
|          | ||||
|     print("Processing complete") | ||||
|  | ||||
|     # 7. Which dir to walk? | ||||
|     input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") | ||||
|     txt_files = list(input_dir.rglob("*.txt")) | ||||
|  | ||||
|     if not txt_files: | ||||
|         print(f"No .txt files found under {input_dir}") | ||||
|     else: | ||||
|         for txt in txt_files: | ||||
|             await process_file(txt, parser, handler) | ||||
|  | ||||
|     # 7) read concurrency setting | ||||
|     try: | ||||
|         max_conc = int(os.getenv("MAX_CONCURRENT", "0")) | ||||
|     except ValueError: | ||||
|         print("Error: MAX_CONCURRENT must be an integer") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     if max_conc < 0: | ||||
|         print("Error: MAX_CONCURRENT must be >= 0") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     # 8) schedule tasks | ||||
|     if max_conc == 0: | ||||
|         # unbounded | ||||
|         tasks = [ | ||||
|             asyncio.create_task(process_file(txt, parser, handler)) | ||||
|             for txt in txt_files | ||||
|         ] | ||||
|     else: | ||||
|         # bounded by semaphore | ||||
|         sem = asyncio.Semaphore(max_conc) | ||||
|  | ||||
|         async def bounded(txt): | ||||
|             async with sem: | ||||
|                 return await process_file(txt, parser, handler) | ||||
|  | ||||
|         tasks = [ | ||||
|             asyncio.create_task(bounded(txt)) | ||||
|             for txt in txt_files | ||||
|         ] | ||||
|  | ||||
|     # 9) run them all | ||||
|     await asyncio.gather(*tasks) | ||||
|  | ||||
|     print("All done.") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|     asyncio.run(main()) | ||||
							
								
								
									
										12
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										12
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							| @@ -1,5 +1,4 @@ | ||||
| version = 1 | ||||
| revision = 1 | ||||
| requires-python = ">=3.12" | ||||
|  | ||||
| [[package]] | ||||
| @@ -18,6 +17,7 @@ source = { virtual = "." } | ||||
| dependencies = [ | ||||
|     { name = "aiofiles" }, | ||||
|     { name = "mwparserfromhell" }, | ||||
|     { name = "python-dotenv" }, | ||||
|     { name = "wikitextparser" }, | ||||
| ] | ||||
|  | ||||
| @@ -25,6 +25,7 @@ dependencies = [ | ||||
| requires-dist = [ | ||||
|     { name = "aiofiles", specifier = ">=24.1.0" }, | ||||
|     { name = "mwparserfromhell", specifier = ">=0.6.6" }, | ||||
|     { name = "python-dotenv", specifier = ">=1.1.0" }, | ||||
|     { name = "wikitextparser", specifier = ">=0.56.3" }, | ||||
| ] | ||||
|  | ||||
| @@ -41,6 +42,15 @@ wheels = [ | ||||
|     { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "python-dotenv" | ||||
| version = "1.1.0" | ||||
| source = { registry = "https://pypi.org/simple" } | ||||
| sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 } | ||||
| wheels = [ | ||||
|     { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 }, | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "regex" | ||||
| version = "2024.11.6" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Bruce Röttgers
					Bruce Röttgers