mirror of
				https://github.com/bcye/structured-wikivoyage-exports.git
				synced 2025-10-30 22:52:45 +00:00 
			
		
		
		
	add bounding, env var config
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -11,3 +11,4 @@ wheels/ | |||||||
|  |  | ||||||
| .env | .env | ||||||
| node_modules | node_modules | ||||||
|  | output | ||||||
| @@ -1,2 +1,2 @@ | |||||||
| from .base_handler import BaseHandler | from .base_handler import BaseHandler | ||||||
| from .filesystm_handler import FileSystemHandler | from .filesystem import FilesystemHandler | ||||||
| @@ -2,8 +2,9 @@ | |||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import aiofiles | import aiofiles | ||||||
| from .base_handler import BaseHandler | from .base_handler import BaseHandler | ||||||
|  | import json | ||||||
| 
 | 
 | ||||||
| class FileSystemHandler(BaseHandler): | class FilesystemHandler(BaseHandler): | ||||||
|     """ |     """ | ||||||
|     Handler that writes files to the filesystem. |     Handler that writes files to the filesystem. | ||||||
|     """ |     """ | ||||||
| @@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler): | |||||||
|         try: |         try: | ||||||
|             file_path = self.output_dir / f"{uid}.json" |             file_path = self.output_dir / f"{uid}.json" | ||||||
|             async with aiofiles.open(file_path, 'w') as f: |             async with aiofiles.open(file_path, 'w') as f: | ||||||
|                 await f.write(entry) |                 await f.write(json.dumps(entry)) | ||||||
|             return True |             return True | ||||||
|         except IOError as e: |         except IOError as e: | ||||||
|             self.logger.error(f"Error writing entry {uid}: {e}") |             self.logger.error(f"Error writing entry {uid}: {e}") | ||||||
| @@ -7,5 +7,6 @@ requires-python = ">=3.12" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|     "aiofiles>=24.1.0", |     "aiofiles>=24.1.0", | ||||||
|     "mwparserfromhell>=0.6.6", |     "mwparserfromhell>=0.6.6", | ||||||
|  |     "python-dotenv>=1.1.0", | ||||||
|     "wikitextparser>=0.56.3", |     "wikitextparser>=0.56.3", | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple | |||||||
| import os | import os | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import sys | import sys | ||||||
|  | import asyncio | ||||||
|  | import importlib | ||||||
|  | import logging | ||||||
|  | from dotenv import load_dotenv | ||||||
|  | load_dotenv() | ||||||
|  |  | ||||||
| class WikivoyageParser: | class WikivoyageParser: | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
| @@ -342,41 +347,120 @@ class WikivoyageParser: | |||||||
|              |              | ||||||
|         return json.dumps(root, indent=indent) |         return json.dumps(root, indent=indent) | ||||||
|  |  | ||||||
| def process_file(input_file: Path, parser: WikivoyageParser) -> None: | async def process_file( | ||||||
|     """Process a single wiki file and save JSON output""" |     input_file: Path, | ||||||
|     # Create output path with .json extension |     parser: WikivoyageParser, | ||||||
|     output_file = input_file.with_suffix('.json') |     handler, | ||||||
|  | ) -> None: | ||||||
|  |     """ | ||||||
|  |     Parse one wiki file and hand the resulting entry off to our handler. | ||||||
|  |     Uses the filename (sans suffix) as the unique UID. | ||||||
|  |     """ | ||||||
|      |      | ||||||
|     # Ensure output directory exists |     text = input_file.read_text(encoding="utf-8") | ||||||
|     output_file.parent.mkdir(parents=True, exist_ok=True) |     entry = parser.parse(text)  # assume returns a dict | ||||||
|      |     uid = input_file.stem | ||||||
|     try: |  | ||||||
|         # Read and parse input file |  | ||||||
|         with open(input_file, 'r', encoding='utf-8') as f: |  | ||||||
|             wikitext = f.read() |  | ||||||
|              |  | ||||||
|         result = parser.parse(wikitext) |  | ||||||
|          |  | ||||||
|         # Write JSON output |  | ||||||
|         with open(output_file, 'w', encoding='utf-8') as f: |  | ||||||
|             f.write(parser.export_json()) |  | ||||||
|              |  | ||||||
|     except Exception as e: |  | ||||||
|         print(f"Error processing {input_file}: {e}") |  | ||||||
|  |  | ||||||
| def main(): |     await handler.write_entry(entry, uid) | ||||||
|     # Initialize parser once for reuse |  | ||||||
|  | def gather_handler_kwargs(handler_name: str) -> dict: | ||||||
|  |     """ | ||||||
|  |     Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs. | ||||||
|  |     E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222} | ||||||
|  |     """ | ||||||
|  |     prefix = f"HANDLER_{handler_name.upper()}_" | ||||||
|  |     kwargs = {} | ||||||
|  |  | ||||||
|  |     for env_key, val in os.environ.items(): | ||||||
|  |         if not env_key.startswith(prefix): | ||||||
|  |             continue | ||||||
|  |         param = env_key[len(prefix) :].lower() | ||||||
|  |         # try to cast ints | ||||||
|  |         if val.isdigit(): | ||||||
|  |             val = int(val) | ||||||
|  |         kwargs[param] = val | ||||||
|  |  | ||||||
|  |     return kwargs | ||||||
|  |  | ||||||
|  | async def main(): | ||||||
|  |     logging.basicConfig(level=logging.DEBUG) | ||||||
|  |  | ||||||
|  |     # 1. Which handler to load? | ||||||
|  |     handler_name = os.getenv("HANDLER") | ||||||
|  |     if not handler_name: | ||||||
|  |         print("Error: set ENV HANDLER (e.g. 'filesystem')") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     # 2. Dynamic import | ||||||
|  |     module_path = f"output_handlers.{handler_name}" | ||||||
|  |     try: | ||||||
|  |         mod = importlib.import_module(module_path) | ||||||
|  |     except ImportError as e: | ||||||
|  |         print(f"Error loading handler module {module_path}: {e}") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     # 3. Find the class: e.g. "sftp" → "SftpHandler" | ||||||
|  |     class_name = handler_name.title().replace("_", "") + "Handler" | ||||||
|  |     if not hasattr(mod, class_name): | ||||||
|  |         print(f"{module_path} defines no class {class_name}") | ||||||
|  |         sys.exit(1) | ||||||
|  |     HandlerCls = getattr(mod, class_name) | ||||||
|  |  | ||||||
|  |     # 4. Build kwargs from ENV | ||||||
|  |     handler_kwargs = gather_handler_kwargs(handler_name) | ||||||
|  |  | ||||||
|  |     # 5. Instantiate | ||||||
|  |     handler = HandlerCls(**handler_kwargs) | ||||||
|  |  | ||||||
|  |     # 6. Prepare parser | ||||||
|     parser = WikivoyageParser() |     parser = WikivoyageParser() | ||||||
|      |  | ||||||
|     # Get input directory from command line or use current directory |     # 7. Which dir to walk? | ||||||
|     input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') |     input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") | ||||||
|      |     txt_files = list(input_dir.rglob("*.txt")) | ||||||
|     # Process all .txt files recursively |  | ||||||
|     for txt_file in input_dir.rglob('*.txt'): |     if not txt_files: | ||||||
|         print(f"Processing {txt_file}") |         print(f"No .txt files found under {input_dir}") | ||||||
|         process_file(txt_file, parser) |     else: | ||||||
|          |         for txt in txt_files: | ||||||
|     print("Processing complete") |             await process_file(txt, parser, handler) | ||||||
|  |  | ||||||
|  |     # 7) read concurrency setting | ||||||
|  |     try: | ||||||
|  |         max_conc = int(os.getenv("MAX_CONCURRENT", "0")) | ||||||
|  |     except ValueError: | ||||||
|  |         print("Error: MAX_CONCURRENT must be an integer") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     if max_conc < 0: | ||||||
|  |         print("Error: MAX_CONCURRENT must be >= 0") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     # 8) schedule tasks | ||||||
|  |     if max_conc == 0: | ||||||
|  |         # unbounded | ||||||
|  |         tasks = [ | ||||||
|  |             asyncio.create_task(process_file(txt, parser, handler)) | ||||||
|  |             for txt in txt_files | ||||||
|  |         ] | ||||||
|  |     else: | ||||||
|  |         # bounded by semaphore | ||||||
|  |         sem = asyncio.Semaphore(max_conc) | ||||||
|  |  | ||||||
|  |         async def bounded(txt): | ||||||
|  |             async with sem: | ||||||
|  |                 return await process_file(txt, parser, handler) | ||||||
|  |  | ||||||
|  |         tasks = [ | ||||||
|  |             asyncio.create_task(bounded(txt)) | ||||||
|  |             for txt in txt_files | ||||||
|  |         ] | ||||||
|  |  | ||||||
|  |     # 9) run them all | ||||||
|  |     await asyncio.gather(*tasks) | ||||||
|  |  | ||||||
|  |     print("All done.") | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     main() |     asyncio.run(main()) | ||||||
							
								
								
									
										12
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										12
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							| @@ -1,5 +1,4 @@ | |||||||
| version = 1 | version = 1 | ||||||
| revision = 1 |  | ||||||
| requires-python = ">=3.12" | requires-python = ">=3.12" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -18,6 +17,7 @@ source = { virtual = "." } | |||||||
| dependencies = [ | dependencies = [ | ||||||
|     { name = "aiofiles" }, |     { name = "aiofiles" }, | ||||||
|     { name = "mwparserfromhell" }, |     { name = "mwparserfromhell" }, | ||||||
|  |     { name = "python-dotenv" }, | ||||||
|     { name = "wikitextparser" }, |     { name = "wikitextparser" }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -25,6 +25,7 @@ dependencies = [ | |||||||
| requires-dist = [ | requires-dist = [ | ||||||
|     { name = "aiofiles", specifier = ">=24.1.0" }, |     { name = "aiofiles", specifier = ">=24.1.0" }, | ||||||
|     { name = "mwparserfromhell", specifier = ">=0.6.6" }, |     { name = "mwparserfromhell", specifier = ">=0.6.6" }, | ||||||
|  |     { name = "python-dotenv", specifier = ">=1.1.0" }, | ||||||
|     { name = "wikitextparser", specifier = ">=0.56.3" }, |     { name = "wikitextparser", specifier = ">=0.56.3" }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -41,6 +42,15 @@ wheels = [ | |||||||
|     { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, |     { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "python-dotenv" | ||||||
|  | version = "1.1.0" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 }, | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex" | name = "regex" | ||||||
| version = "2024.11.6" | version = "2024.11.6" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Bruce Röttgers
					Bruce Röttgers