diff --git a/.gitignore b/.gitignore index a1b941e..056384b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ wheels/ .env node_modules +output \ No newline at end of file diff --git a/output_handlers/__init__.py b/output_handlers/__init__.py index 66ca110..4ee3e0b 100644 --- a/output_handlers/__init__.py +++ b/output_handlers/__init__.py @@ -1,2 +1,2 @@ from .base_handler import BaseHandler -from .filesystm_handler import FileSystemHandler \ No newline at end of file +from .filesystem import FilesystemHandler \ No newline at end of file diff --git a/output_handlers/filesystm_handler.py b/output_handlers/filesystem.py similarity index 93% rename from output_handlers/filesystm_handler.py rename to output_handlers/filesystem.py index 2adeeb7..855348b 100644 --- a/output_handlers/filesystm_handler.py +++ b/output_handlers/filesystem.py @@ -2,8 +2,9 @@ from pathlib import Path import aiofiles from .base_handler import BaseHandler +import json -class FileSystemHandler(BaseHandler): +class FilesystemHandler(BaseHandler): """ Handler that writes files to the filesystem. """ @@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler): try: file_path = self.output_dir / f"{uid}.json" async with aiofiles.open(file_path, 'w') as f: - await f.write(entry) + await f.write(json.dumps(entry)) return True except IOError as e: self.logger.error(f"Error writing entry {uid}: {e}") diff --git a/pyproject.toml b/pyproject.toml index c9589f2..b91e9cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,5 +7,6 @@ requires-python = ">=3.12" dependencies = [ "aiofiles>=24.1.0", "mwparserfromhell>=0.6.6", + "python-dotenv>=1.1.0", "wikitextparser>=0.56.3", ] diff --git a/transform-documents.py b/transform-documents.py index 3cf4353..b4a8be8 100644 --- a/transform-documents.py +++ b/transform-documents.py @@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple import os from pathlib import Path import sys +import asyncio +import importlib +import logging +from dotenv import load_dotenv +load_dotenv() class WikivoyageParser: def __init__(self): @@ -342,41 +347,120 @@ class WikivoyageParser: return json.dumps(root, indent=indent) -def process_file(input_file: Path, parser: WikivoyageParser) -> None: - """Process a single wiki file and save JSON output""" - # Create output path with .json extension - output_file = input_file.with_suffix('.json') +async def process_file( + input_file: Path, + parser: WikivoyageParser, + handler, +) -> None: + """ + Parse one wiki file and hand the resulting entry off to our handler. + Uses the filename (sans suffix) as the unique UID. + """ - # Ensure output directory exists - output_file.parent.mkdir(parents=True, exist_ok=True) - - try: - # Read and parse input file - with open(input_file, 'r', encoding='utf-8') as f: - wikitext = f.read() - - result = parser.parse(wikitext) - - # Write JSON output - with open(output_file, 'w', encoding='utf-8') as f: - f.write(parser.export_json()) - - except Exception as e: - print(f"Error processing {input_file}: {e}") + text = input_file.read_text(encoding="utf-8") + entry = parser.parse(text) # assume returns a dict + uid = input_file.stem -def main(): - # Initialize parser once for reuse + await handler.write_entry(entry, uid) + +def gather_handler_kwargs(handler_name: str) -> dict: + """ + Find all ENV vars starting with HANDLER__ and turn them into kwargs. + E.g. HANDLER_SFTP_HOST=foo → {"host": "foo"}, HANDLER_SFTP_PORT=2222 → {"port": 2222} + """ + prefix = f"HANDLER_{handler_name.upper()}_" + kwargs = {} + + for env_key, val in os.environ.items(): + if not env_key.startswith(prefix): + continue + param = env_key[len(prefix) :].lower() + # try to cast ints + if val.isdigit(): + val = int(val) + kwargs[param] = val + + return kwargs + +async def main(): + logging.basicConfig(level=logging.DEBUG) + + # 1. Which handler to load? + handler_name = os.getenv("HANDLER") + if not handler_name: + print("Error: set ENV HANDLER (e.g. 'filesystem')") + sys.exit(1) + + # 2. Dynamic import + module_path = f"output_handlers.{handler_name}" + try: + mod = importlib.import_module(module_path) + except ImportError as e: + print(f"Error loading handler module {module_path}: {e}") + sys.exit(1) + + # 3. Find the class: e.g. "sftp" → "SftpHandler" + class_name = handler_name.title().replace("_", "") + "Handler" + if not hasattr(mod, class_name): + print(f"{module_path} defines no class {class_name}") + sys.exit(1) + HandlerCls = getattr(mod, class_name) + + # 4. Build kwargs from ENV + handler_kwargs = gather_handler_kwargs(handler_name) + + # 5. Instantiate + handler = HandlerCls(**handler_kwargs) + + # 6. Prepare parser parser = WikivoyageParser() - - # Get input directory from command line or use current directory - input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') - - # Process all .txt files recursively - for txt_file in input_dir.rglob('*.txt'): - print(f"Processing {txt_file}") - process_file(txt_file, parser) - - print("Processing complete") + + # 7. Which dir to walk? + input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".") + txt_files = list(input_dir.rglob("*.txt")) + + if not txt_files: + print(f"No .txt files found under {input_dir}") + else: + for txt in txt_files: + await process_file(txt, parser, handler) + + # 7) read concurrency setting + try: + max_conc = int(os.getenv("MAX_CONCURRENT", "0")) + except ValueError: + print("Error: MAX_CONCURRENT must be an integer") + sys.exit(1) + + if max_conc < 0: + print("Error: MAX_CONCURRENT must be >= 0") + sys.exit(1) + + # 8) schedule tasks + if max_conc == 0: + # unbounded + tasks = [ + asyncio.create_task(process_file(txt, parser, handler)) + for txt in txt_files + ] + else: + # bounded by semaphore + sem = asyncio.Semaphore(max_conc) + + async def bounded(txt): + async with sem: + return await process_file(txt, parser, handler) + + tasks = [ + asyncio.create_task(bounded(txt)) + for txt in txt_files + ] + + # 9) run them all + await asyncio.gather(*tasks) + + print("All done.") + if __name__ == "__main__": - main() \ No newline at end of file + asyncio.run(main()) \ No newline at end of file diff --git a/uv.lock b/uv.lock index 011f9a5..309463a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,4 @@ version = 1 -revision = 1 requires-python = ">=3.12" [[package]] @@ -18,6 +17,7 @@ source = { virtual = "." } dependencies = [ { name = "aiofiles" }, { name = "mwparserfromhell" }, + { name = "python-dotenv" }, { name = "wikitextparser" }, ] @@ -25,6 +25,7 @@ dependencies = [ requires-dist = [ { name = "aiofiles", specifier = ">=24.1.0" }, { name = "mwparserfromhell", specifier = ">=0.6.6" }, + { name = "python-dotenv", specifier = ">=1.1.0" }, { name = "wikitextparser", specifier = ">=0.56.3" }, ] @@ -41,6 +42,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, ] +[[package]] +name = "python-dotenv" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 }, +] + [[package]] name = "regex" version = "2024.11.6"