add bounding, env var config

This commit is contained in:
Bruce Röttgers 2025-04-18 13:35:16 +02:00
parent 3bfd30a073
commit e0aa134ecc
6 changed files with 134 additions and 37 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ wheels/
.env .env
node_modules node_modules
output

View File

@ -1,2 +1,2 @@
from .base_handler import BaseHandler from .base_handler import BaseHandler
from .filesystm_handler import FileSystemHandler from .filesystem import FilesystemHandler

View File

@ -2,8 +2,9 @@
from pathlib import Path from pathlib import Path
import aiofiles import aiofiles
from .base_handler import BaseHandler from .base_handler import BaseHandler
import json
class FileSystemHandler(BaseHandler): class FilesystemHandler(BaseHandler):
""" """
Handler that writes files to the filesystem. Handler that writes files to the filesystem.
""" """
@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler):
try: try:
file_path = self.output_dir / f"{uid}.json" file_path = self.output_dir / f"{uid}.json"
async with aiofiles.open(file_path, 'w') as f: async with aiofiles.open(file_path, 'w') as f:
await f.write(entry) await f.write(json.dumps(entry))
return True return True
except IOError as e: except IOError as e:
self.logger.error(f"Error writing entry {uid}: {e}") self.logger.error(f"Error writing entry {uid}: {e}")

View File

@ -7,5 +7,6 @@ requires-python = ">=3.12"
dependencies = [ dependencies = [
"aiofiles>=24.1.0", "aiofiles>=24.1.0",
"mwparserfromhell>=0.6.6", "mwparserfromhell>=0.6.6",
"python-dotenv>=1.1.0",
"wikitextparser>=0.56.3", "wikitextparser>=0.56.3",
] ]

View File

@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
import os import os
from pathlib import Path from pathlib import Path
import sys import sys
import asyncio
import importlib
import logging
from dotenv import load_dotenv
load_dotenv()
class WikivoyageParser: class WikivoyageParser:
def __init__(self): def __init__(self):
@ -342,41 +347,120 @@ class WikivoyageParser:
return json.dumps(root, indent=indent) return json.dumps(root, indent=indent)
def process_file(input_file: Path, parser: WikivoyageParser) -> None: async def process_file(
"""Process a single wiki file and save JSON output""" input_file: Path,
# Create output path with .json extension parser: WikivoyageParser,
output_file = input_file.with_suffix('.json') handler,
) -> None:
"""
Parse one wiki file and hand the resulting entry off to our handler.
Uses the filename (sans suffix) as the unique UID.
"""
# Ensure output directory exists text = input_file.read_text(encoding="utf-8")
output_file.parent.mkdir(parents=True, exist_ok=True) entry = parser.parse(text) # assume returns a dict
uid = input_file.stem
await handler.write_entry(entry, uid)
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key[len(prefix) :].lower()
# try to cast ints
if val.isdigit():
val = int(val)
kwargs[param] = val
return kwargs
async def main():
logging.basicConfig(level=logging.DEBUG)
# 1. Which handler to load?
handler_name = os.getenv("HANDLER")
if not handler_name:
print("Error: set ENV HANDLER (e.g. 'filesystem')")
sys.exit(1)
# 2. Dynamic import
module_path = f"output_handlers.{handler_name}"
try: try:
# Read and parse input file mod = importlib.import_module(module_path)
with open(input_file, 'r', encoding='utf-8') as f: except ImportError as e:
wikitext = f.read() print(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
result = parser.parse(wikitext) # 3. Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
print(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
# Write JSON output # 4. Build kwargs from ENV
with open(output_file, 'w', encoding='utf-8') as f: handler_kwargs = gather_handler_kwargs(handler_name)
f.write(parser.export_json())
except Exception as e: # 5. Instantiate
print(f"Error processing {input_file}: {e}") handler = HandlerCls(**handler_kwargs)
def main(): # 6. Prepare parser
# Initialize parser once for reuse
parser = WikivoyageParser() parser = WikivoyageParser()
# Get input directory from command line or use current directory # 7. Which dir to walk?
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
txt_files = list(input_dir.rglob("*.txt"))
# Process all .txt files recursively if not txt_files:
for txt_file in input_dir.rglob('*.txt'): print(f"No .txt files found under {input_dir}")
print(f"Processing {txt_file}") else:
process_file(txt_file, parser) for txt in txt_files:
await process_file(txt, parser, handler)
# 7) read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
print("Error: MAX_CONCURRENT must be an integer")
sys.exit(1)
if max_conc < 0:
print("Error: MAX_CONCURRENT must be >= 0")
sys.exit(1)
# 8) schedule tasks
if max_conc == 0:
# unbounded
tasks = [
asyncio.create_task(process_file(txt, parser, handler))
for txt in txt_files
]
else:
# bounded by semaphore
sem = asyncio.Semaphore(max_conc)
async def bounded(txt):
async with sem:
return await process_file(txt, parser, handler)
tasks = [
asyncio.create_task(bounded(txt))
for txt in txt_files
]
# 9) run them all
await asyncio.gather(*tasks)
print("All done.")
print("Processing complete")
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())

12
uv.lock generated
View File

@ -1,5 +1,4 @@
version = 1 version = 1
revision = 1
requires-python = ">=3.12" requires-python = ">=3.12"
[[package]] [[package]]
@ -18,6 +17,7 @@ source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "aiofiles" }, { name = "aiofiles" },
{ name = "mwparserfromhell" }, { name = "mwparserfromhell" },
{ name = "python-dotenv" },
{ name = "wikitextparser" }, { name = "wikitextparser" },
] ]
@ -25,6 +25,7 @@ dependencies = [
requires-dist = [ requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" }, { name = "aiofiles", specifier = ">=24.1.0" },
{ name = "mwparserfromhell", specifier = ">=0.6.6" }, { name = "mwparserfromhell", specifier = ">=0.6.6" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "wikitextparser", specifier = ">=0.56.3" }, { name = "wikitextparser", specifier = ">=0.56.3" },
] ]
@ -41,6 +42,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
] ]
[[package]]
name = "python-dotenv"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "2024.11.6" version = "2024.11.6"