add bounding, env var config

This commit is contained in:
Bruce Röttgers 2025-04-18 13:35:16 +02:00
parent 3bfd30a073
commit e0aa134ecc
6 changed files with 134 additions and 37 deletions

1
.gitignore vendored

@ -11,3 +11,4 @@ wheels/
.env
node_modules
output

@ -1,2 +1,2 @@
from .base_handler import BaseHandler
from .filesystm_handler import FileSystemHandler
from .filesystem import FilesystemHandler

@ -2,8 +2,9 @@
from pathlib import Path
import aiofiles
from .base_handler import BaseHandler
import json
class FileSystemHandler(BaseHandler):
class FilesystemHandler(BaseHandler):
"""
Handler that writes files to the filesystem.
"""
@ -36,7 +37,7 @@ class FileSystemHandler(BaseHandler):
try:
file_path = self.output_dir / f"{uid}.json"
async with aiofiles.open(file_path, 'w') as f:
await f.write(entry)
await f.write(json.dumps(entry))
return True
except IOError as e:
self.logger.error(f"Error writing entry {uid}: {e}")

@ -7,5 +7,6 @@ requires-python = ">=3.12"
dependencies = [
"aiofiles>=24.1.0",
"mwparserfromhell>=0.6.6",
"python-dotenv>=1.1.0",
"wikitextparser>=0.56.3",
]

@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
import os
from pathlib import Path
import sys
import asyncio
import importlib
import logging
from dotenv import load_dotenv
load_dotenv()
class WikivoyageParser:
def __init__(self):
@ -342,41 +347,120 @@ class WikivoyageParser:
return json.dumps(root, indent=indent)
def process_file(input_file: Path, parser: WikivoyageParser) -> None:
"""Process a single wiki file and save JSON output"""
# Create output path with .json extension
output_file = input_file.with_suffix('.json')
async def process_file(
input_file: Path,
parser: WikivoyageParser,
handler,
) -> None:
"""
Parse one wiki file and hand the resulting entry off to our handler.
Uses the filename (sans suffix) as the unique UID.
"""
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
try:
# Read and parse input file
with open(input_file, 'r', encoding='utf-8') as f:
wikitext = f.read()
result = parser.parse(wikitext)
# Write JSON output
with open(output_file, 'w', encoding='utf-8') as f:
f.write(parser.export_json())
except Exception as e:
print(f"Error processing {input_file}: {e}")
text = input_file.read_text(encoding="utf-8")
entry = parser.parse(text) # assume returns a dict
uid = input_file.stem
def main():
# Initialize parser once for reuse
await handler.write_entry(entry, uid)
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key[len(prefix) :].lower()
# try to cast ints
if val.isdigit():
val = int(val)
kwargs[param] = val
return kwargs
async def main():
logging.basicConfig(level=logging.DEBUG)
# 1. Which handler to load?
handler_name = os.getenv("HANDLER")
if not handler_name:
print("Error: set ENV HANDLER (e.g. 'filesystem')")
sys.exit(1)
# 2. Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
mod = importlib.import_module(module_path)
except ImportError as e:
print(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
# 3. Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
print(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
# 4. Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
# 5. Instantiate
handler = HandlerCls(**handler_kwargs)
# 6. Prepare parser
parser = WikivoyageParser()
# Get input directory from command line or use current directory
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.')
# Process all .txt files recursively
for txt_file in input_dir.rglob('*.txt'):
print(f"Processing {txt_file}")
process_file(txt_file, parser)
print("Processing complete")
# 7. Which dir to walk?
input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
txt_files = list(input_dir.rglob("*.txt"))
if not txt_files:
print(f"No .txt files found under {input_dir}")
else:
for txt in txt_files:
await process_file(txt, parser, handler)
# 7) read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
print("Error: MAX_CONCURRENT must be an integer")
sys.exit(1)
if max_conc < 0:
print("Error: MAX_CONCURRENT must be >= 0")
sys.exit(1)
# 8) schedule tasks
if max_conc == 0:
# unbounded
tasks = [
asyncio.create_task(process_file(txt, parser, handler))
for txt in txt_files
]
else:
# bounded by semaphore
sem = asyncio.Semaphore(max_conc)
async def bounded(txt):
async with sem:
return await process_file(txt, parser, handler)
tasks = [
asyncio.create_task(bounded(txt))
for txt in txt_files
]
# 9) run them all
await asyncio.gather(*tasks)
print("All done.")
if __name__ == "__main__":
main()
asyncio.run(main())

12
uv.lock generated

@ -1,5 +1,4 @@
version = 1
revision = 1
requires-python = ">=3.12"
[[package]]
@ -18,6 +17,7 @@ source = { virtual = "." }
dependencies = [
{ name = "aiofiles" },
{ name = "mwparserfromhell" },
{ name = "python-dotenv" },
{ name = "wikitextparser" },
]
@ -25,6 +25,7 @@ dependencies = [
requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
{ name = "mwparserfromhell", specifier = ">=0.6.6" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "wikitextparser", specifier = ">=0.56.3" },
]
@ -41,6 +42,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
]
[[package]]
name = "python-dotenv"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
]
[[package]]
name = "regex"
version = "2024.11.6"