Merge pull request #8 from bcye/feature/modular-output-handlers

Basic setup of modular output handlers
This commit is contained in:
Bruce 2025-04-19 20:19:27 +02:00 committed by GitHub
commit 6548f2196d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 242 additions and 33 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ wheels/
.env
node_modules
output

View File

@ -0,0 +1,2 @@
from .base_handler import BaseHandler
from .filesystem import FilesystemHandler

View File

@ -0,0 +1,54 @@
"""Reference handler for output handlers."""
from abc import ABC, abstractmethod
import logging
class BaseHandler(ABC):
"""
Abstract base class for output handlers. Defines the standardized interface that all output handlers must implement.
In particular, it requires the implementation of an asynchronous ("private") method `_write_entry` to write a single entry to the output.
"""
logger = logging.getLogger(__name__)
def __init__(self, fail_on_error: bool = True, **kwargs):
"""
Initializes the BaseHandler with optional parameters.
Args:
fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
**kwargs: Additional keyword arguments for specific handler implementations.
"""
self.fail_on_error = fail_on_error
@abstractmethod
async def _write_entry(self, entry: dict, uid: str) -> bool:
"""
Asynchronously writes a single entry to the output. This method should gracefully handle any exceptions that may occur during the writing process and simply return False if an error occurs.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
pass
async def write_entry(self, entry: dict, uid: str):
"""
Public method to write an entry to the output. It handles exceptions and logs errors.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
"""
success = await self._write_entry(entry, uid)
if success:
self.logger.debug(f"Successfully wrote entry with UID {uid}")
else:
self.logger.error(f"Failed to write entry with UID {uid}")
if self.fail_on_error:
raise Exception(f"Failed to write entry with UID {uid}")

View File

@ -0,0 +1,44 @@
"""Handler that writes files to the filesystem."""
from pathlib import Path
import aiofiles
from .base_handler import BaseHandler
import json
class FilesystemHandler(BaseHandler):
"""
Handler that writes files to the filesystem.
"""
def __init__(self, output_dir: str, **kwargs):
"""
Initializes the FileSystemHandler with the specified output directory.
Args:
output_dir (str): The directory where files will be written.
**kwargs: Additional keyword arguments for the BaseHandler.
"""
super().__init__(**kwargs)
self.output_dir = Path(output_dir)
# Ensure the target directory exists
self.output_dir.mkdir(parents=True, exist_ok=True)
self.logger.info(f"Output directory set to {self.output_dir}")
async def _write_entry(self, entry: dict, uid: str) -> bool:
"""
Asynchronously writes a single entry to the filesystem.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
try:
file_path = self.output_dir / f"{uid}.json"
async with aiofiles.open(file_path, 'w') as f:
await f.write(json.dumps(entry))
return True
except IOError as e:
self.logger.error(f"Error writing entry {uid}: {e}")
return False

View File

@ -5,6 +5,8 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"aiofiles>=24.1.0",
"mwparserfromhell>=0.6.6",
"python-dotenv>=1.1.0",
"wikitextparser>=0.56.3",
]

View File

@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
import os
from pathlib import Path
import sys
import asyncio
import importlib
import logging
from dotenv import load_dotenv
load_dotenv()
class WikivoyageParser:
def __init__(self):
@ -342,41 +347,120 @@ class WikivoyageParser:
return json.dumps(root, indent=indent)
def process_file(input_file: Path, parser: WikivoyageParser) -> None:
"""Process a single wiki file and save JSON output"""
# Create output path with .json extension
output_file = input_file.with_suffix('.json')
async def process_file(
input_file: Path,
parser: WikivoyageParser,
handler,
) -> None:
"""
Parse one wiki file and hand the resulting entry off to our handler.
Uses the filename (sans suffix) as the unique UID.
"""
# Ensure output directory exists
output_file.parent.mkdir(parents=True, exist_ok=True)
text = input_file.read_text(encoding="utf-8")
entry = parser.parse(text) # assume returns a dict
uid = input_file.stem
await handler.write_entry(entry, uid)
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key[len(prefix) :].lower()
# try to cast ints
if val.isdigit():
val = int(val)
kwargs[param] = val
return kwargs
async def main():
logging.basicConfig(level=logging.DEBUG)
# 1. Which handler to load?
handler_name = os.getenv("HANDLER")
if not handler_name:
print("Error: set ENV HANDLER (e.g. 'filesystem')")
sys.exit(1)
# 2. Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
# Read and parse input file
with open(input_file, 'r', encoding='utf-8') as f:
wikitext = f.read()
mod = importlib.import_module(module_path)
except ImportError as e:
print(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
result = parser.parse(wikitext)
# 3. Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
print(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
# Write JSON output
with open(output_file, 'w', encoding='utf-8') as f:
f.write(parser.export_json())
# 4. Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
except Exception as e:
print(f"Error processing {input_file}: {e}")
# 5. Instantiate
handler = HandlerCls(**handler_kwargs)
def main():
# Initialize parser once for reuse
# 6. Prepare parser
parser = WikivoyageParser()
# Get input directory from command line or use current directory
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.')
# 7. Which dir to walk?
input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
txt_files = list(input_dir.rglob("*.txt"))
# Process all .txt files recursively
for txt_file in input_dir.rglob('*.txt'):
print(f"Processing {txt_file}")
process_file(txt_file, parser)
if not txt_files:
print(f"No .txt files found under {input_dir}")
else:
for txt in txt_files:
await process_file(txt, parser, handler)
# 7) read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
print("Error: MAX_CONCURRENT must be an integer")
sys.exit(1)
if max_conc < 0:
print("Error: MAX_CONCURRENT must be >= 0")
sys.exit(1)
# 8) schedule tasks
if max_conc == 0:
# unbounded
tasks = [
asyncio.create_task(process_file(txt, parser, handler))
for txt in txt_files
]
else:
# bounded by semaphore
sem = asyncio.Semaphore(max_conc)
async def bounded(txt):
async with sem:
return await process_file(txt, parser, handler)
tasks = [
asyncio.create_task(bounded(txt))
for txt in txt_files
]
# 9) run them all
await asyncio.gather(*tasks)
print("All done.")
print("Processing complete")
if __name__ == "__main__":
main()
asyncio.run(main())

22
uv.lock generated
View File

@ -1,18 +1,31 @@
version = 1
requires-python = ">=3.12"
[[package]]
name = "aiofiles"
version = "24.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0b/03/a88171e277e8caa88a4c77808c20ebb04ba74cc4681bf1e9416c862de237/aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c", size = 30247 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a5/45/30bb92d442636f570cb5651bc661f52b610e2eec3f891a5dc3a4c3667db0/aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5", size = 15896 },
]
[[package]]
name = "mapvoyage-extract"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "aiofiles" },
{ name = "mwparserfromhell" },
{ name = "python-dotenv" },
{ name = "wikitextparser" },
]
[package.metadata]
requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
{ name = "mwparserfromhell", specifier = ">=0.6.6" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "wikitextparser", specifier = ">=0.56.3" },
]
@ -29,6 +42,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
]
[[package]]
name = "python-dotenv"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
]
[[package]]
name = "regex"
version = "2024.11.6"