Merge pull request #8 from bcye/feature/modular-output-handlers

Basic setup of modular output handlers
This commit is contained in:
Bruce 2025-04-19 20:19:27 +02:00 committed by GitHub
commit 6548f2196d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 242 additions and 33 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ wheels/
.env .env
node_modules node_modules
output

View File

@ -0,0 +1,2 @@
from .base_handler import BaseHandler
from .filesystem import FilesystemHandler

View File

@ -0,0 +1,54 @@
"""Reference handler for output handlers."""
from abc import ABC, abstractmethod
import logging
class BaseHandler(ABC):
"""
Abstract base class for output handlers. Defines the standardized interface that all output handlers must implement.
In particular, it requires the implementation of an asynchronous ("private") method `_write_entry` to write a single entry to the output.
"""
logger = logging.getLogger(__name__)
def __init__(self, fail_on_error: bool = True, **kwargs):
"""
Initializes the BaseHandler with optional parameters.
Args:
fail_on_error (bool): If True, the handler will raise an exception on error. Defaults to True.
**kwargs: Additional keyword arguments for specific handler implementations.
"""
self.fail_on_error = fail_on_error
@abstractmethod
async def _write_entry(self, entry: dict, uid: str) -> bool:
"""
Asynchronously writes a single entry to the output. This method should gracefully handle any exceptions that may occur during the writing process and simply return False if an error occurs.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
pass
async def write_entry(self, entry: dict, uid: str):
"""
Public method to write an entry to the output. It handles exceptions and logs errors.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
"""
success = await self._write_entry(entry, uid)
if success:
self.logger.debug(f"Successfully wrote entry with UID {uid}")
else:
self.logger.error(f"Failed to write entry with UID {uid}")
if self.fail_on_error:
raise Exception(f"Failed to write entry with UID {uid}")

View File

@ -0,0 +1,44 @@
"""Handler that writes files to the filesystem."""
from pathlib import Path
import aiofiles
from .base_handler import BaseHandler
import json
class FilesystemHandler(BaseHandler):
"""
Handler that writes files to the filesystem.
"""
def __init__(self, output_dir: str, **kwargs):
"""
Initializes the FileSystemHandler with the specified output directory.
Args:
output_dir (str): The directory where files will be written.
**kwargs: Additional keyword arguments for the BaseHandler.
"""
super().__init__(**kwargs)
self.output_dir = Path(output_dir)
# Ensure the target directory exists
self.output_dir.mkdir(parents=True, exist_ok=True)
self.logger.info(f"Output directory set to {self.output_dir}")
async def _write_entry(self, entry: dict, uid: str) -> bool:
"""
Asynchronously writes a single entry to the filesystem.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
try:
file_path = self.output_dir / f"{uid}.json"
async with aiofiles.open(file_path, 'w') as f:
await f.write(json.dumps(entry))
return True
except IOError as e:
self.logger.error(f"Error writing entry {uid}: {e}")
return False

View File

@ -5,6 +5,8 @@ description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.12" requires-python = ">=3.12"
dependencies = [ dependencies = [
"aiofiles>=24.1.0",
"mwparserfromhell>=0.6.6", "mwparserfromhell>=0.6.6",
"python-dotenv>=1.1.0",
"wikitextparser>=0.56.3", "wikitextparser>=0.56.3",
] ]

View File

@ -6,6 +6,11 @@ from typing import Dict, List, Any, Optional, Union, Tuple
import os import os
from pathlib import Path from pathlib import Path
import sys import sys
import asyncio
import importlib
import logging
from dotenv import load_dotenv
load_dotenv()
class WikivoyageParser: class WikivoyageParser:
def __init__(self): def __init__(self):
@ -342,41 +347,120 @@ class WikivoyageParser:
return json.dumps(root, indent=indent) return json.dumps(root, indent=indent)
def process_file(input_file: Path, parser: WikivoyageParser) -> None: async def process_file(
"""Process a single wiki file and save JSON output""" input_file: Path,
# Create output path with .json extension parser: WikivoyageParser,
output_file = input_file.with_suffix('.json') handler,
) -> None:
"""
Parse one wiki file and hand the resulting entry off to our handler.
Uses the filename (sans suffix) as the unique UID.
"""
# Ensure output directory exists text = input_file.read_text(encoding="utf-8")
output_file.parent.mkdir(parents=True, exist_ok=True) entry = parser.parse(text) # assume returns a dict
uid = input_file.stem
try:
# Read and parse input file
with open(input_file, 'r', encoding='utf-8') as f:
wikitext = f.read()
result = parser.parse(wikitext)
# Write JSON output
with open(output_file, 'w', encoding='utf-8') as f:
f.write(parser.export_json())
except Exception as e:
print(f"Error processing {input_file}: {e}")
def main(): await handler.write_entry(entry, uid)
# Initialize parser once for reuse
def gather_handler_kwargs(handler_name: str) -> dict:
"""
Find all ENV vars starting with HANDLER_<NAME>_ and turn them into kwargs.
E.g. HANDLER_SFTP_HOST=foo {"host": "foo"}, HANDLER_SFTP_PORT=2222 {"port": 2222}
"""
prefix = f"HANDLER_{handler_name.upper()}_"
kwargs = {}
for env_key, val in os.environ.items():
if not env_key.startswith(prefix):
continue
param = env_key[len(prefix) :].lower()
# try to cast ints
if val.isdigit():
val = int(val)
kwargs[param] = val
return kwargs
async def main():
logging.basicConfig(level=logging.DEBUG)
# 1. Which handler to load?
handler_name = os.getenv("HANDLER")
if not handler_name:
print("Error: set ENV HANDLER (e.g. 'filesystem')")
sys.exit(1)
# 2. Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
mod = importlib.import_module(module_path)
except ImportError as e:
print(f"Error loading handler module {module_path}: {e}")
sys.exit(1)
# 3. Find the class: e.g. "sftp" → "SftpHandler"
class_name = handler_name.title().replace("_", "") + "Handler"
if not hasattr(mod, class_name):
print(f"{module_path} defines no class {class_name}")
sys.exit(1)
HandlerCls = getattr(mod, class_name)
# 4. Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
# 5. Instantiate
handler = HandlerCls(**handler_kwargs)
# 6. Prepare parser
parser = WikivoyageParser() parser = WikivoyageParser()
# Get input directory from command line or use current directory # 7. Which dir to walk?
input_dir = Path(sys.argv[1] if len(sys.argv) > 1 else '.') input_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
txt_files = list(input_dir.rglob("*.txt"))
# Process all .txt files recursively
for txt_file in input_dir.rglob('*.txt'): if not txt_files:
print(f"Processing {txt_file}") print(f"No .txt files found under {input_dir}")
process_file(txt_file, parser) else:
for txt in txt_files:
print("Processing complete") await process_file(txt, parser, handler)
# 7) read concurrency setting
try:
max_conc = int(os.getenv("MAX_CONCURRENT", "0"))
except ValueError:
print("Error: MAX_CONCURRENT must be an integer")
sys.exit(1)
if max_conc < 0:
print("Error: MAX_CONCURRENT must be >= 0")
sys.exit(1)
# 8) schedule tasks
if max_conc == 0:
# unbounded
tasks = [
asyncio.create_task(process_file(txt, parser, handler))
for txt in txt_files
]
else:
# bounded by semaphore
sem = asyncio.Semaphore(max_conc)
async def bounded(txt):
async with sem:
return await process_file(txt, parser, handler)
tasks = [
asyncio.create_task(bounded(txt))
for txt in txt_files
]
# 9) run them all
await asyncio.gather(*tasks)
print("All done.")
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())

22
uv.lock generated
View File

@ -1,18 +1,31 @@
version = 1 version = 1
requires-python = ">=3.12" requires-python = ">=3.12"
[[package]]
name = "aiofiles"
version = "24.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0b/03/a88171e277e8caa88a4c77808c20ebb04ba74cc4681bf1e9416c862de237/aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c", size = 30247 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a5/45/30bb92d442636f570cb5651bc661f52b610e2eec3f891a5dc3a4c3667db0/aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5", size = 15896 },
]
[[package]] [[package]]
name = "mapvoyage-extract" name = "mapvoyage-extract"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "aiofiles" },
{ name = "mwparserfromhell" }, { name = "mwparserfromhell" },
{ name = "python-dotenv" },
{ name = "wikitextparser" }, { name = "wikitextparser" },
] ]
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
{ name = "mwparserfromhell", specifier = ">=0.6.6" }, { name = "mwparserfromhell", specifier = ">=0.6.6" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "wikitextparser", specifier = ">=0.56.3" }, { name = "wikitextparser", specifier = ">=0.56.3" },
] ]
@ -29,6 +42,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 }, { url = "https://files.pythonhosted.org/packages/78/09/24c2f37524a3ebc3574975766748c7e4423ecefaa815c9fc4a324cbcf94a/mwparserfromhell-0.6.6-cp312-cp312-win_amd64.whl", hash = "sha256:cdc46c115b2495d4025920b7b30a6885a96d2b797ccc4009bf3cc02940ae55d3", size = 101071 },
] ]
[[package]]
name = "python-dotenv"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/88/2c/7bb1416c5620485aa793f2de31d3df393d3686aa8a8506d11e10e13c5baf/python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5", size = 39920 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "2024.11.6" version = "2024.11.6"