switched to aiobotocore

This commit is contained in:
2025-09-29 12:03:39 +02:00
parent 0c28033a44
commit c9825ecf90
5 changed files with 552 additions and 437 deletions

View File

@@ -5,6 +5,7 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"aiobotocore>=2.24.2",
"aiofiles>=24.1.0",
"aiohttp>=3.11.16",
"asyncssh>=2.20.0",

View File

@@ -83,15 +83,15 @@ async def main():
if max_conc < 0:
raise ValueError("MAX_CONCURRENT must be >= 0")
handlers = []
# 3. Load each handler
for handler_name in handler_names:
handler_name = handler_name.strip()
if not handler_name:
continue
# Dynamic import
module_path = f"output_handlers.{handler_name}"
try:
@@ -111,12 +111,12 @@ async def main():
# Build kwargs from ENV
handler_kwargs = gather_handler_kwargs(handler_name)
# Add max_concurrent to kwargs
handler_kwargs["max_concurrent"] = max_conc
# Instantiate
handler = HandlerCls(**handler_kwargs)
handler = await HandlerCls.create(**handler_kwargs)
handlers.append(handler)
# 4. Fetch mappings

View File

@@ -15,7 +15,9 @@ class BaseHandler(ABC):
_successful_writes = 0
_failed_writes = 0
def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
@classmethod
@abstractmethod
async def create(cls, fail_on_error: bool = True, max_concurrent=0, **kwargs) -> "BaseHandler":
"""
Initializes the BaseHandler with optional parameters.
@@ -25,10 +27,12 @@ class BaseHandler(ABC):
0 means unlimited concurrency.
**kwargs: Additional keyword arguments for specific handler implementations.
"""
self = cls(**kwargs)
self.fail_on_error = fail_on_error
self.semaphore = None
if max_concurrent > 0:
self.semaphore = asyncio.Semaphore(max_concurrent)
return self
@abstractmethod
@@ -38,7 +42,7 @@ class BaseHandler(ABC):
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
@@ -51,7 +55,7 @@ class BaseHandler(ABC):
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
"""
if self.semaphore:
async with self.semaphore:

View File

@@ -1,103 +1,77 @@
"""Handler that writes asynchronously."""
from .base_handler import BaseHandler
import json
from minio import Minio
# making async calls to minio requires some wrapping
import concurrent.futures
import asyncio
from io import BytesIO
import urllib3
import aiobotocore
from aiobotocore.session import get_session
from asyncio import TimeoutError
from contextlib import AsyncExitStack
class S3Handler(BaseHandler):
"""
Handler that writes files to an S3 bucket asynchronously.
"""
def __init__(self, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs):
@classmethod
async def create(cls, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs) -> "S3Handler":
"""
Initializes the Handler with the specified S3 endpoint and bucket name.
Args:
**kwargs: Additional keyword arguments for the BaseHandler.
"""
super().__init__(**kwargs)
self = await super().create(**kwargs)
self.bucket_name = bucket_name
# minio uses urllib3 so we need to set the connection pool limit according to max_concurrent
max_concurrent = kwargs.get("max_concurrent")
# usually 0 is used to indicate no concurrence - in this setup that corresponds to a single worker
max_concurrent = max(1, max_concurrent)
self.session = get_session()
self.exit_stack = AsyncExitStack()
http_client = urllib3.PoolManager(num_pools=max_concurrent)
self.s3_client = Minio(
url,
access_key = access_key,
secret_key = secret_key,
secure = True,
http_client = http_client
session = aiobotocore.session.AioSession()
self.client = await self.exit_stack.enter_async_context(
session.create_client(
service_name = 's3',
# region_name='us-west-2',
aws_secret_access_key = secret_key,
aws_access_key_id = access_key,
endpoint_url = url,
)
)
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent)
self._ensure_bucket_exists()
await self._ensure_bucket_exists()
return self
def _ensure_bucket_exists(self):
async def _ensure_bucket_exists(self):
"""
Ensures that the specified S3 bucket exists, tries to create it if it does not.
Ensures that the specified S3 bucket exists, but does not create it if it doesn't.
"""
if not self.s3_client.bucket_exists(self.bucket_name):
try:
self.s3_client.make_bucket(self.bucket_name)
self.logger.info(f"Created bucket: {self.bucket_name}")
except Exception as e:
self.logger.error(f"Error creating bucket {self.bucket_name}: {e}")
raise
else:
self.logger.debug(f"Bucket {self.bucket_name} already exists.")
# this will raise an error if the bucket does not exist
await self.client.head_bucket(Bucket=self.bucket_name)
async def _write_entry(self, entry: dict, uid: str) -> bool:
"""
Asynchronously writes a single entry to the bucket.
Args:
entry (dict): The entry to write (will be JSON-encoded).
uid (str): The unique identifier for the entry.
Returns:
bool: True if the entry was written successfully, False otherwise.
"""
loop = asyncio.get_running_loop()
def sync_put():
# put requires an object that implements read
entry_json = json.dumps(entry).encode("utf-8")
size = len(entry_json) # size in bytes
entry_bytes = BytesIO(entry_json)
result = self.s3_client.put_object(
bucket_name = self.bucket_name,
object_name = f"{uid}.json",
data = entry_bytes,
length = size,
content_type = "application/json"
)
self.logger.debug(f"Got result {result}")
return result
# run the put operation in a thread pool to avoid blocking the event loop
data = json.dumps(entry).encode('utf-8')
try:
result = await loop.run_in_executor(self.executor, sync_put)
if not result:
raise Exception("Minio operation failed without exception.")
self.logger.debug(f"Successfully wrote entry with UID {uid} to bucket {self.bucket_name}.")
return True
except Exception as e:
self.logger.error(f"Error writing entry with UID {uid} to bucket {self.bucket_name}: {e}")
response = await self.client.put_object(
Bucket=self.bucket_name,
Key=f"{uid}.json",
Body=data
)
except TimeoutError:
self.logger.error(f"Timeout error while writing entry {uid} to bucket {self.bucket_name}.")
return False
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
self.logger.info(f"Successfully wrote entry {uid} to bucket {self.bucket_name}.")
return True
else:
self.logger.error(f"Failed to write entry {uid} to bucket {self.bucket_name}. Status code: {response['ResponseMetadata']['HTTPStatusCode']}")
return False
async def close(self):
self.executor.shutdown(wait=True)
self.logger.info("Executor shut down.")
await self.client.close()
await self._exit_stack.__aexit__(None, None, None)
await super().close()

854
uv.lock generated

File diff suppressed because it is too large Load Diff