mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-10-29 14:12:43 +00:00
switched to aiobotocore
This commit is contained in:
@@ -5,6 +5,7 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"aiobotocore>=2.24.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"aiohttp>=3.11.16",
|
||||
"asyncssh>=2.20.0",
|
||||
|
||||
@@ -116,7 +116,7 @@ async def main():
|
||||
handler_kwargs["max_concurrent"] = max_conc
|
||||
|
||||
# Instantiate
|
||||
handler = HandlerCls(**handler_kwargs)
|
||||
handler = await HandlerCls.create(**handler_kwargs)
|
||||
handlers.append(handler)
|
||||
|
||||
# 4. Fetch mappings
|
||||
|
||||
@@ -15,7 +15,9 @@ class BaseHandler(ABC):
|
||||
_successful_writes = 0
|
||||
_failed_writes = 0
|
||||
|
||||
def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
async def create(cls, fail_on_error: bool = True, max_concurrent=0, **kwargs) -> "BaseHandler":
|
||||
"""
|
||||
Initializes the BaseHandler with optional parameters.
|
||||
|
||||
@@ -25,10 +27,12 @@ class BaseHandler(ABC):
|
||||
0 means unlimited concurrency.
|
||||
**kwargs: Additional keyword arguments for specific handler implementations.
|
||||
"""
|
||||
self = cls(**kwargs)
|
||||
self.fail_on_error = fail_on_error
|
||||
self.semaphore = None
|
||||
if max_concurrent > 0:
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
return self
|
||||
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -1,103 +1,77 @@
|
||||
"""Handler that writes asynchronously."""
|
||||
from .base_handler import BaseHandler
|
||||
import json
|
||||
from minio import Minio
|
||||
# making async calls to minio requires some wrapping
|
||||
import concurrent.futures
|
||||
import asyncio
|
||||
from io import BytesIO
|
||||
import urllib3
|
||||
|
||||
import aiobotocore
|
||||
from aiobotocore.session import get_session
|
||||
from asyncio import TimeoutError
|
||||
from contextlib import AsyncExitStack
|
||||
|
||||
class S3Handler(BaseHandler):
|
||||
"""
|
||||
Handler that writes files to an S3 bucket asynchronously.
|
||||
"""
|
||||
def __init__(self, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs):
|
||||
@classmethod
|
||||
async def create(cls, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs) -> "S3Handler":
|
||||
"""
|
||||
Initializes the Handler with the specified S3 endpoint and bucket name.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional keyword arguments for the BaseHandler.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self = await super().create(**kwargs)
|
||||
self.bucket_name = bucket_name
|
||||
|
||||
# minio uses urllib3 so we need to set the connection pool limit according to max_concurrent
|
||||
max_concurrent = kwargs.get("max_concurrent")
|
||||
# usually 0 is used to indicate no concurrence - in this setup that corresponds to a single worker
|
||||
max_concurrent = max(1, max_concurrent)
|
||||
self.session = get_session()
|
||||
self.exit_stack = AsyncExitStack()
|
||||
|
||||
http_client = urllib3.PoolManager(num_pools=max_concurrent)
|
||||
|
||||
self.s3_client = Minio(
|
||||
url,
|
||||
access_key = access_key,
|
||||
secret_key = secret_key,
|
||||
secure = True,
|
||||
http_client = http_client
|
||||
session = aiobotocore.session.AioSession()
|
||||
self.client = await self.exit_stack.enter_async_context(
|
||||
session.create_client(
|
||||
service_name = 's3',
|
||||
# region_name='us-west-2',
|
||||
aws_secret_access_key = secret_key,
|
||||
aws_access_key_id = access_key,
|
||||
endpoint_url = url,
|
||||
)
|
||||
)
|
||||
|
||||
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent)
|
||||
self._ensure_bucket_exists()
|
||||
await self._ensure_bucket_exists()
|
||||
return self
|
||||
|
||||
|
||||
def _ensure_bucket_exists(self):
|
||||
async def _ensure_bucket_exists(self):
|
||||
"""
|
||||
Ensures that the specified S3 bucket exists, tries to create it if it does not.
|
||||
Ensures that the specified S3 bucket exists, but does not create it if it doesn't.
|
||||
"""
|
||||
if not self.s3_client.bucket_exists(self.bucket_name):
|
||||
try:
|
||||
self.s3_client.make_bucket(self.bucket_name)
|
||||
self.logger.info(f"Created bucket: {self.bucket_name}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error creating bucket {self.bucket_name}: {e}")
|
||||
raise
|
||||
else:
|
||||
self.logger.debug(f"Bucket {self.bucket_name} already exists.")
|
||||
# this will raise an error if the bucket does not exist
|
||||
await self.client.head_bucket(Bucket=self.bucket_name)
|
||||
|
||||
|
||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||
"""
|
||||
Asynchronously writes a single entry to the bucket.
|
||||
|
||||
Args:
|
||||
entry (dict): The entry to write (will be JSON-encoded).
|
||||
uid (str): The unique identifier for the entry.
|
||||
|
||||
Returns:
|
||||
bool: True if the entry was written successfully, False otherwise.
|
||||
"""
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
def sync_put():
|
||||
# put requires an object that implements read
|
||||
entry_json = json.dumps(entry).encode("utf-8")
|
||||
size = len(entry_json) # size in bytes
|
||||
entry_bytes = BytesIO(entry_json)
|
||||
result = self.s3_client.put_object(
|
||||
bucket_name = self.bucket_name,
|
||||
object_name = f"{uid}.json",
|
||||
data = entry_bytes,
|
||||
length = size,
|
||||
content_type = "application/json"
|
||||
)
|
||||
self.logger.debug(f"Got result {result}")
|
||||
return result
|
||||
|
||||
# run the put operation in a thread pool to avoid blocking the event loop
|
||||
data = json.dumps(entry).encode('utf-8')
|
||||
try:
|
||||
result = await loop.run_in_executor(self.executor, sync_put)
|
||||
if not result:
|
||||
raise Exception("Minio operation failed without exception.")
|
||||
self.logger.debug(f"Successfully wrote entry with UID {uid} to bucket {self.bucket_name}.")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error writing entry with UID {uid} to bucket {self.bucket_name}: {e}")
|
||||
response = await self.client.put_object(
|
||||
Bucket=self.bucket_name,
|
||||
Key=f"{uid}.json",
|
||||
Body=data
|
||||
)
|
||||
|
||||
except TimeoutError:
|
||||
self.logger.error(f"Timeout error while writing entry {uid} to bucket {self.bucket_name}.")
|
||||
return False
|
||||
|
||||
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
|
||||
self.logger.info(f"Successfully wrote entry {uid} to bucket {self.bucket_name}.")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"Failed to write entry {uid} to bucket {self.bucket_name}. Status code: {response['ResponseMetadata']['HTTPStatusCode']}")
|
||||
return False
|
||||
|
||||
|
||||
async def close(self):
|
||||
self.executor.shutdown(wait=True)
|
||||
self.logger.info("Executor shut down.")
|
||||
await self.client.close()
|
||||
await self._exit_stack.__aexit__(None, None, None)
|
||||
await super().close()
|
||||
|
||||
Reference in New Issue
Block a user