mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-10-29 14:12:43 +00:00
switched to aiobotocore
This commit is contained in:
@@ -5,6 +5,7 @@ description = "Add your description here"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aiobotocore>=2.24.2",
|
||||||
"aiofiles>=24.1.0",
|
"aiofiles>=24.1.0",
|
||||||
"aiohttp>=3.11.16",
|
"aiohttp>=3.11.16",
|
||||||
"asyncssh>=2.20.0",
|
"asyncssh>=2.20.0",
|
||||||
|
|||||||
10
src/main.py
10
src/main.py
@@ -83,15 +83,15 @@ async def main():
|
|||||||
|
|
||||||
if max_conc < 0:
|
if max_conc < 0:
|
||||||
raise ValueError("MAX_CONCURRENT must be >= 0")
|
raise ValueError("MAX_CONCURRENT must be >= 0")
|
||||||
|
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
# 3. Load each handler
|
# 3. Load each handler
|
||||||
for handler_name in handler_names:
|
for handler_name in handler_names:
|
||||||
handler_name = handler_name.strip()
|
handler_name = handler_name.strip()
|
||||||
if not handler_name:
|
if not handler_name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Dynamic import
|
# Dynamic import
|
||||||
module_path = f"output_handlers.{handler_name}"
|
module_path = f"output_handlers.{handler_name}"
|
||||||
try:
|
try:
|
||||||
@@ -111,12 +111,12 @@ async def main():
|
|||||||
|
|
||||||
# Build kwargs from ENV
|
# Build kwargs from ENV
|
||||||
handler_kwargs = gather_handler_kwargs(handler_name)
|
handler_kwargs = gather_handler_kwargs(handler_name)
|
||||||
|
|
||||||
# Add max_concurrent to kwargs
|
# Add max_concurrent to kwargs
|
||||||
handler_kwargs["max_concurrent"] = max_conc
|
handler_kwargs["max_concurrent"] = max_conc
|
||||||
|
|
||||||
# Instantiate
|
# Instantiate
|
||||||
handler = HandlerCls(**handler_kwargs)
|
handler = await HandlerCls.create(**handler_kwargs)
|
||||||
handlers.append(handler)
|
handlers.append(handler)
|
||||||
|
|
||||||
# 4. Fetch mappings
|
# 4. Fetch mappings
|
||||||
|
|||||||
@@ -15,7 +15,9 @@ class BaseHandler(ABC):
|
|||||||
_successful_writes = 0
|
_successful_writes = 0
|
||||||
_failed_writes = 0
|
_failed_writes = 0
|
||||||
|
|
||||||
def __init__(self, fail_on_error: bool = True, max_concurrent=0, **kwargs):
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
async def create(cls, fail_on_error: bool = True, max_concurrent=0, **kwargs) -> "BaseHandler":
|
||||||
"""
|
"""
|
||||||
Initializes the BaseHandler with optional parameters.
|
Initializes the BaseHandler with optional parameters.
|
||||||
|
|
||||||
@@ -25,10 +27,12 @@ class BaseHandler(ABC):
|
|||||||
0 means unlimited concurrency.
|
0 means unlimited concurrency.
|
||||||
**kwargs: Additional keyword arguments for specific handler implementations.
|
**kwargs: Additional keyword arguments for specific handler implementations.
|
||||||
"""
|
"""
|
||||||
|
self = cls(**kwargs)
|
||||||
self.fail_on_error = fail_on_error
|
self.fail_on_error = fail_on_error
|
||||||
self.semaphore = None
|
self.semaphore = None
|
||||||
if max_concurrent > 0:
|
if max_concurrent > 0:
|
||||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -38,7 +42,7 @@ class BaseHandler(ABC):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
entry (dict): The entry to write (will be JSON-encoded).
|
entry (dict): The entry to write (will be JSON-encoded).
|
||||||
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the entry was written successfully, False otherwise.
|
bool: True if the entry was written successfully, False otherwise.
|
||||||
"""
|
"""
|
||||||
@@ -51,7 +55,7 @@ class BaseHandler(ABC):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
entry (dict): The entry to write (will be JSON-encoded).
|
entry (dict): The entry to write (will be JSON-encoded).
|
||||||
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
uid (str): The unique identifier for the entry. The default id provided by wikivoyage is recommended.
|
||||||
"""
|
"""
|
||||||
if self.semaphore:
|
if self.semaphore:
|
||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
|
|||||||
@@ -1,103 +1,77 @@
|
|||||||
"""Handler that writes asynchronously."""
|
"""Handler that writes asynchronously."""
|
||||||
from .base_handler import BaseHandler
|
from .base_handler import BaseHandler
|
||||||
import json
|
import json
|
||||||
from minio import Minio
|
import aiobotocore
|
||||||
# making async calls to minio requires some wrapping
|
from aiobotocore.session import get_session
|
||||||
import concurrent.futures
|
from asyncio import TimeoutError
|
||||||
import asyncio
|
from contextlib import AsyncExitStack
|
||||||
from io import BytesIO
|
|
||||||
import urllib3
|
|
||||||
|
|
||||||
|
|
||||||
class S3Handler(BaseHandler):
|
class S3Handler(BaseHandler):
|
||||||
"""
|
"""
|
||||||
Handler that writes files to an S3 bucket asynchronously.
|
Handler that writes files to an S3 bucket asynchronously.
|
||||||
"""
|
"""
|
||||||
def __init__(self, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs):
|
@classmethod
|
||||||
|
async def create(cls, url: str, access_key: str, secret_key: str, bucket_name: str, **kwargs) -> "S3Handler":
|
||||||
"""
|
"""
|
||||||
Initializes the Handler with the specified S3 endpoint and bucket name.
|
Initializes the Handler with the specified S3 endpoint and bucket name.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
**kwargs: Additional keyword arguments for the BaseHandler.
|
**kwargs: Additional keyword arguments for the BaseHandler.
|
||||||
"""
|
"""
|
||||||
super().__init__(**kwargs)
|
self = await super().create(**kwargs)
|
||||||
|
|
||||||
self.bucket_name = bucket_name
|
self.bucket_name = bucket_name
|
||||||
|
|
||||||
# minio uses urllib3 so we need to set the connection pool limit according to max_concurrent
|
self.session = get_session()
|
||||||
max_concurrent = kwargs.get("max_concurrent")
|
self.exit_stack = AsyncExitStack()
|
||||||
# usually 0 is used to indicate no concurrence - in this setup that corresponds to a single worker
|
|
||||||
max_concurrent = max(1, max_concurrent)
|
|
||||||
|
|
||||||
http_client = urllib3.PoolManager(num_pools=max_concurrent)
|
session = aiobotocore.session.AioSession()
|
||||||
|
self.client = await self.exit_stack.enter_async_context(
|
||||||
self.s3_client = Minio(
|
session.create_client(
|
||||||
url,
|
service_name = 's3',
|
||||||
access_key = access_key,
|
# region_name='us-west-2',
|
||||||
secret_key = secret_key,
|
aws_secret_access_key = secret_key,
|
||||||
secure = True,
|
aws_access_key_id = access_key,
|
||||||
http_client = http_client
|
endpoint_url = url,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent)
|
await self._ensure_bucket_exists()
|
||||||
self._ensure_bucket_exists()
|
return self
|
||||||
|
|
||||||
|
|
||||||
def _ensure_bucket_exists(self):
|
async def _ensure_bucket_exists(self):
|
||||||
"""
|
"""
|
||||||
Ensures that the specified S3 bucket exists, tries to create it if it does not.
|
Ensures that the specified S3 bucket exists, but does not create it if it doesn't.
|
||||||
"""
|
"""
|
||||||
if not self.s3_client.bucket_exists(self.bucket_name):
|
# this will raise an error if the bucket does not exist
|
||||||
try:
|
await self.client.head_bucket(Bucket=self.bucket_name)
|
||||||
self.s3_client.make_bucket(self.bucket_name)
|
|
||||||
self.logger.info(f"Created bucket: {self.bucket_name}")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error creating bucket {self.bucket_name}: {e}")
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
self.logger.debug(f"Bucket {self.bucket_name} already exists.")
|
|
||||||
|
|
||||||
|
|
||||||
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
async def _write_entry(self, entry: dict, uid: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Asynchronously writes a single entry to the bucket.
|
Asynchronously writes a single entry to the bucket.
|
||||||
|
|
||||||
Args:
|
|
||||||
entry (dict): The entry to write (will be JSON-encoded).
|
|
||||||
uid (str): The unique identifier for the entry.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the entry was written successfully, False otherwise.
|
|
||||||
"""
|
"""
|
||||||
|
data = json.dumps(entry).encode('utf-8')
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
|
|
||||||
def sync_put():
|
|
||||||
# put requires an object that implements read
|
|
||||||
entry_json = json.dumps(entry).encode("utf-8")
|
|
||||||
size = len(entry_json) # size in bytes
|
|
||||||
entry_bytes = BytesIO(entry_json)
|
|
||||||
result = self.s3_client.put_object(
|
|
||||||
bucket_name = self.bucket_name,
|
|
||||||
object_name = f"{uid}.json",
|
|
||||||
data = entry_bytes,
|
|
||||||
length = size,
|
|
||||||
content_type = "application/json"
|
|
||||||
)
|
|
||||||
self.logger.debug(f"Got result {result}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
# run the put operation in a thread pool to avoid blocking the event loop
|
|
||||||
try:
|
try:
|
||||||
result = await loop.run_in_executor(self.executor, sync_put)
|
response = await self.client.put_object(
|
||||||
if not result:
|
Bucket=self.bucket_name,
|
||||||
raise Exception("Minio operation failed without exception.")
|
Key=f"{uid}.json",
|
||||||
self.logger.debug(f"Successfully wrote entry with UID {uid} to bucket {self.bucket_name}.")
|
Body=data
|
||||||
return True
|
)
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error writing entry with UID {uid} to bucket {self.bucket_name}: {e}")
|
except TimeoutError:
|
||||||
|
self.logger.error(f"Timeout error while writing entry {uid} to bucket {self.bucket_name}.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
|
||||||
|
self.logger.info(f"Successfully wrote entry {uid} to bucket {self.bucket_name}.")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.error(f"Failed to write entry {uid} to bucket {self.bucket_name}. Status code: {response['ResponseMetadata']['HTTPStatusCode']}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
self.executor.shutdown(wait=True)
|
await self.client.close()
|
||||||
self.logger.info("Executor shut down.")
|
await self._exit_stack.__aexit__(None, None, None)
|
||||||
|
await super().close()
|
||||||
|
|||||||
Reference in New Issue
Block a user