diff --git a/api/.env.example b/api/.env.example index caca618e..49708591 100644 --- a/api/.env.example +++ b/api/.env.example @@ -18,6 +18,13 @@ ENABLE_AWS_S3="false" # AWS_SECRET_ACCESS_KEY="" # S3_BUCKET="" # S3_REGION="" +# --- S3-compatible servers (MinIO, rustfs, Ceph, ...) --- +# Use the S3 backend (ENABLE_AWS_S3=true) against a non-AWS, S3-compatible +# server by overriding the endpoint and signing. Unlike the MinIO backend, the +# S3 backend emits real presigned URLs, so the bucket can stay private. +# S3_ENDPOINT_URL="" # e.g. https://s3.example.com (blank = AWS default) +# S3_SIGNATURE_VERSION="" # blank = botocore default; set "s3v4" if the server requires SigV4 +# S3_ADDRESSING_STYLE="" # blank = auto; set "path" if the server / TLS cert requires path-style # MinIO Configuration if using containerised MinIO instead of # AWS S3 diff --git a/api/constants.py b/api/constants.py index b7bc9f74..a5b06396 100644 --- a/api/constants.py +++ b/api/constants.py @@ -53,6 +53,17 @@ MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" # AWS S3 Configuration S3_BUCKET = os.environ.get("S3_BUCKET") S3_REGION = os.environ.get("S3_REGION", "us-east-1") +# Optional overrides for S3-compatible backends (e.g. MinIO, rustfs, Ceph). +# S3_ENDPOINT_URL: full URL of a custom S3 endpoint (e.g. "https://s3.example.com"). +# Leave unset to use AWS's default endpoint resolution. +# S3_SIGNATURE_VERSION: botocore signature version used to sign requests and +# presigned URLs. Defaults to None (botocore's default, currently SigV2 for +# presigned URLs). Set to "s3v4" for S3-compatible servers that require SigV4. +# S3_ADDRESSING_STYLE: "auto" (default), "path", or "virtual". Many S3-compatible +# servers and TLS setups require "path". +S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL") +S3_SIGNATURE_VERSION = os.environ.get("S3_SIGNATURE_VERSION") +S3_ADDRESSING_STYLE = os.environ.get("S3_ADDRESSING_STYLE") # Sentry configuration SENTRY_DSN = os.getenv("SENTRY_DSN") diff --git a/api/services/filesystem/s3.py b/api/services/filesystem/s3.py index 9cca89ea..1cbc7ffd 100644 --- a/api/services/filesystem/s3.py +++ b/api/services/filesystem/s3.py @@ -1,29 +1,84 @@ from typing import Any, BinaryIO, Dict, Optional import aioboto3 +from botocore.config import Config from botocore.exceptions import ClientError +from api.constants import ( + S3_ADDRESSING_STYLE, + S3_ENDPOINT_URL, + S3_SIGNATURE_VERSION, +) + from .base import BaseFileSystem class S3FileSystem(BaseFileSystem): """S3 implementation of the filesystem interface.""" - def __init__(self, bucket_name: str, region_name: str = "us-east-1"): + def __init__( + self, + bucket_name: str, + region_name: str = "us-east-1", + endpoint_url: Optional[str] = None, + signature_version: Optional[str] = None, + addressing_style: Optional[str] = None, + ): """Initialize S3 filesystem. Args: bucket_name: Name of the S3 bucket region_name: AWS region name + endpoint_url: Optional custom S3 endpoint (e.g. for MinIO/rustfs). + Defaults to ``S3_ENDPOINT_URL`` env var; ``None`` uses AWS. + signature_version: Optional botocore signature version (e.g. + ``"s3v4"``). Defaults to ``S3_SIGNATURE_VERSION`` env var; + ``None`` keeps botocore's default signing behavior. + addressing_style: Optional S3 addressing style (``"path"`` / + ``"virtual"`` / ``"auto"``). Defaults to ``S3_ADDRESSING_STYLE`` + env var; ``None`` keeps botocore's default. """ self.bucket_name = bucket_name self.region_name = region_name + self.endpoint_url = ( + endpoint_url if endpoint_url is not None else S3_ENDPOINT_URL + ) + signature_version = ( + signature_version + if signature_version is not None + else S3_SIGNATURE_VERSION + ) + addressing_style = ( + addressing_style if addressing_style is not None else S3_ADDRESSING_STYLE + ) self.session = aioboto3.Session() + # Build a botocore Config only when an override is requested so that the + # default behavior is byte-for-byte unchanged when no env vars are set. + config_kwargs: Dict[str, Any] = {} + if signature_version: + config_kwargs["signature_version"] = signature_version + if addressing_style: + config_kwargs["s3"] = {"addressing_style": addressing_style} + self._config = Config(**config_kwargs) if config_kwargs else None + + def _client_kwargs(self) -> Dict[str, Any]: + """Common kwargs for every ``session.client("s3", ...)`` call. + + Only includes ``endpoint_url`` / ``config`` when configured, so default + deployments behave exactly as before. + """ + kwargs: Dict[str, Any] = {"region_name": self.region_name} + if self.endpoint_url: + kwargs["endpoint_url"] = self.endpoint_url + if self._config is not None: + kwargs["config"] = self._config + return kwargs + async def acreate_file(self, file_path: str, content: BinaryIO) -> bool: try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: await s3_client.put_object( Bucket=self.bucket_name, Key=file_path, Body=await content.read() @@ -35,7 +90,7 @@ class S3FileSystem(BaseFileSystem): async def aupload_file(self, local_path: str, destination_path: str) -> bool: try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: await s3_client.upload_file( local_path, self.bucket_name, destination_path @@ -60,7 +115,7 @@ class S3FileSystem(BaseFileSystem): """ try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: params = {"Bucket": self.bucket_name, "Key": file_path} @@ -101,7 +156,7 @@ class S3FileSystem(BaseFileSystem): """Get S3 object metadata.""" try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: response = await s3_client.head_object( Bucket=self.bucket_name, Key=file_path @@ -127,7 +182,7 @@ class S3FileSystem(BaseFileSystem): """Generate a presigned PUT URL for direct file upload.""" try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: url = await s3_client.generate_presigned_url( "put_object", @@ -146,7 +201,7 @@ class S3FileSystem(BaseFileSystem): """Download a file from S3 to local path.""" try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: await s3_client.download_file(self.bucket_name, source_path, local_path) return True @@ -157,7 +212,7 @@ class S3FileSystem(BaseFileSystem): """Copy a file within S3 (server-side copy).""" try: async with self.session.client( - "s3", region_name=self.region_name + "s3", **self._client_kwargs() ) as s3_client: await s3_client.copy_object( Bucket=self.bucket_name, diff --git a/docker-compose.yaml b/docker-compose.yaml index 134de93d..813351a4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -148,6 +148,15 @@ services: # Storage configuration - using local MinIO ENABLE_AWS_S3: "false" + # To use AWS S3 or any S3-compatible server (MinIO, rustfs, Ceph, ...) + # instead of the bundled MinIO, set ENABLE_AWS_S3 to "true" and provide: + # S3_BUCKET, S3_REGION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY + # For a non-AWS S3-compatible server, also set: + # S3_ENDPOINT_URL e.g. https://s3.example.com + # S3_SIGNATURE_VERSION set "s3v4" if the server requires SigV4 (e.g. rustfs) + # S3_ADDRESSING_STYLE set "path" if the server / TLS cert requires path-style + # The S3 backend issues real presigned URLs, so the bucket can stay private. + # MinIO MINIO_ENDPOINT: "minio:9000" # Full URL (with scheme) browsers use to reach MinIO. For remote diff --git a/docs/developer/environment-variables.mdx b/docs/developer/environment-variables.mdx index 559bc30b..2bd6db04 100644 --- a/docs/developer/environment-variables.mdx +++ b/docs/developer/environment-variables.mdx @@ -95,6 +95,32 @@ Dograh uses **MinIO by default**, which is bundled with the self-hosted deployme | `ENABLE_AWS_S3` | `false` | Set to `true` to use AWS S3 instead of MinIO | | `S3_BUCKET` | `null` | S3 bucket name | | `S3_REGION` | `us-east-1` | AWS region | +| `S3_ENDPOINT_URL` | `null` | Custom S3 endpoint for S3-compatible servers (e.g. `https://s3.example.com`). Leave unset for AWS. | +| `S3_SIGNATURE_VERSION` | `null` | Signing version. Unset uses botocore's default; set `s3v4` for servers that require SigV4. | +| `S3_ADDRESSING_STYLE` | `null` | `auto` (default), `path`, or `virtual`. Many S3-compatible servers and TLS setups require `path`. | + +Credentials come from the standard `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` environment variables. + +#### S3-compatible servers (MinIO, rustfs, Ceph, ...) + +The S3 backend can target any S3-compatible server, not just AWS. Prefer it over the MinIO backend when you need **presigned URLs against a private bucket**: the MinIO backend returns plain unsigned object URLs and relies on the bucket being anonymously public-readable, whereas the S3 backend issues real presigned URLs so the bucket can stay private. + +To use it, set `ENABLE_AWS_S3=true` and point it at your server with the `S3_*` overrides above. For example, against [rustfs](https://github.com/rustfs/rustfs): + +```bash +ENABLE_AWS_S3=true +S3_BUCKET=voice-audio +S3_REGION=us-east-1 +S3_ENDPOINT_URL=https://s3.example.com +S3_SIGNATURE_VERSION=s3v4 # rustfs rejects SigV2 with SignatureDoesNotMatch +S3_ADDRESSING_STYLE=path # rustfs and most non-AWS TLS certs require path-style +AWS_ACCESS_KEY_ID=... +AWS_SECRET_ACCESS_KEY=... +``` + + +Presigned URLs point at `S3_ENDPOINT_URL`, so that host must be reachable from the browser. Because browsers fetch transcripts cross-origin, the bucket also needs a CORS rule allowing your app's origin for `GET`/`HEAD` — configure this on the storage server (e.g. via `PutBucketCors`), not in Dograh. + ---