diff --git a/.forgejo/workflows/publish.yml b/.forgejo/workflows/publish.yml index 17f700c..9fdee68 100644 --- a/.forgejo/workflows/publish.yml +++ b/.forgejo/workflows/publish.yml @@ -7,16 +7,10 @@ on: workflow_dispatch: jobs: - build-and-publish: - name: Build & Publish (${{ matrix.runner }}, py${{ matrix.python }}) - runs-on: ${{ matrix.runner }} + publish: + runs-on: docker-amd64 container: - image: python:${{ matrix.python }}-bookworm - - strategy: - matrix: - python: ["3.10", "3.11", "3.12"] - runner: [docker-amd64, docker-arm64] + image: python:3.12-bookworm steps: - name: Checkout repository @@ -26,21 +20,13 @@ jobs: . - name: Install build tools - run: | - apt-get update -qq && apt-get install -y patchelf - pip install build Cython twine auditwheel + run: pip install build twine - - name: Build wheel - run: python -m build --wheel - - - name: Repair wheel to manylinux - run: auditwheel repair dist/*.whl --wheel-dir wheelhouse/ - - - name: Check wheel metadata - run: twine check wheelhouse/*.whl + - name: Build package + run: python -m build - name: Publish to PyPI env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: twine upload --verbose wheelhouse/*.whl + run: twine upload dist/* diff --git a/README.md b/README.md index 1846d2e..32c6c81 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,17 @@ ## 🚀 Quick Start +### 0. Try It Now (Demo Credentials) + +No account needed — use these public demo credentials to test immediately: + +| | | +|---|---| +| **API key** | `NOMYO_AI_E2EE_INFERENCE` | +| **Model** | `Qwen/Qwen3-0.6B` | + +> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only. + ### 1. Install methods via pip (recommended): @@ -349,7 +360,8 @@ SecureChatCompletion( base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, - secure_memory: bool = True + secure_memory: bool = True, + max_retries: int = 2 ) ``` @@ -359,6 +371,7 @@ SecureChatCompletion( - `allow_http`: Allow HTTP connections (ONLY for local development, never in production) - `api_key`: Optional API key for bearer authentication - `secure_memory`: Enable secure memory protection (default: True) +- `max_retries`: Retries on retryable errors (429, 500, 502, 503, 504, network errors) with exponential backoff. Default: 2 #### Methods @@ -370,7 +383,7 @@ SecureChatCompletion( #### Constructor ```python -SecureCompletionClient(router_url: str = "https://api.nomyo.ai") +SecureCompletionClient(router_url: str = "https://api.nomyo.ai", allow_http: bool = False, max_retries: int = 2) ``` #### Methods diff --git a/doc/api-reference.md b/doc/api-reference.md index 439e471..1069082 100644 --- a/doc/api-reference.md +++ b/doc/api-reference.md @@ -11,7 +11,8 @@ SecureChatCompletion( base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, - secure_memory: bool = True + secure_memory: bool = True, + max_retries: int = 2 ) ``` @@ -21,6 +22,7 @@ SecureChatCompletion( - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production) - `api_key` (Optional[str]): Optional API key for bearer authentication - `secure_memory` (bool): Enable secure memory protection (default: True) +- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2 ### Methods @@ -73,10 +75,30 @@ A dictionary containing the chat completion response with the following structur "prompt_tokens": int, "completion_tokens": int, "total_tokens": int + }, + "_metadata": { + "payload_id": str, + "processed_at": int, # Unix timestamp + "is_encrypted": bool, + "response_status": str, + "security_tier": str, # "standard", "high", or "maximum" + "memory_protection": dict, # server-side memory protection info + "cuda_device": dict, # privacy-safe GPU info (hashed identifiers) + "tpm_attestation": { # TPM 2.0 hardware attestation (see Security Guide) + "is_available": bool, + # Present only when is_available is True: + "pcr_banks": str, # e.g. "sha256:0,7,10" + "pcr_values": dict, # {bank: {pcr_index: hex_digest}} + "quote_b64": str, # base64-encoded TPMS_ATTEST (signed by AIK) + "signature_b64": str, # base64-encoded TPMT_SIGNATURE + "aik_pubkey_b64": str, # base64-encoded TPM2B_PUBLIC (ephemeral AIK) + } } } ``` +The `_metadata` field is added by the client library and is not part of the OpenAI API response format. See the [Security Guide](security-guide.md) for how to interpret and verify `tpm_attestation`. + #### acreate(model, messages, **kwargs) Async alias for create() method. @@ -92,13 +114,18 @@ The `SecureCompletionClient` class handles the underlying encryption, key manage ### Constructor ```python -SecureCompletionClient(router_url: str = "https://api.nomyo.ai", allow_http: bool = False) +SecureCompletionClient( + router_url: str = "https://api.nomyo.ai", + allow_http: bool = False, + max_retries: int = 2 +) ``` **Parameters:** - `router_url` (str): Base URL of the NOMYO Router (must use HTTPS for production) - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production) +- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2 ### Methods diff --git a/doc/getting-started.md b/doc/getting-started.md index 4ccdf82..1cf78a4 100644 --- a/doc/getting-started.md +++ b/doc/getting-started.md @@ -1,5 +1,33 @@ # Getting Started +## Try It Now (Demo Credentials) + +You can test the client immediately using these public demo credentials — no sign-up required: + +| | | +|---|---| +| **API key** | `NOMYO_AI_E2EE_INFERENCE` | +| **Model** | `Qwen/Qwen3-0.6B` | + +> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only. + +```python +import asyncio +from nomyo import SecureChatCompletion + +async def main(): + client = SecureChatCompletion(api_key="NOMYO_AI_E2EE_INFERENCE") + + response = await client.create( + model="Qwen/Qwen3-0.6B", + messages=[{"role": "user", "content": "Hello!"}] + ) + + print(response['choices'][0]['message']['content']) + +asyncio.run(main()) +``` + ## Basic Usage The NOMYO client provides end-to-end encryption (E2E) for all communications between your application and the NOMYO inference endpoints. This ensures that your prompts and responses are protected from unauthorized access or interception. diff --git a/doc/rate-limits.md b/doc/rate-limits.md index 7d9f85c..2c18da2 100644 --- a/doc/rate-limits.md +++ b/doc/rate-limits.md @@ -48,20 +48,14 @@ HTTP/1.1 503 Service Unavailable - **Implement exponential backoff** when you receive a `429` response. Start with a short delay (e.g. 500 ms) and double it on each subsequent failure, up to a reasonable maximum. - **Monitor for `503` responses** — repeated occurrences indicate that your usage pattern is triggering the abuse threshold. Refactor your request logic before the cool-down expires. -## Example: Exponential Backoff +## Retry Behaviour + +The client retries automatically on `429`, `500`, `502`, `503`, `504`, and network errors using exponential backoff (1 s, 2 s, …). The default is **2 retries**. You can raise or disable this per client: ```python -import asyncio -import httpx +# More retries for high-throughput workloads +client = SecureChatCompletion(api_key="...", max_retries=5) -async def request_with_backoff(client, *args, max_retries=5, **kwargs): - delay = 0.5 - for attempt in range(max_retries): - response = await client.create(*args, **kwargs) - if response.status_code == 429: - await asyncio.sleep(delay) - delay = min(delay * 2, 30) - continue - return response - raise RuntimeError("Rate limit exceeded after maximum retries") +# Disable retries entirely +client = SecureChatCompletion(api_key="...", max_retries=0) ``` diff --git a/doc/security-guide.md b/doc/security-guide.md index 6c34f71..6e4abdc 100644 --- a/doc/security-guide.md +++ b/doc/security-guide.md @@ -162,6 +162,81 @@ Secure memory features: - Guarantees zeroing of sensitive memory - Prevents memory dumps from containing sensitive data +## Hardware Attestation (TPM 2.0) + +### What it is + +When the server has a TPM 2.0 chip, every response includes a `tpm_attestation` block in `_metadata`. This is a cryptographically signed hardware quote proving: + +- Which firmware and Secure Boot state the server is running (PCR 0, 7) +- Which application binary is running, when IMA is active (PCR 10) + +The quote is signed by an ephemeral AIK (Attestation Identity Key) generated fresh for each request and tied to the `payload_id` nonce, so it cannot be replayed for a different request. + +### Reading the attestation + +```python +response = await client.create( + model="Qwen/Qwen3-0.6B", + messages=[{"role": "user", "content": "..."}], + security_tier="maximum" +) + +tpm = response["_metadata"].get("tpm_attestation", {}) + +if tpm.get("is_available"): + print("PCR banks:", tpm["pcr_banks"]) # e.g. "sha256:0,7,10" + print("PCR values:", tpm["pcr_values"]) # {bank: {index: hex}} + print("AIK key:", tpm["aik_pubkey_b64"][:32], "...") +else: + print("TPM not available on this server") +``` + +### Verifying the quote + +The response is self-contained: `aik_pubkey_b64` is the full public key of the AIK that signed the quote, so no separate key-fetch round-trip is needed. + +Verification steps using `tpm2-pytss`: + +```python +import base64 +from tpm2_pytss.types import TPM2B_PUBLIC, TPMT_SIGNATURE, TPM2B_ATTEST + +# 1. Decode the quote components +aik_pub = TPM2B_PUBLIC.unmarshal(base64.b64decode(tpm["aik_pubkey_b64"]))[0] +quote = TPM2B_ATTEST.unmarshal(base64.b64decode(tpm["quote_b64"]))[0] +sig = TPMT_SIGNATURE.unmarshal(base64.b64decode(tpm["signature_b64"]))[0] + +# 2. Verify the signature over the quote using the AIK public key +# (use a TPM ESAPI verify_signature call or an offline RSA verify) + +# 3. Inspect the qualifying_data inside the quote — it must match +# SHA-256(payload_id.encode())[:16] to confirm this quote is for this request + +# 4. Check pcr_values against your known-good baseline +``` + +> Full verification requires `tpm2-pytss` on the client side (`pip install tpm2-pytss` + `sudo apt install libtss2-dev`). It is optional — the attestation is informational unless your deployment policy requires verification. + +### Behaviour per security tier + +| Tier | TPM unavailable | +|------|----------------| +| `standard` | `tpm_attestation: {"is_available": false}` — request proceeds | +| `high` | same as standard | +| `maximum` | `ServiceUnavailableError` (HTTP 503) — request rejected | + +For `maximum` tier, the server enforces TPM availability as a hard requirement. If your server has no TPM and you request `maximum`, catch the error explicitly: + +```python +from nomyo import ServiceUnavailableError + +try: + response = await client.create(..., security_tier="maximum") +except ServiceUnavailableError as e: + print("Server does not meet TPM requirements for maximum tier:", e) +``` + ## Compliance Considerations ### HIPAA Compliance @@ -207,9 +282,11 @@ response = await client.create( messages=[{"role": "user", "content": "Hello"}] ) -print(response["_metadata"]) # Contains security-related information +print(response["_metadata"]) # Contains security_tier, memory_protection, tpm_attestation, etc. ``` +See [Hardware Attestation](#hardware-attestation-tpm-20) for details on the `tpm_attestation` field. + ### Logging Enable logging to see security operations: diff --git a/nomyo/SecureCompletionClient.py b/nomyo/SecureCompletionClient.py index ee81942..6aa5379 100644 --- a/nomyo/SecureCompletionClient.py +++ b/nomyo/SecureCompletionClient.py @@ -1,5 +1,5 @@ -import ctypes, json, base64, urllib.parse, httpx, os, secrets, sys, warnings, logging -from typing import Dict, Any, Optional +import asyncio, ctypes, json, base64, urllib.parse, httpx, os, secrets, sys, warnings, logging +from typing import Dict, Any, Optional, Union from cryptography.hazmat.primitives import serialization, hashes from cryptography.hazmat.primitives.asymmetric import rsa, padding from cryptography.hazmat.backends import default_backend @@ -76,7 +76,7 @@ class SecureCompletionClient: - Response parsing """ - def __init__(self, router_url: str = "https://api.nomyo.ai", allow_http: bool = False, secure_memory: bool = True): + def __init__(self, router_url: str = "https://api.nomyo.ai", allow_http: bool = False, secure_memory: bool = True, max_retries: int = 2): """ Initialize the secure completion client. @@ -84,6 +84,9 @@ class SecureCompletionClient: router_url: Base URL of the NOMYO Router (must use HTTPS for production) allow_http: Allow HTTP connections (ONLY for local development, never in production) secure_memory: Whether to use secure memory operations for this instance. + max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504, + network errors). Uses exponential backoff. Default 2, matching + the OpenAI Python SDK default. """ self.router_url = router_url.rstrip('/') self.private_key = None @@ -91,6 +94,7 @@ class SecureCompletionClient: self.key_size = 4096 # RSA key size self.allow_http = allow_http # Store for use in fetch_server_public_key self._use_secure_memory = _SECURE_MEMORY_AVAILABLE and secure_memory + self.max_retries = max_retries # Validate HTTPS for security if not self.router_url.startswith("https://"): @@ -659,13 +663,22 @@ class SecureCompletionClient: url = f"{self.router_url}/v1/chat/secure_completion" logger.debug("Target URL: %s", url) - try: - async with httpx.AsyncClient(timeout=60.0) as client: - response = await client.post( - url, - headers=headers, - content=encrypted_payload - ) + _RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} + last_exc: Exception = APIConnectionError("Request failed") + + for attempt in range(self.max_retries + 1): + if attempt > 0: + delay = 2 ** (attempt - 1) # 1s, 2s, 4s, … + logger.warning("Retrying request (attempt %d/%d) after %.1fs...", attempt, self.max_retries, delay) + await asyncio.sleep(delay) + + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + url, + headers=headers, + content=encrypted_payload + ) logger.debug("HTTP Status: %d", response.status_code) @@ -676,7 +689,6 @@ class SecureCompletionClient: return decrypted_response elif response.status_code == 400: - # Bad request try: error = response.json() raise InvalidRequestError( @@ -688,7 +700,6 @@ class SecureCompletionClient: raise InvalidRequestError("Bad request: Invalid response format") elif response.status_code == 401: - # Unauthorized - authentication failed try: error = response.json() error_message = error.get('detail', 'Invalid API key or authentication failed') @@ -701,7 +712,6 @@ class SecureCompletionClient: raise AuthenticationError("Invalid API key or authentication failed") elif response.status_code == 403: - # Forbidden - model not allowed for security tier try: error = response.json() raise ForbiddenError( @@ -713,7 +723,6 @@ class SecureCompletionClient: raise ForbiddenError("Forbidden: Model not allowed for the requested security tier") elif response.status_code == 404: - # Endpoint not found try: error = response.json() raise APIError( @@ -724,44 +733,47 @@ class SecureCompletionClient: except (json.JSONDecodeError, ValueError): raise APIError("Endpoint not found: Secure inference not enabled") - elif response.status_code == 429: - # Rate limit exceeded + elif response.status_code in _RETRYABLE_STATUS_CODES: try: error = response.json() - raise RateLimitError( - f"Rate limit exceeded: {error.get('detail', 'Too many requests')}", + if not isinstance(error, dict): + error = {"detail": "unknown"} + detail_msg = error.get("detail", "unknown") + except (json.JSONDecodeError, ValueError): + error = {} + detail_msg = "unknown" + + if response.status_code == 429: + last_exc = RateLimitError( + f"Rate limit exceeded: {detail_msg}", status_code=429, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise RateLimitError("Rate limit exceeded: Too many requests") - - elif response.status_code == 500: - # Server error - try: - error = response.json() - raise ServerError( - f"Server error: {error.get('detail', 'Internal server error')}", + elif response.status_code == 500: + last_exc = ServerError( + f"Server error: {detail_msg}", status_code=500, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise ServerError("Server error: Internal server error") - - elif response.status_code == 503: - # Service unavailable - inference backend is down - try: - error = response.json() - raise ServiceUnavailableError( - f"Service unavailable: {error.get('detail', 'Inference backend is unavailable')}", + elif response.status_code == 503: + last_exc = ServiceUnavailableError( + f"Service unavailable: {detail_msg}", status_code=503, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise ServiceUnavailableError("Service unavailable: Inference backend is unavailable") + else: + last_exc = APIError( + f"Unexpected status code: {response.status_code} {detail_msg}", + status_code=response.status_code, + error_details=error + ) + + if attempt < self.max_retries: + logger.warning("Got retryable status %d: %s", response.status_code, detail_msg) + continue + raise last_exc else: - # Unexpected status code try: unexp_detail = response.json() if not isinstance(unexp_detail, dict): @@ -774,13 +786,17 @@ class SecureCompletionClient: status_code=response.status_code ) - except httpx.NetworkError as e: - raise APIConnectionError(f"Failed to connect to router: {e}") - except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError): - raise # Re-raise known exceptions - except Exception: - logger.exception("Unexpected error in send_secure_request") - raise APIConnectionError("Request failed due to an unexpected error") + except httpx.NetworkError as e: + last_exc = APIConnectionError(f"Failed to connect to router: {e}") + if attempt < self.max_retries: + logger.warning("Network error on attempt %d: %s", attempt, e) + continue + raise last_exc + except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError): + raise # Non-retryable — propagate immediately + except Exception: + logger.exception("Unexpected error in send_secure_request") + raise APIConnectionError("Request failed due to an unexpected error") def _validate_rsa_key(self, key, key_type: str = "private") -> None: """ diff --git a/nomyo/__init__.py b/nomyo/__init__.py index 0a81157..6fb55fe 100644 --- a/nomyo/__init__.py +++ b/nomyo/__init__.py @@ -51,6 +51,6 @@ try: except ImportError: pass -__version__ = "0.2.5" +__version__ = "0.2.7" __author__ = "NOMYO AI" __license__ = "Apache-2.0" diff --git a/nomyo/nomyo.py b/nomyo/nomyo.py index 682997b..95709b2 100644 --- a/nomyo/nomyo.py +++ b/nomyo/nomyo.py @@ -52,7 +52,7 @@ class SecureChatCompletion: ``` """ - def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None): + def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2): """ Initialize the secure chat completion client. @@ -68,8 +68,10 @@ class SecureChatCompletion: Set to False for testing or when security is not required. key_dir: Directory to load/save RSA keys. If None, ephemeral keys are generated in memory for this session only. + max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504, + network errors). Uses exponential backoff. Default 2. """ - self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory) + self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries) self._keys_initialized = False self._keys_lock = asyncio.Lock() self.api_key = api_key diff --git a/pyproject.toml b/pyproject.toml index e0902c3..d0f08cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [build-system] -requires = ["setuptools>=68", "wheel", "Cython>=3.0"] -build-backend = "setuptools.build_meta" +requires = ["hatchling>=1.0.0", "wheel"] +build-backend = "hatchling.build" [project] name = "nomyo" -version = "0.2.5" +version = "0.2.7" description = "OpenAI-compatible secure chat client with end-to-end encryption for NOMYO Inference Endpoints" authors = [ {name = "NOMYO.AI", email = "ichi@nomyo.ai"}, @@ -46,5 +46,8 @@ Documentation = "https://bitfreedom.net/code/nomyo-ai/nomyo/wiki/NOMYO-Secure-Cl Repository = "https://bitfreedom.net/code/nomyo-ai/nomyo" Issues = "https://bitfreedom.net/code/nomyo-ai/nomyo/issues" -[tool.setuptools.packages.find] -include = ["nomyo*"] +[tool.hatch.build.targets.wheel] +packages = ["nomyo"] + +[tool.hatch.build.targets.sdist] +exclude = ["test/", "build.sh", "dist/"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 92d0bd3..0000000 --- a/setup.py +++ /dev/null @@ -1,31 +0,0 @@ -from setuptools import setup -from setuptools.command.build_py import build_py as _build_py -from Cython.Build import cythonize - -# Modules compiled to .so — exclude their .py source from the wheel -COMPILED_MODULES = {"nomyo", "SecureCompletionClient", "SecureMemory"} - - -class BuildPyNoPy(_build_py): - """Skip copying .py source files for cythonized modules.""" - - def find_package_modules(self, package, package_dir): - modules = super().find_package_modules(package, package_dir) - return [ - (pkg, mod, path) - for pkg, mod, path in modules - if not (pkg == "nomyo" and mod in COMPILED_MODULES) - ] - - -setup( - ext_modules=cythonize( - [ - "nomyo/nomyo.py", - "nomyo/SecureCompletionClient.py", - "nomyo/SecureMemory.py", - ], - compiler_directives={"language_level": "3"}, - ), - cmdclass={"build_py": BuildPyNoPy}, -)