diff --git a/.gitignore b/.gitignore index 759ff16..7cd094c 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,10 @@ dist/ *.egg *.sh +# Cython generated files +*.so +*.c + # Virtual environments venv/ .env diff --git a/README.md b/README.md index df46ad6..32c6c81 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,24 @@ **OpenAI-compatible secure chat client with end-to-end encryption with NOMYO Inference Endpoints** 🔒 **All prompts and responses are automatically encrypted and decrypted** + 🔑 **Uses hybrid encryption (AES-256-GCM + RSA-OAEP with 4096-bit keys)** + 🔄 **Drop-in replacement for OpenAI's ChatCompletion API** ## 🚀 Quick Start +### 0. Try It Now (Demo Credentials) + +No account needed — use these public demo credentials to test immediately: + +| | | +|---|---| +| **API key** | `NOMYO_AI_E2EE_INFERENCE` | +| **Model** | `Qwen/Qwen3-0.6B` | + +> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only. + ### 1. Install methods via pip (recommended): @@ -32,8 +45,8 @@ import asyncio from nomyo import SecureChatCompletion async def main(): - # Initialize client (defaults to http://api.nomyo.ai:12434) - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") + # Initialize client (defaults to https://api.nomyo.ai) + client = SecureChatCompletion(base_url="https://api.nomyo.ai") # Simple chat completion response = await client.create( @@ -154,7 +167,7 @@ import asyncio from nomyo import SecureChatCompletion async def main(): - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") + client = SecureChatCompletion(base_url="https://api.nomyo.ai") response = await client.create( model="Qwen/Qwen3-0.6B", @@ -179,7 +192,7 @@ import asyncio from nomyo import SecureChatCompletion async def main(): - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") + client = SecureChatCompletion(base_url="https://api.nomyo.ai") response = await client.create( model="Qwen/Qwen3-0.6B", @@ -218,7 +231,7 @@ import asyncio from nomyo import SecureChatCompletion async def main(): - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") + client = SecureChatCompletion(base_url="https://api.nomyo.ai") response = await client.acreate( model="Qwen/Qwen3-0.6B", @@ -268,7 +281,7 @@ from nomyo import SecureChatCompletion async def main(): # Initialize with API key (recommended for production) client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", api_key="your-api-key-here" ) @@ -293,13 +306,13 @@ from nomyo import SecureChatCompletion async def main(): # Enable secure memory protection (default, recommended) client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", secure_memory=True # Default ) # Disable secure memory (not recommended, for testing only) client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", secure_memory=False ) @@ -344,10 +357,11 @@ asyncio.run(main()) ```python SecureChatCompletion( - base_url: str = "https://api.nomyo.ai:12434", + base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, - secure_memory: bool = True + secure_memory: bool = True, + max_retries: int = 2 ) ``` @@ -357,6 +371,7 @@ SecureChatCompletion( - `allow_http`: Allow HTTP connections (ONLY for local development, never in production) - `api_key`: Optional API key for bearer authentication - `secure_memory`: Enable secure memory protection (default: True) +- `max_retries`: Retries on retryable errors (429, 500, 502, 503, 504, network errors) with exponential backoff. Default: 2 #### Methods @@ -368,7 +383,7 @@ SecureChatCompletion( #### Constructor ```python -SecureCompletionClient(router_url: str = "http://api.nomyo.ai:12434") +SecureCompletionClient(router_url: str = "https://api.nomyo.ai", allow_http: bool = False, max_retries: int = 2) ``` #### Methods diff --git a/SECURE_MEMORY.md b/SECURE_MEMORY.md index bdaacdb..9631613 100644 --- a/SECURE_MEMORY.md +++ b/SECURE_MEMORY.md @@ -29,7 +29,7 @@ from nomyo import SecureChatCompletion # Create client with secure memory enabled (default) client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", secure_memory=True # Enabled by default ) @@ -47,7 +47,7 @@ from nomyo import SecureChatCompletion # Disable secure memory for testing or when not needed client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", secure_memory=False ) ``` @@ -210,7 +210,7 @@ from nomyo import SecureChatCompletion async def secure_chat(): # Create client with maximum security client = SecureChatCompletion( - base_url="https://api.nomyo.ai:12434", + base_url="https://api.nomyo.ai", secure_memory=True # Default ) diff --git a/SECURITY.md b/SECURITY.md index f5a2179..c8e42c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -12,7 +12,7 @@ The client MUST connect using HTTPS in production environments: ```python # ✅ SECURE (Production) -client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") +client = SecureChatCompletion(base_url="https://api.nomyo.ai") # ⚠️ INSECURE (Local development only) client = SecureChatCompletion(base_url="http://localhost:12434", allow_http=True) diff --git a/doc/README.md b/doc/README.md index ceb903b..70dd9d6 100644 --- a/doc/README.md +++ b/doc/README.md @@ -3,6 +3,8 @@ This documentation provides comprehensive information about using the NOMYO Secure Python Chat Client, a drop-in replacement for OpenAI's ChatCompletion API with end-to-end (E2E) encryption. To use this client library you need a paid subscribtion on [NOMYO Inference](https://chat.nomyo.ai/). +![Inference Image](./secure-inference.jpg) + ## Overview The NOMYO Secure Client provides: @@ -44,9 +46,11 @@ asyncio.run(main()) 1. [Installation](installation.md) - How to install and set up the client 2. [Getting Started](getting-started.md) - Quick start guide with examples 3. [API Reference](api-reference.md) - Complete API documentation -4. [Security Guide](security-guide.md) - Security features and best practices -5. [Examples](examples.md) - Advanced usage scenarios -6. [Troubleshooting](troubleshooting.md) - Common issues and solutions +4. [Models](models.md) - Available models and selection guide +5. [Security Guide](security-guide.md) - Security features and best practices +6. [Examples](examples.md) - Advanced usage scenarios +7. [Rate Limits](rate-limits.md) - Request limits, burst allowance, and error handling +8. [Troubleshooting](troubleshooting.md) - Common issues and solutions ## Key Features diff --git a/doc/api-reference.md b/doc/api-reference.md index 1e16436..1069082 100644 --- a/doc/api-reference.md +++ b/doc/api-reference.md @@ -11,7 +11,8 @@ SecureChatCompletion( base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, - secure_memory: bool = True + secure_memory: bool = True, + max_retries: int = 2 ) ``` @@ -21,6 +22,7 @@ SecureChatCompletion( - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production) - `api_key` (Optional[str]): Optional API key for bearer authentication - `secure_memory` (bool): Enable secure memory protection (default: True) +- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2 ### Methods @@ -73,10 +75,30 @@ A dictionary containing the chat completion response with the following structur "prompt_tokens": int, "completion_tokens": int, "total_tokens": int + }, + "_metadata": { + "payload_id": str, + "processed_at": int, # Unix timestamp + "is_encrypted": bool, + "response_status": str, + "security_tier": str, # "standard", "high", or "maximum" + "memory_protection": dict, # server-side memory protection info + "cuda_device": dict, # privacy-safe GPU info (hashed identifiers) + "tpm_attestation": { # TPM 2.0 hardware attestation (see Security Guide) + "is_available": bool, + # Present only when is_available is True: + "pcr_banks": str, # e.g. "sha256:0,7,10" + "pcr_values": dict, # {bank: {pcr_index: hex_digest}} + "quote_b64": str, # base64-encoded TPMS_ATTEST (signed by AIK) + "signature_b64": str, # base64-encoded TPMT_SIGNATURE + "aik_pubkey_b64": str, # base64-encoded TPM2B_PUBLIC (ephemeral AIK) + } } } ``` +The `_metadata` field is added by the client library and is not part of the OpenAI API response format. See the [Security Guide](security-guide.md) for how to interpret and verify `tpm_attestation`. + #### acreate(model, messages, **kwargs) Async alias for create() method. @@ -92,13 +114,18 @@ The `SecureCompletionClient` class handles the underlying encryption, key manage ### Constructor ```python -SecureCompletionClient(router_url: str = "https://api.nomyo.ai:12434", allow_http: bool = False) +SecureCompletionClient( + router_url: str = "https://api.nomyo.ai", + allow_http: bool = False, + max_retries: int = 2 +) ``` **Parameters:** - `router_url` (str): Base URL of the NOMYO Router (must use HTTPS for production) - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production) +- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2 ### Methods diff --git a/doc/getting-started.md b/doc/getting-started.md index 2eb6c1e..1cf78a4 100644 --- a/doc/getting-started.md +++ b/doc/getting-started.md @@ -1,5 +1,33 @@ # Getting Started +## Try It Now (Demo Credentials) + +You can test the client immediately using these public demo credentials — no sign-up required: + +| | | +|---|---| +| **API key** | `NOMYO_AI_E2EE_INFERENCE` | +| **Model** | `Qwen/Qwen3-0.6B` | + +> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only. + +```python +import asyncio +from nomyo import SecureChatCompletion + +async def main(): + client = SecureChatCompletion(api_key="NOMYO_AI_E2EE_INFERENCE") + + response = await client.create( + model="Qwen/Qwen3-0.6B", + messages=[{"role": "user", "content": "Hello!"}] + ) + + print(response['choices'][0]['message']['content']) + +asyncio.run(main()) +``` + ## Basic Usage The NOMYO client provides end-to-end encryption (E2E) for all communications between your application and the NOMYO inference endpoints. This ensures that your prompts and responses are protected from unauthorized access or interception. @@ -197,7 +225,7 @@ import asyncio from nomyo import SecureChatCompletion, AuthenticationError, InvalidRequestError async def main(): - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434") + client = SecureChatCompletion(base_url="https://api.nomyo.ai") try: response = await client.create( diff --git a/doc/models.md b/doc/models.md new file mode 100644 index 0000000..9860039 --- /dev/null +++ b/doc/models.md @@ -0,0 +1,48 @@ +# Available Models + +All models are available via `api.nomyo.ai`. Pass the model ID string directly to the `model` parameter of `create()`. + +## Model List + +| Model ID | Parameters | Type | Notes | +|---|---|---|---| +| `Qwen/Qwen3-0.6B` | 0.6B | General | Lightweight, fast inference | +| `Qwen/Qwen3.5-0.8B` | 0.8B | General | Lightweight, fast inference | +| `LiquidAI/LFM2.5-1.2B-Thinking` | 1.2B | Thinking | Reasoning model | +| `ibm-granite/granite-4.0-h-small` | Small | General | IBM Granite 4.0, enterprise-focused | +| `Qwen/Qwen3.5-9B` | 9B | General | Balanced quality and speed | +| `utter-project/EuroLLM-9B-Instruct-2512` | 9B | General | Multilingual, strong European language support | +| `zai-org/GLM-4.7-Flash` | — | General | Fast GLM variant | +| `mistralai/Ministral-3-14B-Instruct-2512-GGUF` | 14B | General | Mistral instruction-tuned | +| `ServiceNow-AI/Apriel-1.6-15b-Thinker` | 15B | Thinking | Reasoning model | +| `openai/gpt-oss-20b` | 20B | General | OpenAI open-weight release | +| `LiquidAI/LFM2-24B-A2B` | 24B (2B active) | General | MoE — efficient inference | +| `Qwen/Qwen3.5-27B` | 27B | General | High quality, large context | +| `google/medgemma-27b-it` | 27B | Specialized | Medical domain, instruction-tuned | +| `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4` | 30B (3B active) | General | MoE — efficient inference | +| `Qwen/Qwen3.5-35B-A3B` | 35B (3B active) | General | MoE — efficient inference | +| `moonshotai/Kimi-Linear-48B-A3B-Instruct` | 48B (3B active) | General | MoE — large capacity, efficient inference | + +> **MoE** (Mixture of Experts) models show total/active parameter counts. Only active parameters are used per token, keeping inference cost low relative to total model size. + +## Usage Example + +```python +from nomyo import SecureChatCompletion + +client = SecureChatCompletion(api_key="your-api-key") + +response = await client.create( + model="Qwen/Qwen3.5-9B", + messages=[{"role": "user", "content": "Hello!"}] +) +``` + +## Choosing a Model + +- **Low latency / edge use**: `Qwen/Qwen3-0.6B`, `Qwen/Qwen3.5-0.8B`, `LiquidAI/LFM2.5-1.2B-Thinking` +- **Balanced quality and speed**: `Qwen/Qwen3.5-9B`, `mistralai/Ministral-3-14B-Instruct-2512-GGUF` +- **Reasoning / chain-of-thought**: `LiquidAI/LFM2.5-1.2B-Thinking`, `ServiceNow-AI/Apriel-1.6-15b-Thinker` +- **Multilingual**: `utter-project/EuroLLM-9B-Instruct-2512` +- **Medical**: `google/medgemma-27b-it` +- **Highest quality**: `moonshotai/Kimi-Linear-48B-A3B-Instruct`, `Qwen/Qwen3.5-35B-A3B` diff --git a/doc/rate-limits.md b/doc/rate-limits.md new file mode 100644 index 0000000..2c18da2 --- /dev/null +++ b/doc/rate-limits.md @@ -0,0 +1,61 @@ +# Rate Limits + +The NOMYO API (`api.nomyo.ai`) enforces rate limits to ensure fair usage and service stability for all users. + +## Default Rate Limit + +By default, each API key is limited to **2 requests per second**. + +## Burst Allowance + +Short bursts above the default limit are permitted. You may send up to **4 requests per second** in burst mode, provided you have not exceeded burst usage within the current **10-second window**. + +Burst capacity is granted once per 10-second window. If you consume the burst allowance, you must wait for the window to reset before burst is available again. + +## Rate Limit Summary + +| Mode | Limit | Condition | +|---------|--------------------|----------------------------------| +| Default | 2 requests/second | Always active | +| Burst | 4 requests/second | Once per 10-second window | + +## Error Responses + +### 429 Too Many Requests + +Returned when your request rate exceeds the allowed limit. + +``` +HTTP/1.1 429 Too Many Requests +``` + +**What to do:** Back off and retry after a short delay. Implement exponential backoff in your client to avoid repeated limit hits. + +### 503 Service Unavailable (Cool-down) + +Returned when burst limits are abused repeatedly. A **30-minute cool-down** is applied to the offending API key. + +``` +HTTP/1.1 503 Service Unavailable +``` + +**What to do:** Wait 30 minutes before retrying. Review your request patterns to ensure you stay within the permitted limits. + +## Best Practices + +- **Throttle your requests** client-side to stay at or below 2 requests/second under normal load. +- **Use burst sparingly** — it is intended for occasional spikes, not sustained high-throughput usage. +- **Implement exponential backoff** when you receive a `429` response. Start with a short delay (e.g. 500 ms) and double it on each subsequent failure, up to a reasonable maximum. +- **Monitor for `503` responses** — repeated occurrences indicate that your usage pattern is triggering the abuse threshold. Refactor your request logic before the cool-down expires. + +## Retry Behaviour + +The client retries automatically on `429`, `500`, `502`, `503`, `504`, and network errors using exponential backoff (1 s, 2 s, …). The default is **2 retries**. You can raise or disable this per client: + +```python +# More retries for high-throughput workloads +client = SecureChatCompletion(api_key="...", max_retries=5) + +# Disable retries entirely +client = SecureChatCompletion(api_key="...", max_retries=0) +``` diff --git a/doc/secure-inference.jpg b/doc/secure-inference.jpg new file mode 100644 index 0000000..70c6fd1 Binary files /dev/null and b/doc/secure-inference.jpg differ diff --git a/doc/security-guide.md b/doc/security-guide.md index 6c34f71..6e4abdc 100644 --- a/doc/security-guide.md +++ b/doc/security-guide.md @@ -162,6 +162,81 @@ Secure memory features: - Guarantees zeroing of sensitive memory - Prevents memory dumps from containing sensitive data +## Hardware Attestation (TPM 2.0) + +### What it is + +When the server has a TPM 2.0 chip, every response includes a `tpm_attestation` block in `_metadata`. This is a cryptographically signed hardware quote proving: + +- Which firmware and Secure Boot state the server is running (PCR 0, 7) +- Which application binary is running, when IMA is active (PCR 10) + +The quote is signed by an ephemeral AIK (Attestation Identity Key) generated fresh for each request and tied to the `payload_id` nonce, so it cannot be replayed for a different request. + +### Reading the attestation + +```python +response = await client.create( + model="Qwen/Qwen3-0.6B", + messages=[{"role": "user", "content": "..."}], + security_tier="maximum" +) + +tpm = response["_metadata"].get("tpm_attestation", {}) + +if tpm.get("is_available"): + print("PCR banks:", tpm["pcr_banks"]) # e.g. "sha256:0,7,10" + print("PCR values:", tpm["pcr_values"]) # {bank: {index: hex}} + print("AIK key:", tpm["aik_pubkey_b64"][:32], "...") +else: + print("TPM not available on this server") +``` + +### Verifying the quote + +The response is self-contained: `aik_pubkey_b64` is the full public key of the AIK that signed the quote, so no separate key-fetch round-trip is needed. + +Verification steps using `tpm2-pytss`: + +```python +import base64 +from tpm2_pytss.types import TPM2B_PUBLIC, TPMT_SIGNATURE, TPM2B_ATTEST + +# 1. Decode the quote components +aik_pub = TPM2B_PUBLIC.unmarshal(base64.b64decode(tpm["aik_pubkey_b64"]))[0] +quote = TPM2B_ATTEST.unmarshal(base64.b64decode(tpm["quote_b64"]))[0] +sig = TPMT_SIGNATURE.unmarshal(base64.b64decode(tpm["signature_b64"]))[0] + +# 2. Verify the signature over the quote using the AIK public key +# (use a TPM ESAPI verify_signature call or an offline RSA verify) + +# 3. Inspect the qualifying_data inside the quote — it must match +# SHA-256(payload_id.encode())[:16] to confirm this quote is for this request + +# 4. Check pcr_values against your known-good baseline +``` + +> Full verification requires `tpm2-pytss` on the client side (`pip install tpm2-pytss` + `sudo apt install libtss2-dev`). It is optional — the attestation is informational unless your deployment policy requires verification. + +### Behaviour per security tier + +| Tier | TPM unavailable | +|------|----------------| +| `standard` | `tpm_attestation: {"is_available": false}` — request proceeds | +| `high` | same as standard | +| `maximum` | `ServiceUnavailableError` (HTTP 503) — request rejected | + +For `maximum` tier, the server enforces TPM availability as a hard requirement. If your server has no TPM and you request `maximum`, catch the error explicitly: + +```python +from nomyo import ServiceUnavailableError + +try: + response = await client.create(..., security_tier="maximum") +except ServiceUnavailableError as e: + print("Server does not meet TPM requirements for maximum tier:", e) +``` + ## Compliance Considerations ### HIPAA Compliance @@ -207,9 +282,11 @@ response = await client.create( messages=[{"role": "user", "content": "Hello"}] ) -print(response["_metadata"]) # Contains security-related information +print(response["_metadata"]) # Contains security_tier, memory_protection, tpm_attestation, etc. ``` +See [Hardware Attestation](#hardware-attestation-tpm-20) for details on the `tpm_attestation` field. + ### Logging Enable logging to see security operations: diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md new file mode 100644 index 0000000..4f34127 --- /dev/null +++ b/doc/troubleshooting.md @@ -0,0 +1 @@ +# Troubleshooting diff --git a/nomyo/SecureCompletionClient.py b/nomyo/SecureCompletionClient.py index 66cd94a..6aa5379 100644 --- a/nomyo/SecureCompletionClient.py +++ b/nomyo/SecureCompletionClient.py @@ -1,5 +1,5 @@ -import json, base64, urllib.parse, httpx, os, secrets, warnings, logging -from typing import Dict, Any, Optional +import asyncio, ctypes, json, base64, urllib.parse, httpx, os, secrets, sys, warnings, logging +from typing import Dict, Any, Optional, Union from cryptography.hazmat.primitives import serialization, hashes from cryptography.hazmat.primitives.asymmetric import rsa, padding from cryptography.hazmat.backends import default_backend @@ -76,7 +76,7 @@ class SecureCompletionClient: - Response parsing """ - def __init__(self, router_url: str = "https://api.nomyo.ai:12435", allow_http: bool = False, secure_memory: bool = True): + def __init__(self, router_url: str = "https://api.nomyo.ai", allow_http: bool = False, secure_memory: bool = True, max_retries: int = 2): """ Initialize the secure completion client. @@ -84,6 +84,9 @@ class SecureCompletionClient: router_url: Base URL of the NOMYO Router (must use HTTPS for production) allow_http: Allow HTTP connections (ONLY for local development, never in production) secure_memory: Whether to use secure memory operations for this instance. + max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504, + network errors). Uses exponential backoff. Default 2, matching + the OpenAI Python SDK default. """ self.router_url = router_url.rstrip('/') self.private_key = None @@ -91,6 +94,7 @@ class SecureCompletionClient: self.key_size = 4096 # RSA key size self.allow_http = allow_http # Store for use in fetch_server_public_key self._use_secure_memory = _SECURE_MEMORY_AVAILABLE and secure_memory + self.max_retries = max_retries # Validate HTTPS for security if not self.router_url.startswith("https://"): @@ -333,10 +337,11 @@ class SecureCompletionClient: except Exception: raise ValueError("Failed to fetch server's public key") - async def _do_encrypt(self, payload_bytes: bytes, aes_key: bytes) -> bytes: + async def _do_encrypt(self, payload_bytes: Union[bytes, bytearray], aes_key: Union[bytes, bytearray]) -> bytes: """ Core AES-256-GCM + RSA-OAEP encryption. Caller is responsible for memory protection of payload_bytes and aes_key before calling this. + Accepts bytearray to avoid creating an unzeroed immutable bytes copy. """ nonce = secrets.token_bytes(12) # 96-bit nonce for GCM cipher = Cipher( @@ -353,14 +358,28 @@ class SecureCompletionClient: server_public_key_pem.encode('utf-8'), backend=default_backend() ) - encrypted_aes_key = server_public_key.encrypt( - aes_key, - padding.OAEP( - mgf=padding.MGF1(algorithm=hashes.SHA256()), - algorithm=hashes.SHA256(), - label=None + # RSA encrypt requires bytes — an immutable copy is unavoidable here. + # We narrow its lifetime to this block and attempt to zero it via + # CPython internals immediately after use. This relies on the CPython + # bytes object layout (ob_sval starts at getsizeof(b'')-1 from id()), + # so it is a best-effort measure on CPython only. + _key_bytes = bytes(aes_key) + try: + encrypted_aes_key = server_public_key.encrypt( + _key_bytes, + padding.OAEP( + mgf=padding.MGF1(algorithm=hashes.SHA256()), + algorithm=hashes.SHA256(), + label=None + ) ) - ) + finally: + try: + _data_offset = sys.getsizeof(b'') - 1 # offset to ob_sval in PyBytesObject + ctypes.memset(id(_key_bytes) + _data_offset, 0, len(_key_bytes)) + except Exception: + pass + del _key_bytes encrypted_package = { "version": "1.0", @@ -405,8 +424,8 @@ class SecureCompletionClient: raise ValueError("Payload cannot be empty") try: - # Serialize payload to JSON - payload_json = json.dumps(payload).encode('utf-8') + # Serialize payload to JSON as bytearray so SecureBuffer can zero the original + payload_json = bytearray(json.dumps(payload).encode('utf-8')) # Validate payload size (prevent DoS) MAX_PAYLOAD_SIZE = 10 * 1024 * 1024 # 10MB limit @@ -415,14 +434,14 @@ class SecureCompletionClient: logger.debug("Payload size: %d bytes", len(payload_json)) - aes_key = secrets.token_bytes(32) # 256-bit key + aes_key = bytearray(secrets.token_bytes(32)) # 256-bit key as bytearray try: if self._use_secure_memory: with secure_bytearray(payload_json) as protected_payload: with secure_bytearray(aes_key) as protected_aes_key: return await self._do_encrypt( - bytes(protected_payload.data), - bytes(protected_aes_key.data) + protected_payload.data, + protected_aes_key.data ) else: logger.warning("Secure memory not available, using standard encryption") @@ -476,6 +495,20 @@ class SecureCompletionClient: if missing_fields: raise ValueError(f"Missing required fields in encrypted package: {', '.join(missing_fields)}") + # Validate version and algorithm to prevent downgrade attacks + SUPPORTED_VERSION = "1.0" + SUPPORTED_ALGORITHM = "hybrid-aes256-rsa4096" + if package["version"] != SUPPORTED_VERSION: + raise ValueError( + f"Unsupported protocol version: '{package['version']}'. " + f"Expected: '{SUPPORTED_VERSION}'" + ) + if package["algorithm"] != SUPPORTED_ALGORITHM: + raise ValueError( + f"Unsupported encryption algorithm: '{package['algorithm']}'. " + f"Expected: '{SUPPORTED_ALGORITHM}'" + ) + # Validate encrypted_payload structure if not isinstance(package["encrypted_payload"], dict): raise ValueError("Invalid encrypted_payload: must be a dictionary") @@ -485,9 +518,13 @@ class SecureCompletionClient: if missing_payload_fields: raise ValueError(f"Missing fields in encrypted_payload: {', '.join(missing_payload_fields)}") + # Guard: private key must be initialized before attempting decryption + if self.private_key is None: + raise SecurityError("Private key not initialized. Call generate_keys() or load_keys() first.") + # Decrypt with proper error handling — keep crypto errors opaque (timing attacks) - plaintext_json: Optional[str] = None plaintext_size: int = 0 + response: Optional[Dict[str, Any]] = None try: # Decrypt AES key with private key encrypted_aes_key = base64.b64decode(package["encrypted_aes_key"]) @@ -508,7 +545,7 @@ class SecureCompletionClient: tag = base64.b64decode(package["encrypted_payload"]["tag"]) cipher = Cipher( - algorithms.AES(bytes(protected_aes_key.data)), + algorithms.AES(protected_aes_key.data), modes.GCM(nonce, tag), backend=default_backend() ) @@ -517,12 +554,14 @@ class SecureCompletionClient: plaintext_size = len(plaintext_bytes) with secure_bytearray(plaintext_bytes) as protected_plaintext: - # NOTE: plaintext_json is a Python str (immutable) and cannot be - # securely zeroed. The bytearray source is zeroed by the context - # manager, but the str object will persist until GC. This is a - # known limitation of Python's memory model. - plaintext_json = bytes(protected_plaintext.data).decode('utf-8') - del plaintext_bytes # drop immutable bytes ref; secure copy already zeroed + # Parse directly from bytearray — json.loads accepts bytearray + # (Python 3.6+), avoiding an immutable bytes/str copy that cannot + # be zeroed. The bytearray is zeroed by the context manager on exit. + try: + response = json.loads(protected_plaintext.data) + except (json.JSONDecodeError, UnicodeDecodeError) as e: + raise ValueError(f"Decrypted response is not valid JSON: {e}") + del plaintext_bytes # AES key automatically zeroed here else: logger.warning("Secure memory not available, using standard decryption") @@ -538,19 +577,18 @@ class SecureCompletionClient: decryptor = cipher.decryptor() plaintext_bytes = decryptor.update(ciphertext) + decryptor.finalize() plaintext_size = len(plaintext_bytes) - plaintext_json = plaintext_bytes.decode('utf-8') + try: + response = json.loads(plaintext_bytes) + except (json.JSONDecodeError, UnicodeDecodeError) as e: + raise ValueError(f"Decrypted response is not valid JSON: {e}") del plaintext_bytes + except ValueError: + raise # Re-raise JSON parse errors without masking as SecurityError except Exception: # Don't leak specific decryption errors (timing attacks) raise SecurityError("Decryption failed: integrity check or authentication failed") - # Parse JSON outside the crypto exception handler so format errors aren't hidden - try: - response = json.loads(plaintext_json) - except (json.JSONDecodeError, UnicodeDecodeError) as e: - raise ValueError(f"Decrypted response is not valid JSON: {e}") - # Add metadata for debugging if "_metadata" not in response: response["_metadata"] = {} @@ -625,13 +663,22 @@ class SecureCompletionClient: url = f"{self.router_url}/v1/chat/secure_completion" logger.debug("Target URL: %s", url) - try: - async with httpx.AsyncClient(timeout=60.0) as client: - response = await client.post( - url, - headers=headers, - content=encrypted_payload - ) + _RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} + last_exc: Exception = APIConnectionError("Request failed") + + for attempt in range(self.max_retries + 1): + if attempt > 0: + delay = 2 ** (attempt - 1) # 1s, 2s, 4s, … + logger.warning("Retrying request (attempt %d/%d) after %.1fs...", attempt, self.max_retries, delay) + await asyncio.sleep(delay) + + try: + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + url, + headers=headers, + content=encrypted_payload + ) logger.debug("HTTP Status: %d", response.status_code) @@ -642,7 +689,6 @@ class SecureCompletionClient: return decrypted_response elif response.status_code == 400: - # Bad request try: error = response.json() raise InvalidRequestError( @@ -654,7 +700,6 @@ class SecureCompletionClient: raise InvalidRequestError("Bad request: Invalid response format") elif response.status_code == 401: - # Unauthorized - authentication failed try: error = response.json() error_message = error.get('detail', 'Invalid API key or authentication failed') @@ -667,7 +712,6 @@ class SecureCompletionClient: raise AuthenticationError("Invalid API key or authentication failed") elif response.status_code == 403: - # Forbidden - model not allowed for security tier try: error = response.json() raise ForbiddenError( @@ -679,7 +723,6 @@ class SecureCompletionClient: raise ForbiddenError("Forbidden: Model not allowed for the requested security tier") elif response.status_code == 404: - # Endpoint not found try: error = response.json() raise APIError( @@ -690,44 +733,47 @@ class SecureCompletionClient: except (json.JSONDecodeError, ValueError): raise APIError("Endpoint not found: Secure inference not enabled") - elif response.status_code == 429: - # Rate limit exceeded + elif response.status_code in _RETRYABLE_STATUS_CODES: try: error = response.json() - raise RateLimitError( - f"Rate limit exceeded: {error.get('detail', 'Too many requests')}", + if not isinstance(error, dict): + error = {"detail": "unknown"} + detail_msg = error.get("detail", "unknown") + except (json.JSONDecodeError, ValueError): + error = {} + detail_msg = "unknown" + + if response.status_code == 429: + last_exc = RateLimitError( + f"Rate limit exceeded: {detail_msg}", status_code=429, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise RateLimitError("Rate limit exceeded: Too many requests") - - elif response.status_code == 500: - # Server error - try: - error = response.json() - raise ServerError( - f"Server error: {error.get('detail', 'Internal server error')}", + elif response.status_code == 500: + last_exc = ServerError( + f"Server error: {detail_msg}", status_code=500, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise ServerError("Server error: Internal server error") - - elif response.status_code == 503: - # Service unavailable - inference backend is down - try: - error = response.json() - raise ServiceUnavailableError( - f"Service unavailable: {error.get('detail', 'Inference backend is unavailable')}", + elif response.status_code == 503: + last_exc = ServiceUnavailableError( + f"Service unavailable: {detail_msg}", status_code=503, error_details=error ) - except (json.JSONDecodeError, ValueError): - raise ServiceUnavailableError("Service unavailable: Inference backend is unavailable") + else: + last_exc = APIError( + f"Unexpected status code: {response.status_code} {detail_msg}", + status_code=response.status_code, + error_details=error + ) + + if attempt < self.max_retries: + logger.warning("Got retryable status %d: %s", response.status_code, detail_msg) + continue + raise last_exc else: - # Unexpected status code try: unexp_detail = response.json() if not isinstance(unexp_detail, dict): @@ -740,12 +786,17 @@ class SecureCompletionClient: status_code=response.status_code ) - except httpx.NetworkError as e: - raise APIConnectionError(f"Failed to connect to router: {e}") - except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError): - raise # Re-raise known exceptions - except Exception as e: - raise Exception(f"Request failed: {e}") + except httpx.NetworkError as e: + last_exc = APIConnectionError(f"Failed to connect to router: {e}") + if attempt < self.max_retries: + logger.warning("Network error on attempt %d: %s", attempt, e) + continue + raise last_exc + except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError): + raise # Non-retryable — propagate immediately + except Exception: + logger.exception("Unexpected error in send_secure_request") + raise APIConnectionError("Request failed due to an unexpected error") def _validate_rsa_key(self, key, key_type: str = "private") -> None: """ diff --git a/nomyo/__init__.py b/nomyo/__init__.py index 9c4d160..6fb55fe 100644 --- a/nomyo/__init__.py +++ b/nomyo/__init__.py @@ -51,6 +51,6 @@ try: except ImportError: pass -__version__ = "0.2.2" +__version__ = "0.2.7" __author__ = "NOMYO AI" __license__ = "Apache-2.0" diff --git a/nomyo/nomyo.py b/nomyo/nomyo.py index 7787c0a..95709b2 100644 --- a/nomyo/nomyo.py +++ b/nomyo/nomyo.py @@ -29,7 +29,7 @@ class SecureChatCompletion: Usage: ```python # Create a client instance - client = SecureChatCompletion(base_url="https://api.nomyo.ai:12435") + client = SecureChatCompletion(base_url="https://api.nomyo.ai") # Simple chat completion response = await client.create( @@ -52,7 +52,7 @@ class SecureChatCompletion: ``` """ - def __init__(self, base_url: str = "https://api.nomyo.ai:12435", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None): + def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2): """ Initialize the secure chat completion client. @@ -68,8 +68,10 @@ class SecureChatCompletion: Set to False for testing or when security is not required. key_dir: Directory to load/save RSA keys. If None, ephemeral keys are generated in memory for this session only. + max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504, + network errors). Uses exponential backoff. Default 2. """ - self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory) + self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries) self._keys_initialized = False self._keys_lock = asyncio.Lock() self.api_key = api_key diff --git a/pyproject.toml b/pyproject.toml index 3abe372..d0f08cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "nomyo" -version = "0.2.2" +version = "0.2.7" description = "OpenAI-compatible secure chat client with end-to-end encryption for NOMYO Inference Endpoints" authors = [ {name = "NOMYO.AI", email = "ichi@nomyo.ai"}, @@ -42,7 +42,7 @@ dependencies = [ [project.urls] Homepage = "https://www.nomyo.ai" -Documentation = "https://bitfreedom.net/code/nomyo-ai/nomyo/src/branch/main/doc" +Documentation = "https://bitfreedom.net/code/nomyo-ai/nomyo/wiki/NOMYO-Secure-Client-Documentation" Repository = "https://bitfreedom.net/code/nomyo-ai/nomyo" Issues = "https://bitfreedom.net/code/nomyo-ai/nomyo/issues"