diff --git a/.gitignore b/.gitignore
index 759ff16..7cd094c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,10 @@ dist/
 *.egg
 *.sh
 
+# Cython generated files
+*.so
+*.c
+
 # Virtual environments
 venv/
 .env
diff --git a/README.md b/README.md
index df46ad6..32c6c81 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,24 @@
 **OpenAI-compatible secure chat client with end-to-end encryption with NOMYO Inference Endpoints**
 
 🔒 **All prompts and responses are automatically encrypted and decrypted**
+
 🔑 **Uses hybrid encryption (AES-256-GCM + RSA-OAEP with 4096-bit keys)**
+
 🔄 **Drop-in replacement for OpenAI's ChatCompletion API**
 
 ## 🚀 Quick Start
 
+### 0. Try It Now (Demo Credentials)
+
+No account needed — use these public demo credentials to test immediately:
+
+| | |
+|---|---|
+| **API key** | `NOMYO_AI_E2EE_INFERENCE` |
+| **Model** | `Qwen/Qwen3-0.6B` |
+
+> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only.
+
 ### 1. Install methods
 
 via pip (recommended):
@@ -32,8 +45,8 @@ import asyncio
 from nomyo import SecureChatCompletion
 
 async def main():
-    # Initialize client (defaults to http://api.nomyo.ai:12434)
-    client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+    # Initialize client (defaults to https://api.nomyo.ai)
+    client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
     # Simple chat completion
     response = await client.create(
@@ -154,7 +167,7 @@ import asyncio
 from nomyo import SecureChatCompletion
 
 async def main():
-    client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+    client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
     response = await client.create(
         model="Qwen/Qwen3-0.6B",
@@ -179,7 +192,7 @@ import asyncio
 from nomyo import SecureChatCompletion
 
 async def main():
-    client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+    client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
     response = await client.create(
         model="Qwen/Qwen3-0.6B",
@@ -218,7 +231,7 @@ import asyncio
 from nomyo import SecureChatCompletion
 
 async def main():
-    client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+    client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
     response = await client.acreate(
         model="Qwen/Qwen3-0.6B",
@@ -268,7 +281,7 @@ from nomyo import SecureChatCompletion
 async def main():
     # Initialize with API key (recommended for production)
     client = SecureChatCompletion(
-        base_url="https://api.nomyo.ai:12434",
+        base_url="https://api.nomyo.ai",
         api_key="your-api-key-here"
     )
 
@@ -293,13 +306,13 @@ from nomyo import SecureChatCompletion
 async def main():
     # Enable secure memory protection (default, recommended)
     client = SecureChatCompletion(
-        base_url="https://api.nomyo.ai:12434",
+        base_url="https://api.nomyo.ai",
         secure_memory=True  # Default
     )
 
     # Disable secure memory (not recommended, for testing only)
     client = SecureChatCompletion(
-        base_url="https://api.nomyo.ai:12434",
+        base_url="https://api.nomyo.ai",
         secure_memory=False
     )
 
@@ -344,10 +357,11 @@ asyncio.run(main())
 
 ```python
 SecureChatCompletion(
-    base_url: str = "https://api.nomyo.ai:12434",
+    base_url: str = "https://api.nomyo.ai",
     allow_http: bool = False,
     api_key: Optional[str] = None,
-    secure_memory: bool = True
+    secure_memory: bool = True,
+    max_retries: int = 2
 )
 ```
 
@@ -357,6 +371,7 @@ SecureChatCompletion(
 - `allow_http`: Allow HTTP connections (ONLY for local development, never in production)
 - `api_key`: Optional API key for bearer authentication
 - `secure_memory`: Enable secure memory protection (default: True)
+- `max_retries`: Retries on retryable errors (429, 500, 502, 503, 504, network errors) with exponential backoff. Default: 2
 
 #### Methods
 
@@ -368,7 +383,7 @@ SecureChatCompletion(
 #### Constructor
 
 ```python
-SecureCompletionClient(router_url: str = "http://api.nomyo.ai:12434")
+SecureCompletionClient(router_url: str = "https://api.nomyo.ai", allow_http: bool = False, max_retries: int = 2)
 ```
 
 #### Methods
diff --git a/SECURE_MEMORY.md b/SECURE_MEMORY.md
index bdaacdb..9631613 100644
--- a/SECURE_MEMORY.md
+++ b/SECURE_MEMORY.md
@@ -29,7 +29,7 @@ from nomyo import SecureChatCompletion
 
 # Create client with secure memory enabled (default)
 client = SecureChatCompletion(
-    base_url="https://api.nomyo.ai:12434",
+    base_url="https://api.nomyo.ai",
     secure_memory=True  # Enabled by default
 )
 
@@ -47,7 +47,7 @@ from nomyo import SecureChatCompletion
 
 # Disable secure memory for testing or when not needed
 client = SecureChatCompletion(
-    base_url="https://api.nomyo.ai:12434",
+    base_url="https://api.nomyo.ai",
     secure_memory=False
 )
 ```
@@ -210,7 +210,7 @@ from nomyo import SecureChatCompletion
 async def secure_chat():
     # Create client with maximum security
     client = SecureChatCompletion(
-        base_url="https://api.nomyo.ai:12434",
+        base_url="https://api.nomyo.ai",
         secure_memory=True  # Default
     )
 
diff --git a/SECURITY.md b/SECURITY.md
index f5a2179..c8e42c5 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -12,7 +12,7 @@ The client MUST connect using HTTPS in production environments:
 
 ```python
 # ✅ SECURE (Production)
-client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
 # ⚠️ INSECURE (Local development only)
 client = SecureChatCompletion(base_url="http://localhost:12434", allow_http=True)
diff --git a/doc/README.md b/doc/README.md
index ceb903b..70dd9d6 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -3,6 +3,8 @@
 This documentation provides comprehensive information about using the NOMYO Secure Python Chat Client, a drop-in replacement for OpenAI's ChatCompletion API with end-to-end (E2E) encryption.
 To use this client library you need a paid subscribtion on [NOMYO Inference](https://chat.nomyo.ai/).
 
+![Inference Image](./secure-inference.jpg)
+
 ## Overview
 
 The NOMYO Secure Client provides:
@@ -44,9 +46,11 @@ asyncio.run(main())
 1. [Installation](installation.md) - How to install and set up the client
 2. [Getting Started](getting-started.md) - Quick start guide with examples
 3. [API Reference](api-reference.md) - Complete API documentation
-4. [Security Guide](security-guide.md) - Security features and best practices
-5. [Examples](examples.md) - Advanced usage scenarios
-6. [Troubleshooting](troubleshooting.md) - Common issues and solutions
+4. [Models](models.md) - Available models and selection guide
+5. [Security Guide](security-guide.md) - Security features and best practices
+6. [Examples](examples.md) - Advanced usage scenarios
+7. [Rate Limits](rate-limits.md) - Request limits, burst allowance, and error handling
+8. [Troubleshooting](troubleshooting.md) - Common issues and solutions
 
 ## Key Features
 
diff --git a/doc/api-reference.md b/doc/api-reference.md
index 1e16436..1069082 100644
--- a/doc/api-reference.md
+++ b/doc/api-reference.md
@@ -11,7 +11,8 @@ SecureChatCompletion(
     base_url: str = "https://api.nomyo.ai",
     allow_http: bool = False,
     api_key: Optional[str] = None,
-    secure_memory: bool = True
+    secure_memory: bool = True,
+    max_retries: int = 2
 )
 ```
 
@@ -21,6 +22,7 @@ SecureChatCompletion(
 - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production)
 - `api_key` (Optional[str]): Optional API key for bearer authentication
 - `secure_memory` (bool): Enable secure memory protection (default: True)
+- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2
 
 ### Methods
 
@@ -73,10 +75,30 @@ A dictionary containing the chat completion response with the following structur
         "prompt_tokens": int,
         "completion_tokens": int,
         "total_tokens": int
+    },
+    "_metadata": {
+        "payload_id": str,
+        "processed_at": int,          # Unix timestamp
+        "is_encrypted": bool,
+        "response_status": str,
+        "security_tier": str,         # "standard", "high", or "maximum"
+        "memory_protection": dict,    # server-side memory protection info
+        "cuda_device": dict,          # privacy-safe GPU info (hashed identifiers)
+        "tpm_attestation": {          # TPM 2.0 hardware attestation (see Security Guide)
+            "is_available": bool,
+            # Present only when is_available is True:
+            "pcr_banks": str,         # e.g. "sha256:0,7,10"
+            "pcr_values": dict,       # {bank: {pcr_index: hex_digest}}
+            "quote_b64": str,         # base64-encoded TPMS_ATTEST (signed by AIK)
+            "signature_b64": str,     # base64-encoded TPMT_SIGNATURE
+            "aik_pubkey_b64": str,    # base64-encoded TPM2B_PUBLIC (ephemeral AIK)
+        }
     }
 }
 ```
 
+The `_metadata` field is added by the client library and is not part of the OpenAI API response format. See the [Security Guide](security-guide.md) for how to interpret and verify `tpm_attestation`.
+
 #### acreate(model, messages, **kwargs)
 
 Async alias for create() method.
@@ -92,13 +114,18 @@ The `SecureCompletionClient` class handles the underlying encryption, key manage
 ### Constructor
 
 ```python
-SecureCompletionClient(router_url: str = "https://api.nomyo.ai:12434", allow_http: bool = False)
+SecureCompletionClient(
+    router_url: str = "https://api.nomyo.ai",
+    allow_http: bool = False,
+    max_retries: int = 2
+)
 ```
 
 **Parameters:**
 
 - `router_url` (str): Base URL of the NOMYO Router (must use HTTPS for production)
 - `allow_http` (bool): Allow HTTP connections (ONLY for local development, never in production)
+- `max_retries` (int): Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default: 2
 
 ### Methods
 
diff --git a/doc/getting-started.md b/doc/getting-started.md
index 2eb6c1e..1cf78a4 100644
--- a/doc/getting-started.md
+++ b/doc/getting-started.md
@@ -1,5 +1,33 @@
 # Getting Started
 
+## Try It Now (Demo Credentials)
+
+You can test the client immediately using these public demo credentials — no sign-up required:
+
+| | |
+|---|---|
+| **API key** | `NOMYO_AI_E2EE_INFERENCE` |
+| **Model** | `Qwen/Qwen3-0.6B` |
+
+> **Note:** The demo endpoint uses a fixed 256-token context window and is intended for evaluation only.
+
+```python
+import asyncio
+from nomyo import SecureChatCompletion
+
+async def main():
+    client = SecureChatCompletion(api_key="NOMYO_AI_E2EE_INFERENCE")
+
+    response = await client.create(
+        model="Qwen/Qwen3-0.6B",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+
+    print(response['choices'][0]['message']['content'])
+
+asyncio.run(main())
+```
+
 ## Basic Usage
 
 The NOMYO client provides end-to-end encryption (E2E) for all communications between your application and the NOMYO inference endpoints. This ensures that your prompts and responses are protected from unauthorized access or interception.
@@ -197,7 +225,7 @@ import asyncio
 from nomyo import SecureChatCompletion, AuthenticationError, InvalidRequestError
 
 async def main():
-    client = SecureChatCompletion(base_url="https://api.nomyo.ai:12434")
+    client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
     try:
         response = await client.create(
diff --git a/doc/models.md b/doc/models.md
new file mode 100644
index 0000000..9860039
--- /dev/null
+++ b/doc/models.md
@@ -0,0 +1,48 @@
+# Available Models
+
+All models are available via `api.nomyo.ai`. Pass the model ID string directly to the `model` parameter of `create()`.
+
+## Model List
+
+| Model ID | Parameters | Type | Notes |
+|---|---|---|---|
+| `Qwen/Qwen3-0.6B` | 0.6B | General | Lightweight, fast inference |
+| `Qwen/Qwen3.5-0.8B` | 0.8B | General | Lightweight, fast inference |
+| `LiquidAI/LFM2.5-1.2B-Thinking` | 1.2B | Thinking | Reasoning model |
+| `ibm-granite/granite-4.0-h-small` | Small | General | IBM Granite 4.0, enterprise-focused |
+| `Qwen/Qwen3.5-9B` | 9B | General | Balanced quality and speed |
+| `utter-project/EuroLLM-9B-Instruct-2512` | 9B | General | Multilingual, strong European language support |
+| `zai-org/GLM-4.7-Flash` | — | General | Fast GLM variant |
+| `mistralai/Ministral-3-14B-Instruct-2512-GGUF` | 14B | General | Mistral instruction-tuned |
+| `ServiceNow-AI/Apriel-1.6-15b-Thinker` | 15B | Thinking | Reasoning model |
+| `openai/gpt-oss-20b` | 20B | General | OpenAI open-weight release |
+| `LiquidAI/LFM2-24B-A2B` | 24B (2B active) | General | MoE — efficient inference |
+| `Qwen/Qwen3.5-27B` | 27B | General | High quality, large context |
+| `google/medgemma-27b-it` | 27B | Specialized | Medical domain, instruction-tuned |
+| `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4` | 30B (3B active) | General | MoE — efficient inference |
+| `Qwen/Qwen3.5-35B-A3B` | 35B (3B active) | General | MoE — efficient inference |
+| `moonshotai/Kimi-Linear-48B-A3B-Instruct` | 48B (3B active) | General | MoE — large capacity, efficient inference |
+
+> **MoE** (Mixture of Experts) models show total/active parameter counts. Only active parameters are used per token, keeping inference cost low relative to total model size.
+
+## Usage Example
+
+```python
+from nomyo import SecureChatCompletion
+
+client = SecureChatCompletion(api_key="your-api-key")
+
+response = await client.create(
+    model="Qwen/Qwen3.5-9B",
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+## Choosing a Model
+
+- **Low latency / edge use**: `Qwen/Qwen3-0.6B`, `Qwen/Qwen3.5-0.8B`, `LiquidAI/LFM2.5-1.2B-Thinking`
+- **Balanced quality and speed**: `Qwen/Qwen3.5-9B`, `mistralai/Ministral-3-14B-Instruct-2512-GGUF`
+- **Reasoning / chain-of-thought**: `LiquidAI/LFM2.5-1.2B-Thinking`, `ServiceNow-AI/Apriel-1.6-15b-Thinker`
+- **Multilingual**: `utter-project/EuroLLM-9B-Instruct-2512`
+- **Medical**: `google/medgemma-27b-it`
+- **Highest quality**: `moonshotai/Kimi-Linear-48B-A3B-Instruct`, `Qwen/Qwen3.5-35B-A3B`
diff --git a/doc/rate-limits.md b/doc/rate-limits.md
new file mode 100644
index 0000000..2c18da2
--- /dev/null
+++ b/doc/rate-limits.md
@@ -0,0 +1,61 @@
+# Rate Limits
+
+The NOMYO API (`api.nomyo.ai`) enforces rate limits to ensure fair usage and service stability for all users.
+
+## Default Rate Limit
+
+By default, each API key is limited to **2 requests per second**.
+
+## Burst Allowance
+
+Short bursts above the default limit are permitted. You may send up to **4 requests per second** in burst mode, provided you have not exceeded burst usage within the current **10-second window**.
+
+Burst capacity is granted once per 10-second window. If you consume the burst allowance, you must wait for the window to reset before burst is available again.
+
+## Rate Limit Summary
+
+| Mode    | Limit              | Condition                        |
+|---------|--------------------|----------------------------------|
+| Default | 2 requests/second  | Always active                    |
+| Burst   | 4 requests/second  | Once per 10-second window        |
+
+## Error Responses
+
+### 429 Too Many Requests
+
+Returned when your request rate exceeds the allowed limit.
+
+```
+HTTP/1.1 429 Too Many Requests
+```
+
+**What to do:** Back off and retry after a short delay. Implement exponential backoff in your client to avoid repeated limit hits.
+
+### 503 Service Unavailable (Cool-down)
+
+Returned when burst limits are abused repeatedly. A **30-minute cool-down** is applied to the offending API key.
+
+```
+HTTP/1.1 503 Service Unavailable
+```
+
+**What to do:** Wait 30 minutes before retrying. Review your request patterns to ensure you stay within the permitted limits.
+
+## Best Practices
+
+- **Throttle your requests** client-side to stay at or below 2 requests/second under normal load.
+- **Use burst sparingly** — it is intended for occasional spikes, not sustained high-throughput usage.
+- **Implement exponential backoff** when you receive a `429` response. Start with a short delay (e.g. 500 ms) and double it on each subsequent failure, up to a reasonable maximum.
+- **Monitor for `503` responses** — repeated occurrences indicate that your usage pattern is triggering the abuse threshold. Refactor your request logic before the cool-down expires.
+
+## Retry Behaviour
+
+The client retries automatically on `429`, `500`, `502`, `503`, `504`, and network errors using exponential backoff (1 s, 2 s, …). The default is **2 retries**. You can raise or disable this per client:
+
+```python
+# More retries for high-throughput workloads
+client = SecureChatCompletion(api_key="...", max_retries=5)
+
+# Disable retries entirely
+client = SecureChatCompletion(api_key="...", max_retries=0)
+```
diff --git a/doc/secure-inference.jpg b/doc/secure-inference.jpg
new file mode 100644
index 0000000..70c6fd1
Binary files /dev/null and b/doc/secure-inference.jpg differ
diff --git a/doc/security-guide.md b/doc/security-guide.md
index 6c34f71..6e4abdc 100644
--- a/doc/security-guide.md
+++ b/doc/security-guide.md
@@ -162,6 +162,81 @@ Secure memory features:
 - Guarantees zeroing of sensitive memory
 - Prevents memory dumps from containing sensitive data
 
+## Hardware Attestation (TPM 2.0)
+
+### What it is
+
+When the server has a TPM 2.0 chip, every response includes a `tpm_attestation` block in `_metadata`. This is a cryptographically signed hardware quote proving:
+
+- Which firmware and Secure Boot state the server is running (PCR 0, 7)
+- Which application binary is running, when IMA is active (PCR 10)
+
+The quote is signed by an ephemeral AIK (Attestation Identity Key) generated fresh for each request and tied to the `payload_id` nonce, so it cannot be replayed for a different request.
+
+### Reading the attestation
+
+```python
+response = await client.create(
+    model="Qwen/Qwen3-0.6B",
+    messages=[{"role": "user", "content": "..."}],
+    security_tier="maximum"
+)
+
+tpm = response["_metadata"].get("tpm_attestation", {})
+
+if tpm.get("is_available"):
+    print("PCR banks:", tpm["pcr_banks"])         # e.g. "sha256:0,7,10"
+    print("PCR values:", tpm["pcr_values"])        # {bank: {index: hex}}
+    print("AIK key:", tpm["aik_pubkey_b64"][:32], "...")
+else:
+    print("TPM not available on this server")
+```
+
+### Verifying the quote
+
+The response is self-contained: `aik_pubkey_b64` is the full public key of the AIK that signed the quote, so no separate key-fetch round-trip is needed.
+
+Verification steps using `tpm2-pytss`:
+
+```python
+import base64
+from tpm2_pytss.types import TPM2B_PUBLIC, TPMT_SIGNATURE, TPM2B_ATTEST
+
+# 1. Decode the quote components
+aik_pub = TPM2B_PUBLIC.unmarshal(base64.b64decode(tpm["aik_pubkey_b64"]))[0]
+quote   = TPM2B_ATTEST.unmarshal(base64.b64decode(tpm["quote_b64"]))[0]
+sig     = TPMT_SIGNATURE.unmarshal(base64.b64decode(tpm["signature_b64"]))[0]
+
+# 2. Verify the signature over the quote using the AIK public key
+#    (use a TPM ESAPI verify_signature call or an offline RSA verify)
+
+# 3. Inspect the qualifying_data inside the quote — it must match
+#    SHA-256(payload_id.encode())[:16] to confirm this quote is for this request
+
+# 4. Check pcr_values against your known-good baseline
+```
+
+> Full verification requires `tpm2-pytss` on the client side (`pip install tpm2-pytss` + `sudo apt install libtss2-dev`). It is optional — the attestation is informational unless your deployment policy requires verification.
+
+### Behaviour per security tier
+
+| Tier | TPM unavailable |
+|------|----------------|
+| `standard` | `tpm_attestation: {"is_available": false}` — request proceeds |
+| `high` | same as standard |
+| `maximum` | `ServiceUnavailableError` (HTTP 503) — request rejected |
+
+For `maximum` tier, the server enforces TPM availability as a hard requirement. If your server has no TPM and you request `maximum`, catch the error explicitly:
+
+```python
+from nomyo import ServiceUnavailableError
+
+try:
+    response = await client.create(..., security_tier="maximum")
+except ServiceUnavailableError as e:
+    print("Server does not meet TPM requirements for maximum tier:", e)
+```
+
 ## Compliance Considerations
 
 ### HIPAA Compliance
@@ -207,9 +282,11 @@ response = await client.create(
     messages=[{"role": "user", "content": "Hello"}]
 )
 
-print(response["_metadata"])  # Contains security-related information
+print(response["_metadata"])  # Contains security_tier, memory_protection, tpm_attestation, etc.
 ```
 
+See [Hardware Attestation](#hardware-attestation-tpm-20) for details on the `tpm_attestation` field.
+
 ### Logging
 
 Enable logging to see security operations:
diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md
new file mode 100644
index 0000000..4f34127
--- /dev/null
+++ b/doc/troubleshooting.md
@@ -0,0 +1 @@
+# Troubleshooting
diff --git a/nomyo/SecureCompletionClient.py b/nomyo/SecureCompletionClient.py
index 66cd94a..6aa5379 100644
--- a/nomyo/SecureCompletionClient.py
+++ b/nomyo/SecureCompletionClient.py
@@ -1,5 +1,5 @@
-import json, base64, urllib.parse, httpx, os, secrets, warnings, logging
-from typing import Dict, Any, Optional
+import asyncio, ctypes, json, base64, urllib.parse, httpx, os, secrets, sys, warnings, logging
+from typing import Dict, Any, Optional, Union
 from cryptography.hazmat.primitives import serialization, hashes
 from cryptography.hazmat.primitives.asymmetric import rsa, padding
 from cryptography.hazmat.backends import default_backend
@@ -76,7 +76,7 @@ class SecureCompletionClient:
     - Response parsing
     """
 
-    def __init__(self, router_url: str = "https://api.nomyo.ai:12435", allow_http: bool = False, secure_memory: bool = True):
+    def __init__(self, router_url: str = "https://api.nomyo.ai", allow_http: bool = False, secure_memory: bool = True, max_retries: int = 2):
         """
         Initialize the secure completion client.
 
@@ -84,6 +84,9 @@ class SecureCompletionClient:
             router_url: Base URL of the NOMYO Router (must use HTTPS for production)
             allow_http: Allow HTTP connections (ONLY for local development, never in production)
             secure_memory: Whether to use secure memory operations for this instance.
+            max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504,
+                         network errors). Uses exponential backoff. Default 2, matching
+                         the OpenAI Python SDK default.
         """
         self.router_url = router_url.rstrip('/')
         self.private_key = None
@@ -91,6 +94,7 @@ class SecureCompletionClient:
         self.key_size = 4096  # RSA key size
         self.allow_http = allow_http  # Store for use in fetch_server_public_key
         self._use_secure_memory = _SECURE_MEMORY_AVAILABLE and secure_memory
+        self.max_retries = max_retries
 
         # Validate HTTPS for security
         if not self.router_url.startswith("https://"):
@@ -333,10 +337,11 @@ class SecureCompletionClient:
         except Exception:
             raise ValueError("Failed to fetch server's public key")
 
-    async def _do_encrypt(self, payload_bytes: bytes, aes_key: bytes) -> bytes:
+    async def _do_encrypt(self, payload_bytes: Union[bytes, bytearray], aes_key: Union[bytes, bytearray]) -> bytes:
         """
         Core AES-256-GCM + RSA-OAEP encryption. Caller is responsible for
         memory protection of payload_bytes and aes_key before calling this.
+        Accepts bytearray to avoid creating an unzeroed immutable bytes copy.
         """
         nonce = secrets.token_bytes(12)  # 96-bit nonce for GCM
         cipher = Cipher(
@@ -353,14 +358,28 @@ class SecureCompletionClient:
             server_public_key_pem.encode('utf-8'),
             backend=default_backend()
         )
-        encrypted_aes_key = server_public_key.encrypt(
-            aes_key,
-            padding.OAEP(
-                mgf=padding.MGF1(algorithm=hashes.SHA256()),
-                algorithm=hashes.SHA256(),
-                label=None
+        # RSA encrypt requires bytes — an immutable copy is unavoidable here.
+        # We narrow its lifetime to this block and attempt to zero it via
+        # CPython internals immediately after use. This relies on the CPython
+        # bytes object layout (ob_sval starts at getsizeof(b'')-1 from id()),
+        # so it is a best-effort measure on CPython only.
+        _key_bytes = bytes(aes_key)
+        try:
+            encrypted_aes_key = server_public_key.encrypt(
+                _key_bytes,
+                padding.OAEP(
+                    mgf=padding.MGF1(algorithm=hashes.SHA256()),
+                    algorithm=hashes.SHA256(),
+                    label=None
+                )
             )
-        )
+        finally:
+            try:
+                _data_offset = sys.getsizeof(b'') - 1  # offset to ob_sval in PyBytesObject
+                ctypes.memset(id(_key_bytes) + _data_offset, 0, len(_key_bytes))
+            except Exception:
+                pass
+            del _key_bytes
 
         encrypted_package = {
             "version": "1.0",
@@ -405,8 +424,8 @@ class SecureCompletionClient:
             raise ValueError("Payload cannot be empty")
 
         try:
-            # Serialize payload to JSON
-            payload_json = json.dumps(payload).encode('utf-8')
+            # Serialize payload to JSON as bytearray so SecureBuffer can zero the original
+            payload_json = bytearray(json.dumps(payload).encode('utf-8'))
 
             # Validate payload size (prevent DoS)
             MAX_PAYLOAD_SIZE = 10 * 1024 * 1024  # 10MB limit
@@ -415,14 +434,14 @@ class SecureCompletionClient:
 
             logger.debug("Payload size: %d bytes", len(payload_json))
 
-            aes_key = secrets.token_bytes(32)  # 256-bit key
+            aes_key = bytearray(secrets.token_bytes(32))  # 256-bit key as bytearray
             try:
                 if self._use_secure_memory:
                     with secure_bytearray(payload_json) as protected_payload:
                         with secure_bytearray(aes_key) as protected_aes_key:
                             return await self._do_encrypt(
-                                bytes(protected_payload.data),
-                                bytes(protected_aes_key.data)
+                                protected_payload.data,
+                                protected_aes_key.data
                             )
                 else:
                     logger.warning("Secure memory not available, using standard encryption")
@@ -476,6 +495,20 @@ class SecureCompletionClient:
         if missing_fields:
             raise ValueError(f"Missing required fields in encrypted package: {', '.join(missing_fields)}")
 
+        # Validate version and algorithm to prevent downgrade attacks
+        SUPPORTED_VERSION = "1.0"
+        SUPPORTED_ALGORITHM = "hybrid-aes256-rsa4096"
+        if package["version"] != SUPPORTED_VERSION:
+            raise ValueError(
+                f"Unsupported protocol version: '{package['version']}'. "
+                f"Expected: '{SUPPORTED_VERSION}'"
+            )
+        if package["algorithm"] != SUPPORTED_ALGORITHM:
+            raise ValueError(
+                f"Unsupported encryption algorithm: '{package['algorithm']}'. "
+                f"Expected: '{SUPPORTED_ALGORITHM}'"
+            )
+
         # Validate encrypted_payload structure
         if not isinstance(package["encrypted_payload"], dict):
             raise ValueError("Invalid encrypted_payload: must be a dictionary")
@@ -485,9 +518,13 @@ class SecureCompletionClient:
         if missing_payload_fields:
             raise ValueError(f"Missing fields in encrypted_payload: {', '.join(missing_payload_fields)}")
 
+        # Guard: private key must be initialized before attempting decryption
+        if self.private_key is None:
+            raise SecurityError("Private key not initialized. Call generate_keys() or load_keys() first.")
+
         # Decrypt with proper error handling — keep crypto errors opaque (timing attacks)
-        plaintext_json: Optional[str] = None
         plaintext_size: int = 0
+        response: Optional[Dict[str, Any]] = None
         try:
             # Decrypt AES key with private key
             encrypted_aes_key = base64.b64decode(package["encrypted_aes_key"])
@@ -508,7 +545,7 @@ class SecureCompletionClient:
                     tag = base64.b64decode(package["encrypted_payload"]["tag"])
 
                     cipher = Cipher(
-                        algorithms.AES(bytes(protected_aes_key.data)),
+                        algorithms.AES(protected_aes_key.data),
                         modes.GCM(nonce, tag),
                         backend=default_backend()
                     )
@@ -517,12 +554,14 @@ class SecureCompletionClient:
                     plaintext_size = len(plaintext_bytes)
 
                     with secure_bytearray(plaintext_bytes) as protected_plaintext:
-                        # NOTE: plaintext_json is a Python str (immutable) and cannot be
-                        # securely zeroed. The bytearray source is zeroed by the context
-                        # manager, but the str object will persist until GC. This is a
-                        # known limitation of Python's memory model.
-                        plaintext_json = bytes(protected_plaintext.data).decode('utf-8')
-                    del plaintext_bytes  # drop immutable bytes ref; secure copy already zeroed
+                        # Parse directly from bytearray — json.loads accepts bytearray
+                        # (Python 3.6+), avoiding an immutable bytes/str copy that cannot
+                        # be zeroed. The bytearray is zeroed by the context manager on exit.
+                        try:
+                            response = json.loads(protected_plaintext.data)
+                        except (json.JSONDecodeError, UnicodeDecodeError) as e:
+                            raise ValueError(f"Decrypted response is not valid JSON: {e}")
+                    del plaintext_bytes
                 # AES key automatically zeroed here
             else:
                 logger.warning("Secure memory not available, using standard decryption")
@@ -538,19 +577,18 @@ class SecureCompletionClient:
                 decryptor = cipher.decryptor()
                 plaintext_bytes = decryptor.update(ciphertext) + decryptor.finalize()
                 plaintext_size = len(plaintext_bytes)
-                plaintext_json = plaintext_bytes.decode('utf-8')
+                try:
+                    response = json.loads(plaintext_bytes)
+                except (json.JSONDecodeError, UnicodeDecodeError) as e:
+                    raise ValueError(f"Decrypted response is not valid JSON: {e}")
                 del plaintext_bytes
 
+        except ValueError:
+            raise  # Re-raise JSON parse errors without masking as SecurityError
         except Exception:
             # Don't leak specific decryption errors (timing attacks)
             raise SecurityError("Decryption failed: integrity check or authentication failed")
 
-        # Parse JSON outside the crypto exception handler so format errors aren't hidden
-        try:
-            response = json.loads(plaintext_json)
-        except (json.JSONDecodeError, UnicodeDecodeError) as e:
-            raise ValueError(f"Decrypted response is not valid JSON: {e}")
-
         # Add metadata for debugging
         if "_metadata" not in response:
             response["_metadata"] = {}
@@ -625,13 +663,22 @@ class SecureCompletionClient:
         url = f"{self.router_url}/v1/chat/secure_completion"
         logger.debug("Target URL: %s", url)
 
-        try:
-            async with httpx.AsyncClient(timeout=60.0) as client:
-                response = await client.post(
-                    url,
-                    headers=headers,
-                    content=encrypted_payload
-                )
+        _RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
+        last_exc: Exception = APIConnectionError("Request failed")
+
+        for attempt in range(self.max_retries + 1):
+            if attempt > 0:
+                delay = 2 ** (attempt - 1)  # 1s, 2s, 4s, …
+                logger.warning("Retrying request (attempt %d/%d) after %.1fs...", attempt, self.max_retries, delay)
+                await asyncio.sleep(delay)
+
+            try:
+                async with httpx.AsyncClient(timeout=60.0) as client:
+                    response = await client.post(
+                        url,
+                        headers=headers,
+                        content=encrypted_payload
+                    )
 
                 logger.debug("HTTP Status: %d", response.status_code)
 
@@ -642,7 +689,6 @@ class SecureCompletionClient:
                     return decrypted_response
 
                 elif response.status_code == 400:
-                    # Bad request
                     try:
                         error = response.json()
                         raise InvalidRequestError(
@@ -654,7 +700,6 @@ class SecureCompletionClient:
                         raise InvalidRequestError("Bad request: Invalid response format")
 
                 elif response.status_code == 401:
-                    # Unauthorized - authentication failed
                     try:
                         error = response.json()
                         error_message = error.get('detail', 'Invalid API key or authentication failed')
@@ -667,7 +712,6 @@ class SecureCompletionClient:
                         raise AuthenticationError("Invalid API key or authentication failed")
 
                 elif response.status_code == 403:
-                    # Forbidden - model not allowed for security tier
                     try:
                         error = response.json()
                         raise ForbiddenError(
@@ -679,7 +723,6 @@ class SecureCompletionClient:
                         raise ForbiddenError("Forbidden: Model not allowed for the requested security tier")
 
                 elif response.status_code == 404:
-                    # Endpoint not found
                     try:
                         error = response.json()
                         raise APIError(
@@ -690,44 +733,47 @@ class SecureCompletionClient:
                     except (json.JSONDecodeError, ValueError):
                         raise APIError("Endpoint not found: Secure inference not enabled")
 
-                elif response.status_code == 429:
-                    # Rate limit exceeded
+                elif response.status_code in _RETRYABLE_STATUS_CODES:
                     try:
                         error = response.json()
-                        raise RateLimitError(
-                            f"Rate limit exceeded: {error.get('detail', 'Too many requests')}",
+                        if not isinstance(error, dict):
+                            error = {"detail": "unknown"}
+                        detail_msg = error.get("detail", "unknown")
+                    except (json.JSONDecodeError, ValueError):
+                        error = {}
+                        detail_msg = "unknown"
+
+                    if response.status_code == 429:
+                        last_exc = RateLimitError(
+                            f"Rate limit exceeded: {detail_msg}",
                             status_code=429,
                             error_details=error
                         )
-                    except (json.JSONDecodeError, ValueError):
-                        raise RateLimitError("Rate limit exceeded: Too many requests")
-
-                elif response.status_code == 500:
-                    # Server error
-                    try:
-                        error = response.json()
-                        raise ServerError(
-                            f"Server error: {error.get('detail', 'Internal server error')}",
+                    elif response.status_code == 500:
+                        last_exc = ServerError(
+                            f"Server error: {detail_msg}",
                             status_code=500,
                             error_details=error
                         )
-                    except (json.JSONDecodeError, ValueError):
-                        raise ServerError("Server error: Internal server error")
-
-                elif response.status_code == 503:
-                    # Service unavailable - inference backend is down
-                    try:
-                        error = response.json()
-                        raise ServiceUnavailableError(
-                            f"Service unavailable: {error.get('detail', 'Inference backend is unavailable')}",
+                    elif response.status_code == 503:
+                        last_exc = ServiceUnavailableError(
+                            f"Service unavailable: {detail_msg}",
                             status_code=503,
                             error_details=error
                         )
-                    except (json.JSONDecodeError, ValueError):
-                        raise ServiceUnavailableError("Service unavailable: Inference backend is unavailable")
+                    else:
+                        last_exc = APIError(
+                            f"Unexpected status code: {response.status_code} {detail_msg}",
+                            status_code=response.status_code,
+                            error_details=error
+                        )
+
+                    if attempt < self.max_retries:
+                        logger.warning("Got retryable status %d: %s", response.status_code, detail_msg)
+                        continue
+                    raise last_exc
 
                 else:
-                    # Unexpected status code
                     try:
                         unexp_detail = response.json()
                         if not isinstance(unexp_detail, dict):
@@ -740,12 +786,17 @@ class SecureCompletionClient:
                         status_code=response.status_code
                     )
 
-        except httpx.NetworkError as e:
-            raise APIConnectionError(f"Failed to connect to router: {e}")
-        except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError):
-            raise  # Re-raise known exceptions
-        except Exception as e:
-            raise Exception(f"Request failed: {e}")
+            except httpx.NetworkError as e:
+                last_exc = APIConnectionError(f"Failed to connect to router: {e}")
+                if attempt < self.max_retries:
+                    logger.warning("Network error on attempt %d: %s", attempt, e)
+                    continue
+                raise last_exc
+            except (SecurityError, APIError, AuthenticationError, InvalidRequestError, ForbiddenError, RateLimitError, ServerError, ServiceUnavailableError, APIConnectionError):
+                raise  # Non-retryable — propagate immediately
+            except Exception:
+                logger.exception("Unexpected error in send_secure_request")
+                raise APIConnectionError("Request failed due to an unexpected error")
 
     def _validate_rsa_key(self, key, key_type: str = "private") -> None:
         """
diff --git a/nomyo/__init__.py b/nomyo/__init__.py
index 9c4d160..6fb55fe 100644
--- a/nomyo/__init__.py
+++ b/nomyo/__init__.py
@@ -51,6 +51,6 @@ try:
 except ImportError:
     pass
 
-__version__ = "0.2.2"
+__version__ = "0.2.7"
 __author__ = "NOMYO AI"
 __license__ = "Apache-2.0"
diff --git a/nomyo/nomyo.py b/nomyo/nomyo.py
index 7787c0a..95709b2 100644
--- a/nomyo/nomyo.py
+++ b/nomyo/nomyo.py
@@ -29,7 +29,7 @@ class SecureChatCompletion:
     Usage:
         ```python
         # Create a client instance
-        client = SecureChatCompletion(base_url="https://api.nomyo.ai:12435")
+        client = SecureChatCompletion(base_url="https://api.nomyo.ai")
 
         # Simple chat completion
         response = await client.create(
@@ -52,7 +52,7 @@ class SecureChatCompletion:
         ```
     """
 
-    def __init__(self, base_url: str = "https://api.nomyo.ai:12435", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None):
+    def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2):
         """
         Initialize the secure chat completion client.
 
@@ -68,8 +68,10 @@ class SecureChatCompletion:
                           Set to False for testing or when security is not required.
             key_dir: Directory to load/save RSA keys. If None, ephemeral keys are
                      generated in memory for this session only.
+            max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504,
+                        network errors). Uses exponential backoff. Default 2.
         """
-        self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory)
+        self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries)
         self._keys_initialized = False
         self._keys_lock = asyncio.Lock()
         self.api_key = api_key
diff --git a/pyproject.toml b/pyproject.toml
index 3abe372..d0f08cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "nomyo"
-version = "0.2.2"
+version = "0.2.7"
 description = "OpenAI-compatible secure chat client with end-to-end encryption for NOMYO Inference Endpoints"
 authors = [
     {name = "NOMYO.AI", email = "ichi@nomyo.ai"},
@@ -42,7 +42,7 @@ dependencies = [
 
 [project.urls]
 Homepage = "https://www.nomyo.ai"
-Documentation = "https://bitfreedom.net/code/nomyo-ai/nomyo/src/branch/main/doc"
+Documentation = "https://bitfreedom.net/code/nomyo-ai/nomyo/wiki/NOMYO-Secure-Client-Documentation"
 Repository = "https://bitfreedom.net/code/nomyo-ai/nomyo"
 Issues = "https://bitfreedom.net/code/nomyo-ai/nomyo/issues"