nomyo/nomyo/nomyo.py
alpha nerd 93adb6c45c
All checks were successful
Publish to PyPI / publish (push) Successful in 16s
feat: add automatic client retry logic with exponential backoff
2026-04-15 12:08:21 +02:00

250 lines
11 KiB
Python

import asyncio
import os
import uuid
from typing import Dict, Any, List, Optional
from .SecureCompletionClient import SecureCompletionClient
# Check if secure memory module is available (used only for the user-facing warning)
try:
from . import SecureMemory as _ # noqa: F401
_SECURE_MEMORY_AVAILABLE = True
except ImportError:
_SECURE_MEMORY_AVAILABLE = False
class SecureChatCompletion:
"""
OpenAI-compatible secure chat completion client.
This class provides the same interface as OpenAI's ChatCompletion.create()
method, but automatically encrypts all requests and decrypts all responses
for secure communication with the NOMYO Router's /v1/chat/secure_completion
endpoint.
Security Features:
- End-to-end encryption (AES-256-GCM + RSA-OAEP)
- Secure memory protection (prevents memory swapping and guarantees zeroing)
- HTTPS enforcement (with optional HTTP for local development)
- Automatic key management
Usage:
```python
# Create a client instance
client = SecureChatCompletion(base_url="https://api.nomyo.ai")
# Simple chat completion
response = await client.create(
model="Qwen/Qwen3-0.6B",
messages=[
{"role": "user", "content": "What is the capital of France?"}
],
temperature=0.7
)
# With tools
response = await client.create(
model="Qwen/Qwen3-0.6B",
messages=[
{"role": "user", "content": "What's the weather in Paris?"}
],
tools=[...],
temperature=0.7
)
```
"""
def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2):
"""
Initialize the secure chat completion client.
Args:
base_url: Base URL of the NOMYO Router (must use HTTPS for production)
This parameter is named 'base_url' for OpenAI compatibility.
allow_http: Allow HTTP connections (ONLY for local development, never in production)
api_key: Optional API key for bearer authentication. If provided, it will be
used for all requests made with this client.
secure_memory: Enable secure memory protection (default: True).
When enabled, prevents plaintext payloads from being swapped to disk
and guarantees memory is zeroed after encryption.
Set to False for testing or when security is not required.
key_dir: Directory to load/save RSA keys. If None, ephemeral keys are
generated in memory for this session only.
max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504,
network errors). Uses exponential backoff. Default 2.
"""
self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries)
self._keys_initialized = False
self._keys_lock = asyncio.Lock()
self.api_key = api_key
self._key_dir = key_dir
self._secure_memory_enabled = secure_memory
if secure_memory and not _SECURE_MEMORY_AVAILABLE:
import warnings
warnings.warn(
"Secure memory requested but not available. "
"Falling back to standard memory handling.",
UserWarning,
stacklevel=2
)
async def _ensure_keys(self):
"""Ensure keys are loaded or generated (concurrency-safe)."""
if self._keys_initialized:
return
async with self._keys_lock:
if self._keys_initialized: # double-check after acquiring lock
return
if self._key_dir is not None:
private_key_path = os.path.join(self._key_dir, "private_key.pem")
public_key_path = os.path.join(self._key_dir, "public_key.pem")
try:
self.client.load_keys(private_key_path, public_key_path)
self._keys_initialized = True
return
except Exception:
self.client.generate_keys(save_to_file=True, key_dir=self._key_dir)
else:
self.client.generate_keys()
self._keys_initialized = True
async def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
"""
Creates a new chat completion for the provided messages and parameters.
This method provides the same interface as OpenAI's ChatCompletion.create()
but automatically handles encryption and decryption for secure communication.
Args:
model: The model to use for the chat completion.
messages: A list of message objects. Each message has a role ("system",
"user", or "assistant") and content.
**kwargs: Additional parameters that can be passed to the API.
Supported parameters include:
- temperature: float (0-2)
- max_tokens: int
- top_p: float
- stop: Union[str, List[str]]
- presence_penalty: float (-2.0 to 2.0)
- frequency_penalty: float (-2.0 to 2.0)
- n: int — number of completions to generate
- best_of: int — generate this many and return the best
- seed: int — for reproducible outputs
- logit_bias: Dict[str, float]
- user: str
- tools: List of tool definitions (passed through to llama.cpp)
- tool_choice: str ("auto", "none", or specific tool name)
- response_format: Dict — controls output format, e.g.
{"type": "json_object"} or
{"type": "json_schema", "json_schema": {...}}
- stream: bool — NOT supported for encrypted inference; the server
will reject this with HTTP 400. Always use stream=False (default).
- base_url: str (alternative to initializing with router_url)
- security_tier: str ("standard", "high", or "maximum")
Controls hardware routing and security level:
* "standard": general secure inference (GPU)
* "high": sensitive business data (balanced CPU/GPU)
* "maximum": maximum isolation (PHI, classified data — CPU only)
If not specified, server uses default based on model name mapping.
Returns:
A dictionary containing the chat completion response with the following structure:
{
"id": str,
"object": "chat.completion",
"created": int,
"model": str,
"choices": [
{
"index": int,
"message": {
"role": str,
"content": str,
"tool_calls": List[Dict], # present if tools were used
"reasoning_content": str # present for thinking models
# (e.g. Qwen3, DeepSeek-R1);
# contains the model's internal
# chain-of-thought, separate from
# the final answer in "content"
},
"finish_reason": str
}
],
"usage": {
"prompt_tokens": int,
"completion_tokens": int,
"total_tokens": int
},
"_metadata": {
"payload_id": str, # echoes the X-Payload-ID sent with the request
"processed_at": int, # Unix timestamp of server-side processing
"is_encrypted": bool, # always True for this endpoint
"response_status": str, # "success" on success
"security_tier": str, # active tier: "standard", "high", or "maximum"
"memory_protection": { # server-side memory protection capabilities
"platform": str, # e.g. "linux", "windows", "darwin"
"memory_locking": bool, # whether mlock/VirtualLock succeeded
"secure_zeroing": bool, # whether memset-based zeroing is available
"core_dump_prevention": bool # whether core dumps are suppressed
},
"cuda_device": { # privacy-safe GPU info (hashed identifiers)
"available": bool,
"device_hash": str # SHA-256 of device name — not the raw name
}
}
}
Raises:
ValueError: If required parameters are missing or invalid.
ConnectionError: If the connection to the router fails.
Exception: For other errors during the request.
"""
# Extract non-payload kwargs before building the payload dict
base_url = kwargs.pop("base_url", None)
security_tier = kwargs.pop("security_tier", None)
api_key_override = kwargs.pop("api_key", None)
# Use the instance's client unless base_url is explicitly overridden
if base_url is not None:
temp_client = type(self)(
base_url=base_url,
allow_http=self.client.allow_http,
api_key=self.api_key,
secure_memory=self._secure_memory_enabled,
key_dir=self._key_dir,
)
instance = temp_client
else:
instance = self
# Ensure keys are available
await instance._ensure_keys()
# Build payload — api_key is intentionally excluded (sent as Bearer header)
payload = {
"model": model,
"messages": messages,
**kwargs
}
payload_id = str(uuid.uuid4())
request_api_key = api_key_override if api_key_override is not None else instance.api_key
# Send secure request with security tier
response = await instance.client.send_secure_request(payload, payload_id, request_api_key, security_tier)
return response
async def acreate(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
"""
Async alias for create() method.
This provides the same functionality as create() but with an explicit
async name, following OpenAI's naming conventions.
Args:
Same as create() method.
Returns:
Same as create() method.
"""
return await self.create(model, messages, **kwargs)