import asyncio import os import uuid from typing import Dict, Any, List, Optional from .SecureCompletionClient import SecureCompletionClient # Check if secure memory module is available (used only for the user-facing warning) try: from . import SecureMemory as _ # noqa: F401 _SECURE_MEMORY_AVAILABLE = True except ImportError: _SECURE_MEMORY_AVAILABLE = False class SecureChatCompletion: """ OpenAI-compatible secure chat completion client. This class provides the same interface as OpenAI's ChatCompletion.create() method, but automatically encrypts all requests and decrypts all responses for secure communication with the NOMYO Router's /v1/chat/secure_completion endpoint. Security Features: - End-to-end encryption (AES-256-GCM + RSA-OAEP) - Secure memory protection (prevents memory swapping and guarantees zeroing) - HTTPS enforcement (with optional HTTP for local development) - Automatic key management Usage: ```python # Create a client instance client = SecureChatCompletion(base_url="https://api.nomyo.ai") # Simple chat completion response = await client.create( model="Qwen/Qwen3-0.6B", messages=[ {"role": "user", "content": "What is the capital of France?"} ], temperature=0.7 ) # With tools response = await client.create( model="Qwen/Qwen3-0.6B", messages=[ {"role": "user", "content": "What's the weather in Paris?"} ], tools=[...], temperature=0.7 ) ``` """ def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2): """ Initialize the secure chat completion client. Args: base_url: Base URL of the NOMYO Router (must use HTTPS for production) This parameter is named 'base_url' for OpenAI compatibility. allow_http: Allow HTTP connections (ONLY for local development, never in production) api_key: Optional API key for bearer authentication. If provided, it will be used for all requests made with this client. secure_memory: Enable secure memory protection (default: True). When enabled, prevents plaintext payloads from being swapped to disk and guarantees memory is zeroed after encryption. Set to False for testing or when security is not required. key_dir: Directory to load/save RSA keys. If None, ephemeral keys are generated in memory for this session only. max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504, network errors). Uses exponential backoff. Default 2. """ self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries) self._keys_initialized = False self._keys_lock = asyncio.Lock() self.api_key = api_key self._key_dir = key_dir self._secure_memory_enabled = secure_memory if secure_memory and not _SECURE_MEMORY_AVAILABLE: import warnings warnings.warn( "Secure memory requested but not available. " "Falling back to standard memory handling.", UserWarning, stacklevel=2 ) async def _ensure_keys(self): """Ensure keys are loaded or generated (concurrency-safe).""" if self._keys_initialized: return async with self._keys_lock: if self._keys_initialized: # double-check after acquiring lock return if self._key_dir is not None: private_key_path = os.path.join(self._key_dir, "private_key.pem") public_key_path = os.path.join(self._key_dir, "public_key.pem") try: self.client.load_keys(private_key_path, public_key_path) self._keys_initialized = True return except Exception: self.client.generate_keys(save_to_file=True, key_dir=self._key_dir) else: self.client.generate_keys() self._keys_initialized = True async def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]: """ Creates a new chat completion for the provided messages and parameters. This method provides the same interface as OpenAI's ChatCompletion.create() but automatically handles encryption and decryption for secure communication. Args: model: The model to use for the chat completion. messages: A list of message objects. Each message has a role ("system", "user", or "assistant") and content. **kwargs: Additional parameters that can be passed to the API. Supported parameters include: - temperature: float (0-2) - max_tokens: int - top_p: float - stop: Union[str, List[str]] - presence_penalty: float (-2.0 to 2.0) - frequency_penalty: float (-2.0 to 2.0) - n: int — number of completions to generate - best_of: int — generate this many and return the best - seed: int — for reproducible outputs - logit_bias: Dict[str, float] - user: str - tools: List of tool definitions (passed through to llama.cpp) - tool_choice: str ("auto", "none", or specific tool name) - response_format: Dict — controls output format, e.g. {"type": "json_object"} or {"type": "json_schema", "json_schema": {...}} - stream: bool — NOT supported for encrypted inference; the server will reject this with HTTP 400. Always use stream=False (default). - base_url: str (alternative to initializing with router_url) - security_tier: str ("standard", "high", or "maximum") Controls hardware routing and security level: * "standard": general secure inference (GPU) * "high": sensitive business data (balanced CPU/GPU) * "maximum": maximum isolation (PHI, classified data — CPU only) If not specified, server uses default based on model name mapping. Returns: A dictionary containing the chat completion response with the following structure: { "id": str, "object": "chat.completion", "created": int, "model": str, "choices": [ { "index": int, "message": { "role": str, "content": str, "tool_calls": List[Dict], # present if tools were used "reasoning_content": str # present for thinking models # (e.g. Qwen3, DeepSeek-R1); # contains the model's internal # chain-of-thought, separate from # the final answer in "content" }, "finish_reason": str } ], "usage": { "prompt_tokens": int, "completion_tokens": int, "total_tokens": int }, "_metadata": { "payload_id": str, # echoes the X-Payload-ID sent with the request "processed_at": int, # Unix timestamp of server-side processing "is_encrypted": bool, # always True for this endpoint "response_status": str, # "success" on success "security_tier": str, # active tier: "standard", "high", or "maximum" "memory_protection": { # server-side memory protection capabilities "platform": str, # e.g. "linux", "windows", "darwin" "memory_locking": bool, # whether mlock/VirtualLock succeeded "secure_zeroing": bool, # whether memset-based zeroing is available "core_dump_prevention": bool # whether core dumps are suppressed }, "cuda_device": { # privacy-safe GPU info (hashed identifiers) "available": bool, "device_hash": str # SHA-256 of device name — not the raw name } } } Raises: ValueError: If required parameters are missing or invalid. ConnectionError: If the connection to the router fails. Exception: For other errors during the request. """ # Extract non-payload kwargs before building the payload dict base_url = kwargs.pop("base_url", None) security_tier = kwargs.pop("security_tier", None) api_key_override = kwargs.pop("api_key", None) # Use the instance's client unless base_url is explicitly overridden if base_url is not None: temp_client = type(self)( base_url=base_url, allow_http=self.client.allow_http, api_key=self.api_key, secure_memory=self._secure_memory_enabled, key_dir=self._key_dir, ) instance = temp_client else: instance = self # Ensure keys are available await instance._ensure_keys() # Build payload — api_key is intentionally excluded (sent as Bearer header) payload = { "model": model, "messages": messages, **kwargs } payload_id = str(uuid.uuid4()) request_api_key = api_key_override if api_key_override is not None else instance.api_key # Send secure request with security tier response = await instance.client.send_secure_request(payload, payload_id, request_api_key, security_tier) return response async def acreate(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]: """ Async alias for create() method. This provides the same functionality as create() but with an explicit async name, following OpenAI's naming conventions. Args: Same as create() method. Returns: Same as create() method. """ return await self.create(model, messages, **kwargs)