nomyo/nomyo/nomyo.py

import asyncio
import os
import uuid
from typing import Dict, Any, List, Optional
from .SecureCompletionClient import SecureCompletionClient

# Check if secure memory module is available (used only for the user-facing warning)
try:
    from . import SecureMemory as _  # noqa: F401
    _SECURE_MEMORY_AVAILABLE = True
except ImportError:
    _SECURE_MEMORY_AVAILABLE = False

class SecureChatCompletion:
    """
    OpenAI-compatible secure chat completion client.

    This class provides the same interface as OpenAI's ChatCompletion.create()
    method, but automatically encrypts all requests and decrypts all responses
    for secure communication with the NOMYO Router's /v1/chat/secure_completion
    endpoint.

    Security Features:
    - End-to-end encryption (AES-256-GCM + RSA-OAEP)
    - Secure memory protection (prevents memory swapping and guarantees zeroing)
    - HTTPS enforcement (with optional HTTP for local development)
    - Automatic key management

    Usage:
        ```python
        # Create a client instance
        client = SecureChatCompletion(base_url="https://api.nomyo.ai")

        # Simple chat completion
        response = await client.create(
            model="Qwen/Qwen3-0.6B",
            messages=[
                {"role": "user", "content": "What is the capital of France?"}
            ],
            temperature=0.7
        )

        # With tools
        response = await client.create(
            model="Qwen/Qwen3-0.6B",
            messages=[
                {"role": "user", "content": "What's the weather in Paris?"}
            ],
            tools=[...],
            temperature=0.7
        )
        ```
    """

    def __init__(self, base_url: str = "https://api.nomyo.ai", allow_http: bool = False, api_key: Optional[str] = None, secure_memory: bool = True, key_dir: Optional[str] = None, max_retries: int = 2):
        """
        Initialize the secure chat completion client.

        Args:
            base_url: Base URL of the NOMYO Router (must use HTTPS for production)
                     This parameter is named 'base_url' for OpenAI compatibility.
            allow_http: Allow HTTP connections (ONLY for local development, never in production)
            api_key: Optional API key for bearer authentication. If provided, it will be
                     used for all requests made with this client.
            secure_memory: Enable secure memory protection (default: True).
                          When enabled, prevents plaintext payloads from being swapped to disk
                          and guarantees memory is zeroed after encryption.
                          Set to False for testing or when security is not required.
            key_dir: Directory to load/save RSA keys. If None, ephemeral keys are
                     generated in memory for this session only.
            max_retries: Number of retries on retryable errors (429, 500, 502, 503, 504,
                        network errors). Uses exponential backoff. Default 2.
        """
        self.client = SecureCompletionClient(router_url=base_url, allow_http=allow_http, secure_memory=secure_memory, max_retries=max_retries)
        self._keys_initialized = False
        self._keys_lock = asyncio.Lock()
        self.api_key = api_key
        self._key_dir = key_dir
        self._secure_memory_enabled = secure_memory

        if secure_memory and not _SECURE_MEMORY_AVAILABLE:
            import warnings
            warnings.warn(
                "Secure memory requested but not available. "
                "Falling back to standard memory handling.",
                UserWarning,
                stacklevel=2
            )

    async def _ensure_keys(self):
        """Ensure keys are loaded or generated (concurrency-safe)."""
        if self._keys_initialized:
            return
        async with self._keys_lock:
            if self._keys_initialized:  # double-check after acquiring lock
                return
            if self._key_dir is not None:
                private_key_path = os.path.join(self._key_dir, "private_key.pem")
                public_key_path = os.path.join(self._key_dir, "public_key.pem")
                try:
                    self.client.load_keys(private_key_path, public_key_path)
                    self._keys_initialized = True
                    return
                except Exception:
                    self.client.generate_keys(save_to_file=True, key_dir=self._key_dir)
            else:
                self.client.generate_keys()
            self._keys_initialized = True

    async def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
        """
        Creates a new chat completion for the provided messages and parameters.

        This method provides the same interface as OpenAI's ChatCompletion.create()
        but automatically handles encryption and decryption for secure communication.

        Args:
            model: The model to use for the chat completion.
            messages: A list of message objects. Each message has a role ("system",
                "user", or "assistant") and content.
            **kwargs: Additional parameters that can be passed to the API.
                Supported parameters include:
                - temperature: float (0-2)
                - max_tokens: int
                - top_p: float
                - stop: Union[str, List[str]]
                - presence_penalty: float (-2.0 to 2.0)
                - frequency_penalty: float (-2.0 to 2.0)
                - n: int — number of completions to generate
                - best_of: int — generate this many and return the best
                - seed: int — for reproducible outputs
                - logit_bias: Dict[str, float]
                - user: str
                - tools: List of tool definitions (passed through to llama.cpp)
                - tool_choice: str ("auto", "none", or specific tool name)
                - response_format: Dict — controls output format, e.g.
                    {"type": "json_object"} or
                    {"type": "json_schema", "json_schema": {...}}
                - stream: bool — NOT supported for encrypted inference; the server
                    will reject this with HTTP 400. Always use stream=False (default).
                - base_url: str (alternative to initializing with router_url)
                - security_tier: str ("standard", "high", or "maximum")
                    Controls hardware routing and security level:
                    * "standard": general secure inference (GPU)
                    * "high": sensitive business data (balanced CPU/GPU)
                    * "maximum": maximum isolation (PHI, classified data — CPU only)
                    If not specified, server uses default based on model name mapping.

        Returns:
            A dictionary containing the chat completion response with the following structure:
            {
                "id": str,
                "object": "chat.completion",
                "created": int,
                "model": str,
                "choices": [
                    {
                        "index": int,
                        "message": {
                            "role": str,
                            "content": str,
                            "tool_calls": List[Dict],       # present if tools were used
                            "reasoning_content": str        # present for thinking models
                                                            # (e.g. Qwen3, DeepSeek-R1);
                                                            # contains the model's internal
                                                            # chain-of-thought, separate from
                                                            # the final answer in "content"
                        },
                        "finish_reason": str
                    }
                ],
                "usage": {
                    "prompt_tokens": int,
                    "completion_tokens": int,
                    "total_tokens": int
                },
                "_metadata": {
                    "payload_id": str,          # echoes the X-Payload-ID sent with the request
                    "processed_at": int,        # Unix timestamp of server-side processing
                    "is_encrypted": bool,       # always True for this endpoint
                    "response_status": str,     # "success" on success
                    "security_tier": str,       # active tier: "standard", "high", or "maximum"
                    "memory_protection": {      # server-side memory protection capabilities
                        "platform": str,        # e.g. "linux", "windows", "darwin"
                        "memory_locking": bool, # whether mlock/VirtualLock succeeded
                        "secure_zeroing": bool, # whether memset-based zeroing is available
                        "core_dump_prevention": bool  # whether core dumps are suppressed
                    },
                    "cuda_device": {            # privacy-safe GPU info (hashed identifiers)
                        "available": bool,
                        "device_hash": str      # SHA-256 of device name — not the raw name
                    }
                }
            }

        Raises:
            ValueError: If required parameters are missing or invalid.
            ConnectionError: If the connection to the router fails.
            Exception: For other errors during the request.
        """
        # Extract non-payload kwargs before building the payload dict
        base_url = kwargs.pop("base_url", None)
        security_tier = kwargs.pop("security_tier", None)
        api_key_override = kwargs.pop("api_key", None)

        # Use the instance's client unless base_url is explicitly overridden
        if base_url is not None:
            temp_client = type(self)(
                base_url=base_url,
                allow_http=self.client.allow_http,
                api_key=self.api_key,
                secure_memory=self._secure_memory_enabled,
                key_dir=self._key_dir,
            )
            instance = temp_client
        else:
            instance = self

        # Ensure keys are available
        await instance._ensure_keys()

        # Build payload — api_key is intentionally excluded (sent as Bearer header)
        payload = {
            "model": model,
            "messages": messages,
            **kwargs
        }

        payload_id = str(uuid.uuid4())
        request_api_key = api_key_override if api_key_override is not None else instance.api_key

        # Send secure request with security tier
        response = await instance.client.send_secure_request(payload, payload_id, request_api_key, security_tier)

        return response

    async def acreate(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
        """
        Async alias for create() method.

        This provides the same functionality as create() but with an explicit
        async name, following OpenAI's naming conventions.

        Args:
            Same as create() method.

        Returns:
            Same as create() method.
        """
        return await self.create(model, messages, **kwargs)