feat: add llama-swap as a backend

2026-06-14 16:34:31 +02:00 · 2026-06-14 16:34:31 +02:00 · aa8baebac5
commit aa8baebac5
parent c8da58430a
17 changed files with 544 additions and 52 deletions
--- a/config.py
+++ b/config.py
@ -23,6 +23,10 @@ class Config(BaseSettings):
    )
    # List of llama-server endpoints (OpenAI-compatible with /v1/models status info)
    llama_server_endpoints: List[str] = Field(default_factory=list)
+    # List of llama-swap endpoints (OpenAI-compatible front for multiple llama-server
+    # workers). Same surface as llama_server_endpoints, but loaded models are read from
+    # /running (not /v1/models status) and unload uses POST /api/models/unload/:model_id.
+    llama_swap_endpoints: List[str] = Field(default_factory=list)
    # Max concurrent connections per endpoint‑model pair, see OLLAMA_NUM_PARALLEL
    max_concurrent_connections: int = 1
    # Per-endpoint overrides: {endpoint_url: {max_concurrent_connections: N}}