Merge pull request #25 from nomyo-ai/dev-v0.6
- updated reasoning handling - improved model and error caches - fixed openai tool calling incl. ollama translations - direct support for llama.cpp's llama_server via llama_server_endpoint config - basic llama_server model info in dashboard - improved endpoint info fetching behaviour in error cases
This commit is contained in:
commit
9ef1b770ba
3 changed files with 528 additions and 164 deletions
|
|
@ -1,40 +1,32 @@
|
||||||
# Sample NOMYO Router Configuration
|
# config.yaml
|
||||||
|
# Ollama endpoints
|
||||||
# Basic single endpoint configuration
|
|
||||||
endpoints:
|
endpoints:
|
||||||
- http://localhost:11434
|
- http://192.168.0.50:11434
|
||||||
|
- http://192.168.0.51:11434
|
||||||
|
- http://192.168.0.52:11434
|
||||||
|
# External OpenAI-compatible endpoints (will NOT be queried for /api/ps /api/ps_details)
|
||||||
|
- https://api.openai.com/v1
|
||||||
|
|
||||||
|
# llama-server endpoints (OpenAI-compatible with /v1/models status info)
|
||||||
|
# These endpoints will be queried for /api/tags, /api/ps, /api/ps_details
|
||||||
|
# and included in the model selection pool for inference routing
|
||||||
|
llama_server_endpoints:
|
||||||
|
- http://localhost:8080/v1
|
||||||
|
- http://localhost:8081/v1
|
||||||
|
|
||||||
|
# Maximum concurrent connections *per endpoint‑model pair* (equals to OLLAMA_NUM_PARALLEL)
|
||||||
max_concurrent_connections: 2
|
max_concurrent_connections: 2
|
||||||
|
|
||||||
# Optional router-level API key to secure the router and dashboard (leave blank to disable)
|
# Optional router-level API key that gates router/API/web UI access (leave empty to disable)
|
||||||
nomyo-router-api-key: ""
|
nomyo-router-api-key: ""
|
||||||
|
|
||||||
# Multi-endpoint configuration with local Ollama instances
|
|
||||||
# endpoints:
|
|
||||||
# - http://ollama-worker1:11434
|
|
||||||
# - http://ollama-worker2:11434
|
|
||||||
# - http://ollama-worker3:11434
|
|
||||||
|
|
||||||
# Mixed configuration with Ollama and OpenAI endpoints
|
|
||||||
# endpoints:
|
|
||||||
# - http://localhost:11434
|
|
||||||
# - https://api.openai.com/v1
|
|
||||||
|
|
||||||
|
|
||||||
# API keys for remote endpoints
|
# API keys for remote endpoints
|
||||||
# Use ${VAR_NAME} syntax to reference environment variables
|
# Set an environment variable like OPENAI_KEY
|
||||||
|
# Confirm endpoints are exactly as in endpoints block
|
||||||
api_keys:
|
api_keys:
|
||||||
# Local Ollama instances typically don't require authentication
|
"http://192.168.0.50:11434": "ollama"
|
||||||
"http://localhost:11434": "ollama"
|
"http://192.168.0.51:11434": "ollama"
|
||||||
|
"http://192.168.0.52:11434": "ollama"
|
||||||
# Remote Ollama instances
|
"https://api.openai.com/v1": "${OPENAI_KEY}"
|
||||||
# "http://remote-ollama:11434": "ollama"
|
"http://localhost:8080/v1": "llama-server" # Optional API key for llama-server
|
||||||
|
"http://localhost:8081/v1": "llama-server"
|
||||||
# OpenAI API
|
|
||||||
# "https://api.openai.com/v1": "${OPENAI_KEY}"
|
|
||||||
|
|
||||||
# Anthropic API
|
|
||||||
# "https://api.anthropic.com/v1": "${ANTHROPIC_KEY}"
|
|
||||||
|
|
||||||
# Other OpenAI-compatible endpoints
|
|
||||||
# "https://api.mistral.ai/v1": "${MISTRAL_KEY}"
|
|
||||||
533
router.py
533
router.py
|
|
@ -34,16 +34,18 @@ from PIL import Image
|
||||||
# Successful results are cached for 300s
|
# Successful results are cached for 300s
|
||||||
_models_cache: dict[str, tuple[Set[str], float]] = {}
|
_models_cache: dict[str, tuple[Set[str], float]] = {}
|
||||||
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
|
_loaded_models_cache: dict[str, tuple[Set[str], float]] = {}
|
||||||
# Transient errors are cached for 1s – the key stays until the
|
# Transient errors are cached separately per concern so that a failure
|
||||||
# timeout expires, after which the endpoint will be queried again.
|
# in one path does not poison the other.
|
||||||
_error_cache: dict[str, float] = {}
|
_available_error_cache: dict[str, float] = {}
|
||||||
|
_loaded_error_cache: dict[str, float] = {}
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Cache locks
|
# Cache locks
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
_models_cache_lock = asyncio.Lock()
|
_models_cache_lock = asyncio.Lock()
|
||||||
_loaded_models_cache_lock = asyncio.Lock()
|
_loaded_models_cache_lock = asyncio.Lock()
|
||||||
_error_cache_lock = asyncio.Lock()
|
_available_error_cache_lock = asyncio.Lock()
|
||||||
|
_loaded_error_cache_lock = asyncio.Lock()
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# In-flight request tracking (prevents cache stampede)
|
# In-flight request tracking (prevents cache stampede)
|
||||||
|
|
@ -106,6 +108,8 @@ class Config(BaseSettings):
|
||||||
"http://localhost:11434",
|
"http://localhost:11434",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
# List of llama-server endpoints (OpenAI-compatible with /v1/models status info)
|
||||||
|
llama_server_endpoints: List[str] = Field(default_factory=list)
|
||||||
# Max concurrent connections per endpoint‑model pair, see OLLAMA_NUM_PARALLEL
|
# Max concurrent connections per endpoint‑model pair, see OLLAMA_NUM_PARALLEL
|
||||||
max_concurrent_connections: int = 1
|
max_concurrent_connections: int = 1
|
||||||
|
|
||||||
|
|
@ -254,13 +258,19 @@ async def enforce_router_api_key(request: Request, call_next):
|
||||||
# Strip the api_key query param from scope so access logs do not leak it
|
# Strip the api_key query param from scope so access logs do not leak it
|
||||||
_strip_api_key_from_scope(request)
|
_strip_api_key_from_scope(request)
|
||||||
if provided_key is None:
|
if provided_key is None:
|
||||||
response = await call_next(request)
|
# No key provided but authentication is required - return 401
|
||||||
# Add CORS headers for API key authentication requests
|
headers = {}
|
||||||
if "/api/" in path and path != "/api/usage-stream":
|
if "/api/" in path and path != "/api/usage-stream":
|
||||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
headers = {
|
||||||
response.headers["Access-Control-Allow-Headers"] = "Authorization, Content-Type"
|
"Access-Control-Allow-Origin": "*",
|
||||||
response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
|
"Access-Control-Allow-Headers": "Authorization, Content-Type",
|
||||||
return response
|
"Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
|
||||||
|
}
|
||||||
|
return JSONResponse(
|
||||||
|
content={"detail": "Missing NOMYO Router API key"},
|
||||||
|
status_code=401,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
||||||
if not secrets.compare_digest(str(provided_key), str(expected_key)):
|
if not secrets.compare_digest(str(provided_key), str(expected_key)):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|
@ -339,7 +349,50 @@ def _format_connection_issue(url: str, error: Exception) -> str:
|
||||||
|
|
||||||
return f"Error while contacting {url}: {error}"
|
return f"Error while contacting {url}: {error}"
|
||||||
|
|
||||||
|
def _normalize_llama_model_name(name: str) -> str:
|
||||||
|
"""Extract the model name from a huggingface-style identifier.
|
||||||
|
e.g. 'unsloth/gpt-oss-20b-GGUF:F16' -> 'gpt-oss-20b-GGUF'
|
||||||
|
"""
|
||||||
|
if "/" in name:
|
||||||
|
name = name.rsplit("/", 1)[1]
|
||||||
|
if ":" in name:
|
||||||
|
name = name.split(":")[0]
|
||||||
|
return name
|
||||||
|
|
||||||
|
def _extract_llama_quant(name: str) -> str:
|
||||||
|
"""Extract the quantization level from a huggingface-style identifier.
|
||||||
|
e.g. 'unsloth/gpt-oss-20b-GGUF:Q8_0' -> 'Q8_0'
|
||||||
|
Returns empty string if no quant suffix is present.
|
||||||
|
"""
|
||||||
|
if ":" in name:
|
||||||
|
return name.rsplit(":", 1)[1]
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _is_llama_model_loaded(item: dict) -> bool:
|
||||||
|
"""Return True if a llama-server /v1/models item has status 'loaded'.
|
||||||
|
Handles both dict format ({"value": "loaded"}) and plain string ("loaded")."""
|
||||||
|
status = item.get("status")
|
||||||
|
if isinstance(status, dict):
|
||||||
|
return status.get("value") == "loaded"
|
||||||
|
if isinstance(status, str):
|
||||||
|
return status == "loaded"
|
||||||
|
return False
|
||||||
|
|
||||||
def is_ext_openai_endpoint(endpoint: str) -> bool:
|
def is_ext_openai_endpoint(endpoint: str) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if an endpoint is an external OpenAI-compatible endpoint (not Ollama or llama-server).
|
||||||
|
|
||||||
|
Returns True for:
|
||||||
|
- External services like OpenAI.com, Groq, etc.
|
||||||
|
|
||||||
|
Returns False for:
|
||||||
|
- Ollama endpoints (without /v1, or with /v1 but default port 11434)
|
||||||
|
- llama-server endpoints (explicitly configured in llama_server_endpoints)
|
||||||
|
"""
|
||||||
|
# Check if it's a llama-server endpoint (has /v1 and is in the configured list)
|
||||||
|
if endpoint in config.llama_server_endpoints:
|
||||||
|
return False
|
||||||
|
|
||||||
if "/v1" not in endpoint:
|
if "/v1" not in endpoint:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
@ -353,6 +406,13 @@ def is_ext_openai_endpoint(endpoint: str) -> bool:
|
||||||
|
|
||||||
return True # It's an external OpenAI endpoint
|
return True # It's an external OpenAI endpoint
|
||||||
|
|
||||||
|
def is_openai_compatible(endpoint: str) -> bool:
|
||||||
|
"""
|
||||||
|
Return True if the endpoint speaks the OpenAI API (not native Ollama).
|
||||||
|
This includes external OpenAI endpoints AND llama-server endpoints.
|
||||||
|
"""
|
||||||
|
return "/v1" in endpoint or endpoint in config.llama_server_endpoints
|
||||||
|
|
||||||
async def token_worker() -> None:
|
async def token_worker() -> None:
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -508,7 +568,10 @@ class fetch:
|
||||||
if api_key is not None:
|
if api_key is not None:
|
||||||
headers = {"Authorization": "Bearer " + api_key}
|
headers = {"Authorization": "Bearer " + api_key}
|
||||||
|
|
||||||
if "/v1" in endpoint:
|
if endpoint in config.llama_server_endpoints and "/v1" not in endpoint:
|
||||||
|
endpoint_url = f"{endpoint}/v1/models"
|
||||||
|
key = "data"
|
||||||
|
elif "/v1" in endpoint or endpoint in config.llama_server_endpoints:
|
||||||
endpoint_url = f"{endpoint}/models"
|
endpoint_url = f"{endpoint}/models"
|
||||||
key = "data"
|
key = "data"
|
||||||
else:
|
else:
|
||||||
|
|
@ -533,10 +596,21 @@ class fetch:
|
||||||
message = _format_connection_issue(endpoint_url, e)
|
message = _format_connection_issue(endpoint_url, e)
|
||||||
print(f"[fetch.available_models] {message}")
|
print(f"[fetch.available_models] {message}")
|
||||||
# Update error cache with lock protection
|
# Update error cache with lock protection
|
||||||
async with _error_cache_lock:
|
async with _available_error_cache_lock:
|
||||||
_error_cache[endpoint] = time.time()
|
_available_error_cache[endpoint] = time.time()
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
|
async def _refresh_available_models(endpoint: str, api_key: Optional[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
Background task to refresh available models cache without blocking the caller.
|
||||||
|
Used for stale-while-revalidate pattern.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
await fetch._fetch_available_models_internal(endpoint, api_key)
|
||||||
|
except Exception as e:
|
||||||
|
# Silently fail - cache will remain stale but functional
|
||||||
|
print(f"[fetch._refresh_available_models] Background refresh failed for {endpoint}: {e}")
|
||||||
|
|
||||||
async def available_models(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
|
async def available_models(endpoint: str, api_key: Optional[str] = None) -> Set[str]:
|
||||||
"""
|
"""
|
||||||
Query <endpoint>/api/tags and return a set of all model names that the
|
Query <endpoint>/api/tags and return a set of all model names that the
|
||||||
|
|
@ -547,6 +621,10 @@ class fetch:
|
||||||
Uses request coalescing to prevent cache stampede: if multiple requests
|
Uses request coalescing to prevent cache stampede: if multiple requests
|
||||||
arrive when cache is expired, only one actual HTTP request is made.
|
arrive when cache is expired, only one actual HTTP request is made.
|
||||||
|
|
||||||
|
Uses stale-while-revalidate: when the cache is between 300-600s old,
|
||||||
|
the stale data is returned immediately while a background refresh runs.
|
||||||
|
This prevents model blackouts caused by transient timeouts.
|
||||||
|
|
||||||
If the request fails (e.g. timeout, 5xx, or malformed response), an empty
|
If the request fails (e.g. timeout, 5xx, or malformed response), an empty
|
||||||
set is returned.
|
set is returned.
|
||||||
"""
|
"""
|
||||||
|
|
@ -554,19 +632,27 @@ class fetch:
|
||||||
async with _models_cache_lock:
|
async with _models_cache_lock:
|
||||||
if endpoint in _models_cache:
|
if endpoint in _models_cache:
|
||||||
models, cached_at = _models_cache[endpoint]
|
models, cached_at = _models_cache[endpoint]
|
||||||
|
|
||||||
|
# FRESH: < 300s old - return immediately
|
||||||
if _is_fresh(cached_at, 300):
|
if _is_fresh(cached_at, 300):
|
||||||
return models
|
return models
|
||||||
# Stale entry - remove it
|
|
||||||
|
# STALE: 300-600s old - return stale data and refresh in background
|
||||||
|
if _is_fresh(cached_at, 600):
|
||||||
|
asyncio.create_task(fetch._refresh_available_models(endpoint, api_key))
|
||||||
|
return models # Return stale data immediately
|
||||||
|
|
||||||
|
# EXPIRED: > 600s old - too stale, must refresh synchronously
|
||||||
del _models_cache[endpoint]
|
del _models_cache[endpoint]
|
||||||
|
|
||||||
# Check error cache with lock protection
|
# Check error cache with lock protection
|
||||||
async with _error_cache_lock:
|
async with _available_error_cache_lock:
|
||||||
if endpoint in _error_cache:
|
if endpoint in _available_error_cache:
|
||||||
if _is_fresh(_error_cache[endpoint], 10):
|
if _is_fresh(_available_error_cache[endpoint], 30):
|
||||||
# Still within the short error TTL – pretend nothing is available
|
# Still within the short error TTL – pretend nothing is available
|
||||||
return set()
|
return set()
|
||||||
# Error expired – remove it
|
# Error expired – remove it
|
||||||
del _error_cache[endpoint]
|
del _available_error_cache[endpoint]
|
||||||
|
|
||||||
# Request coalescing: check if another request is already fetching this endpoint
|
# Request coalescing: check if another request is already fetching this endpoint
|
||||||
async with _inflight_lock:
|
async with _inflight_lock:
|
||||||
|
|
@ -593,8 +679,39 @@ class fetch:
|
||||||
"""
|
"""
|
||||||
Internal function that performs the actual HTTP request to fetch loaded models.
|
Internal function that performs the actual HTTP request to fetch loaded models.
|
||||||
This is called by loaded_models() after checking caches and in-flight requests.
|
This is called by loaded_models() after checking caches and in-flight requests.
|
||||||
|
|
||||||
|
For Ollama endpoints: queries /api/ps and returns model names
|
||||||
|
For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
|
||||||
"""
|
"""
|
||||||
client: aiohttp.ClientSession = app_state["session"]
|
client: aiohttp.ClientSession = app_state["session"]
|
||||||
|
|
||||||
|
# Check if this is a llama-server endpoint
|
||||||
|
if endpoint in config.llama_server_endpoints:
|
||||||
|
# Query /v1/models for llama-server
|
||||||
|
try:
|
||||||
|
async with client.get(f"{endpoint}/models") as resp:
|
||||||
|
await _ensure_success(resp)
|
||||||
|
data = await resp.json()
|
||||||
|
|
||||||
|
# Filter for loaded models only
|
||||||
|
items = data.get("data", [])
|
||||||
|
models = {
|
||||||
|
item.get("id")
|
||||||
|
for item in items
|
||||||
|
if item.get("id") and _is_llama_model_loaded(item)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update cache with lock protection
|
||||||
|
async with _loaded_models_cache_lock:
|
||||||
|
_loaded_models_cache[endpoint] = (models, time.time())
|
||||||
|
return models
|
||||||
|
except Exception as e:
|
||||||
|
# If anything goes wrong we simply assume the endpoint has no models
|
||||||
|
message = _format_connection_issue(f"{endpoint}/models", e)
|
||||||
|
print(f"[fetch.loaded_models] {message}")
|
||||||
|
return set()
|
||||||
|
else:
|
||||||
|
# Original Ollama /api/ps logic
|
||||||
try:
|
try:
|
||||||
async with client.get(f"{endpoint}/api/ps") as resp:
|
async with client.get(f"{endpoint}/api/ps") as resp:
|
||||||
await _ensure_success(resp)
|
await _ensure_success(resp)
|
||||||
|
|
@ -642,7 +759,7 @@ class fetch:
|
||||||
models, cached_at = _loaded_models_cache[endpoint]
|
models, cached_at = _loaded_models_cache[endpoint]
|
||||||
|
|
||||||
# FRESH: < 10s old - return immediately
|
# FRESH: < 10s old - return immediately
|
||||||
if _is_fresh(cached_at, 10):
|
if _is_fresh(cached_at, 30):
|
||||||
return models
|
return models
|
||||||
|
|
||||||
# STALE: 10-60s old - return stale data and refresh in background
|
# STALE: 10-60s old - return stale data and refresh in background
|
||||||
|
|
@ -655,12 +772,12 @@ class fetch:
|
||||||
del _loaded_models_cache[endpoint]
|
del _loaded_models_cache[endpoint]
|
||||||
|
|
||||||
# Check error cache with lock protection
|
# Check error cache with lock protection
|
||||||
async with _error_cache_lock:
|
async with _loaded_error_cache_lock:
|
||||||
if endpoint in _error_cache:
|
if endpoint in _loaded_error_cache:
|
||||||
if _is_fresh(_error_cache[endpoint], 10):
|
if _is_fresh(_loaded_error_cache[endpoint], 30):
|
||||||
return set()
|
return set()
|
||||||
# Error expired - remove it
|
# Error expired - remove it
|
||||||
del _error_cache[endpoint]
|
del _loaded_error_cache[endpoint]
|
||||||
|
|
||||||
# Request coalescing: check if another request is already fetching this endpoint
|
# Request coalescing: check if another request is already fetching this endpoint
|
||||||
async with _inflight_lock:
|
async with _inflight_lock:
|
||||||
|
|
@ -682,11 +799,22 @@ class fetch:
|
||||||
if _inflight_loaded_models.get(endpoint) == task:
|
if _inflight_loaded_models.get(endpoint) == task:
|
||||||
_inflight_loaded_models.pop(endpoint, None)
|
_inflight_loaded_models.pop(endpoint, None)
|
||||||
|
|
||||||
async def endpoint_details(endpoint: str, route: str, detail: str, api_key: Optional[str] = None) -> List[dict]:
|
async def endpoint_details(endpoint: str, route: str, detail: str, api_key: Optional[str] = None, skip_error_cache: bool = False) -> List[dict]:
|
||||||
"""
|
"""
|
||||||
Query <endpoint>/<route> to fetch <detail> and return a List of dicts with details
|
Query <endpoint>/<route> to fetch <detail> and return a List of dicts with details
|
||||||
for the corresponding Ollama endpoint. If the request fails we respond with "N/A" for detail.
|
for the corresponding Ollama endpoint. If the request fails we respond with "N/A" for detail.
|
||||||
|
|
||||||
|
When ``skip_error_cache`` is False (the default), the call is short-circuited
|
||||||
|
if the endpoint recently failed (recorded in ``_available_error_cache``).
|
||||||
|
Pass ``skip_error_cache=True`` from health-check routes that must always probe.
|
||||||
"""
|
"""
|
||||||
|
# Fast-fail if the endpoint is known to be down (unless caller opts out)
|
||||||
|
if not skip_error_cache:
|
||||||
|
async with _available_error_cache_lock:
|
||||||
|
if endpoint in _available_error_cache:
|
||||||
|
if _is_fresh(_available_error_cache[endpoint], 30):
|
||||||
|
return []
|
||||||
|
|
||||||
client: aiohttp.ClientSession = app_state["session"]
|
client: aiohttp.ClientSession = app_state["session"]
|
||||||
headers = None
|
headers = None
|
||||||
if api_key is not None:
|
if api_key is not None:
|
||||||
|
|
@ -703,6 +831,9 @@ class fetch:
|
||||||
# If anything goes wrong we cannot reply details
|
# If anything goes wrong we cannot reply details
|
||||||
message = _format_connection_issue(request_url, e)
|
message = _format_connection_issue(request_url, e)
|
||||||
print(f"[fetch.endpoint_details] {message}")
|
print(f"[fetch.endpoint_details] {message}")
|
||||||
|
# Record failure so subsequent calls skip this endpoint briefly
|
||||||
|
async with _available_error_cache_lock:
|
||||||
|
_available_error_cache[endpoint] = time.time()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def ep2base(ep):
|
def ep2base(ep):
|
||||||
|
|
@ -749,13 +880,13 @@ async def _make_chat_request(endpoint: str, model: str, messages: list, tools=No
|
||||||
Helper function to make a chat request to a specific endpoint.
|
Helper function to make a chat request to a specific endpoint.
|
||||||
Handles endpoint selection, client creation, usage tracking, and request execution.
|
Handles endpoint selection, client creation, usage tracking, and request execution.
|
||||||
"""
|
"""
|
||||||
is_openai_endpoint = "/v1" in endpoint
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
if use_openai:
|
||||||
if is_openai_endpoint:
|
|
||||||
if ":latest" in model:
|
if ":latest" in model:
|
||||||
model = model.split(":latest")[0]
|
model = model.split(":latest")[0]
|
||||||
if messages:
|
if messages:
|
||||||
messages = transform_images_to_data_urls(messages)
|
messages = transform_images_to_data_urls(messages)
|
||||||
|
messages = transform_tool_calls_to_openai(messages)
|
||||||
params = {
|
params = {
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
"model": model,
|
"model": model,
|
||||||
|
|
@ -774,21 +905,23 @@ async def _make_chat_request(endpoint: str, model: str, messages: list, tools=No
|
||||||
"response_format": {"type": "json_schema", "json_schema": format} if format is not None else None
|
"response_format": {"type": "json_schema", "json_schema": format} if format is not None else None
|
||||||
}
|
}
|
||||||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||||
oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=ep2base(endpoint), default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
|
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
start_ts = time.perf_counter()
|
start_ts = time.perf_counter()
|
||||||
response = await oclient.chat.completions.create(**params)
|
response = await oclient.chat.completions.create(**params)
|
||||||
if stream:
|
if stream:
|
||||||
# For streaming, we need to collect all chunks
|
# For streaming, we need to collect all chunks
|
||||||
chunks = []
|
chunks = []
|
||||||
|
tc_acc = {} # accumulate tool-call deltas
|
||||||
async for chunk in response:
|
async for chunk in response:
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
|
_accumulate_openai_tc_delta(chunk, tc_acc)
|
||||||
if chunk.usage is not None:
|
if chunk.usage is not None:
|
||||||
prompt_tok = chunk.usage.prompt_tokens or 0
|
prompt_tok = chunk.usage.prompt_tokens or 0
|
||||||
comp_tok = chunk.usage.completion_tokens or 0
|
comp_tok = chunk.usage.completion_tokens or 0
|
||||||
|
|
@ -797,6 +930,9 @@ async def _make_chat_request(endpoint: str, model: str, messages: list, tools=No
|
||||||
# Convert to Ollama format
|
# Convert to Ollama format
|
||||||
if chunks:
|
if chunks:
|
||||||
response = rechunk.openai_chat_completion2ollama(chunks[-1], stream, start_ts)
|
response = rechunk.openai_chat_completion2ollama(chunks[-1], stream, start_ts)
|
||||||
|
# Inject fully-accumulated tool calls into the final response
|
||||||
|
if tc_acc and response.message:
|
||||||
|
response.message.tool_calls = _build_ollama_tool_calls(tc_acc)
|
||||||
else:
|
else:
|
||||||
prompt_tok = response.usage.prompt_tokens or 0
|
prompt_tok = response.usage.prompt_tokens or 0
|
||||||
comp_tok = response.usage.completion_tokens or 0
|
comp_tok = response.usage.completion_tokens or 0
|
||||||
|
|
@ -950,6 +1086,39 @@ def resize_image_if_needed(image_data):
|
||||||
print(f"Error processing image: {e}")
|
print(f"Error processing image: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def transform_tool_calls_to_openai(message_list):
|
||||||
|
"""
|
||||||
|
Ensure tool_calls in assistant messages conform to the OpenAI format:
|
||||||
|
- Each tool call must have "type": "function"
|
||||||
|
- Each tool call must have an "id"
|
||||||
|
- arguments must be a JSON string, not a dict
|
||||||
|
Also ensure tool-role messages have a tool_call_id.
|
||||||
|
"""
|
||||||
|
# Track generated IDs so tool-role messages can reference them
|
||||||
|
last_tool_call_ids = {}
|
||||||
|
for msg in message_list:
|
||||||
|
role = msg.get("role")
|
||||||
|
if role == "assistant" and "tool_calls" in msg:
|
||||||
|
for tc in msg["tool_calls"]:
|
||||||
|
if "type" not in tc:
|
||||||
|
tc["type"] = "function"
|
||||||
|
if "id" not in tc:
|
||||||
|
tc["id"] = f"call_{secrets.token_hex(16)}"
|
||||||
|
func = tc.get("function", {})
|
||||||
|
if isinstance(func.get("arguments"), dict):
|
||||||
|
func["arguments"] = orjson.dumps(func["arguments"]).decode("utf-8")
|
||||||
|
# Remember the id for the following tool-role message
|
||||||
|
name = func.get("name")
|
||||||
|
if name:
|
||||||
|
last_tool_call_ids[name] = tc["id"]
|
||||||
|
elif role == "tool":
|
||||||
|
if "tool_call_id" not in msg:
|
||||||
|
# Try to match by name from a preceding assistant tool_call
|
||||||
|
name = msg.get("name") or msg.get("tool_name")
|
||||||
|
if name and name in last_tool_call_ids:
|
||||||
|
msg["tool_call_id"] = last_tool_call_ids.pop(name)
|
||||||
|
return message_list
|
||||||
|
|
||||||
def transform_images_to_data_urls(message_list):
|
def transform_images_to_data_urls(message_list):
|
||||||
for message in message_list:
|
for message in message_list:
|
||||||
if "images" in message:
|
if "images" in message:
|
||||||
|
|
@ -977,6 +1146,51 @@ def transform_images_to_data_urls(message_list):
|
||||||
|
|
||||||
return message_list
|
return message_list
|
||||||
|
|
||||||
|
def _accumulate_openai_tc_delta(chunk, accumulator: dict) -> None:
|
||||||
|
"""Accumulate tool_call deltas from a single OpenAI streaming chunk.
|
||||||
|
|
||||||
|
``accumulator`` is a dict mapping tool-call *index* to
|
||||||
|
``{"id": str, "name": str, "arguments": str}`` where ``arguments``
|
||||||
|
is the concatenation of all JSON fragments seen so far.
|
||||||
|
"""
|
||||||
|
if not chunk.choices:
|
||||||
|
return
|
||||||
|
delta = chunk.choices[0].delta
|
||||||
|
tc_deltas = getattr(delta, "tool_calls", None)
|
||||||
|
if not tc_deltas:
|
||||||
|
return
|
||||||
|
for tc in tc_deltas:
|
||||||
|
idx = tc.index
|
||||||
|
if idx not in accumulator:
|
||||||
|
accumulator[idx] = {
|
||||||
|
"id": getattr(tc, "id", None) or f"call_{secrets.token_hex(16)}",
|
||||||
|
"name": tc.function.name if tc.function else None,
|
||||||
|
"arguments": "",
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
if getattr(tc, "id", None):
|
||||||
|
accumulator[idx]["id"] = tc.id
|
||||||
|
if tc.function and tc.function.name:
|
||||||
|
accumulator[idx]["name"] = tc.function.name
|
||||||
|
if tc.function and tc.function.arguments:
|
||||||
|
accumulator[idx]["arguments"] += tc.function.arguments
|
||||||
|
|
||||||
|
def _build_ollama_tool_calls(accumulator: dict) -> list | None:
|
||||||
|
"""Convert accumulated tool-call data into Ollama-format tool_calls list."""
|
||||||
|
if not accumulator:
|
||||||
|
return None
|
||||||
|
result = []
|
||||||
|
for idx in sorted(accumulator.keys()):
|
||||||
|
tc = accumulator[idx]
|
||||||
|
try:
|
||||||
|
args = orjson.loads(tc["arguments"]) if tc["arguments"] else {}
|
||||||
|
except (orjson.JSONDecodeError, TypeError):
|
||||||
|
args = {}
|
||||||
|
result.append(ollama.Message.ToolCall(
|
||||||
|
function=ollama.Message.ToolCall.Function(name=tc["name"], arguments=args)
|
||||||
|
))
|
||||||
|
return result
|
||||||
|
|
||||||
class rechunk:
|
class rechunk:
|
||||||
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
|
def openai_chat_completion2ollama(chunk: dict, stream: bool, start_ts: float) -> ollama.ChatResponse:
|
||||||
now = time.perf_counter()
|
now = time.perf_counter()
|
||||||
|
|
@ -992,24 +1206,41 @@ class rechunk:
|
||||||
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)),
|
prompt_eval_duration=int((now - start_ts) * 1_000_000_000 * (chunk.usage.prompt_tokens / chunk.usage.completion_tokens / 100)),
|
||||||
eval_count=int(chunk.usage.completion_tokens),
|
eval_count=int(chunk.usage.completion_tokens),
|
||||||
eval_duration=int((now - start_ts) * 1_000_000_000),
|
eval_duration=int((now - start_ts) * 1_000_000_000),
|
||||||
message={"role": "assistant"}
|
message=ollama.Message(role="assistant", content=""),
|
||||||
)
|
)
|
||||||
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
with_thinking = chunk.choices[0] if chunk.choices[0] else None
|
||||||
if stream == True:
|
if stream == True:
|
||||||
thinking = getattr(with_thinking.delta, "reasoning", None) if with_thinking else None
|
thinking = (getattr(with_thinking.delta, "reasoning_content", None) or getattr(with_thinking.delta, "reasoning", None)) if with_thinking else None
|
||||||
role = chunk.choices[0].delta.role or "assistant"
|
role = chunk.choices[0].delta.role or "assistant"
|
||||||
content = chunk.choices[0].delta.content or ''
|
content = chunk.choices[0].delta.content or ''
|
||||||
else:
|
else:
|
||||||
thinking = getattr(with_thinking, "reasoning", None) if with_thinking else None
|
thinking = (getattr(with_thinking.message, "reasoning_content", None) or getattr(with_thinking.message, "reasoning", None)) if with_thinking else None
|
||||||
role = chunk.choices[0].message.role or "assistant"
|
role = chunk.choices[0].message.role or "assistant"
|
||||||
content = chunk.choices[0].message.content or ''
|
content = chunk.choices[0].message.content or ''
|
||||||
|
# Convert OpenAI tool_calls to Ollama format
|
||||||
|
# In streaming mode, tool_calls arrive as partial deltas across multiple chunks
|
||||||
|
# (name only in first delta, arguments as incremental JSON fragments).
|
||||||
|
# Callers must accumulate deltas and inject the final result; skip here.
|
||||||
|
ollama_tool_calls = None
|
||||||
|
if not stream:
|
||||||
|
raw_tool_calls = getattr(with_thinking.message, "tool_calls", None) if with_thinking else None
|
||||||
|
if raw_tool_calls:
|
||||||
|
ollama_tool_calls = []
|
||||||
|
for tc in raw_tool_calls:
|
||||||
|
try:
|
||||||
|
args = orjson.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else (tc.function.arguments or {})
|
||||||
|
except (orjson.JSONDecodeError, TypeError):
|
||||||
|
args = {}
|
||||||
|
ollama_tool_calls.append(ollama.Message.ToolCall(
|
||||||
|
function=ollama.Message.ToolCall.Function(name=tc.function.name, arguments=args)
|
||||||
|
))
|
||||||
assistant_msg = ollama.Message(
|
assistant_msg = ollama.Message(
|
||||||
role=role,
|
role=role,
|
||||||
content=content,
|
content=content,
|
||||||
thinking=thinking,
|
thinking=thinking,
|
||||||
images=None,
|
images=None,
|
||||||
tool_name=None,
|
tool_name=None,
|
||||||
tool_calls=None)
|
tool_calls=ollama_tool_calls)
|
||||||
rechunk = ollama.ChatResponse(
|
rechunk = ollama.ChatResponse(
|
||||||
model=chunk.model,
|
model=chunk.model,
|
||||||
created_at=iso8601_ns(),
|
created_at=iso8601_ns(),
|
||||||
|
|
@ -1134,35 +1365,40 @@ async def choose_endpoint(model: str) -> str:
|
||||||
6️⃣ If no endpoint advertises the model at all, raise an error.
|
6️⃣ If no endpoint advertises the model at all, raise an error.
|
||||||
"""
|
"""
|
||||||
# 1️⃣ Gather advertised‑model sets for all endpoints concurrently
|
# 1️⃣ Gather advertised‑model sets for all endpoints concurrently
|
||||||
tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if "/v1" not in ep]
|
# Include both config.endpoints and config.llama_server_endpoints
|
||||||
tag_tasks += [fetch.available_models(ep, config.api_keys[ep]) for ep in config.endpoints if "/v1" in ep]
|
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
||||||
|
all_endpoints = config.endpoints + llama_eps_extra
|
||||||
|
|
||||||
|
tag_tasks = [fetch.available_models(ep) for ep in config.endpoints if not is_openai_compatible(ep)]
|
||||||
|
tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in config.endpoints if is_openai_compatible(ep)]
|
||||||
|
tag_tasks += [fetch.available_models(ep, config.api_keys.get(ep)) for ep in llama_eps_extra]
|
||||||
advertised_sets = await asyncio.gather(*tag_tasks)
|
advertised_sets = await asyncio.gather(*tag_tasks)
|
||||||
|
|
||||||
# 2️⃣ Filter endpoints that advertise the requested model
|
# 2️⃣ Filter endpoints that advertise the requested model
|
||||||
candidate_endpoints = [
|
candidate_endpoints = [
|
||||||
ep for ep, models in zip(config.endpoints, advertised_sets)
|
ep for ep, models in zip(all_endpoints, advertised_sets)
|
||||||
if model in models
|
if model in models
|
||||||
]
|
]
|
||||||
|
|
||||||
# 6️⃣
|
# 6️⃣
|
||||||
if not candidate_endpoints:
|
if not candidate_endpoints:
|
||||||
if ":latest" in model: #ollama naming convention not applicable to openai
|
if ":latest" in model: #ollama naming convention not applicable to openai/llama-server
|
||||||
model_without_latest = model.split(":latest")[0]
|
model_without_latest = model.split(":latest")[0]
|
||||||
candidate_endpoints = [
|
candidate_endpoints = [
|
||||||
ep for ep, models in zip(config.endpoints, advertised_sets)
|
ep for ep, models in zip(all_endpoints, advertised_sets)
|
||||||
if model_without_latest in models and is_ext_openai_endpoint(ep)
|
if model_without_latest in models and (is_ext_openai_endpoint(ep) or ep in config.llama_server_endpoints)
|
||||||
]
|
]
|
||||||
if not candidate_endpoints:
|
if not candidate_endpoints:
|
||||||
# Only add :latest suffix if model doesn't already have a version suffix
|
# Only add :latest suffix if model doesn't already have a version suffix
|
||||||
if ":" not in model:
|
if ":" not in model:
|
||||||
model = model + ":latest"
|
model = model + ":latest"
|
||||||
candidate_endpoints = [
|
candidate_endpoints = [
|
||||||
ep for ep, models in zip(config.endpoints, advertised_sets)
|
ep for ep, models in zip(all_endpoints, advertised_sets)
|
||||||
if model in models
|
if model in models
|
||||||
]
|
]
|
||||||
if not candidate_endpoints:
|
if not candidate_endpoints:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"None of the configured endpoints ({', '.join(config.endpoints)}) "
|
f"None of the configured endpoints ({', '.join(all_endpoints)}) "
|
||||||
f"advertise the model '{model}'."
|
f"advertise the model '{model}'."
|
||||||
)
|
)
|
||||||
# 3️⃣ Among the candidates, find those that have the model *loaded*
|
# 3️⃣ Among the candidates, find those that have the model *loaded*
|
||||||
|
|
@ -1210,7 +1446,7 @@ async def choose_endpoint(model: str) -> str:
|
||||||
# Then by total endpoint usage (ascending) to balance idle endpoints
|
# Then by total endpoint usage (ascending) to balance idle endpoints
|
||||||
endpoints_with_free_slot.sort(
|
endpoints_with_free_slot.sort(
|
||||||
key=lambda ep: (
|
key=lambda ep: (
|
||||||
-usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections)
|
#-usage_counts.get(ep, {}).get(model, 0), # Primary: per-model usage (descending - prefer endpoints with connections)
|
||||||
sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints)
|
sum(usage_counts.get(ep, {}).values()) # Secondary: total endpoint usage (ascending - prefer idle endpoints)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -1266,9 +1502,8 @@ async def proxy(request: Request):
|
||||||
|
|
||||||
|
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
is_openai_endpoint = "/v1" in endpoint
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
if use_openai:
|
||||||
if is_openai_endpoint:
|
|
||||||
if ":latest" in model:
|
if ":latest" in model:
|
||||||
model = model.split(":latest")
|
model = model.split(":latest")
|
||||||
model = model[0]
|
model = model[0]
|
||||||
|
|
@ -1289,7 +1524,7 @@ async def proxy(request: Request):
|
||||||
"suffix": suffix,
|
"suffix": suffix,
|
||||||
}
|
}
|
||||||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||||
oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=ep2base(endpoint), default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -1297,14 +1532,14 @@ async def proxy(request: Request):
|
||||||
# 4. Async generator that streams data and decrements the counter
|
# 4. Async generator that streams data and decrements the counter
|
||||||
async def stream_generate_response():
|
async def stream_generate_response():
|
||||||
try:
|
try:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
start_ts = time.perf_counter()
|
start_ts = time.perf_counter()
|
||||||
async_gen = await oclient.completions.create(**params)
|
async_gen = await oclient.completions.create(**params)
|
||||||
else:
|
else:
|
||||||
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
|
async_gen = await client.generate(model=model, prompt=prompt, suffix=suffix, system=system, template=template, context=context, stream=stream, think=think, raw=raw, format=_format, images=images, options=options, keep_alive=keep_alive)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
|
chunk = rechunk.openai_completion2ollama(chunk, stream, start_ts)
|
||||||
prompt_tok = chunk.prompt_eval_count or 0
|
prompt_tok = chunk.prompt_eval_count or 0
|
||||||
comp_tok = chunk.eval_count or 0
|
comp_tok = chunk.eval_count or 0
|
||||||
|
|
@ -1316,7 +1551,7 @@ async def proxy(request: Request):
|
||||||
json_line = orjson.dumps(chunk)
|
json_line = orjson.dumps(chunk)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
yield json_line.encode("utf-8") + b"\n"
|
||||||
else:
|
else:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
response = rechunk.openai_completion2ollama(async_gen, stream, start_ts)
|
response = rechunk.openai_completion2ollama(async_gen, stream, start_ts)
|
||||||
response = response.model_dump_json()
|
response = response.model_dump_json()
|
||||||
else:
|
else:
|
||||||
|
|
@ -1386,14 +1621,14 @@ async def chat_proxy(request: Request):
|
||||||
else:
|
else:
|
||||||
opt = False
|
opt = False
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
is_openai_endpoint = "/v1" in endpoint
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
if use_openai:
|
||||||
if is_openai_endpoint:
|
|
||||||
if ":latest" in model:
|
if ":latest" in model:
|
||||||
model = model.split(":latest")
|
model = model.split(":latest")
|
||||||
model = model[0]
|
model = model[0]
|
||||||
if messages:
|
if messages:
|
||||||
messages = transform_images_to_data_urls(messages)
|
messages = transform_images_to_data_urls(messages)
|
||||||
|
messages = transform_tool_calls_to_openai(messages)
|
||||||
params = {
|
params = {
|
||||||
"messages": messages,
|
"messages": messages,
|
||||||
"model": model,
|
"model": model,
|
||||||
|
|
@ -1412,7 +1647,7 @@ async def chat_proxy(request: Request):
|
||||||
"response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
|
"response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
|
||||||
}
|
}
|
||||||
params.update({k: v for k, v in optional_params.items() if v is not None})
|
params.update({k: v for k, v in optional_params.items() if v is not None})
|
||||||
oclient = openai.AsyncOpenAI(base_url=endpoint, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
oclient = openai.AsyncOpenAI(base_url=ep2base(endpoint), default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -1420,7 +1655,7 @@ async def chat_proxy(request: Request):
|
||||||
async def stream_chat_response():
|
async def stream_chat_response():
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
start_ts = time.perf_counter()
|
start_ts = time.perf_counter()
|
||||||
async_gen = await oclient.chat.completions.create(**params)
|
async_gen = await oclient.chat.completions.create(**params)
|
||||||
else:
|
else:
|
||||||
|
|
@ -1430,9 +1665,14 @@ async def chat_proxy(request: Request):
|
||||||
else:
|
else:
|
||||||
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
|
async_gen = await client.chat(model=model, messages=messages, tools=tools, stream=stream, think=think, format=_format, options=options, keep_alive=keep_alive)
|
||||||
if stream == True:
|
if stream == True:
|
||||||
|
tc_acc = {} # accumulate OpenAI tool-call deltas across chunks
|
||||||
async for chunk in async_gen:
|
async for chunk in async_gen:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
|
_accumulate_openai_tc_delta(chunk, tc_acc)
|
||||||
chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
|
chunk = rechunk.openai_chat_completion2ollama(chunk, stream, start_ts)
|
||||||
|
# Inject fully-accumulated tool calls only into the final chunk
|
||||||
|
if chunk.done and tc_acc and chunk.message:
|
||||||
|
chunk.message.tool_calls = _build_ollama_tool_calls(tc_acc)
|
||||||
# `chunk` can be a dict or a pydantic model – dump to JSON safely
|
# `chunk` can be a dict or a pydantic model – dump to JSON safely
|
||||||
prompt_tok = chunk.prompt_eval_count or 0
|
prompt_tok = chunk.prompt_eval_count or 0
|
||||||
comp_tok = chunk.eval_count or 0
|
comp_tok = chunk.eval_count or 0
|
||||||
|
|
@ -1444,7 +1684,7 @@ async def chat_proxy(request: Request):
|
||||||
json_line = orjson.dumps(chunk)
|
json_line = orjson.dumps(chunk)
|
||||||
yield json_line.encode("utf-8") + b"\n"
|
yield json_line.encode("utf-8") + b"\n"
|
||||||
else:
|
else:
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
response = rechunk.openai_chat_completion2ollama(async_gen, stream, start_ts)
|
response = rechunk.openai_chat_completion2ollama(async_gen, stream, start_ts)
|
||||||
response = response.model_dump_json()
|
response = response.model_dump_json()
|
||||||
else:
|
else:
|
||||||
|
|
@ -1503,13 +1743,12 @@ async def embedding_proxy(request: Request):
|
||||||
|
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
is_openai_endpoint = "/v1" in endpoint
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
if use_openai:
|
||||||
if is_openai_endpoint:
|
|
||||||
if ":latest" in model:
|
if ":latest" in model:
|
||||||
model = model.split(":latest")
|
model = model.split(":latest")
|
||||||
model = model[0]
|
model = model[0]
|
||||||
client = openai.AsyncOpenAI(base_url=endpoint, api_key=config.api_keys[endpoint])
|
client = openai.AsyncOpenAI(base_url=ep2base(endpoint), api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -1517,7 +1756,7 @@ async def embedding_proxy(request: Request):
|
||||||
async def stream_embedding_response():
|
async def stream_embedding_response():
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
async_gen = await client.embeddings.create(input=prompt, model=model)
|
async_gen = await client.embeddings.create(input=prompt, model=model)
|
||||||
async_gen = rechunk.openai_embeddings2ollama(async_gen)
|
async_gen = rechunk.openai_embeddings2ollama(async_gen)
|
||||||
else:
|
else:
|
||||||
|
|
@ -1570,13 +1809,12 @@ async def embed_proxy(request: Request):
|
||||||
|
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
is_openai_endpoint = is_ext_openai_endpoint(endpoint) #"/v1" in endpoint
|
use_openai = is_openai_compatible(endpoint)
|
||||||
|
if use_openai:
|
||||||
if is_openai_endpoint:
|
|
||||||
if ":latest" in model:
|
if ":latest" in model:
|
||||||
model = model.split(":latest")
|
model = model.split(":latest")
|
||||||
model = model[0]
|
model = model[0]
|
||||||
client = openai.AsyncOpenAI(base_url=endpoint, api_key=config.api_keys[endpoint])
|
client = openai.AsyncOpenAI(base_url=ep2base(endpoint), api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
else:
|
else:
|
||||||
client = ollama.AsyncClient(host=endpoint)
|
client = ollama.AsyncClient(host=endpoint)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
|
|
@ -1584,7 +1822,7 @@ async def embed_proxy(request: Request):
|
||||||
async def stream_embedding_response():
|
async def stream_embedding_response():
|
||||||
try:
|
try:
|
||||||
# The chat method returns a generator of dicts (or GenerateResponse)
|
# The chat method returns a generator of dicts (or GenerateResponse)
|
||||||
if is_openai_endpoint:
|
if use_openai:
|
||||||
async_gen = await client.embeddings.create(input=_input, model=model)
|
async_gen = await client.embeddings.create(input=_input, model=model)
|
||||||
async_gen = rechunk.openai_embed2ollama(async_gen, model)
|
async_gen = rechunk.openai_embed2ollama(async_gen, model)
|
||||||
else:
|
else:
|
||||||
|
|
@ -1983,6 +2221,9 @@ async def tags_proxy(request: Request):
|
||||||
# 1. Query all endpoints for models
|
# 1. Query all endpoints for models
|
||||||
tasks = [fetch.endpoint_details(ep, "/api/tags", "models") for ep in config.endpoints if "/v1" not in ep]
|
tasks = [fetch.endpoint_details(ep, "/api/tags", "models") for ep in config.endpoints if "/v1" not in ep]
|
||||||
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep]) for ep in config.endpoints if "/v1" in ep]
|
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep]) for ep in config.endpoints if "/v1" in ep]
|
||||||
|
# Also query llama-server endpoints not already covered by config.endpoints
|
||||||
|
llama_eps_for_tags = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
||||||
|
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep)) for ep in llama_eps_for_tags]
|
||||||
all_models = await asyncio.gather(*tasks)
|
all_models = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
models = {'models': []}
|
models = {'models': []}
|
||||||
|
|
@ -2010,18 +2251,47 @@ async def tags_proxy(request: Request):
|
||||||
@app.get("/api/ps")
|
@app.get("/api/ps")
|
||||||
async def ps_proxy(request: Request):
|
async def ps_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Proxy a ps request to all Ollama endpoints and reply a unique list of all running models.
|
Proxy a ps request to all Ollama and llama-server endpoints and reply a unique list of all running models.
|
||||||
|
|
||||||
|
For Ollama endpoints: queries /api/ps
|
||||||
|
For llama-server endpoints: queries /v1/models with status.value == "loaded"
|
||||||
"""
|
"""
|
||||||
# 1. Query all endpoints for running models
|
# 1. Query Ollama endpoints for running models via /api/ps
|
||||||
tasks = [fetch.endpoint_details(ep, "/api/ps", "models") for ep in config.endpoints if "/v1" not in ep]
|
ollama_tasks = [fetch.endpoint_details(ep, "/api/ps", "models") for ep in config.endpoints if "/v1" not in ep]
|
||||||
loaded_models = await asyncio.gather(*tasks)
|
# 2. Query llama-server endpoints for loaded models via /v1/models
|
||||||
|
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
|
||||||
|
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
|
||||||
|
llama_tasks = [
|
||||||
|
fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep))
|
||||||
|
for ep in all_llama_endpoints
|
||||||
|
]
|
||||||
|
|
||||||
|
ollama_loaded = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
|
||||||
|
llama_loaded = await asyncio.gather(*llama_tasks) if llama_tasks else []
|
||||||
|
|
||||||
models = {'models': []}
|
models = {'models': []}
|
||||||
for modellist in loaded_models:
|
# Add Ollama models (if any)
|
||||||
|
if ollama_loaded:
|
||||||
|
for modellist in ollama_loaded:
|
||||||
models['models'] += modellist
|
models['models'] += modellist
|
||||||
|
# Add llama-server models (filter for loaded only, if any)
|
||||||
|
if llama_loaded:
|
||||||
|
for modellist in llama_loaded:
|
||||||
|
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
||||||
|
# Convert llama-server format to Ollama-like format for consistency
|
||||||
|
for item in loaded_models:
|
||||||
|
raw_id = item.get("id", "")
|
||||||
|
normalized = _normalize_llama_model_name(raw_id)
|
||||||
|
quant = _extract_llama_quant(raw_id)
|
||||||
|
models['models'].append({
|
||||||
|
"name": normalized,
|
||||||
|
"id": normalized,
|
||||||
|
"digest": "",
|
||||||
|
"status": item.get("status"),
|
||||||
|
"details": {"quantization_level": quant} if quant else {}
|
||||||
|
})
|
||||||
|
|
||||||
# 2. Return a JSONResponse with deduplicated currently deployed models
|
# 3. Return a JSONResponse with deduplicated currently deployed models
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content={"models": dedupe_on_keys(models['models'], ['digest'])},
|
content={"models": dedupe_on_keys(models['models'], ['digest'])},
|
||||||
status_code=200,
|
status_code=200,
|
||||||
|
|
@ -2033,20 +2303,64 @@ async def ps_proxy(request: Request):
|
||||||
@app.get("/api/ps_details")
|
@app.get("/api/ps_details")
|
||||||
async def ps_details_proxy(request: Request):
|
async def ps_details_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Proxy a ps request to all Ollama endpoints and reply with per-endpoint instances.
|
Proxy a ps request to all Ollama and llama-server endpoints and reply with per-endpoint instances.
|
||||||
This keeps /api/ps backward compatible while providing richer data.
|
This keeps /api/ps backward compatible while providing richer data.
|
||||||
|
|
||||||
|
For Ollama endpoints: queries /api/ps
|
||||||
|
For llama-server endpoints: queries /v1/models with status info
|
||||||
"""
|
"""
|
||||||
tasks = [(ep, fetch.endpoint_details(ep, "/api/ps", "models")) for ep in config.endpoints if "/v1" not in ep]
|
# 1. Query Ollama endpoints via /api/ps
|
||||||
loaded_models = await asyncio.gather(*[task for _, task in tasks])
|
ollama_tasks = [(ep, fetch.endpoint_details(ep, "/api/ps", "models")) for ep in config.endpoints if "/v1" not in ep]
|
||||||
|
# 2. Query llama-server endpoints via /v1/models
|
||||||
|
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
|
||||||
|
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
|
||||||
|
llama_tasks = [
|
||||||
|
(ep, fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep)))
|
||||||
|
for ep in all_llama_endpoints
|
||||||
|
]
|
||||||
|
|
||||||
|
ollama_loaded = await asyncio.gather(*[task for _, task in ollama_tasks]) if ollama_tasks else []
|
||||||
|
llama_loaded = await asyncio.gather(*[task for _, task in llama_tasks]) if llama_tasks else []
|
||||||
|
|
||||||
models: list[dict] = []
|
models: list[dict] = []
|
||||||
for (endpoint, modellist) in zip([ep for ep, _ in tasks], loaded_models):
|
|
||||||
|
# Add Ollama models with endpoint info (if any)
|
||||||
|
if ollama_loaded:
|
||||||
|
for (endpoint, modellist) in zip([ep for ep, _ in ollama_tasks], ollama_loaded):
|
||||||
for model in modellist:
|
for model in modellist:
|
||||||
if isinstance(model, dict):
|
if isinstance(model, dict):
|
||||||
model_with_endpoint = dict(model)
|
model_with_endpoint = dict(model)
|
||||||
model_with_endpoint["endpoint"] = endpoint
|
model_with_endpoint["endpoint"] = endpoint
|
||||||
models.append(model_with_endpoint)
|
models.append(model_with_endpoint)
|
||||||
|
|
||||||
|
# Add llama-server models with endpoint info and full status metadata (if any)
|
||||||
|
if llama_loaded:
|
||||||
|
for (endpoint, modellist) in zip([ep for ep, _ in llama_tasks], llama_loaded):
|
||||||
|
# Filter for loaded models only
|
||||||
|
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
||||||
|
for item in loaded_models:
|
||||||
|
if isinstance(item, dict) and item.get("id"):
|
||||||
|
raw_id = item["id"]
|
||||||
|
normalized = _normalize_llama_model_name(raw_id)
|
||||||
|
quant = _extract_llama_quant(raw_id)
|
||||||
|
model_with_endpoint = {
|
||||||
|
"name": normalized,
|
||||||
|
"id": normalized,
|
||||||
|
"original_name": raw_id,
|
||||||
|
"digest": "",
|
||||||
|
"details": {"quantization_level": quant} if quant else {},
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"status": item.get("status"),
|
||||||
|
"created": item.get("created"),
|
||||||
|
"owned_by": item.get("owned_by")
|
||||||
|
}
|
||||||
|
# Include full llama-server status details (args, preset)
|
||||||
|
status_info = item.get("status", {})
|
||||||
|
if isinstance(status_info, dict):
|
||||||
|
model_with_endpoint["llama_status_args"] = status_info.get("args")
|
||||||
|
model_with_endpoint["llama_status_preset"] = status_info.get("preset")
|
||||||
|
models.append(model_with_endpoint)
|
||||||
|
|
||||||
return JSONResponse(content={"models": models}, status_code=200)
|
return JSONResponse(content={"models": models}, status_code=200)
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
|
|
@ -2068,7 +2382,7 @@ async def usage_proxy(request: Request):
|
||||||
async def config_proxy(request: Request):
|
async def config_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Return a simple JSON object that contains the configured
|
Return a simple JSON object that contains the configured
|
||||||
Ollama endpoints. The front‑end uses this to display
|
Ollama endpoints and llama_server_endpoints. The front‑end uses this to display
|
||||||
which endpoints are being proxied.
|
which endpoints are being proxied.
|
||||||
"""
|
"""
|
||||||
async def check_endpoint(url: str):
|
async def check_endpoint(url: str):
|
||||||
|
|
@ -2092,9 +2406,17 @@ async def config_proxy(request: Request):
|
||||||
detail = _format_connection_issue(target_url, e)
|
detail = _format_connection_issue(target_url, e)
|
||||||
return {"url": url, "status": "error", "detail": detail}
|
return {"url": url, "status": "error", "detail": detail}
|
||||||
|
|
||||||
results = await asyncio.gather(*[check_endpoint(ep) for ep in config.endpoints])
|
# Check Ollama endpoints
|
||||||
|
ollama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.endpoints])
|
||||||
|
|
||||||
|
# Check llama-server endpoints
|
||||||
|
llama_results = []
|
||||||
|
if config.llama_server_endpoints:
|
||||||
|
llama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.llama_server_endpoints])
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"endpoints": results,
|
"endpoints": ollama_results,
|
||||||
|
"llama_server_endpoints": llama_results,
|
||||||
"require_router_api_key": bool(config.router_api_key),
|
"require_router_api_key": bool(config.router_api_key),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2129,8 +2451,8 @@ async def openai_embedding_proxy(request: Request):
|
||||||
# 2. Endpoint logic
|
# 2. Endpoint logic
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
if "/v1" in endpoint: # and is_ext_openai_endpoint(endpoint):
|
if is_openai_compatible(endpoint):
|
||||||
api_key = config.api_keys[endpoint]
|
api_key = config.api_keys.get(endpoint, "no-key")
|
||||||
else:
|
else:
|
||||||
api_key = "ollama"
|
api_key = "ollama"
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
|
|
@ -2215,8 +2537,7 @@ async def openai_chat_completions_proxy(request: Request):
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
|
||||||
# 3. Async generator that streams completions data and decrements the counter
|
# 3. Async generator that streams completions data and decrements the counter
|
||||||
async def stream_ochat_response():
|
async def stream_ochat_response():
|
||||||
try:
|
try:
|
||||||
|
|
@ -2341,8 +2662,7 @@ async def openai_completions_proxy(request: Request):
|
||||||
endpoint = await choose_endpoint(model)
|
endpoint = await choose_endpoint(model)
|
||||||
await increment_usage(endpoint, model)
|
await increment_usage(endpoint, model)
|
||||||
base_url = ep2base(endpoint)
|
base_url = ep2base(endpoint)
|
||||||
|
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
|
||||||
oclient = openai.AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=config.api_keys[endpoint])
|
|
||||||
|
|
||||||
# 3. Async generator that streams completions data and decrements the counter
|
# 3. Async generator that streams completions data and decrements the counter
|
||||||
async def stream_ocompletions_response(model=model):
|
async def stream_ocompletions_response(model=model):
|
||||||
|
|
@ -2398,22 +2718,46 @@ async def openai_completions_proxy(request: Request):
|
||||||
@app.get("/v1/models")
|
@app.get("/v1/models")
|
||||||
async def openai_models_proxy(request: Request):
|
async def openai_models_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Proxy an OpenAI API models request to Ollama endpoints and reply with a unique list of all models.
|
Proxy an OpenAI API models request to Ollama and llama-server endpoints and reply with a unique list of models.
|
||||||
|
|
||||||
|
For Ollama endpoints: queries /api/tags (all models)
|
||||||
|
For llama-server endpoints: queries /v1/models and filters for status.value == "loaded"
|
||||||
"""
|
"""
|
||||||
# 1. Query all endpoints for models
|
# 1. Query Ollama endpoints for all models via /api/tags
|
||||||
tasks = [fetch.endpoint_details(ep, "/api/tags", "models") for ep in config.endpoints if "/v1" not in ep]
|
ollama_tasks = [fetch.endpoint_details(ep, "/api/tags", "models") for ep in config.endpoints if "/v1" not in ep]
|
||||||
tasks += [fetch.endpoint_details(ep, "/models", "data", config.api_keys[ep]) for ep in config.endpoints if "/v1" in ep]
|
# 2. Query llama-server endpoints for loaded models via /v1/models
|
||||||
all_models = await asyncio.gather(*tasks)
|
# Also query endpoints from llama_server_endpoints that may not be in config.endpoints
|
||||||
|
all_llama_endpoints = set(config.llama_server_endpoints) | set(ep for ep in config.endpoints if ep in config.llama_server_endpoints)
|
||||||
|
llama_tasks = [
|
||||||
|
fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep))
|
||||||
|
for ep in all_llama_endpoints
|
||||||
|
]
|
||||||
|
|
||||||
|
ollama_models = await asyncio.gather(*ollama_tasks) if ollama_tasks else []
|
||||||
|
llama_models = await asyncio.gather(*llama_tasks) if llama_tasks else []
|
||||||
|
|
||||||
models = {'data': []}
|
models = {'data': []}
|
||||||
for modellist in all_models:
|
|
||||||
|
# Add Ollama models (if any)
|
||||||
|
if ollama_models:
|
||||||
|
for modellist in ollama_models:
|
||||||
for model in modellist:
|
for model in modellist:
|
||||||
if not "id" in model.keys(): # Relable Ollama models with OpenAI Model.id from Model.name
|
if not "id" in model.keys(): # Relable Ollama models with OpenAI Model.id from Model.name
|
||||||
model['id'] = model['name']
|
model['id'] = model.get('name', model.get('id', ''))
|
||||||
else:
|
else:
|
||||||
model['name'] = model['id']
|
model['name'] = model['id']
|
||||||
models['data'] += modellist
|
models['data'].append(model)
|
||||||
|
|
||||||
|
# Add llama-server models (filter for loaded only, if any)
|
||||||
|
if llama_models:
|
||||||
|
for modellist in llama_models:
|
||||||
|
loaded_models = [item for item in modellist if _is_llama_model_loaded(item)]
|
||||||
|
for model in loaded_models:
|
||||||
|
if not "id" in model.keys():
|
||||||
|
model['id'] = model.get('name', model.get('id', ''))
|
||||||
|
else:
|
||||||
|
model['name'] = model['id']
|
||||||
|
models['data'].append(model)
|
||||||
|
|
||||||
# 2. Return a JSONResponse with a deduplicated list of unique models for inference
|
# 2. Return a JSONResponse with a deduplicated list of unique models for inference
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|
@ -2459,7 +2803,7 @@ async def health_proxy(request: Request):
|
||||||
* The HTTP status code is 200 when everything is healthy, 503 otherwise.
|
* The HTTP status code is 200 when everything is healthy, 503 otherwise.
|
||||||
"""
|
"""
|
||||||
# Run all health checks in parallel
|
# Run all health checks in parallel
|
||||||
tasks = [fetch.endpoint_details(ep, "/api/version", "version") for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)]
|
tasks = [fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True) for ep in config.endpoints] # if not is_ext_openai_endpoint(ep)]
|
||||||
|
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
|
@ -2524,6 +2868,7 @@ async def startup_event() -> None:
|
||||||
print(
|
print(
|
||||||
f"Loaded configuration from {config_path}:\n"
|
f"Loaded configuration from {config_path}:\n"
|
||||||
f" endpoints={config.endpoints},\n"
|
f" endpoints={config.endpoints},\n"
|
||||||
|
f" llama_server_endpoints={config.llama_server_endpoints},\n"
|
||||||
f" max_concurrent_connections={config.max_concurrent_connections}"
|
f" max_concurrent_connections={config.max_concurrent_connections}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -379,7 +379,7 @@
|
||||||
<th>Quant</th>
|
<th>Quant</th>
|
||||||
<th>Ctx</th>
|
<th>Ctx</th>
|
||||||
<th>Size</th>
|
<th>Size</th>
|
||||||
<th>Until</th>
|
<th>Unload</th>
|
||||||
<th>Digest</th>
|
<th>Digest</th>
|
||||||
<th>Tokens</th>
|
<th>Tokens</th>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
@ -683,7 +683,12 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
showApiKeyModal("Enter the NOMYO Router API key to load the dashboard.");
|
showApiKeyModal("Enter the NOMYO Router API key to load the dashboard.");
|
||||||
}
|
}
|
||||||
const body = document.getElementById("endpoints-body");
|
const body = document.getElementById("endpoints-body");
|
||||||
body.innerHTML = data.endpoints
|
|
||||||
|
// Build HTML for both endpoints and llama_server_endpoints
|
||||||
|
let html = "";
|
||||||
|
|
||||||
|
// Add Ollama endpoints
|
||||||
|
html += data.endpoints
|
||||||
.map((e) => {
|
.map((e) => {
|
||||||
const statusClass =
|
const statusClass =
|
||||||
e.status === "ok"
|
e.status === "ok"
|
||||||
|
|
@ -698,6 +703,27 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
</tr>`;
|
</tr>`;
|
||||||
})
|
})
|
||||||
.join("");
|
.join("");
|
||||||
|
|
||||||
|
// Add llama-server endpoints
|
||||||
|
if (data.llama_server_endpoints && data.llama_server_endpoints.length > 0) {
|
||||||
|
html += data.llama_server_endpoints
|
||||||
|
.map((e) => {
|
||||||
|
const statusClass =
|
||||||
|
e.status === "ok"
|
||||||
|
? "status-ok"
|
||||||
|
: "status-error";
|
||||||
|
const version = e.version || "N/A";
|
||||||
|
return `
|
||||||
|
<tr>
|
||||||
|
<td class="endpoint">${e.url}</td>
|
||||||
|
<td class="status ${statusClass}">${e.status}</td>
|
||||||
|
<td class="version">${version}</td>
|
||||||
|
</tr>`;
|
||||||
|
})
|
||||||
|
.join("");
|
||||||
|
}
|
||||||
|
|
||||||
|
body.innerHTML = html;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
const body = document.getElementById("endpoints-body");
|
const body = document.getElementById("endpoints-body");
|
||||||
|
|
@ -837,7 +863,7 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
|
|
||||||
const formatUntil = (value) => {
|
const formatUntil = (value) => {
|
||||||
if (value === null || value === undefined || value === "") {
|
if (value === null || value === undefined || value === "") {
|
||||||
return "Forever";
|
return "∞";
|
||||||
}
|
}
|
||||||
|
|
||||||
let targetTime;
|
let targetTime;
|
||||||
|
|
@ -898,10 +924,11 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
const params = modelInstances[0]?.details?.parameter_size ?? "";
|
const params = modelInstances[0]?.details?.parameter_size ?? "";
|
||||||
const quant = modelInstances[0]?.details?.quantization_level ?? "";
|
const quant = modelInstances[0]?.details?.quantization_level ?? "";
|
||||||
const ctx = modelInstances[0]?.context_length ?? "";
|
const ctx = modelInstances[0]?.context_length ?? "";
|
||||||
|
const originalName = modelInstances[0]?.original_name || modelName;
|
||||||
const uniqueEndpoints = Array.from(new Set(endpoints));
|
const uniqueEndpoints = Array.from(new Set(endpoints));
|
||||||
const endpointsData = encodeURIComponent(JSON.stringify(uniqueEndpoints));
|
const endpointsData = encodeURIComponent(JSON.stringify(uniqueEndpoints));
|
||||||
return `<tr data-model="${modelName}" data-endpoints="${endpointsData}">
|
return `<tr data-model="${modelName}" data-endpoints="${endpointsData}">
|
||||||
<td class="model">${modelName} <a href="#" class="stats-link" data-model="${modelName}">stats</a></td>
|
<td class="model">${modelName} <a href="#" class="stats-link" data-model="${originalName}">stats</a></td>
|
||||||
<td>${renderInstanceList(endpoints)}</td>
|
<td>${renderInstanceList(endpoints)}</td>
|
||||||
<td>${params}</td>
|
<td>${params}</td>
|
||||||
<td>${quant}</td>
|
<td>${quant}</td>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue