Compare commits
1 commit
main
...
artifacts-
| Author | SHA1 | Date | |
|---|---|---|---|
| caf708d7a2 |
11 changed files with 125 additions and 272 deletions
|
|
@ -86,7 +86,7 @@ jobs:
|
||||||
provenance: false
|
provenance: false
|
||||||
build-args: |
|
build-args: |
|
||||||
SEMANTIC_CACHE=true
|
SEMANTIC_CACHE=true
|
||||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}-${{ github.run_id }}
|
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}
|
||||||
|
|
||||||
merge:
|
merge:
|
||||||
runs-on: docker-amd64
|
runs-on: docker-amd64
|
||||||
|
|
@ -142,6 +142,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
docker buildx imagetools create \
|
docker buildx imagetools create \
|
||||||
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64-${{ github.run_id }} \
|
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \
|
||||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64-${{ github.run_id }}
|
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ jobs:
|
||||||
platforms: ${{ matrix.platform }}
|
platforms: ${{ matrix.platform }}
|
||||||
push: true
|
push: true
|
||||||
provenance: false
|
provenance: false
|
||||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}-${{ github.run_id }}
|
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
|
||||||
|
|
||||||
merge:
|
merge:
|
||||||
runs-on: docker-amd64
|
runs-on: docker-amd64
|
||||||
|
|
@ -133,6 +133,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
docker buildx imagetools create \
|
docker buildx imagetools create \
|
||||||
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
$(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64-${{ github.run_id }} \
|
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \
|
||||||
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64-${{ github.run_id }}
|
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ jobs:
|
||||||
opencode:
|
opencode:
|
||||||
if: |
|
if: |
|
||||||
contains(github.event.comment.body, '/oc') ||
|
contains(github.event.comment.body, '/oc') ||
|
||||||
contains(github.event.review.body, '/oc')
|
contains(github.event.comment.body, '/opencode')
|
||||||
runs-on: docker-amd64
|
runs-on: docker-amd64
|
||||||
container:
|
container:
|
||||||
image: node:lts-bookworm
|
image: node:lts-bookworm
|
||||||
|
|
@ -54,7 +54,9 @@ jobs:
|
||||||
uses: ./.opencode-action
|
uses: ./.opencode-action
|
||||||
with:
|
with:
|
||||||
nomyo_api_key: ${{ secrets.NOMYO_API_KEY }}
|
nomyo_api_key: ${{ secrets.NOMYO_API_KEY }}
|
||||||
model: nomyo/unsloth/Qwen3.6-35B-A3B-MTP-GGUF:Q4_K_XL
|
model: nomyo/unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_M
|
||||||
forgejo_api_url: https://bitfreedom.net/code/
|
forgejo_api_url: https://bitfreedom.net/code/
|
||||||
forgejo_token: ${{ secrets.FORGEJO_TOKEN }}
|
forgejo_token: ${{ secrets.FORGEJO_TOKEN }}
|
||||||
forgejo_push_token: ${{ secrets.FORGEJO_PUSH_TOKEN }}
|
forgejo_push_token: ${{ secrets.FORGEJO_PUSH_TOKEN }}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
20
.forgejo/workflows/test-action.yml
Normal file
20
.forgejo/workflows/test-action.yml
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
name: artifact-probe
|
||||||
|
on: workflow_dispatch
|
||||||
|
jobs:
|
||||||
|
up:
|
||||||
|
runs-on: docker-amd64
|
||||||
|
container:
|
||||||
|
image: node:lts-bookworm
|
||||||
|
steps:
|
||||||
|
- run: echo "hello" > /tmp/probe.txt
|
||||||
|
- uses: https://github.com/actions/upload-artifact@v4
|
||||||
|
with: { name: probe, path: /tmp/probe.txt }
|
||||||
|
down:
|
||||||
|
needs: up
|
||||||
|
runs-on: docker-amd64
|
||||||
|
container:
|
||||||
|
image: node:lts-bookworm
|
||||||
|
steps:
|
||||||
|
- uses: https://github.com/actions/download-artifact@v4
|
||||||
|
with: { name: probe, path: /tmp/got }
|
||||||
|
- run: cat /tmp/got/probe.txt
|
||||||
|
|
@ -206,8 +206,6 @@ The `/health` endpoint provides comprehensive health status:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
For Ollama endpoints the probe is a parallel check of `/api/version` (liveness) and `/api/ps` (the route used by `choose_endpoint` when selecting a backend for a request). Reporting `ok` only when both succeed prevents the router from advertising an endpoint as healthy while completion calls dead-end on `/api/ps`. The same dual probe backs `/api/config`, which the dashboard uses to render endpoint health.
|
|
||||||
|
|
||||||
## Database Schema
|
## Database Schema
|
||||||
|
|
||||||
The router uses SQLite for persistent storage:
|
The router uses SQLite for persistent storage:
|
||||||
|
|
|
||||||
|
|
@ -29,10 +29,6 @@ Response:
|
||||||
- `200`: All endpoints healthy
|
- `200`: All endpoints healthy
|
||||||
- `503`: One or more endpoints unhealthy
|
- `503`: One or more endpoints unhealthy
|
||||||
|
|
||||||
**Probe scope per endpoint**:
|
|
||||||
- **Ollama endpoints** are probed at both `/api/version` (liveness) and `/api/ps` (model-introspection used by the router). If either fails the endpoint is reported as `error`; the response still includes `version` when the daemon is reachable so operators can tell a partial failure from a full outage. The `detail` field names the failing probe, e.g. `"/api/ps: 502 …"`.
|
|
||||||
- **OpenAI-compatible / llama-server endpoints** are probed at `/models`.
|
|
||||||
|
|
||||||
### Current Usage
|
### Current Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -137,8 +133,6 @@ Response:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Uses the same dual-probe logic as `/health` (Ollama: `/api/version` + `/api/ps`; OpenAI-compatible: `/models`). An endpoint will report `error` whenever either probe fails. The dashboard renders the `detail` field as a tooltip on the status cell.
|
|
||||||
|
|
||||||
### Cache Statistics
|
### Cache Statistics
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
|
|
@ -5,18 +5,18 @@ annotated-types==0.7.0
|
||||||
anyio==4.13.0
|
anyio==4.13.0
|
||||||
async-timeout==5.0.1
|
async-timeout==5.0.1
|
||||||
attrs==26.1.0
|
attrs==26.1.0
|
||||||
certifi==2026.5.20
|
certifi==2026.4.22
|
||||||
click==8.4.0
|
click==8.4.0
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
exceptiongroup==1.3.1
|
exceptiongroup==1.3.1
|
||||||
fastapi==0.136.3
|
fastapi==0.136.1
|
||||||
fastapi-sse==1.1.1
|
fastapi-sse==1.1.1
|
||||||
frozenlist==1.8.0
|
frozenlist==1.8.0
|
||||||
h11==0.16.0
|
h11==0.16.0
|
||||||
httpcore==1.0.9
|
httpcore==1.0.9
|
||||||
httpx==0.28.1
|
httpx==0.28.1
|
||||||
idna==3.15
|
idna==3.15
|
||||||
jiter==0.15.0
|
jiter==0.14.0
|
||||||
multidict==6.7.1
|
multidict==6.7.1
|
||||||
ollama==0.6.2
|
ollama==0.6.2
|
||||||
openai==2.37.0
|
openai==2.37.0
|
||||||
|
|
@ -30,10 +30,10 @@ pydantic_core==2.46.4
|
||||||
python-dotenv==1.2.2
|
python-dotenv==1.2.2
|
||||||
PyYAML==6.0.3
|
PyYAML==6.0.3
|
||||||
sniffio==1.3.1
|
sniffio==1.3.1
|
||||||
starlette>=1.0.1
|
starlette==0.52.1
|
||||||
truststore==0.10.4
|
truststore==0.10.4
|
||||||
tiktoken==0.13.0
|
tiktoken==0.13.0
|
||||||
tqdm==4.68.2
|
tqdm==4.67.3
|
||||||
typing-inspection==0.4.2
|
typing-inspection==0.4.2
|
||||||
typing_extensions==4.15.0
|
typing_extensions==4.15.0
|
||||||
uvicorn==0.47.0
|
uvicorn==0.47.0
|
||||||
|
|
|
||||||
195
router.py
195
router.py
|
|
@ -1000,7 +1000,7 @@ class fetch:
|
||||||
async with client.get(f"{endpoint}/models") as resp:
|
async with client.get(f"{endpoint}/models") as resp:
|
||||||
await _ensure_success(resp)
|
await _ensure_success(resp)
|
||||||
data = await resp.json()
|
data = await resp.json()
|
||||||
|
|
||||||
# Filter for loaded models only
|
# Filter for loaded models only
|
||||||
items = data.get("data", [])
|
items = data.get("data", [])
|
||||||
models = {
|
models = {
|
||||||
|
|
@ -1012,19 +1012,11 @@ class fetch:
|
||||||
# Update cache with lock protection
|
# Update cache with lock protection
|
||||||
async with _loaded_models_cache_lock:
|
async with _loaded_models_cache_lock:
|
||||||
_loaded_models_cache[endpoint] = (models, time.time())
|
_loaded_models_cache[endpoint] = (models, time.time())
|
||||||
# Probe succeeded — clear any stale error so the endpoint
|
|
||||||
# becomes routable again.
|
|
||||||
async with _loaded_error_cache_lock:
|
|
||||||
_loaded_error_cache.pop(endpoint, None)
|
|
||||||
return models
|
return models
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If anything goes wrong we simply assume the endpoint has no models
|
# If anything goes wrong we simply assume the endpoint has no models
|
||||||
message = _format_connection_issue(f"{endpoint}/models", e)
|
message = _format_connection_issue(f"{endpoint}/models", e)
|
||||||
print(f"[fetch.loaded_models] {message}")
|
print(f"[fetch.loaded_models] {message}")
|
||||||
# Record the failure so `choose_endpoint` can avoid routing
|
|
||||||
# to an unhealthy backend and repeated probes short-circuit.
|
|
||||||
async with _loaded_error_cache_lock:
|
|
||||||
_loaded_error_cache[endpoint] = time.time()
|
|
||||||
return set()
|
return set()
|
||||||
else:
|
else:
|
||||||
# Original Ollama /api/ps logic
|
# Original Ollama /api/ps logic
|
||||||
|
|
@ -1039,15 +1031,11 @@ class fetch:
|
||||||
# Update cache with lock protection
|
# Update cache with lock protection
|
||||||
async with _loaded_models_cache_lock:
|
async with _loaded_models_cache_lock:
|
||||||
_loaded_models_cache[endpoint] = (models, time.time())
|
_loaded_models_cache[endpoint] = (models, time.time())
|
||||||
async with _loaded_error_cache_lock:
|
|
||||||
_loaded_error_cache.pop(endpoint, None)
|
|
||||||
return models
|
return models
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# If anything goes wrong we simply assume the endpoint has no models
|
# If anything goes wrong we simply assume the endpoint has no models
|
||||||
message = _format_connection_issue(f"{endpoint}/api/ps", e)
|
message = _format_connection_issue(f"{endpoint}/api/ps", e)
|
||||||
print(f"[fetch.loaded_models] {message}")
|
print(f"[fetch.loaded_models] {message}")
|
||||||
async with _loaded_error_cache_lock:
|
|
||||||
_loaded_error_cache[endpoint] = time.time()
|
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
async def _refresh_loaded_models(endpoint: str) -> None:
|
async def _refresh_loaded_models(endpoint: str) -> None:
|
||||||
|
|
@ -1865,28 +1853,6 @@ async def choose_endpoint(model: str, reserve: bool = True,
|
||||||
load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints]
|
load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints]
|
||||||
loaded_sets = await asyncio.gather(*load_tasks)
|
loaded_sets = await asyncio.gather(*load_tasks)
|
||||||
|
|
||||||
# 3️⃣.5 Exclude endpoints whose loaded-model probe has been failing
|
|
||||||
# recently. Without this filter, an endpoint where `/api/ps` returns 5xx
|
|
||||||
# would appear with an empty loaded set but pass through to the
|
|
||||||
# free-slot fallback (step 4) — sending completion calls to an
|
|
||||||
# unhealthy backend. See issue #83.
|
|
||||||
async with _loaded_error_cache_lock:
|
|
||||||
unhealthy = {
|
|
||||||
ep for ep, ts in _loaded_error_cache.items()
|
|
||||||
if _is_fresh(ts, 300)
|
|
||||||
}
|
|
||||||
if unhealthy:
|
|
||||||
filtered = [
|
|
||||||
(ep, models) for ep, models in zip(candidate_endpoints, loaded_sets)
|
|
||||||
if ep not in unhealthy
|
|
||||||
]
|
|
||||||
if filtered:
|
|
||||||
candidate_endpoints = [ep for ep, _ in filtered]
|
|
||||||
loaded_sets = [models for _, models in filtered]
|
|
||||||
# If *every* candidate is unhealthy we still fall through with the
|
|
||||||
# original list — refusing to route is worse than retrying a
|
|
||||||
# possibly-recovered backend.
|
|
||||||
|
|
||||||
# Look up a possible affinity hint *before* taking usage_lock. The two
|
# Look up a possible affinity hint *before* taking usage_lock. The two
|
||||||
# locks are never held together to avoid lock-ordering issues.
|
# locks are never held together to avoid lock-ordering issues.
|
||||||
affine_ep: Optional[str] = None
|
affine_ep: Optional[str] = None
|
||||||
|
|
@ -3188,103 +3154,44 @@ async def usage_proxy(request: Request):
|
||||||
"token_usage_counts": token_usage_counts}
|
"token_usage_counts": token_usage_counts}
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
# 20. Endpoint health probes (shared by /api/config and /health)
|
# 20. Proxy config route – for monitoring and frontent usage
|
||||||
# -------------------------------------------------------------
|
|
||||||
async def _raw_probe(
|
|
||||||
ep: str,
|
|
||||||
route: str,
|
|
||||||
api_key: Optional[str] = None,
|
|
||||||
timeout: Optional[float] = None,
|
|
||||||
) -> tuple[bool, object]:
|
|
||||||
"""Direct HTTP probe that distinguishes success from failure
|
|
||||||
(unlike `fetch.endpoint_details`, which returns [] on either).
|
|
||||||
Returns `(ok, payload_or_error_message)`.
|
|
||||||
"""
|
|
||||||
headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")}
|
|
||||||
if api_key is not None:
|
|
||||||
headers["Authorization"] = "Bearer " + api_key
|
|
||||||
url = f"{ep.rstrip('/')}/{route.lstrip('/')}"
|
|
||||||
req_kwargs = {}
|
|
||||||
if timeout is not None:
|
|
||||||
req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout)
|
|
||||||
try:
|
|
||||||
client: aiohttp.ClientSession = get_session(ep)
|
|
||||||
async with client.get(url, headers=headers, **req_kwargs) as resp:
|
|
||||||
await _ensure_success(resp)
|
|
||||||
data = await resp.json()
|
|
||||||
return True, data
|
|
||||||
except Exception as exc:
|
|
||||||
return False, _format_connection_issue(url, exc)
|
|
||||||
|
|
||||||
|
|
||||||
async def _endpoint_health(ep: str, *, timeout: Optional[float] = None) -> dict:
|
|
||||||
"""Probe an endpoint and return `{status, version?, detail?}`.
|
|
||||||
|
|
||||||
Ollama endpoints get a dual probe of `/api/version` and `/api/ps` so
|
|
||||||
that a daemon which is reachable but has a broken model-introspection
|
|
||||||
path (issue #83) is reported as `error` rather than `ok`.
|
|
||||||
OpenAI-compatible endpoints use a single `/models` probe.
|
|
||||||
"""
|
|
||||||
if is_openai_compatible(ep):
|
|
||||||
ok, payload = await _raw_probe(
|
|
||||||
ep, "/models", config.api_keys.get(ep), timeout=timeout,
|
|
||||||
)
|
|
||||||
if ok:
|
|
||||||
return {"status": "ok", "version": "latest"}
|
|
||||||
return {"status": "error", "detail": str(payload)}
|
|
||||||
|
|
||||||
(version_ok, version_payload), (ps_ok, ps_payload) = await asyncio.gather(
|
|
||||||
_raw_probe(ep, "/api/version", timeout=timeout),
|
|
||||||
_raw_probe(ep, "/api/ps", timeout=timeout),
|
|
||||||
)
|
|
||||||
|
|
||||||
version_value = (
|
|
||||||
version_payload.get("version")
|
|
||||||
if version_ok and isinstance(version_payload, dict)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
if version_ok and ps_ok:
|
|
||||||
return {"status": "ok", "version": version_value}
|
|
||||||
if not version_ok and not ps_ok:
|
|
||||||
return {"status": "error", "detail": str(version_payload)}
|
|
||||||
# Partial failure — daemon reachable but one probe failed. Report
|
|
||||||
# as "error" so callers can surface the issue; include `version` so
|
|
||||||
# the operator knows the daemon itself is alive.
|
|
||||||
if not ps_ok:
|
|
||||||
return {
|
|
||||||
"status": "error",
|
|
||||||
"version": version_value,
|
|
||||||
"detail": f"/api/ps: {ps_payload}",
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
"status": "error",
|
|
||||||
"detail": f"/api/version: {version_payload}",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------------------------------------
|
|
||||||
# 20b. Proxy config route – for monitoring and frontend usage
|
|
||||||
# -------------------------------------------------------------
|
# -------------------------------------------------------------
|
||||||
@app.get("/api/config")
|
@app.get("/api/config")
|
||||||
async def config_proxy(request: Request):
|
async def config_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Return a simple JSON object that contains the configured
|
Return a simple JSON object that contains the configured
|
||||||
Ollama endpoints and llama_server_endpoints. The front‑end uses this
|
Ollama endpoints and llama_server_endpoints. The front‑end uses this to display
|
||||||
to display which endpoints are being proxied and their health.
|
which endpoints are being proxied.
|
||||||
Status is "error" when either liveness (/api/version) or routing
|
|
||||||
health (/api/ps) fails — see issue #83.
|
|
||||||
"""
|
"""
|
||||||
async def check(url: str) -> dict:
|
async def check_endpoint(url: str):
|
||||||
return {"url": url, **(await _endpoint_health(url, timeout=5))}
|
client: aiohttp.ClientSession = get_session(url)
|
||||||
|
headers = None
|
||||||
|
if "/v1" in url:
|
||||||
|
headers = {"Authorization": "Bearer " + config.api_keys.get(url, "no-key")}
|
||||||
|
target_url = f"{url}/models"
|
||||||
|
else:
|
||||||
|
target_url = f"{url}/api/version"
|
||||||
|
|
||||||
ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints])
|
try:
|
||||||
|
async with client.get(target_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp:
|
||||||
|
await _ensure_success(resp)
|
||||||
|
data = await resp.json()
|
||||||
|
if "/v1" in url:
|
||||||
|
return {"url": url, "status": "ok", "version": "latest"}
|
||||||
|
else:
|
||||||
|
return {"url": url, "status": "ok", "version": data.get("version")}
|
||||||
|
except Exception as e:
|
||||||
|
detail = _format_connection_issue(target_url, e)
|
||||||
|
return {"url": url, "status": "error", "detail": detail}
|
||||||
|
|
||||||
|
# Check Ollama endpoints
|
||||||
|
ollama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.endpoints])
|
||||||
|
|
||||||
|
# Check llama-server endpoints
|
||||||
llama_results = []
|
llama_results = []
|
||||||
if config.llama_server_endpoints:
|
if config.llama_server_endpoints:
|
||||||
llama_results = await asyncio.gather(
|
llama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.llama_server_endpoints])
|
||||||
*[check(ep) for ep in config.llama_server_endpoints]
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"endpoints": ollama_results,
|
"endpoints": ollama_results,
|
||||||
"llama_server_endpoints": llama_results,
|
"llama_server_endpoints": llama_results,
|
||||||
|
|
@ -4096,30 +4003,44 @@ async def health_proxy(request: Request):
|
||||||
"""
|
"""
|
||||||
Health‑check endpoint for monitoring the proxy.
|
Health‑check endpoint for monitoring the proxy.
|
||||||
|
|
||||||
* Queries each configured endpoint for both liveness and routing health:
|
* Queries each configured endpoint for its `/api/version` response.
|
||||||
Ollama endpoints are probed at `/api/version` AND `/api/ps`,
|
|
||||||
OpenAI-compatible endpoints at `/models`.
|
|
||||||
* Returns a JSON object containing:
|
* Returns a JSON object containing:
|
||||||
- `status`: "ok" if every endpoint replied to every probe, otherwise "error".
|
- `status`: "ok" if every endpoint replied, otherwise "error".
|
||||||
- `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
|
- `endpoints`: a mapping of endpoint URL → `{status, version|detail}`.
|
||||||
* The HTTP status code is 200 when everything is healthy, 503 otherwise.
|
* The HTTP status code is 200 when everything is healthy, 503 otherwise.
|
||||||
"""
|
"""
|
||||||
# Run all health checks in parallel.
|
# Run all health checks in parallel.
|
||||||
# Ollama endpoints expose /api/version (liveness) and /api/ps (routing
|
# Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM,
|
||||||
# health — required by `choose_endpoint`). OpenAI-compatible endpoints
|
# llama-server, external) expose /models. Using /api/version against an
|
||||||
# (vLLM, llama-server, external) expose /models, which serves both
|
# OpenAI-compatible endpoint yields a 404 and noisy log output.
|
||||||
# purposes. Probing /api/version alone would miss the case where the
|
|
||||||
# Ollama process is up but /api/ps is failing — see issue #83.
|
|
||||||
all_endpoints = list(config.endpoints)
|
all_endpoints = list(config.endpoints)
|
||||||
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints]
|
||||||
all_endpoints += llama_eps_extra
|
all_endpoints += llama_eps_extra
|
||||||
|
|
||||||
probe_results = await asyncio.gather(
|
tasks = []
|
||||||
*(_endpoint_health(ep) for ep in all_endpoints),
|
for ep in all_endpoints:
|
||||||
)
|
if is_openai_compatible(ep):
|
||||||
|
tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True))
|
||||||
|
else:
|
||||||
|
tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True))
|
||||||
|
|
||||||
health_summary = dict(zip(all_endpoints, probe_results))
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
overall_ok = all(entry.get("status") == "ok" for entry in probe_results)
|
|
||||||
|
health_summary = {}
|
||||||
|
overall_ok = True
|
||||||
|
|
||||||
|
for ep, result in zip(all_endpoints, results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
# Endpoint did not respond / returned an error
|
||||||
|
health_summary[ep] = {"status": "error", "detail": str(result)}
|
||||||
|
overall_ok = False
|
||||||
|
else:
|
||||||
|
# Successful response – report the reported version (Ollama) or
|
||||||
|
# indicate the endpoint is reachable (OpenAI-compatible).
|
||||||
|
if is_openai_compatible(ep):
|
||||||
|
health_summary[ep] = {"status": "ok"}
|
||||||
|
else:
|
||||||
|
health_summary[ep] = {"status": "ok", "version": result}
|
||||||
|
|
||||||
response_payload = {
|
response_payload = {
|
||||||
"status": "ok" if overall_ok else "error",
|
"status": "ok" if overall_ok else "error",
|
||||||
|
|
|
||||||
|
|
@ -192,10 +192,6 @@
|
||||||
color: #8b0000;
|
color: #8b0000;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
}
|
}
|
||||||
.status-error[title] {
|
|
||||||
cursor: help;
|
|
||||||
text-decoration: underline dotted;
|
|
||||||
}
|
|
||||||
.copy-link,
|
.copy-link,
|
||||||
.delete-link,
|
.delete-link,
|
||||||
.show-link,
|
.show-link,
|
||||||
|
|
@ -740,16 +736,6 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
return await resp.json();
|
return await resp.json();
|
||||||
}
|
}
|
||||||
|
|
||||||
function escapeHtml(value) {
|
|
||||||
if (value === null || value === undefined) return "";
|
|
||||||
return String(value)
|
|
||||||
.replace(/&/g, "&")
|
|
||||||
.replace(/</g, "<")
|
|
||||||
.replace(/>/g, ">")
|
|
||||||
.replace(/"/g, """)
|
|
||||||
.replace(/'/g, "'");
|
|
||||||
}
|
|
||||||
|
|
||||||
function toggleDarkMode() {
|
function toggleDarkMode() {
|
||||||
document.documentElement.classList.toggle("dark-mode");
|
document.documentElement.classList.toggle("dark-mode");
|
||||||
}
|
}
|
||||||
|
|
@ -766,24 +752,40 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) {
|
||||||
// Build HTML for both endpoints and llama_server_endpoints
|
// Build HTML for both endpoints and llama_server_endpoints
|
||||||
let html = "";
|
let html = "";
|
||||||
|
|
||||||
const renderRow = (e) => {
|
// Add Ollama endpoints
|
||||||
const statusClass =
|
html += data.endpoints
|
||||||
e.status === "ok" ? "status-ok" : "status-error";
|
.map((e) => {
|
||||||
const version = e.version || "N/A";
|
const statusClass =
|
||||||
const titleAttr = e.detail
|
e.status === "ok"
|
||||||
? ` title="${escapeHtml(e.detail)}"`
|
? "status-ok"
|
||||||
: "";
|
: "status-error";
|
||||||
return `
|
const version = e.version || "N/A";
|
||||||
|
return `
|
||||||
<tr>
|
<tr>
|
||||||
<td class="endpoint">${escapeHtml(e.url)}</td>
|
<td class="endpoint">${e.url}</td>
|
||||||
<td class="status ${statusClass}"${titleAttr}>${escapeHtml(e.status)}</td>
|
<td class="status ${statusClass}">${e.status}</td>
|
||||||
<td class="version">${escapeHtml(version)}</td>
|
<td class="version">${version}</td>
|
||||||
</tr>`;
|
</tr>`;
|
||||||
};
|
})
|
||||||
|
.join("");
|
||||||
html += data.endpoints.map(renderRow).join("");
|
|
||||||
|
// Add llama-server endpoints
|
||||||
if (data.llama_server_endpoints && data.llama_server_endpoints.length > 0) {
|
if (data.llama_server_endpoints && data.llama_server_endpoints.length > 0) {
|
||||||
html += data.llama_server_endpoints.map(renderRow).join("");
|
html += data.llama_server_endpoints
|
||||||
|
.map((e) => {
|
||||||
|
const statusClass =
|
||||||
|
e.status === "ok"
|
||||||
|
? "status-ok"
|
||||||
|
: "status-error";
|
||||||
|
const version = e.version || "N/A";
|
||||||
|
return `
|
||||||
|
<tr>
|
||||||
|
<td class="endpoint">${e.url}</td>
|
||||||
|
<td class="status ${statusClass}">${e.status}</td>
|
||||||
|
<td class="version">${version}</td>
|
||||||
|
</tr>`;
|
||||||
|
})
|
||||||
|
.join("");
|
||||||
}
|
}
|
||||||
|
|
||||||
body.innerHTML = html;
|
body.innerHTML = html;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
"""Tests for choose_endpoint routing logic with mocked fetch calls."""
|
"""Tests for choose_endpoint routing logic with mocked fetch calls."""
|
||||||
import time
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -26,12 +25,10 @@ def _make_cfg(endpoints, llama_eps=None, max_conn=2, endpoint_config=None, prior
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def reset_usage():
|
def reset_usage():
|
||||||
"""Clear usage_counts and error caches between tests to prevent bleed."""
|
"""Clear usage_counts between tests to prevent bleed."""
|
||||||
router.usage_counts.clear()
|
router.usage_counts.clear()
|
||||||
router._loaded_error_cache.clear()
|
|
||||||
yield
|
yield
|
||||||
router.usage_counts.clear()
|
router.usage_counts.clear()
|
||||||
router._loaded_error_cache.clear()
|
|
||||||
|
|
||||||
|
|
||||||
class TestChooseEndpointBasic:
|
class TestChooseEndpointBasic:
|
||||||
|
|
@ -105,57 +102,6 @@ class TestChooseEndpointBasic:
|
||||||
# Least-busy is EP2
|
# Least-busy is EP2
|
||||||
assert ep == EP2
|
assert ep == EP2
|
||||||
|
|
||||||
async def test_excludes_endpoint_with_recent_loaded_error(self):
|
|
||||||
# Regression: issue #83 — when /api/ps fails for EP1 but EP1
|
|
||||||
# still advertises the model via /api/tags, routing must not
|
|
||||||
# fall back to EP1 just because it has a free slot.
|
|
||||||
cfg = _make_cfg([EP1, EP2])
|
|
||||||
|
|
||||||
async def available(ep, *_):
|
|
||||||
return {"llama3.2:latest"}
|
|
||||||
|
|
||||||
# EP1's /api/ps probe failed recently; EP2 is fine but the model
|
|
||||||
# is not loaded there. Without the health filter, EP1 would be
|
|
||||||
# picked by the free-slot fallback (step 4 in choose_endpoint).
|
|
||||||
router._loaded_error_cache[EP1] = time.time()
|
|
||||||
|
|
||||||
with (
|
|
||||||
patch.object(router, "config", cfg),
|
|
||||||
patch.object(router.fetch, "available_models", side_effect=available),
|
|
||||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
|
|
||||||
):
|
|
||||||
ep, _ = await router.choose_endpoint("llama3.2:latest")
|
|
||||||
assert ep == EP2
|
|
||||||
|
|
||||||
async def test_stale_loaded_error_does_not_exclude(self):
|
|
||||||
# Errors older than the 300s window must not keep an endpoint
|
|
||||||
# excluded forever.
|
|
||||||
cfg = _make_cfg([EP1])
|
|
||||||
router._loaded_error_cache[EP1] = time.time() - 301
|
|
||||||
|
|
||||||
with (
|
|
||||||
patch.object(router, "config", cfg),
|
|
||||||
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
|
|
||||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value={"m:latest"})),
|
|
||||||
):
|
|
||||||
ep, _ = await router.choose_endpoint("m:latest")
|
|
||||||
assert ep == EP1
|
|
||||||
|
|
||||||
async def test_all_unhealthy_still_routes(self):
|
|
||||||
# If every candidate has a fresh loaded-error we still try one
|
|
||||||
# (it may have recovered between the cache write and now) rather
|
|
||||||
# than refusing to route.
|
|
||||||
cfg = _make_cfg([EP1])
|
|
||||||
router._loaded_error_cache[EP1] = time.time()
|
|
||||||
|
|
||||||
with (
|
|
||||||
patch.object(router, "config", cfg),
|
|
||||||
patch.object(router.fetch, "available_models", AsyncMock(return_value={"m:latest"})),
|
|
||||||
patch.object(router.fetch, "loaded_models", AsyncMock(return_value=set())),
|
|
||||||
):
|
|
||||||
ep, _ = await router.choose_endpoint("m:latest")
|
|
||||||
assert ep == EP1
|
|
||||||
|
|
||||||
async def test_reserve_increments_usage(self):
|
async def test_reserve_increments_usage(self):
|
||||||
cfg = _make_cfg([EP1])
|
cfg = _make_cfg([EP1])
|
||||||
with (
|
with (
|
||||||
|
|
|
||||||
|
|
@ -178,33 +178,3 @@ class TestFetchLoadedModels:
|
||||||
first = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
first = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||||
second = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
second = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
||||||
assert first == second
|
assert first == second
|
||||||
|
|
||||||
async def test_records_error_in_loaded_error_cache_on_failure(self):
|
|
||||||
# Regression: issue #83 — /api/ps failures must be recorded so
|
|
||||||
# `choose_endpoint` can exclude unhealthy backends from routing.
|
|
||||||
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
|
|
||||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
|
||||||
m.get(f"{MOCK_OLLAMA_EP}/api/ps", status=502, payload={})
|
|
||||||
await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
|
||||||
assert MOCK_OLLAMA_EP in router._loaded_error_cache
|
|
||||||
|
|
||||||
async def test_records_error_for_llama_server_on_failure(self):
|
|
||||||
cfg = _make_cfg(ollama_eps=[], llama_eps=[MOCK_LLAMA_EP])
|
|
||||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
|
||||||
m.get(f"{MOCK_LLAMA_EP}/models", status=502, payload={})
|
|
||||||
await router.fetch.loaded_models(MOCK_LLAMA_EP)
|
|
||||||
assert MOCK_LLAMA_EP in router._loaded_error_cache
|
|
||||||
|
|
||||||
async def test_clears_error_cache_on_subsequent_success(self):
|
|
||||||
cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
|
|
||||||
# Pre-seed an old error so loaded_models() falls through to the
|
|
||||||
# network probe instead of short-circuiting on the error cache.
|
|
||||||
async with router._loaded_error_cache_lock:
|
|
||||||
router._loaded_error_cache[MOCK_OLLAMA_EP] = time.time() - 301
|
|
||||||
with patch.object(router, "config", cfg), aioresponses() as m:
|
|
||||||
m.get(
|
|
||||||
f"{MOCK_OLLAMA_EP}/api/ps",
|
|
||||||
payload={"models": [{"name": "qwen:7b"}]},
|
|
||||||
)
|
|
||||||
await router.fetch.loaded_models(MOCK_OLLAMA_EP)
|
|
||||||
assert MOCK_OLLAMA_EP not in router._loaded_error_cache
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue