diff --git a/.forgejo/workflows/docker-publish-semantic.yml b/.forgejo/workflows/docker-publish-semantic.yml index 163f1a1..ebbfd36 100644 --- a/.forgejo/workflows/docker-publish-semantic.yml +++ b/.forgejo/workflows/docker-publish-semantic.yml @@ -86,7 +86,7 @@ jobs: provenance: false build-args: | SEMANTIC_CACHE=true - tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}-${{ github.run_id }} + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }} merge: runs-on: docker-amd64 @@ -142,6 +142,6 @@ jobs: run: | docker buildx imagetools create \ $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64-${{ github.run_id }} \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64-${{ github.run_id }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64 diff --git a/.forgejo/workflows/docker-publish.yml b/.forgejo/workflows/docker-publish.yml index 09e145c..27cd879 100644 --- a/.forgejo/workflows/docker-publish.yml +++ b/.forgejo/workflows/docker-publish.yml @@ -77,7 +77,7 @@ jobs: platforms: ${{ matrix.platform }} push: true provenance: false - tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}-${{ github.run_id }} + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }} merge: runs-on: docker-amd64 @@ -133,6 +133,6 @@ jobs: run: | docker buildx imagetools create \ $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64-${{ github.run_id }} \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64-${{ github.run_id }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-amd64 \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-arm64 diff --git a/.forgejo/workflows/opencode.yml b/.forgejo/workflows/opencode.yml index 4fef9e1..6d126d3 100644 --- a/.forgejo/workflows/opencode.yml +++ b/.forgejo/workflows/opencode.yml @@ -11,7 +11,7 @@ jobs: opencode: if: | contains(github.event.comment.body, '/oc') || - contains(github.event.review.body, '/oc') + contains(github.event.comment.body, '/opencode') runs-on: docker-amd64 container: image: node:lts-bookworm @@ -54,7 +54,9 @@ jobs: uses: ./.opencode-action with: nomyo_api_key: ${{ secrets.NOMYO_API_KEY }} - model: nomyo/unsloth/Qwen3.6-35B-A3B-MTP-GGUF:Q4_K_XL + model: nomyo/unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_M forgejo_api_url: https://bitfreedom.net/code/ forgejo_token: ${{ secrets.FORGEJO_TOKEN }} forgejo_push_token: ${{ secrets.FORGEJO_PUSH_TOKEN }} + + diff --git a/.forgejo/workflows/test-action.yml b/.forgejo/workflows/test-action.yml new file mode 100644 index 0000000..e91b832 --- /dev/null +++ b/.forgejo/workflows/test-action.yml @@ -0,0 +1,20 @@ +name: artifact-probe +on: workflow_dispatch +jobs: + up: + runs-on: docker-amd64 + container: + image: node:lts-bookworm + steps: + - run: echo "hello" > /tmp/probe.txt + - uses: https://github.com/actions/upload-artifact@v4 + with: { name: probe, path: /tmp/probe.txt } + down: + needs: up + runs-on: docker-amd64 + container: + image: node:lts-bookworm + steps: + - uses: https://github.com/actions/download-artifact@v4 + with: { name: probe, path: /tmp/got } + - run: cat /tmp/got/probe.txt \ No newline at end of file diff --git a/doc/architecture.md b/doc/architecture.md index c2408d8..f725573 100644 --- a/doc/architecture.md +++ b/doc/architecture.md @@ -206,8 +206,6 @@ The `/health` endpoint provides comprehensive health status: } ``` -For Ollama endpoints the probe is a parallel check of `/api/version` (liveness) and `/api/ps` (the route used by `choose_endpoint` when selecting a backend for a request). Reporting `ok` only when both succeed prevents the router from advertising an endpoint as healthy while completion calls dead-end on `/api/ps`. The same dual probe backs `/api/config`, which the dashboard uses to render endpoint health. - ## Database Schema The router uses SQLite for persistent storage: diff --git a/doc/monitoring.md b/doc/monitoring.md index 9ce25ec..ab75d25 100644 --- a/doc/monitoring.md +++ b/doc/monitoring.md @@ -29,10 +29,6 @@ Response: - `200`: All endpoints healthy - `503`: One or more endpoints unhealthy -**Probe scope per endpoint**: -- **Ollama endpoints** are probed at both `/api/version` (liveness) and `/api/ps` (model-introspection used by the router). If either fails the endpoint is reported as `error`; the response still includes `version` when the daemon is reachable so operators can tell a partial failure from a full outage. The `detail` field names the failing probe, e.g. `"/api/ps: 502 …"`. -- **OpenAI-compatible / llama-server endpoints** are probed at `/models`. - ### Current Usage ```bash @@ -137,8 +133,6 @@ Response: } ``` -Uses the same dual-probe logic as `/health` (Ollama: `/api/version` + `/api/ps`; OpenAI-compatible: `/models`). An endpoint will report `error` whenever either probe fails. The dashboard renders the `detail` field as a tooltip on the status cell. - ### Cache Statistics ```bash diff --git a/requirements.txt b/requirements.txt index b03afff..5920529 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,18 +5,18 @@ annotated-types==0.7.0 anyio==4.13.0 async-timeout==5.0.1 attrs==26.1.0 -certifi==2026.5.20 +certifi==2026.4.22 click==8.4.0 distro==1.9.0 exceptiongroup==1.3.1 -fastapi==0.136.3 +fastapi==0.136.1 fastapi-sse==1.1.1 frozenlist==1.8.0 h11==0.16.0 httpcore==1.0.9 httpx==0.28.1 idna==3.15 -jiter==0.15.0 +jiter==0.14.0 multidict==6.7.1 ollama==0.6.2 openai==2.37.0 @@ -30,10 +30,10 @@ pydantic_core==2.46.4 python-dotenv==1.2.2 PyYAML==6.0.3 sniffio==1.3.1 -starlette>=1.0.1 +starlette==0.52.1 truststore==0.10.4 tiktoken==0.13.0 -tqdm==4.68.1 +tqdm==4.67.3 typing-inspection==0.4.2 typing_extensions==4.15.0 uvicorn==0.47.0 diff --git a/router.py b/router.py index c465ebc..08225fb 100644 --- a/router.py +++ b/router.py @@ -1000,7 +1000,7 @@ class fetch: async with client.get(f"{endpoint}/models") as resp: await _ensure_success(resp) data = await resp.json() - + # Filter for loaded models only items = data.get("data", []) models = { @@ -1012,19 +1012,11 @@ class fetch: # Update cache with lock protection async with _loaded_models_cache_lock: _loaded_models_cache[endpoint] = (models, time.time()) - # Probe succeeded — clear any stale error so the endpoint - # becomes routable again. - async with _loaded_error_cache_lock: - _loaded_error_cache.pop(endpoint, None) return models except Exception as e: # If anything goes wrong we simply assume the endpoint has no models message = _format_connection_issue(f"{endpoint}/models", e) print(f"[fetch.loaded_models] {message}") - # Record the failure so `choose_endpoint` can avoid routing - # to an unhealthy backend and repeated probes short-circuit. - async with _loaded_error_cache_lock: - _loaded_error_cache[endpoint] = time.time() return set() else: # Original Ollama /api/ps logic @@ -1039,15 +1031,11 @@ class fetch: # Update cache with lock protection async with _loaded_models_cache_lock: _loaded_models_cache[endpoint] = (models, time.time()) - async with _loaded_error_cache_lock: - _loaded_error_cache.pop(endpoint, None) return models except Exception as e: # If anything goes wrong we simply assume the endpoint has no models message = _format_connection_issue(f"{endpoint}/api/ps", e) print(f"[fetch.loaded_models] {message}") - async with _loaded_error_cache_lock: - _loaded_error_cache[endpoint] = time.time() return set() async def _refresh_loaded_models(endpoint: str) -> None: @@ -1865,28 +1853,6 @@ async def choose_endpoint(model: str, reserve: bool = True, load_tasks = [fetch.loaded_models(ep) for ep in candidate_endpoints] loaded_sets = await asyncio.gather(*load_tasks) - # 3️⃣.5 Exclude endpoints whose loaded-model probe has been failing - # recently. Without this filter, an endpoint where `/api/ps` returns 5xx - # would appear with an empty loaded set but pass through to the - # free-slot fallback (step 4) — sending completion calls to an - # unhealthy backend. See issue #83. - async with _loaded_error_cache_lock: - unhealthy = { - ep for ep, ts in _loaded_error_cache.items() - if _is_fresh(ts, 300) - } - if unhealthy: - filtered = [ - (ep, models) for ep, models in zip(candidate_endpoints, loaded_sets) - if ep not in unhealthy - ] - if filtered: - candidate_endpoints = [ep for ep, _ in filtered] - loaded_sets = [models for _, models in filtered] - # If *every* candidate is unhealthy we still fall through with the - # original list — refusing to route is worse than retrying a - # possibly-recovered backend. - # Look up a possible affinity hint *before* taking usage_lock. The two # locks are never held together to avoid lock-ordering issues. affine_ep: Optional[str] = None @@ -3188,103 +3154,44 @@ async def usage_proxy(request: Request): "token_usage_counts": token_usage_counts} # ------------------------------------------------------------- -# 20. Endpoint health probes (shared by /api/config and /health) -# ------------------------------------------------------------- -async def _raw_probe( - ep: str, - route: str, - api_key: Optional[str] = None, - timeout: Optional[float] = None, -) -> tuple[bool, object]: - """Direct HTTP probe that distinguishes success from failure - (unlike `fetch.endpoint_details`, which returns [] on either). - Returns `(ok, payload_or_error_message)`. - """ - headers = {"Referer": default_headers.get("HTTP-Referer", "https://nomyo.ai")} - if api_key is not None: - headers["Authorization"] = "Bearer " + api_key - url = f"{ep.rstrip('/')}/{route.lstrip('/')}" - req_kwargs = {} - if timeout is not None: - req_kwargs["timeout"] = aiohttp.ClientTimeout(total=timeout) - try: - client: aiohttp.ClientSession = get_session(ep) - async with client.get(url, headers=headers, **req_kwargs) as resp: - await _ensure_success(resp) - data = await resp.json() - return True, data - except Exception as exc: - return False, _format_connection_issue(url, exc) - - -async def _endpoint_health(ep: str, *, timeout: Optional[float] = None) -> dict: - """Probe an endpoint and return `{status, version?, detail?}`. - - Ollama endpoints get a dual probe of `/api/version` and `/api/ps` so - that a daemon which is reachable but has a broken model-introspection - path (issue #83) is reported as `error` rather than `ok`. - OpenAI-compatible endpoints use a single `/models` probe. - """ - if is_openai_compatible(ep): - ok, payload = await _raw_probe( - ep, "/models", config.api_keys.get(ep), timeout=timeout, - ) - if ok: - return {"status": "ok", "version": "latest"} - return {"status": "error", "detail": str(payload)} - - (version_ok, version_payload), (ps_ok, ps_payload) = await asyncio.gather( - _raw_probe(ep, "/api/version", timeout=timeout), - _raw_probe(ep, "/api/ps", timeout=timeout), - ) - - version_value = ( - version_payload.get("version") - if version_ok and isinstance(version_payload, dict) - else None - ) - - if version_ok and ps_ok: - return {"status": "ok", "version": version_value} - if not version_ok and not ps_ok: - return {"status": "error", "detail": str(version_payload)} - # Partial failure — daemon reachable but one probe failed. Report - # as "error" so callers can surface the issue; include `version` so - # the operator knows the daemon itself is alive. - if not ps_ok: - return { - "status": "error", - "version": version_value, - "detail": f"/api/ps: {ps_payload}", - } - return { - "status": "error", - "detail": f"/api/version: {version_payload}", - } - - -# ------------------------------------------------------------- -# 20b. Proxy config route – for monitoring and frontend usage +# 20. Proxy config route – for monitoring and frontent usage # ------------------------------------------------------------- @app.get("/api/config") async def config_proxy(request: Request): """ Return a simple JSON object that contains the configured - Ollama endpoints and llama_server_endpoints. The front‑end uses this - to display which endpoints are being proxied and their health. - Status is "error" when either liveness (/api/version) or routing - health (/api/ps) fails — see issue #83. + Ollama endpoints and llama_server_endpoints. The front‑end uses this to display + which endpoints are being proxied. """ - async def check(url: str) -> dict: - return {"url": url, **(await _endpoint_health(url, timeout=5))} + async def check_endpoint(url: str): + client: aiohttp.ClientSession = get_session(url) + headers = None + if "/v1" in url: + headers = {"Authorization": "Bearer " + config.api_keys.get(url, "no-key")} + target_url = f"{url}/models" + else: + target_url = f"{url}/api/version" - ollama_results = await asyncio.gather(*[check(ep) for ep in config.endpoints]) + try: + async with client.get(target_url, headers=headers, timeout=aiohttp.ClientTimeout(total=5)) as resp: + await _ensure_success(resp) + data = await resp.json() + if "/v1" in url: + return {"url": url, "status": "ok", "version": "latest"} + else: + return {"url": url, "status": "ok", "version": data.get("version")} + except Exception as e: + detail = _format_connection_issue(target_url, e) + return {"url": url, "status": "error", "detail": detail} + + # Check Ollama endpoints + ollama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.endpoints]) + + # Check llama-server endpoints llama_results = [] if config.llama_server_endpoints: - llama_results = await asyncio.gather( - *[check(ep) for ep in config.llama_server_endpoints] - ) - + llama_results = await asyncio.gather(*[check_endpoint(ep) for ep in config.llama_server_endpoints]) + return { "endpoints": ollama_results, "llama_server_endpoints": llama_results, @@ -4096,30 +4003,44 @@ async def health_proxy(request: Request): """ Health‑check endpoint for monitoring the proxy. - * Queries each configured endpoint for both liveness and routing health: - Ollama endpoints are probed at `/api/version` AND `/api/ps`, - OpenAI-compatible endpoints at `/models`. + * Queries each configured endpoint for its `/api/version` response. * Returns a JSON object containing: - - `status`: "ok" if every endpoint replied to every probe, otherwise "error". + - `status`: "ok" if every endpoint replied, otherwise "error". - `endpoints`: a mapping of endpoint URL → `{status, version|detail}`. * The HTTP status code is 200 when everything is healthy, 503 otherwise. """ # Run all health checks in parallel. - # Ollama endpoints expose /api/version (liveness) and /api/ps (routing - # health — required by `choose_endpoint`). OpenAI-compatible endpoints - # (vLLM, llama-server, external) expose /models, which serves both - # purposes. Probing /api/version alone would miss the case where the - # Ollama process is up but /api/ps is failing — see issue #83. + # Ollama endpoints expose /api/version; OpenAI-compatible endpoints (vLLM, + # llama-server, external) expose /models. Using /api/version against an + # OpenAI-compatible endpoint yields a 404 and noisy log output. all_endpoints = list(config.endpoints) llama_eps_extra = [ep for ep in config.llama_server_endpoints if ep not in config.endpoints] all_endpoints += llama_eps_extra - probe_results = await asyncio.gather( - *(_endpoint_health(ep) for ep in all_endpoints), - ) + tasks = [] + for ep in all_endpoints: + if is_openai_compatible(ep): + tasks.append(fetch.endpoint_details(ep, "/models", "data", config.api_keys.get(ep), skip_error_cache=True)) + else: + tasks.append(fetch.endpoint_details(ep, "/api/version", "version", skip_error_cache=True)) - health_summary = dict(zip(all_endpoints, probe_results)) - overall_ok = all(entry.get("status") == "ok" for entry in probe_results) + results = await asyncio.gather(*tasks, return_exceptions=True) + + health_summary = {} + overall_ok = True + + for ep, result in zip(all_endpoints, results): + if isinstance(result, Exception): + # Endpoint did not respond / returned an error + health_summary[ep] = {"status": "error", "detail": str(result)} + overall_ok = False + else: + # Successful response – report the reported version (Ollama) or + # indicate the endpoint is reachable (OpenAI-compatible). + if is_openai_compatible(ep): + health_summary[ep] = {"status": "ok"} + else: + health_summary[ep] = {"status": "ok", "version": result} response_payload = { "status": "ok" if overall_ok else "error", diff --git a/static/index.html b/static/index.html index 8c0b16c..b29f22b 100644 --- a/static/index.html +++ b/static/index.html @@ -192,10 +192,6 @@ color: #8b0000; font-weight: bold; } - .status-error[title] { - cursor: help; - text-decoration: underline dotted; - } .copy-link, .delete-link, .show-link, @@ -740,16 +736,6 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) { return await resp.json(); } - function escapeHtml(value) { - if (value === null || value === undefined) return ""; - return String(value) - .replace(/&/g, "&") - .replace(//g, ">") - .replace(/"/g, """) - .replace(/'/g, "'"); - } - function toggleDarkMode() { document.documentElement.classList.toggle("dark-mode"); } @@ -766,24 +752,40 @@ function renderTimeSeriesChart(timeSeriesData, chart, minutes) { // Build HTML for both endpoints and llama_server_endpoints let html = ""; - const renderRow = (e) => { - const statusClass = - e.status === "ok" ? "status-ok" : "status-error"; - const version = e.version || "N/A"; - const titleAttr = e.detail - ? ` title="${escapeHtml(e.detail)}"` - : ""; - return ` + // Add Ollama endpoints + html += data.endpoints + .map((e) => { + const statusClass = + e.status === "ok" + ? "status-ok" + : "status-error"; + const version = e.version || "N/A"; + return `