merge origin/main, add DigitalOcean alongside Vercel and OpenRouter

This commit is contained in:
Spherrrical 2026-04-23 15:13:06 -07:00
commit 013f377ddf
138 changed files with 17041 additions and 3335 deletions

View file

@ -133,13 +133,13 @@ jobs:
load: true
tags: |
${{ env.PLANO_DOCKER_IMAGE }}
${{ env.DOCKER_IMAGE }}:0.4.19
${{ env.DOCKER_IMAGE }}:0.4.20
${{ env.DOCKER_IMAGE }}:latest
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Save image as artifact
run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.19 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.20 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
- name: Upload image artifact
uses: actions/upload-artifact@v6

View file

@ -24,7 +24,7 @@ export function Hero() {
>
<div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
<span className="text-xs sm:text-sm font-medium text-black/65">
v0.4.19
v0.4.20
</span>
<span className="text-xs sm:text-sm font-medium text-black ">

View file

@ -1 +1 @@
docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.19
docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.20

View file

@ -1,3 +1,3 @@
"""Plano CLI - Intelligent Prompt Gateway."""
__version__ = "0.4.19"
__version__ = "0.4.20"

View file

@ -30,6 +30,7 @@ SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
"zhipu",
"vercel",
"openrouter",
"digitalocean",
]
SUPPORTED_PROVIDERS = (

View file

@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4"
SERVICE_NAME_ARCHGW = "plano"
PLANO_DOCKER_NAME = "plano"
PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.19")
PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.20")
DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
# Native mode constants

163
cli/planoai/defaults.py Normal file
View file

@ -0,0 +1,163 @@
"""Default config synthesizer for zero-config ``planoai up``.
When the user runs ``planoai up`` in a directory with no ``config.yaml`` /
``plano_config.yaml``, we synthesize a pass-through config that covers the
common LLM providers and auto-wires OTel export to ``localhost:4317`` so
``planoai obs`` works out of the box.
Auth handling:
- If the provider's env var is set, bind ``access_key: $ENV_VAR``.
- Otherwise set ``passthrough_auth: true`` so the client's own Authorization
header is forwarded. No env var is required to start the proxy.
"""
from __future__ import annotations
import os
from dataclasses import dataclass
DEFAULT_LLM_LISTENER_PORT = 12000
# plano_config validation requires an http:// scheme on the OTLP endpoint.
DEFAULT_OTLP_ENDPOINT = "http://localhost:4317"
@dataclass(frozen=True)
class ProviderDefault:
name: str
env_var: str
base_url: str
model_pattern: str
# Only set for providers whose prefix in the model pattern is NOT one of the
# built-in SUPPORTED_PROVIDERS in cli/planoai/config_generator.py. For
# built-ins, the validator infers the interface from the model prefix and
# rejects configs that set this field explicitly.
provider_interface: str | None = None
# Keep ordering stable so synthesized configs diff cleanly across runs.
PROVIDER_DEFAULTS: list[ProviderDefault] = [
ProviderDefault(
name="openai",
env_var="OPENAI_API_KEY",
base_url="https://api.openai.com/v1",
model_pattern="openai/*",
),
ProviderDefault(
name="anthropic",
env_var="ANTHROPIC_API_KEY",
base_url="https://api.anthropic.com/v1",
model_pattern="anthropic/*",
),
ProviderDefault(
name="gemini",
env_var="GEMINI_API_KEY",
base_url="https://generativelanguage.googleapis.com/v1beta",
model_pattern="gemini/*",
),
ProviderDefault(
name="groq",
env_var="GROQ_API_KEY",
base_url="https://api.groq.com/openai/v1",
model_pattern="groq/*",
),
ProviderDefault(
name="deepseek",
env_var="DEEPSEEK_API_KEY",
base_url="https://api.deepseek.com/v1",
model_pattern="deepseek/*",
),
ProviderDefault(
name="mistral",
env_var="MISTRAL_API_KEY",
base_url="https://api.mistral.ai/v1",
model_pattern="mistral/*",
),
# DigitalOcean Gradient is a first-class provider post-#889 — the
# `digitalocean/` model prefix routes to the built-in Envoy cluster, no
# base_url needed at runtime.
ProviderDefault(
name="digitalocean",
env_var="DO_API_KEY",
base_url="https://inference.do-ai.run/v1",
model_pattern="digitalocean/*",
),
]
@dataclass
class DetectionResult:
with_keys: list[ProviderDefault]
passthrough: list[ProviderDefault]
@property
def summary(self) -> str:
parts = []
if self.with_keys:
parts.append("env-keyed: " + ", ".join(p.name for p in self.with_keys))
if self.passthrough:
parts.append("pass-through: " + ", ".join(p.name for p in self.passthrough))
return " | ".join(parts) if parts else "no providers"
def detect_providers(env: dict[str, str] | None = None) -> DetectionResult:
env = env if env is not None else dict(os.environ)
with_keys: list[ProviderDefault] = []
passthrough: list[ProviderDefault] = []
for p in PROVIDER_DEFAULTS:
val = env.get(p.env_var)
if val:
with_keys.append(p)
else:
passthrough.append(p)
return DetectionResult(with_keys=with_keys, passthrough=passthrough)
def synthesize_default_config(
env: dict[str, str] | None = None,
*,
listener_port: int = DEFAULT_LLM_LISTENER_PORT,
otel_endpoint: str = DEFAULT_OTLP_ENDPOINT,
) -> dict:
"""Build a pass-through config dict suitable for validation + envoy rendering.
The returned dict can be dumped to YAML and handed to the existing `planoai up`
pipeline unchanged.
"""
detection = detect_providers(env)
def _entry(p: ProviderDefault, base: dict) -> dict:
row: dict = {"name": p.name, "model": p.model_pattern, "base_url": p.base_url}
if p.provider_interface is not None:
row["provider_interface"] = p.provider_interface
row.update(base)
return row
model_providers: list[dict] = []
for p in detection.with_keys:
model_providers.append(_entry(p, {"access_key": f"${p.env_var}"}))
for p in detection.passthrough:
model_providers.append(_entry(p, {"passthrough_auth": True}))
# No explicit `default: true` entry is synthesized: the plano config
# validator rejects wildcard models as defaults, and brightstaff already
# registers bare model names as lookup keys during wildcard expansion
# (crates/common/src/llm_providers.rs), so `{"model": "gpt-4o-mini"}`
# without a prefix resolves via the openai wildcard without needing
# `default: true`. See discussion on #890.
return {
"version": "v0.4.0",
"listeners": [
{
"name": "llm",
"type": "model",
"port": listener_port,
"address": "0.0.0.0",
}
],
"model_providers": model_providers,
"tracing": {
"random_sampling": 100,
"opentracing_grpc_endpoint": otel_endpoint,
},
}

View file

@ -6,7 +6,13 @@ import sys
import contextlib
import logging
import rich_click as click
import yaml
from planoai import targets
from planoai.defaults import (
DEFAULT_LLM_LISTENER_PORT,
detect_providers,
synthesize_default_config,
)
# Brand color - Plano purple
PLANO_COLOR = "#969FF4"
@ -31,6 +37,7 @@ from planoai.core import (
)
from planoai.init_cmd import init as init_cmd
from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_background
from planoai.obs_cmd import obs as obs_cmd
from planoai.consts import (
DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT,
@ -317,7 +324,23 @@ def build(docker):
help="Show detailed startup logs with timestamps.",
is_flag=True,
)
def up(file, path, foreground, with_tracing, tracing_port, docker, verbose):
@click.option(
"--listener-port",
default=DEFAULT_LLM_LISTENER_PORT,
type=int,
show_default=True,
help="Override the LLM listener port when running without a config file. Ignored when a config file is present.",
)
def up(
file,
path,
foreground,
with_tracing,
tracing_port,
docker,
verbose,
listener_port,
):
"""Starts Plano."""
from rich.status import Status
@ -328,12 +351,23 @@ def up(file, path, foreground, with_tracing, tracing_port, docker, verbose):
# Use the utility function to find config file
plano_config_file = find_config_file(path, file)
# Check if the file exists
# Zero-config fallback: when no user config is present, synthesize a
# pass-through config that covers the common LLM providers and
# auto-wires OTel export to ``planoai obs``. See cli/planoai/defaults.py.
if not os.path.exists(plano_config_file):
detection = detect_providers()
cfg_dict = synthesize_default_config(listener_port=listener_port)
default_dir = os.path.expanduser("~/.plano")
os.makedirs(default_dir, exist_ok=True)
synthesized_path = os.path.join(default_dir, "default_config.yaml")
with open(synthesized_path, "w") as fh:
yaml.safe_dump(cfg_dict, fh, sort_keys=False)
plano_config_file = synthesized_path
console.print(
f"[red]✗[/red] Config file not found: [dim]{plano_config_file}[/dim]"
f"[dim]No plano config found; using defaults ({detection.summary}). "
f"Listening on :{listener_port}, tracing -> http://localhost:4317.[/dim]"
)
sys.exit(1)
if not docker:
from planoai.native_runner import native_validate_config
@ -681,6 +715,7 @@ main.add_command(cli_agent)
main.add_command(generate_prompt_targets)
main.add_command(init_cmd, name="init")
main.add_command(trace_cmd, name="trace")
main.add_command(obs_cmd, name="obs")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,6 @@
"""Plano observability console: in-memory live view of LLM traffic."""
from planoai.obs.collector import LLMCall, LLMCallStore, ObsCollector
from planoai.obs.pricing import PricingCatalog
__all__ = ["LLMCall", "LLMCallStore", "ObsCollector", "PricingCatalog"]

View file

@ -0,0 +1,266 @@
"""In-memory collector for LLM calls, fed by OTLP/gRPC spans from brightstaff."""
from __future__ import annotations
import threading
from collections import deque
from concurrent import futures
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any, Iterable
import grpc
from opentelemetry.proto.collector.trace.v1 import (
trace_service_pb2,
trace_service_pb2_grpc,
)
DEFAULT_GRPC_PORT = 4317
DEFAULT_CAPACITY = 1000
@dataclass
class LLMCall:
"""One LLM call as reconstructed from a brightstaff LLM span.
Fields default to ``None`` when the underlying span attribute was absent.
"""
request_id: str
timestamp: datetime
model: str
provider: str | None = None
request_model: str | None = None
session_id: str | None = None
route_name: str | None = None
is_streaming: bool | None = None
status_code: int | None = None
prompt_tokens: int | None = None
completion_tokens: int | None = None
total_tokens: int | None = None
cached_input_tokens: int | None = None
cache_creation_tokens: int | None = None
reasoning_tokens: int | None = None
ttft_ms: float | None = None
duration_ms: float | None = None
routing_strategy: str | None = None
routing_reason: str | None = None
cost_usd: float | None = None
@property
def tpt_ms(self) -> float | None:
if self.duration_ms is None or self.completion_tokens in (None, 0):
return None
ttft = self.ttft_ms or 0.0
generate_ms = max(0.0, self.duration_ms - ttft)
if generate_ms <= 0:
return None
return generate_ms / self.completion_tokens
@property
def tokens_per_sec(self) -> float | None:
tpt = self.tpt_ms
if tpt is None or tpt <= 0:
return None
return 1000.0 / tpt
class LLMCallStore:
"""Thread-safe ring buffer of recent LLM calls."""
def __init__(self, capacity: int = DEFAULT_CAPACITY) -> None:
self._capacity = capacity
self._calls: deque[LLMCall] = deque(maxlen=capacity)
self._lock = threading.Lock()
@property
def capacity(self) -> int:
return self._capacity
def add(self, call: LLMCall) -> None:
with self._lock:
self._calls.append(call)
def clear(self) -> None:
with self._lock:
self._calls.clear()
def snapshot(self) -> list[LLMCall]:
with self._lock:
return list(self._calls)
def __len__(self) -> int:
with self._lock:
return len(self._calls)
# Span attribute keys used below are the canonical OTel / Plano keys emitted by
# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source
# of truth.
def _anyvalue_to_python(value: Any) -> Any: # AnyValue from OTLP
kind = value.WhichOneof("value")
if kind == "string_value":
return value.string_value
if kind == "bool_value":
return value.bool_value
if kind == "int_value":
return value.int_value
if kind == "double_value":
return value.double_value
return None
def _attrs_to_dict(attrs: Iterable[Any]) -> dict[str, Any]:
out: dict[str, Any] = {}
for kv in attrs:
py = _anyvalue_to_python(kv.value)
if py is not None:
out[kv.key] = py
return out
def _maybe_int(value: Any) -> int | None:
if value is None:
return None
try:
return int(value)
except (TypeError, ValueError):
return None
def _maybe_float(value: Any) -> float | None:
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def span_to_llm_call(
span: Any, service_name: str, pricing: Any | None = None
) -> LLMCall | None:
"""Convert an OTLP span into an LLMCall, or return None if it isn't one.
A span is considered an LLM call iff it carries the ``llm.model`` attribute.
"""
attrs = _attrs_to_dict(span.attributes)
model = attrs.get("llm.model")
if not model:
return None
# Prefer explicit span attributes; fall back to likely aliases.
request_id = next(
(
str(attrs[key])
for key in ("request_id", "http.request_id")
if key in attrs and attrs[key] is not None
),
span.span_id.hex() if span.span_id else "",
)
start_ns = span.start_time_unix_nano or 0
ts = (
datetime.fromtimestamp(start_ns / 1_000_000_000, tz=timezone.utc).astimezone()
if start_ns
else datetime.now().astimezone()
)
call = LLMCall(
request_id=str(request_id),
timestamp=ts,
model=str(model),
provider=(
str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name
),
request_model=(
str(attrs["model.requested"]) if "model.requested" in attrs else None
),
session_id=(
str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None
),
route_name=(
str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None
),
is_streaming=(
bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None
),
status_code=_maybe_int(attrs.get("http.status_code")),
prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")),
completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")),
total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")),
cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")),
cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")),
reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")),
ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")),
duration_ms=_maybe_float(attrs.get("llm.duration_ms")),
routing_strategy=(
str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None
),
routing_reason=(
str(attrs["routing.selection_reason"])
if "routing.selection_reason" in attrs
else None
),
)
if pricing is not None:
call.cost_usd = pricing.cost_for_call(call)
return call
class _ObsServicer(trace_service_pb2_grpc.TraceServiceServicer):
def __init__(self, store: LLMCallStore, pricing: Any | None) -> None:
self._store = store
self._pricing = pricing
def Export(self, request, context): # noqa: N802 — gRPC generated name
for resource_spans in request.resource_spans:
service_name = "unknown"
for attr in resource_spans.resource.attributes:
if attr.key == "service.name":
val = _anyvalue_to_python(attr.value)
if val is not None:
service_name = str(val)
break
for scope_spans in resource_spans.scope_spans:
for span in scope_spans.spans:
call = span_to_llm_call(span, service_name, self._pricing)
if call is not None:
self._store.add(call)
return trace_service_pb2.ExportTraceServiceResponse()
@dataclass
class ObsCollector:
"""Owns the OTLP/gRPC server and the in-memory LLMCall ring buffer."""
store: LLMCallStore = field(default_factory=LLMCallStore)
pricing: Any | None = None
host: str = "0.0.0.0"
port: int = DEFAULT_GRPC_PORT
_server: grpc.Server | None = field(default=None, init=False, repr=False)
def start(self) -> None:
if self._server is not None:
return
server = grpc.server(futures.ThreadPoolExecutor(max_workers=4))
trace_service_pb2_grpc.add_TraceServiceServicer_to_server(
_ObsServicer(self.store, self.pricing), server
)
address = f"{self.host}:{self.port}"
bound = server.add_insecure_port(address)
if bound == 0:
raise OSError(
f"Failed to bind OTLP listener on {address}: port already in use. "
"Stop tracing via `planoai trace down` or pick another port with --port."
)
server.start()
self._server = server
def stop(self, grace: float = 2.0) -> None:
if self._server is not None:
self._server.stop(grace)
self._server = None

321
cli/planoai/obs/pricing.py Normal file
View file

@ -0,0 +1,321 @@
"""DigitalOcean Gradient pricing catalog for the obs console.
Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``.
Single-source: one fetch at startup, cached for the life of the process.
"""
from __future__ import annotations
import logging
import re
import threading
from dataclasses import dataclass
from typing import Any
import requests
DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
FETCH_TIMEOUT_SECS = 5.0
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class ModelPrice:
"""Input/output $/token rates. Token counts are multiplied by these."""
input_per_token_usd: float
output_per_token_usd: float
cached_input_per_token_usd: float | None = None
class PricingCatalog:
"""In-memory pricing lookup keyed by model id.
DO's catalog uses ids like ``openai-gpt-5.4``; Plano's resolved model names
may arrive as ``do/openai-gpt-5.4`` or bare ``openai-gpt-5.4``. We strip the
leading provider prefix when looking up.
"""
def __init__(self, prices: dict[str, ModelPrice] | None = None) -> None:
self._prices: dict[str, ModelPrice] = prices or {}
self._lock = threading.Lock()
def __len__(self) -> int:
with self._lock:
return len(self._prices)
def sample_models(self, n: int = 5) -> list[str]:
with self._lock:
return list(self._prices.keys())[:n]
@classmethod
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
"""Fetch pricing from DO's catalog endpoint. On failure, returns an
empty catalog (cost column will be blank).
The catalog endpoint is public no auth required, no signup so
``planoai obs`` gets cost data on first run out of the box.
"""
try:
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
resp.raise_for_status()
data = resp.json()
except Exception as exc: # noqa: BLE001 — best-effort; never fatal
logger.warning(
"DO pricing fetch failed: %s; cost column will be blank.",
exc,
)
return cls()
prices = _parse_do_pricing(data)
if not prices:
# Dump the first entry's raw shape so we can see which fields DO
# actually returned — helps when the catalog adds new fields or
# the response doesn't match our parser.
import json as _json
sample_items = _coerce_items(data)
sample = sample_items[0] if sample_items else data
logger.warning(
"DO pricing response had no parseable entries; cost column "
"will be blank. Sample entry: %s",
_json.dumps(sample, default=str)[:400],
)
return cls(prices)
def price_for(self, model_name: str | None) -> ModelPrice | None:
if not model_name:
return None
with self._lock:
# Try the full name first, then stripped prefix, then lowercased variants.
for candidate in _model_key_candidates(model_name):
hit = self._prices.get(candidate)
if hit is not None:
return hit
return None
def cost_for_call(self, call: Any) -> float | None:
"""Compute USD cost for an LLMCall. Returns None when pricing is unknown."""
price = self.price_for(getattr(call, "model", None)) or self.price_for(
getattr(call, "request_model", None)
)
if price is None:
return None
prompt = int(getattr(call, "prompt_tokens", 0) or 0)
completion = int(getattr(call, "completion_tokens", 0) or 0)
cached = int(getattr(call, "cached_input_tokens", 0) or 0)
# Cached input tokens are priced separately at the cached rate when known;
# otherwise they're already counted in prompt tokens at the regular rate.
fresh_prompt = prompt
if price.cached_input_per_token_usd is not None and cached:
fresh_prompt = max(0, prompt - cached)
cost_cached = cached * price.cached_input_per_token_usd
else:
cost_cached = 0.0
cost = (
fresh_prompt * price.input_per_token_usd
+ completion * price.output_per_token_usd
+ cost_cached
)
return round(cost, 6)
_DATE_SUFFIX_RE = re.compile(r"-\d{8}$")
_PROVIDER_PREFIXES = ("anthropic", "openai", "google", "meta", "cohere", "mistral")
_ANTHROPIC_FAMILIES = {"opus", "sonnet", "haiku"}
def _model_key_candidates(model_name: str) -> list[str]:
"""Lookup-side variants of a Plano-emitted model name.
Plano resolves names like ``claude-haiku-4-5-20251001``; the catalog stores
them as ``anthropic-claude-haiku-4.5``. We strip the date suffix and the
``provider/`` prefix here; the catalog itself registers the dash/dot and
family-order aliases at parse time (see :func:`_expand_aliases`).
"""
base = model_name.strip()
out = [base]
if "/" in base:
out.append(base.split("/", 1)[1])
for k in list(out):
stripped = _DATE_SUFFIX_RE.sub("", k)
if stripped != k:
out.append(stripped)
out.extend([v.lower() for v in list(out)])
seen: set[str] = set()
uniq = []
for key in out:
if key not in seen:
seen.add(key)
uniq.append(key)
return uniq
def _expand_aliases(model_id: str) -> set[str]:
"""Catalog-side variants of a DO model id.
DO publishes Anthropic models under ids like ``anthropic-claude-opus-4.7``
or ``anthropic-claude-4.6-sonnet`` while Plano emits ``claude-opus-4-7`` /
``claude-sonnet-4-6``. Generate a set covering provider-prefix stripping,
dashdot in version segments, and familyversion word order so a single
catalog entry matches every name shape we'll see at lookup.
"""
aliases: set[str] = set()
def add(name: str) -> None:
if not name:
return
aliases.add(name)
aliases.add(name.lower())
add(model_id)
base = model_id
head, _, rest = base.partition("-")
if head.lower() in _PROVIDER_PREFIXES and rest:
add(rest)
base = rest
for key in list(aliases):
if "." in key:
add(key.replace(".", "-"))
parts = base.split("-")
if len(parts) >= 3 and parts[0].lower() == "claude":
rest_parts = parts[1:]
for i, p in enumerate(rest_parts):
if p.lower() in _ANTHROPIC_FAMILIES:
others = rest_parts[:i] + rest_parts[i + 1 :]
if not others:
break
family_last = "claude-" + "-".join(others) + "-" + p
family_first = "claude-" + p + "-" + "-".join(others)
add(family_last)
add(family_first)
add(family_last.replace(".", "-"))
add(family_first.replace(".", "-"))
break
return aliases
def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
"""Parse DO catalog response into a ModelPrice map keyed by model id.
DO's shape (as of 2026-04):
{
"data": [
{"model_id": "openai-gpt-5.4",
"pricing": {"input_price_per_million": 5.0,
"output_price_per_million": 15.0}},
...
]
}
Older/alternate shapes are also accepted (flat top-level fields, or the
``id``/``model``/``name`` key).
"""
prices: dict[str, ModelPrice] = {}
items = _coerce_items(data)
for item in items:
model_id = (
item.get("model_id")
or item.get("id")
or item.get("model")
or item.get("name")
)
if not model_id:
continue
# DO nests rates under `pricing`; try that first, then fall back to
# top-level fields for alternate response shapes.
sources = [item]
if isinstance(item.get("pricing"), dict):
sources.insert(0, item["pricing"])
input_rate = _extract_rate_from_sources(
sources,
["input_per_token", "input_token_price", "price_input"],
["input_price_per_million", "input_per_million", "input_per_mtok"],
)
output_rate = _extract_rate_from_sources(
sources,
["output_per_token", "output_token_price", "price_output"],
["output_price_per_million", "output_per_million", "output_per_mtok"],
)
cached_rate = _extract_rate_from_sources(
sources,
[
"cached_input_per_token",
"cached_input_token_price",
"prompt_cache_read_per_token",
],
[
"cached_input_price_per_million",
"cached_input_per_million",
"cached_input_per_mtok",
],
)
if input_rate is None or output_rate is None:
continue
# Treat 0-rate entries as "unknown" so cost falls back to `—` rather
# than showing a misleading $0.0000. DO's catalog sometimes omits
# rates for promo/open-weight models.
if input_rate == 0 and output_rate == 0:
continue
price = ModelPrice(
input_per_token_usd=input_rate,
output_per_token_usd=output_rate,
cached_input_per_token_usd=cached_rate,
)
for alias in _expand_aliases(str(model_id)):
prices.setdefault(alias, price)
return prices
def _coerce_items(data: Any) -> list[dict]:
if isinstance(data, list):
return [x for x in data if isinstance(x, dict)]
if isinstance(data, dict):
for key in ("data", "models", "pricing", "items"):
val = data.get(key)
if isinstance(val, list):
return [x for x in val if isinstance(x, dict)]
return []
def _extract_rate_from_sources(
sources: list[dict],
per_token_keys: list[str],
per_million_keys: list[str],
) -> float | None:
"""Return a per-token rate in USD, or None if unknown.
Some DO catalog responses put per-token values under a field whose name
says ``_per_million`` (e.g. ``input_price_per_million: 5E-8`` that's
$5e-8 per token, not per million). Heuristic: values < 1 are already
per-token (real per-million rates are ~0.1 to ~100); values >= 1 are
treated as per-million and divided by 1,000,000.
"""
for src in sources:
for key in per_token_keys:
if key in src and src[key] is not None:
try:
return float(src[key])
except (TypeError, ValueError):
continue
for key in per_million_keys:
if key in src and src[key] is not None:
try:
v = float(src[key])
except (TypeError, ValueError):
continue
if v >= 1:
return v / 1_000_000
return v
return None

634
cli/planoai/obs/render.py Normal file
View file

@ -0,0 +1,634 @@
"""Rich TUI renderer for the observability console."""
from __future__ import annotations
from collections import Counter
from dataclasses import dataclass
from datetime import datetime
from http import HTTPStatus
from rich.align import Align
from rich.box import SIMPLE, SIMPLE_HEAVY
from rich.console import Group
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
MAX_WIDTH = 160
from planoai.obs.collector import LLMCall
@dataclass
class AggregateStats:
count: int
total_cost_usd: float
total_input_tokens: int
total_output_tokens: int
distinct_sessions: int
current_session: str | None
p50_latency_ms: float | None = None
p95_latency_ms: float | None = None
p99_latency_ms: float | None = None
p50_ttft_ms: float | None = None
p95_ttft_ms: float | None = None
p99_ttft_ms: float | None = None
error_count: int = 0
errors_4xx: int = 0
errors_5xx: int = 0
has_cost: bool = False
@dataclass
class ModelRollup:
model: str
requests: int
input_tokens: int
output_tokens: int
cache_write: int
cache_read: int
cost_usd: float
has_cost: bool = False
avg_tokens_per_sec: float | None = None
def _percentile(values: list[float], pct: float) -> float | None:
if not values:
return None
s = sorted(values)
k = max(0, min(len(s) - 1, int(round((pct / 100.0) * (len(s) - 1)))))
return s[k]
def aggregates(calls: list[LLMCall]) -> AggregateStats:
total_cost = sum((c.cost_usd or 0.0) for c in calls)
total_input = sum(int(c.prompt_tokens or 0) for c in calls)
total_output = sum(int(c.completion_tokens or 0) for c in calls)
session_ids = {c.session_id for c in calls if c.session_id}
current = next(
(c.session_id for c in reversed(calls) if c.session_id is not None), None
)
durations = [c.duration_ms for c in calls if c.duration_ms is not None]
ttfts = [c.ttft_ms for c in calls if c.ttft_ms is not None]
errors_4xx = sum(
1 for c in calls if c.status_code is not None and 400 <= c.status_code < 500
)
errors_5xx = sum(
1 for c in calls if c.status_code is not None and c.status_code >= 500
)
has_cost = any(c.cost_usd is not None for c in calls)
return AggregateStats(
count=len(calls),
total_cost_usd=total_cost,
total_input_tokens=total_input,
total_output_tokens=total_output,
distinct_sessions=len(session_ids),
current_session=current,
p50_latency_ms=_percentile(durations, 50),
p95_latency_ms=_percentile(durations, 95),
p99_latency_ms=_percentile(durations, 99),
p50_ttft_ms=_percentile(ttfts, 50),
p95_ttft_ms=_percentile(ttfts, 95),
p99_ttft_ms=_percentile(ttfts, 99),
error_count=errors_4xx + errors_5xx,
errors_4xx=errors_4xx,
errors_5xx=errors_5xx,
has_cost=has_cost,
)
def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
buckets: dict[str, dict[str, float | int | bool]] = {}
tps_samples: dict[str, list[float]] = {}
for c in calls:
key = c.model
b = buckets.setdefault(
key,
{
"requests": 0,
"input": 0,
"output": 0,
"cache_write": 0,
"cache_read": 0,
"cost": 0.0,
"has_cost": False,
},
)
b["requests"] = int(b["requests"]) + 1
b["input"] = int(b["input"]) + int(c.prompt_tokens or 0)
b["output"] = int(b["output"]) + int(c.completion_tokens or 0)
b["cache_write"] = int(b["cache_write"]) + int(c.cache_creation_tokens or 0)
b["cache_read"] = int(b["cache_read"]) + int(c.cached_input_tokens or 0)
b["cost"] = float(b["cost"]) + (c.cost_usd or 0.0)
if c.cost_usd is not None:
b["has_cost"] = True
tps = c.tokens_per_sec
if tps is not None:
tps_samples.setdefault(key, []).append(tps)
rollups: list[ModelRollup] = []
for model, b in buckets.items():
samples = tps_samples.get(model)
avg_tps = (sum(samples) / len(samples)) if samples else None
rollups.append(
ModelRollup(
model=model,
requests=int(b["requests"]),
input_tokens=int(b["input"]),
output_tokens=int(b["output"]),
cache_write=int(b["cache_write"]),
cache_read=int(b["cache_read"]),
cost_usd=float(b["cost"]),
has_cost=bool(b["has_cost"]),
avg_tokens_per_sec=avg_tps,
)
)
rollups.sort(key=lambda r: (r.cost_usd, r.requests), reverse=True)
return rollups
@dataclass
class RouteHit:
route: str
hits: int
pct: float
p95_latency_ms: float | None
error_count: int
def route_hits(calls: list[LLMCall]) -> list[RouteHit]:
counts: Counter[str] = Counter()
per_route_latency: dict[str, list[float]] = {}
per_route_errors: dict[str, int] = {}
for c in calls:
if not c.route_name:
continue
counts[c.route_name] += 1
if c.duration_ms is not None:
per_route_latency.setdefault(c.route_name, []).append(c.duration_ms)
if c.status_code is not None and c.status_code >= 400:
per_route_errors[c.route_name] = per_route_errors.get(c.route_name, 0) + 1
total = sum(counts.values())
if total == 0:
return []
return [
RouteHit(
route=r,
hits=n,
pct=(n / total) * 100.0,
p95_latency_ms=_percentile(per_route_latency.get(r, []), 95),
error_count=per_route_errors.get(r, 0),
)
for r, n in counts.most_common()
]
def _fmt_cost(v: float | None, *, zero: str = "") -> str:
if v is None:
return ""
if v == 0:
return zero
if abs(v) < 0.0001:
return f"${v:.8f}".rstrip("0").rstrip(".")
if abs(v) < 0.01:
return f"${v:.6f}".rstrip("0").rstrip(".")
if abs(v) < 1:
return f"${v:.4f}"
return f"${v:,.2f}"
def _fmt_ms(v: float | None) -> str:
if v is None:
return ""
if v >= 1000:
return f"{v / 1000:.1f}s"
return f"{v:.0f}ms"
def _fmt_int(v: int | None) -> str:
if v is None or v == 0:
return ""
return f"{v:,}"
def _fmt_tokens(v: int | None) -> str:
if v is None:
return ""
return f"{v:,}"
def _fmt_tps(v: float | None) -> str:
if v is None or v <= 0:
return ""
if v >= 100:
return f"{v:.0f}/s"
return f"{v:.1f}/s"
def _latency_style(v: float | None) -> str:
if v is None:
return "dim"
if v < 500:
return "green"
if v < 2000:
return "yellow"
return "red"
def _ttft_style(v: float | None) -> str:
if v is None:
return "dim"
if v < 300:
return "green"
if v < 1000:
return "yellow"
return "red"
def _truncate_model(name: str, limit: int = 32) -> str:
if len(name) <= limit:
return name
return name[: limit - 1] + ""
def _status_text(code: int | None) -> Text:
if code is None:
return Text("", style="dim")
if 200 <= code < 300:
return Text("● ok", style="green")
if 300 <= code < 400:
return Text(f"{code}", style="yellow")
if 400 <= code < 500:
return Text(f"{code}", style="yellow bold")
return Text(f"{code}", style="red bold")
def _summary_panel(last: LLMCall | None, stats: AggregateStats) -> Panel:
# Content-sized columns with a fixed gutter keep the two blocks close
# together instead of stretching across the full terminal on wide screens.
grid = Table.grid(padding=(0, 4))
grid.add_column(no_wrap=True)
grid.add_column(no_wrap=True)
# Left: latest request snapshot.
left = Table.grid(padding=(0, 1))
left.add_column(style="dim", no_wrap=True)
left.add_column(no_wrap=True)
if last is None:
left.add_row("latest", Text("waiting for spans…", style="dim italic"))
else:
model_text = Text(_truncate_model(last.model, 48), style="bold cyan")
if last.is_streaming:
model_text.append(" ⟳ stream", style="dim")
left.add_row("model", model_text)
if last.request_model and last.request_model != last.model:
left.add_row(
"requested", Text(_truncate_model(last.request_model, 48), style="cyan")
)
if last.route_name:
left.add_row("route", Text(last.route_name, style="yellow"))
left.add_row("status", _status_text(last.status_code))
tokens = Text()
tokens.append(_fmt_tokens(last.prompt_tokens))
tokens.append(" in", style="dim")
tokens.append(" · ", style="dim")
tokens.append(_fmt_tokens(last.completion_tokens), style="green")
tokens.append(" out", style="dim")
if last.cached_input_tokens:
tokens.append(" · ", style="dim")
tokens.append(_fmt_tokens(last.cached_input_tokens), style="yellow")
tokens.append(" cached", style="dim")
left.add_row("tokens", tokens)
timing = Text()
timing.append("TTFT ", style="dim")
timing.append(_fmt_ms(last.ttft_ms), style=_ttft_style(last.ttft_ms))
timing.append(" · ", style="dim")
timing.append("lat ", style="dim")
timing.append(_fmt_ms(last.duration_ms), style=_latency_style(last.duration_ms))
tps = last.tokens_per_sec
if tps:
timing.append(" · ", style="dim")
timing.append(_fmt_tps(tps), style="green")
left.add_row("timing", timing)
left.add_row("cost", Text(_fmt_cost(last.cost_usd), style="green bold"))
# Right: lifetime totals.
right = Table.grid(padding=(0, 1))
right.add_column(style="dim", no_wrap=True)
right.add_column(no_wrap=True)
right.add_row(
"requests",
Text(f"{stats.count:,}", style="bold"),
)
if stats.error_count:
err_text = Text()
err_text.append(f"{stats.error_count:,}", style="red bold")
parts: list[str] = []
if stats.errors_4xx:
parts.append(f"{stats.errors_4xx} 4xx")
if stats.errors_5xx:
parts.append(f"{stats.errors_5xx} 5xx")
if parts:
err_text.append(f" ({' · '.join(parts)})", style="dim")
right.add_row("errors", err_text)
cost_str = _fmt_cost(stats.total_cost_usd) if stats.has_cost else ""
right.add_row("total cost", Text(cost_str, style="green bold"))
tokens_total = Text()
tokens_total.append(_fmt_tokens(stats.total_input_tokens))
tokens_total.append(" in", style="dim")
tokens_total.append(" · ", style="dim")
tokens_total.append(_fmt_tokens(stats.total_output_tokens), style="green")
tokens_total.append(" out", style="dim")
right.add_row("tokens", tokens_total)
lat_text = Text()
lat_text.append("p50 ", style="dim")
lat_text.append(
_fmt_ms(stats.p50_latency_ms), style=_latency_style(stats.p50_latency_ms)
)
lat_text.append(" · ", style="dim")
lat_text.append("p95 ", style="dim")
lat_text.append(
_fmt_ms(stats.p95_latency_ms), style=_latency_style(stats.p95_latency_ms)
)
lat_text.append(" · ", style="dim")
lat_text.append("p99 ", style="dim")
lat_text.append(
_fmt_ms(stats.p99_latency_ms), style=_latency_style(stats.p99_latency_ms)
)
right.add_row("latency", lat_text)
ttft_text = Text()
ttft_text.append("p50 ", style="dim")
ttft_text.append(_fmt_ms(stats.p50_ttft_ms), style=_ttft_style(stats.p50_ttft_ms))
ttft_text.append(" · ", style="dim")
ttft_text.append("p95 ", style="dim")
ttft_text.append(_fmt_ms(stats.p95_ttft_ms), style=_ttft_style(stats.p95_ttft_ms))
ttft_text.append(" · ", style="dim")
ttft_text.append("p99 ", style="dim")
ttft_text.append(_fmt_ms(stats.p99_ttft_ms), style=_ttft_style(stats.p99_ttft_ms))
right.add_row("TTFT", ttft_text)
sess = Text()
sess.append(f"{stats.distinct_sessions}")
if stats.current_session:
sess.append(" · current ", style="dim")
sess.append(stats.current_session, style="magenta")
right.add_row("sessions", sess)
grid.add_row(left, right)
return Panel(
grid,
title="[bold]live LLM traffic[/]",
border_style="cyan",
box=SIMPLE_HEAVY,
padding=(0, 1),
)
def _model_rollup_table(rollups: list[ModelRollup]) -> Table:
table = Table(
title="by model",
title_justify="left",
title_style="bold dim",
caption="cost via DigitalOcean Gradient catalog",
caption_justify="left",
caption_style="dim italic",
box=SIMPLE,
header_style="bold",
pad_edge=False,
padding=(0, 1),
)
table.add_column("model", style="cyan", no_wrap=True)
table.add_column("req", justify="right")
table.add_column("input", justify="right")
table.add_column("output", justify="right", style="green")
table.add_column("cache wr", justify="right", style="yellow")
table.add_column("cache rd", justify="right", style="yellow")
table.add_column("tok/s", justify="right")
table.add_column("cost", justify="right", style="green")
if not rollups:
table.add_row(
Text("no requests yet", style="dim italic"),
*([""] * 7),
)
return table
for r in rollups:
cost_cell = _fmt_cost(r.cost_usd) if r.has_cost else ""
table.add_row(
_truncate_model(r.model),
f"{r.requests:,}",
_fmt_tokens(r.input_tokens),
_fmt_tokens(r.output_tokens),
_fmt_int(r.cache_write),
_fmt_int(r.cache_read),
_fmt_tps(r.avg_tokens_per_sec),
cost_cell,
)
return table
def _route_hit_table(hits: list[RouteHit]) -> Table:
table = Table(
title="route share",
title_justify="left",
title_style="bold dim",
box=SIMPLE,
header_style="bold",
pad_edge=False,
padding=(0, 1),
)
table.add_column("route", style="cyan")
table.add_column("hits", justify="right")
table.add_column("%", justify="right")
table.add_column("p95", justify="right")
table.add_column("err", justify="right")
for h in hits:
err_cell = (
Text(f"{h.error_count:,}", style="red bold") if h.error_count else ""
)
table.add_row(
h.route,
f"{h.hits:,}",
f"{h.pct:5.1f}%",
Text(_fmt_ms(h.p95_latency_ms), style=_latency_style(h.p95_latency_ms)),
err_cell,
)
return table
def _recent_table(calls: list[LLMCall], limit: int = 15) -> Table:
show_route = any(c.route_name for c in calls)
show_cache = any((c.cached_input_tokens or 0) > 0 for c in calls)
show_rsn = any((c.reasoning_tokens or 0) > 0 for c in calls)
caption_parts = ["in·new = fresh prompt tokens"]
if show_cache:
caption_parts.append("in·cache = cached read")
if show_rsn:
caption_parts.append("rsn = reasoning")
caption_parts.append("lat = total latency")
table = Table(
title=f"recent · last {min(limit, len(calls)) if calls else 0}",
title_justify="left",
title_style="bold dim",
caption=" · ".join(caption_parts),
caption_justify="left",
caption_style="dim italic",
box=SIMPLE,
header_style="bold",
pad_edge=False,
padding=(0, 1),
)
table.add_column("time", no_wrap=True)
table.add_column("model", style="cyan", no_wrap=True)
if show_route:
table.add_column("route", style="yellow", no_wrap=True)
table.add_column("in·new", justify="right")
if show_cache:
table.add_column("in·cache", justify="right", style="yellow")
table.add_column("out", justify="right", style="green")
if show_rsn:
table.add_column("rsn", justify="right")
table.add_column("tok/s", justify="right")
table.add_column("TTFT", justify="right")
table.add_column("lat", justify="right")
table.add_column("cost", justify="right", style="green")
table.add_column("status")
if not calls:
cols = len(table.columns)
table.add_row(
Text("waiting for spans…", style="dim italic"),
*([""] * (cols - 1)),
)
return table
recent = list(reversed(calls))[:limit]
for idx, c in enumerate(recent):
is_newest = idx == 0
time_style = "bold white" if is_newest else None
model_style = "bold cyan" if is_newest else "cyan"
row: list[object] = [
(
Text(c.timestamp.strftime("%H:%M:%S"), style=time_style)
if time_style
else c.timestamp.strftime("%H:%M:%S")
),
Text(_truncate_model(c.model), style=model_style),
]
if show_route:
row.append(c.route_name or "")
row.append(_fmt_tokens(c.prompt_tokens))
if show_cache:
row.append(_fmt_int(c.cached_input_tokens))
row.append(_fmt_tokens(c.completion_tokens))
if show_rsn:
row.append(_fmt_int(c.reasoning_tokens))
row.extend(
[
_fmt_tps(c.tokens_per_sec),
Text(_fmt_ms(c.ttft_ms), style=_ttft_style(c.ttft_ms)),
Text(_fmt_ms(c.duration_ms), style=_latency_style(c.duration_ms)),
_fmt_cost(c.cost_usd),
_status_text(c.status_code),
]
)
table.add_row(*row)
return table
def _last_error(calls: list[LLMCall]) -> LLMCall | None:
for c in reversed(calls):
if c.status_code is not None and c.status_code >= 400:
return c
return None
def _http_reason(code: int) -> str:
try:
return HTTPStatus(code).phrase
except ValueError:
return ""
def _fmt_ago(ts: datetime) -> str:
# `ts` is produced in collector.py via datetime.now(tz=...), but fall back
# gracefully if a naive timestamp ever sneaks in.
now = datetime.now(tz=ts.tzinfo) if ts.tzinfo else datetime.now()
delta = (now - ts).total_seconds()
if delta < 0:
delta = 0
if delta < 60:
return f"{int(delta)}s ago"
if delta < 3600:
return f"{int(delta // 60)}m ago"
return f"{int(delta // 3600)}h ago"
def _error_banner(call: LLMCall) -> Panel:
code = call.status_code or 0
border = "red" if code >= 500 else "yellow"
header = Text()
header.append(f"{code}", style=f"{border} bold")
reason = _http_reason(code)
if reason:
header.append(f" {reason}", style=border)
header.append(" · ", style="dim")
header.append(_truncate_model(call.model, 48), style="cyan")
if call.route_name:
header.append(" · ", style="dim")
header.append(call.route_name, style="yellow")
header.append(" · ", style="dim")
header.append(_fmt_ago(call.timestamp), style="dim")
if call.request_id:
header.append(" · req ", style="dim")
header.append(call.request_id, style="magenta")
return Panel(
header,
title="[bold]last error[/]",
title_align="left",
border_style=border,
box=SIMPLE,
padding=(0, 1),
)
def _footer(stats: AggregateStats) -> Text:
waiting = stats.count == 0
text = Text()
text.append("Ctrl-C ", style="bold")
text.append("exit", style="dim")
text.append(" · OTLP :4317", style="dim")
text.append(" · pricing: DigitalOcean ", style="dim")
if waiting:
text.append("waiting for spans", style="yellow")
text.append(
" — set tracing.opentracing_grpc_endpoint=localhost:4317", style="dim"
)
else:
text.append(f"receiving · {stats.count:,} call(s) buffered", style="green")
return text
def render(calls: list[LLMCall]) -> Align:
last = calls[-1] if calls else None
stats = aggregates(calls)
rollups = model_rollups(calls)
hits = route_hits(calls)
parts: list[object] = [_summary_panel(last, stats)]
err = _last_error(calls)
if err is not None:
parts.append(_error_banner(err))
if hits:
split = Table.grid(padding=(0, 2))
split.add_column(no_wrap=False)
split.add_column(no_wrap=False)
split.add_row(_model_rollup_table(rollups), _route_hit_table(hits))
parts.append(split)
else:
parts.append(_model_rollup_table(rollups))
parts.append(_recent_table(calls))
parts.append(_footer(stats))
# Cap overall width so wide terminals don't stretch the layout into a
# mostly-whitespace gap between columns.
return Align.left(Group(*parts), width=MAX_WIDTH)

99
cli/planoai/obs_cmd.py Normal file
View file

@ -0,0 +1,99 @@
"""`planoai obs` — live observability TUI."""
from __future__ import annotations
import time
import rich_click as click
from rich.console import Console
from rich.live import Live
from planoai.consts import PLANO_COLOR
from planoai.obs.collector import (
DEFAULT_CAPACITY,
DEFAULT_GRPC_PORT,
LLMCallStore,
ObsCollector,
)
from planoai.obs.pricing import PricingCatalog
from planoai.obs.render import render
@click.command(name="obs", help="Live observability console for Plano LLM traffic.")
@click.option(
"--port",
type=int,
default=DEFAULT_GRPC_PORT,
show_default=True,
help="OTLP/gRPC port to listen on. Must match the brightstaff tracing endpoint.",
)
@click.option(
"--host",
type=str,
default="0.0.0.0",
show_default=True,
help="Host to bind the OTLP listener.",
)
@click.option(
"--capacity",
type=int,
default=DEFAULT_CAPACITY,
show_default=True,
help="Max LLM calls kept in memory; older calls evicted FIFO.",
)
@click.option(
"--refresh-ms",
type=int,
default=500,
show_default=True,
help="TUI refresh interval.",
)
def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
console = Console()
console.print(
f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...",
end="",
)
pricing = PricingCatalog.fetch()
if len(pricing):
sample = ", ".join(pricing.sample_models(3))
console.print(
f" [green]{len(pricing)} models loaded[/] [dim]({sample}, ...)[/]"
)
else:
console.print(
" [yellow]no pricing loaded[/] — "
"[dim]cost column will be blank (DO catalog unreachable)[/]"
)
store = LLMCallStore(capacity=capacity)
collector = ObsCollector(store=store, pricing=pricing, host=host, port=port)
try:
collector.start()
except OSError as exc:
console.print(f"[red]{exc}[/]")
raise SystemExit(1)
console.print(
f"Listening for OTLP spans on [bold]{host}:{port}[/]. "
"Ensure plano config has [cyan]tracing.opentracing_grpc_endpoint: http://localhost:4317[/] "
"and [cyan]tracing.random_sampling: 100[/] (or run [bold]planoai up[/] "
"with no config — it wires this automatically)."
)
console.print("Press [bold]Ctrl-C[/] to exit.\n")
refresh = max(0.05, refresh_ms / 1000.0)
try:
with Live(
render(store.snapshot()),
console=console,
refresh_per_second=1.0 / refresh,
screen=False,
) as live:
while True:
time.sleep(refresh)
live.update(render(store.snapshot()))
except KeyboardInterrupt:
console.print("\n[dim]obs stopped[/]")
finally:
collector.stop()

View file

@ -61,7 +61,7 @@ def configure_rich_click(plano_color: str) -> None:
},
{
"name": "Observability",
"commands": ["trace"],
"commands": ["trace", "obs"],
},
{
"name": "Utilities",

View file

@ -91,7 +91,12 @@ def convert_legacy_listeners(
"type": "model",
"port": 12000,
"address": "0.0.0.0",
"timeout": "30s",
# LLM streaming responses routinely exceed 30s (extended thinking,
# long tool reasoning, large completions). Match the 300s ceiling
# used by the direct upstream-provider routes so Envoy doesn't
# abort streams with UT mid-response. Users can override via their
# plano_config.yaml `listeners.timeout` field.
"timeout": "300s",
"model_providers": model_providers or [],
}
@ -100,7 +105,7 @@ def convert_legacy_listeners(
"type": "prompt",
"port": 10000,
"address": "0.0.0.0",
"timeout": "30s",
"timeout": "300s",
}
# Handle None case

View file

@ -1,6 +1,6 @@
[project]
name = "planoai"
version = "0.4.19"
version = "0.4.20"
description = "Python-based CLI tool to manage Plano."
authors = [{name = "Katanemo Labs, Inc."}]
readme = "README.md"

86
cli/test/test_defaults.py Normal file
View file

@ -0,0 +1,86 @@
from pathlib import Path
import jsonschema
import yaml
from planoai.defaults import (
PROVIDER_DEFAULTS,
detect_providers,
synthesize_default_config,
)
_SCHEMA_PATH = Path(__file__).parents[2] / "config" / "plano_config_schema.yaml"
def _schema() -> dict:
return yaml.safe_load(_SCHEMA_PATH.read_text())
def test_zero_env_vars_produces_pure_passthrough():
cfg = synthesize_default_config(env={})
assert cfg["version"] == "v0.4.0"
assert cfg["listeners"][0]["port"] == 12000
for provider in cfg["model_providers"]:
assert provider.get("passthrough_auth") is True
assert "access_key" not in provider
# No provider should be marked default in pure pass-through mode.
assert provider.get("default") is not True
# All known providers should be listed.
names = {p["name"] for p in cfg["model_providers"]}
assert "digitalocean" in names
assert "openai" in names
assert "anthropic" in names
def test_env_keys_promote_providers_to_env_keyed():
cfg = synthesize_default_config(
env={"OPENAI_API_KEY": "sk-1", "DO_API_KEY": "do-1"}
)
by_name = {p["name"]: p for p in cfg["model_providers"]}
assert by_name["openai"].get("access_key") == "$OPENAI_API_KEY"
assert by_name["openai"].get("passthrough_auth") is None
assert by_name["digitalocean"].get("access_key") == "$DO_API_KEY"
# Unset env keys remain pass-through.
assert by_name["anthropic"].get("passthrough_auth") is True
def test_no_default_is_synthesized():
# Bare model names resolve via brightstaff's wildcard expansion registering
# bare keys, so the synthesizer intentionally never sets `default: true`.
cfg = synthesize_default_config(
env={"OPENAI_API_KEY": "sk-1", "ANTHROPIC_API_KEY": "a-1"}
)
assert not any(p.get("default") is True for p in cfg["model_providers"])
def test_listener_port_is_configurable():
cfg = synthesize_default_config(env={}, listener_port=11000)
assert cfg["listeners"][0]["port"] == 11000
def test_detection_summary_strings():
det = detect_providers(env={"OPENAI_API_KEY": "sk", "DO_API_KEY": "d"})
summary = det.summary
assert "env-keyed" in summary and "openai" in summary and "digitalocean" in summary
assert "pass-through" in summary
def test_tracing_block_points_at_local_console():
cfg = synthesize_default_config(env={})
tracing = cfg["tracing"]
assert tracing["opentracing_grpc_endpoint"] == "http://localhost:4317"
# random_sampling is a percentage in the plano config — 100 = every span.
assert tracing["random_sampling"] == 100
def test_synthesized_config_validates_against_schema():
cfg = synthesize_default_config(env={"OPENAI_API_KEY": "sk"})
jsonschema.validate(cfg, _schema())
def test_provider_defaults_digitalocean_is_configured():
by_name = {p.name: p for p in PROVIDER_DEFAULTS}
assert "digitalocean" in by_name
assert by_name["digitalocean"].env_var == "DO_API_KEY"
assert by_name["digitalocean"].base_url == "https://inference.do-ai.run/v1"
assert by_name["digitalocean"].model_pattern == "digitalocean/*"

View file

@ -0,0 +1,145 @@
import time
from datetime import datetime, timezone
from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call
def _mk_attr(key: str, value):
v = MagicMock()
if isinstance(value, bool):
v.WhichOneof.return_value = "bool_value"
v.bool_value = value
elif isinstance(value, int):
v.WhichOneof.return_value = "int_value"
v.int_value = value
elif isinstance(value, float):
v.WhichOneof.return_value = "double_value"
v.double_value = value
else:
v.WhichOneof.return_value = "string_value"
v.string_value = str(value)
kv = MagicMock()
kv.key = key
kv.value = v
return kv
def _mk_span(
attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab"
) -> MagicMock:
span = MagicMock()
span.attributes = [_mk_attr(k, v) for k, v in attrs.items()]
span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000)
span.span_id.hex.return_value = span_id_hex
return span
def test_span_without_llm_model_is_ignored():
span = _mk_span({"http.method": "POST"})
assert span_to_llm_call(span, "plano(llm)") is None
def test_span_with_full_llm_attrs_produces_call():
span = _mk_span(
{
"llm.model": "openai-gpt-5.4",
"model.requested": "router:software-engineering",
"plano.session_id": "sess-abc",
"plano.route.name": "software-engineering",
"llm.is_streaming": False,
"llm.duration_ms": 1234,
"llm.time_to_first_token": 210,
"llm.usage.prompt_tokens": 100,
"llm.usage.completion_tokens": 50,
"llm.usage.total_tokens": 150,
"llm.usage.cached_input_tokens": 30,
"llm.usage.cache_creation_tokens": 5,
"llm.usage.reasoning_tokens": 200,
"http.status_code": 200,
"request_id": "req-42",
}
)
call = span_to_llm_call(span, "plano(llm)")
assert call is not None
assert call.request_id == "req-42"
assert call.model == "openai-gpt-5.4"
assert call.request_model == "router:software-engineering"
assert call.session_id == "sess-abc"
assert call.route_name == "software-engineering"
assert call.is_streaming is False
assert call.duration_ms == 1234.0
assert call.ttft_ms == 210.0
assert call.prompt_tokens == 100
assert call.completion_tokens == 50
assert call.total_tokens == 150
assert call.cached_input_tokens == 30
assert call.cache_creation_tokens == 5
assert call.reasoning_tokens == 200
assert call.status_code == 200
def test_pricing_lookup_attaches_cost():
class StubPricing:
def cost_for_call(self, call):
# Simple: 2 * prompt + 3 * completion, in cents
return 0.02 * (call.prompt_tokens or 0) + 0.03 * (
call.completion_tokens or 0
)
span = _mk_span(
{
"llm.model": "do/openai-gpt-5.4",
"llm.usage.prompt_tokens": 10,
"llm.usage.completion_tokens": 2,
}
)
call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing())
assert call is not None
assert call.cost_usd == pytest.approx(0.26)
def test_tpt_and_tokens_per_sec_derived():
call = LLMCall(
request_id="x",
timestamp=datetime.now(tz=timezone.utc),
model="m",
duration_ms=1000,
ttft_ms=200,
completion_tokens=80,
)
# (1000 - 200) / 80 = 10ms per token => 100 tokens/sec
assert call.tpt_ms == 10.0
assert call.tokens_per_sec == 100.0
def test_tpt_returns_none_when_no_completion_tokens():
call = LLMCall(
request_id="x",
timestamp=datetime.now(tz=timezone.utc),
model="m",
duration_ms=1000,
ttft_ms=200,
completion_tokens=0,
)
assert call.tpt_ms is None
assert call.tokens_per_sec is None
def test_store_evicts_fifo_at_capacity():
store = LLMCallStore(capacity=3)
now = datetime.now(tz=timezone.utc)
for i in range(5):
store.add(
LLMCall(
request_id=f"r{i}",
timestamp=now,
model="m",
)
)
snap = store.snapshot()
assert len(snap) == 3
assert [c.request_id for c in snap] == ["r2", "r3", "r4"]

View file

@ -0,0 +1,146 @@
from datetime import datetime, timezone
from planoai.obs.collector import LLMCall
from planoai.obs.pricing import ModelPrice, PricingCatalog
def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall:
return LLMCall(
request_id="r",
timestamp=datetime.now(tz=timezone.utc),
model=model,
prompt_tokens=prompt,
completion_tokens=completion,
cached_input_tokens=cached,
)
def test_lookup_matches_bare_and_prefixed():
prices = {
"openai-gpt-5.4": ModelPrice(
input_per_token_usd=0.000001, output_per_token_usd=0.000002
)
}
catalog = PricingCatalog(prices)
assert catalog.price_for("openai-gpt-5.4") is not None
# do/openai-gpt-5.4 should resolve after stripping the provider prefix.
assert catalog.price_for("do/openai-gpt-5.4") is not None
assert catalog.price_for("unknown-model") is None
def test_cost_computation_without_cache():
prices = {
"m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002)
}
cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500))
assert cost == 0.002 # 1000 * 1e-6 + 500 * 2e-6
def test_cost_computation_with_cached_discount():
prices = {
"m": ModelPrice(
input_per_token_usd=0.000001,
output_per_token_usd=0.000002,
cached_input_per_token_usd=0.0000001,
)
}
# 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3
cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200))
assert cost == round(0.0008 + 0.00002 + 0.001, 6)
def test_empty_catalog_returns_none():
assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None
def test_parse_do_catalog_treats_small_values_as_per_token():
"""DO's real catalog uses per-token values under the `_per_million` key
(e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token."""
from planoai.obs.pricing import _parse_do_pricing
sample = {
"data": [
{
"model_id": "openai-gpt-oss-20b",
"pricing": {
"input_price_per_million": 5e-8,
"output_price_per_million": 4.5e-7,
},
},
{
"model_id": "openai-gpt-oss-120b",
"pricing": {
"input_price_per_million": 1e-7,
"output_price_per_million": 7e-7,
},
},
]
}
prices = _parse_do_pricing(sample)
# Values < 1 are assumed to already be per-token — no extra division.
assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8
assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7
assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
def test_anthropic_aliases_match_plano_emitted_names():
"""DO publishes 'anthropic-claude-opus-4.7' and 'anthropic-claude-haiku-4.5';
Plano emits 'claude-opus-4-7' and 'claude-haiku-4-5-20251001'. Aliases
registered at parse time should bridge the gap."""
from planoai.obs.pricing import _parse_do_pricing
sample = {
"data": [
{
"model_id": "anthropic-claude-opus-4.7",
"pricing": {
"input_price_per_million": 15.0,
"output_price_per_million": 75.0,
},
},
{
"model_id": "anthropic-claude-haiku-4.5",
"pricing": {
"input_price_per_million": 1.0,
"output_price_per_million": 5.0,
},
},
{
"model_id": "anthropic-claude-4.6-sonnet",
"pricing": {
"input_price_per_million": 3.0,
"output_price_per_million": 15.0,
},
},
]
}
catalog = PricingCatalog(_parse_do_pricing(sample))
# Family-last shapes Plano emits.
assert catalog.price_for("claude-opus-4-7") is not None
assert catalog.price_for("claude-haiku-4-5") is not None
# Date-suffixed name (Anthropic API style).
assert catalog.price_for("claude-haiku-4-5-20251001") is not None
# Word-order swap: DO has 'claude-4.6-sonnet', Plano emits 'claude-sonnet-4-6'.
assert catalog.price_for("claude-sonnet-4-6") is not None
# Original DO ids still resolve.
assert catalog.price_for("anthropic-claude-opus-4.7") is not None
def test_parse_do_catalog_divides_large_values_as_per_million():
"""A provider that genuinely reports $5-per-million in that field gets divided."""
from planoai.obs.pricing import _parse_do_pricing
sample = {
"data": [
{
"model_id": "mystery-model",
"pricing": {
"input_price_per_million": 5.0, # > 1 → treated as per-million
"output_price_per_million": 15.0,
},
},
]
}
prices = _parse_do_pricing(sample)
assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000

106
cli/test/test_obs_render.py Normal file
View file

@ -0,0 +1,106 @@
from datetime import datetime, timedelta, timezone
from planoai.obs.collector import LLMCall
from planoai.obs.render import aggregates, model_rollups, route_hits
def _call(
model: str,
ts: datetime,
prompt=0,
completion=0,
cost=None,
route=None,
session=None,
cache_read=0,
cache_write=0,
):
return LLMCall(
request_id="r",
timestamp=ts,
model=model,
prompt_tokens=prompt,
completion_tokens=completion,
cached_input_tokens=cache_read,
cache_creation_tokens=cache_write,
cost_usd=cost,
route_name=route,
session_id=session,
)
def test_aggregates_sum_and_session_counts():
now = datetime.now(tz=timezone.utc).astimezone()
calls = [
_call(
"m1",
now - timedelta(seconds=50),
prompt=10,
completion=5,
cost=0.001,
session="s1",
),
_call(
"m2",
now - timedelta(seconds=40),
prompt=20,
completion=10,
cost=0.002,
session="s1",
),
_call(
"m1",
now - timedelta(seconds=30),
prompt=30,
completion=15,
cost=0.003,
session="s2",
),
]
stats = aggregates(calls)
assert stats.count == 3
assert stats.total_cost_usd == 0.006
assert stats.total_input_tokens == 60
assert stats.total_output_tokens == 30
assert stats.distinct_sessions == 2
assert stats.current_session == "s2"
def test_rollups_split_by_model_and_cache():
now = datetime.now(tz=timezone.utc).astimezone()
calls = [
_call(
"m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7
),
_call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1),
_call("m2", now, prompt=30, completion=15, cost=0.004),
]
rollups = model_rollups(calls)
by_model = {r.model: r for r in rollups}
assert by_model["m1"].requests == 2
assert by_model["m1"].input_tokens == 30
assert by_model["m1"].cache_write == 3
assert by_model["m1"].cache_read == 8
assert by_model["m2"].input_tokens == 30
def test_route_hits_only_for_routed_calls():
now = datetime.now(tz=timezone.utc).astimezone()
calls = [
_call("m", now, route="code"),
_call("m", now, route="code"),
_call("m", now, route="summarization"),
_call("m", now), # no route
]
hits = route_hits(calls)
# Only calls with route names are counted.
assert sum(h.hits for h in hits) == 3
hits_by_name = {h.route: h for h in hits}
assert hits_by_name["code"].hits == 2
assert hits_by_name["summarization"].hits == 1
def test_route_hits_empty_when_no_routes():
now = datetime.now(tz=timezone.utc).astimezone()
calls = [_call("m", now), _call("m", now)]
assert route_hits(calls) == []

2
cli/uv.lock generated
View file

@ -337,7 +337,7 @@ wheels = [
[[package]]
name = "planoai"
version = "0.4.18"
version = "0.4.20"
source = { editable = "." }
dependencies = [
{ name = "click" },

View file

@ -901,6 +901,33 @@ static_resources:
validation_context:
trusted_ca:
filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
- name: digitalocean
connect_timeout: {{ upstream_connect_timeout | default('5s') }}
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: digitalocean
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: inference.do-ai.run
port_value: 443
hostname: "inference.do-ai.run"
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
sni: inference.do-ai.run
common_tls_context:
tls_params:
tls_minimum_protocol_version: TLSv1_2
tls_maximum_protocol_version: TLSv1_3
validation_context:
trusted_ca:
filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }}
- name: xiaomi
connect_timeout: {{ upstream_connect_timeout | default('5s') }}
type: LOGICAL_DNS

View file

@ -0,0 +1,541 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "RED, LLM upstream, routing service, and process metrics for brightstaff. Pair with Envoy admin metrics from cluster=bright_staff.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "HTTP RED",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisLabel": "req/s",
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 1,
"showPoints": "never"
},
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
"id": 1,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (handler) (rate(brightstaff_http_requests_total[1m]))",
"legendFormat": "{{handler}}",
"refId": "A"
}
],
"title": "Rate — brightstaff RPS by handler",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "5xx fraction over 5m. Page-worthy when sustained above ~1%.",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"unit": "percentunit"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_http_requests_total{status_class=\"5xx\"}[5m])) / clamp_min(sum(rate(brightstaff_http_requests_total[5m])), 1)",
"legendFormat": "5xx rate",
"refId": "A"
}
],
"title": "Errors — brightstaff 5xx rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "p50/p95/p99 by handler, computed from histogram buckets over 5m.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 9 },
"id": 3,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.50, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p50 {{handler}}",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{handler}}",
"refId": "B"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.99, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p99 {{handler}}",
"refId": "C"
}
],
"title": "Duration — p50 / p95 / p99 by handler",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "In-flight requests by handler. Climbs before latency does when brightstaff is saturated.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
"id": 4,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (handler) (brightstaff_http_in_flight_requests)",
"legendFormat": "{{handler}}",
"refId": "A"
}
],
"title": "In-flight requests by handler",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
"id": 200,
"panels": [],
"title": "LLM upstream",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 27 },
"id": 5,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_upstream_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{provider}}/{{model}}",
"refId": "A"
}
],
"title": "LLM upstream p95 by provider/model",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "All non-success error classes. timeout/connect = network, 5xx/429 = provider, parse = body shape mismatch, stream = mid-stream disconnect.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "reqps"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 27 },
"id": 6,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (provider, error_class) (rate(brightstaff_llm_upstream_requests_total{error_class!=\"none\"}[5m]))",
"legendFormat": "{{provider}} / {{error_class}}",
"refId": "A"
}
],
"title": "LLM upstream errors by provider / class",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Streaming only. Empty if the route never streams.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 36 },
"id": 7,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_time_to_first_token_seconds_bucket[5m])))",
"legendFormat": "p95 {{provider}}/{{model}}",
"refId": "A"
}
],
"title": "Time-to-first-token p95 (streaming)",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Tokens/sec by provider/model/kind — proxy for cost. Stacked.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "tokens/s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 36 },
"id": 8,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (provider, model, kind) (rate(brightstaff_llm_tokens_total[5m]))",
"legendFormat": "{{provider}}/{{model}} {{kind}}",
"refId": "A"
}
],
"title": "Token throughput by provider / model / kind",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 },
"id": 300,
"panels": [],
"title": "Routing service",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Which models the orchestrator picked over the last 15 minutes.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "short"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 46 },
"id": 9,
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (selected_model) (increase(brightstaff_router_decisions_total[15m]))",
"legendFormat": "{{selected_model}}",
"refId": "A"
}
],
"title": "Model selection distribution (last 15m)",
"type": "bargauge"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Fraction of decisions that fell back (orchestrator returned `none` or errored). High = router can't classify intent or no candidates configured.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "percentunit"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 46 },
"id": 10,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (route) (rate(brightstaff_router_decisions_total{fallback=\"true\"}[5m])) / clamp_min(sum by (route) (rate(brightstaff_router_decisions_total[5m])), 1)",
"legendFormat": "{{route}}",
"refId": "A"
}
],
"title": "Fallback rate by route",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 55 },
"id": 11,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, route) (rate(brightstaff_router_decision_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{route}}",
"refId": "A"
}
],
"title": "Router decision p95 latency",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Hit / (hit + miss). Low ratio = sessions aren't being reused or TTL too short.",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "green", "value": 0.8 }
]
},
"unit": "percentunit",
"min": 0,
"max": 1
}
},
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 55 },
"id": 12,
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_session_cache_events_total{outcome=\"hit\"}[5m])) / clamp_min(sum(rate(brightstaff_session_cache_events_total{outcome=~\"hit|miss\"}[5m])), 1)",
"legendFormat": "hit rate",
"refId": "A"
}
],
"title": "Session cache hit rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "decision_served = a real model picked. no_candidates = sentinel `none` returned. policy_error = orchestrator failed.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 55 },
"id": 13,
"options": {
"legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (outcome) (rate(brightstaff_routing_service_requests_total[5m]))",
"legendFormat": "{{outcome}}",
"refId": "A"
}
],
"title": "/routing/* outcomes",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 63 },
"id": 400,
"panels": [],
"title": "Process & Envoy link",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Compare to brightstaff RPS (panel 1) — sustained gap = network or Envoy queueing.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 },
"id": 14,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=\"bright_staff\"}[1m]))",
"legendFormat": "envoy → bright_staff",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_http_requests_total[1m]))",
"legendFormat": "brightstaff served",
"refId": "B"
}
],
"title": "Envoy → brightstaff link health",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }
},
"overrides": [
{
"matcher": { "id": "byName", "options": "RSS" },
"properties": [{ "id": "unit", "value": "bytes" }]
},
{
"matcher": { "id": "byName", "options": "CPU" },
"properties": [{ "id": "unit", "value": "percentunit" }]
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 },
"id": 15,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "process_resident_memory_bytes{job=\"brightstaff\"}",
"legendFormat": "RSS",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "rate(process_cpu_seconds_total{job=\"brightstaff\"}[1m])",
"legendFormat": "CPU",
"refId": "B"
}
],
"title": "Brightstaff process RSS / CPU",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["plano", "brightstaff", "llm"],
"templating": {
"list": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"type": "datasource",
"query": "prometheus",
"current": { "selected": false, "text": "Prometheus", "value": "DS_PROMETHEUS" },
"hide": 0,
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"includeAll": false,
"multi": false
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Brightstaff (Plano dataplane)",
"uid": "brightstaff",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,43 @@
# One-command Prometheus + Grafana stack for observing a locally-running
# Plano (Envoy admin :9901 + brightstaff :9092 on the host).
#
# cd config/grafana
# docker compose up -d
# open http://localhost:3000 (admin / admin)
#
# Grafana is preloaded with:
# - Prometheus datasource (uid=DS_PROMETHEUS) → http://prometheus:9090
# - Brightstaff dashboard (auto-imported from brightstaff_dashboard.json)
#
# Prometheus scrapes the host's :9092 and :9901 via host.docker.internal.
# On Linux this works because of the `extra_hosts: host-gateway` mapping
# below. On Mac/Win it works natively.
services:
prometheus:
image: prom/prometheus:latest
container_name: plano-prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus_scrape.yaml:/etc/prometheus/prometheus.yml:ro
extra_hosts:
- "host.docker.internal:host-gateway"
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: plano-grafana
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
volumes:
- ./provisioning:/etc/grafana/provisioning:ro
- ./brightstaff_dashboard.json:/var/lib/grafana/dashboards/brightstaff_dashboard.json:ro
depends_on:
- prometheus
restart: unless-stopped

View file

@ -0,0 +1,44 @@
# Prometheus config that scrapes Plano (Envoy admin + brightstaff). This is
# a complete Prometheus config — mount it directly at
# /etc/prometheus/prometheus.yml. The included docker-compose.yaml does this
# for you.
#
# Targets:
# - envoy:9901 Envoy admin → envoy_cluster_*, envoy_http_*, envoy_server_*.
# - brightstaff:9092 Native dataplane → brightstaff_http_*, brightstaff_llm_*,
# brightstaff_router_*, process_*.
#
# Hostname `host.docker.internal` works on Docker Desktop (Mac/Win) and on
# Linux when the container is started with `--add-host=host.docker.internal:
# host-gateway` (the included compose does this). If Plano runs *inside*
# Docker on the same network as Prometheus, replace it with the container
# name (e.g. `plano:9092`).
#
# This file is unrelated to demos/llm_routing/model_routing_service/prometheus.yaml,
# which scrapes a fake metrics service to feed the routing engine.
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
scrape_configs:
- job_name: envoy
honor_timestamps: true
metrics_path: /stats
params:
format: ["prometheus"]
static_configs:
- targets:
- host.docker.internal:9901
labels:
service: plano
- job_name: brightstaff
honor_timestamps: true
metrics_path: /metrics
static_configs:
- targets:
- host.docker.internal:9092
labels:
service: plano

View file

@ -0,0 +1,15 @@
# Auto-load the brightstaff dashboard JSON on Grafana startup.
apiVersion: 1
providers:
- name: brightstaff
orgId: 1
folder: Plano
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View file

@ -0,0 +1,14 @@
# Auto-provision the Prometheus datasource so the bundled dashboard wires up
# without any clicks. The `uid: DS_PROMETHEUS` matches the templated input in
# brightstaff_dashboard.json.
apiVersion: 1
datasources:
- name: Prometheus
uid: DS_PROMETHEUS
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

View file

@ -192,6 +192,7 @@ properties:
- gemini
- vercel
- openrouter
- digitalocean
routing_preferences:
type: array
items:
@ -242,6 +243,7 @@ properties:
- gemini
- vercel
- openrouter
- digitalocean
routing_preferences:
type: array
items:
@ -280,6 +282,9 @@ properties:
type: boolean
use_agent_orchestrator:
type: boolean
disable_signals:
type: boolean
description: "Disable agentic signal analysis (frustration, repetition, escalation, etc.) on LLM responses to save CPU. Default false."
upstream_connect_timeout:
type: string
description: "Connect timeout for upstream provider clusters (e.g., '5s', '10s'). Default is '5s'."

372
crates/Cargo.lock generated
View file

@ -23,6 +23,18 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -257,6 +269,24 @@ dependencies = [
"vsimd",
]
[[package]]
name = "bindgen"
version = "0.72.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
dependencies = [
"bitflags",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"proc-macro2",
"quote",
"regex",
"rustc-hash 2.1.2",
"shlex",
"syn 2.0.117",
]
[[package]]
name = "bit-set"
version = "0.5.3"
@ -316,6 +346,9 @@ dependencies = [
"hyper 1.9.0",
"hyper-util",
"lru",
"metrics 0.23.1",
"metrics-exporter-prometheus",
"metrics-process",
"mockito",
"opentelemetry",
"opentelemetry-http",
@ -325,6 +358,7 @@ dependencies = [
"pretty_assertions",
"rand 0.9.4",
"redis",
"regex",
"reqwest",
"serde",
"serde_json",
@ -332,6 +366,8 @@ dependencies = [
"serde_yaml",
"strsim",
"thiserror 2.0.18",
"tikv-jemalloc-ctl",
"tikv-jemallocator",
"time",
"tokio",
"tokio-postgres",
@ -391,6 +427,15 @@ dependencies = [
"shlex",
]
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
]
[[package]]
name = "cfg-if"
version = "1.0.4"
@ -428,6 +473,17 @@ dependencies = [
"windows-link",
]
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "cmov"
version = "0.5.3"
@ -574,6 +630,21 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crypto-common"
version = "0.1.7"
@ -1070,6 +1141,12 @@ dependencies = [
"wasip3",
]
[[package]]
name = "glob"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
[[package]]
name = "governor"
version = "0.6.3"
@ -1128,7 +1205,7 @@ version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25"
dependencies = [
"ahash",
"ahash 0.3.8",
"autocfg",
]
@ -1138,6 +1215,15 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash 0.8.12",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
@ -1189,6 +1275,12 @@ dependencies = [
"uuid",
]
[[package]]
name = "hermit-abi"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
[[package]]
name = "hex"
version = "0.4.3"
@ -1665,6 +1757,27 @@ version = "0.2.185"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
[[package]]
name = "libloading"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
dependencies = [
"cfg-if",
"windows-link",
]
[[package]]
name = "libproc"
version = "0.14.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a54ad7278b8bc5301d5ffd2a94251c004feb971feba96c971ea4063645990757"
dependencies = [
"bindgen",
"errno",
"libc",
]
[[package]]
name = "libredox"
version = "0.1.16"
@ -1745,6 +1858,12 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "mach2"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dae608c151f68243f2b000364e1f7b186d9c29845f7d2d85bd31b9ad77ad552b"
[[package]]
name = "matchers"
version = "0.2.0"
@ -1782,6 +1901,77 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "metrics"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3045b4193fbdc5b5681f32f11070da9be3609f189a79f3390706d42587f46bb5"
dependencies = [
"ahash 0.8.12",
"portable-atomic",
]
[[package]]
name = "metrics"
version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8"
dependencies = [
"ahash 0.8.12",
"portable-atomic",
]
[[package]]
name = "metrics-exporter-prometheus"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6"
dependencies = [
"base64 0.22.1",
"http-body-util",
"hyper 1.9.0",
"hyper-util",
"indexmap 2.14.0",
"ipnet",
"metrics 0.23.1",
"metrics-util",
"quanta",
"thiserror 1.0.69",
"tokio",
"tracing",
]
[[package]]
name = "metrics-process"
version = "2.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4268d87f64a752f5a651314fc683f04da10be65701ea3e721ba4d74f79163cac"
dependencies = [
"libc",
"libproc",
"mach2",
"metrics 0.24.3",
"once_cell",
"procfs",
"rlimit",
"windows",
]
[[package]]
name = "metrics-util"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4259040465c955f9f2f1a4a8a16dc46726169bca0f88e8fb2dbeced487c3e828"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
"hashbrown 0.14.5",
"metrics 0.23.1",
"num_cpus",
"quanta",
"sketches-ddsketch",
]
[[package]]
name = "mime"
version = "0.3.17"
@ -1935,6 +2125,16 @@ dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "objc2-core-foundation"
version = "0.3.2"
@ -2125,6 +2325,12 @@ dependencies = [
"windows-link",
]
[[package]]
name = "paste"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "percent-encoding"
version = "2.3.2"
@ -2278,6 +2484,27 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "procfs"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25485360a54d6861439d60facef26de713b1e126bf015ec8f98239467a2b82f7"
dependencies = [
"bitflags",
"procfs-core",
"rustix",
]
[[package]]
name = "procfs-core"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6401bf7b6af22f78b563665d15a22e9aef27775b79b149a66ca022468a4e405"
dependencies = [
"bitflags",
"hex",
]
[[package]]
name = "prompt_gateway"
version = "0.1.0"
@ -2333,6 +2560,21 @@ dependencies = [
"log",
]
[[package]]
name = "quanta"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.1+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]]
name = "quinn"
version = "0.11.9"
@ -2485,6 +2727,15 @@ version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69"
[[package]]
name = "raw-cpuid"
version = "11.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
dependencies = [
"bitflags",
]
[[package]]
name = "redis"
version = "0.27.6"
@ -2646,6 +2897,15 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "rlimit"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f35ee2729c56bb610f6dba436bf78135f728b7373bdffae2ec815b2d3eb98cc3"
dependencies = [
"libc",
]
[[package]]
name = "rustc-hash"
version = "1.1.0"
@ -3098,6 +3358,12 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
[[package]]
name = "sketches-ddsketch"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
[[package]]
name = "slab"
version = "0.4.12"
@ -3308,6 +3574,37 @@ dependencies = [
"rustc-hash 1.1.0",
]
[[package]]
name = "tikv-jemalloc-ctl"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c"
dependencies = [
"libc",
"paste",
"tikv-jemalloc-sys",
]
[[package]]
name = "tikv-jemalloc-sys"
version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "tikv-jemallocator"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a"
dependencies = [
"libc",
"tikv-jemalloc-sys",
]
[[package]]
name = "time"
version = "0.3.47"
@ -4003,6 +4300,49 @@ dependencies = [
"web-sys",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows"
version = "0.62.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580"
dependencies = [
"windows-collections",
"windows-core",
"windows-future",
"windows-numerics",
]
[[package]]
name = "windows-collections"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610"
dependencies = [
"windows-core",
]
[[package]]
name = "windows-core"
version = "0.62.2"
@ -4016,6 +4356,17 @@ dependencies = [
"windows-strings",
]
[[package]]
name = "windows-future"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb"
dependencies = [
"windows-core",
"windows-link",
"windows-threading",
]
[[package]]
name = "windows-implement"
version = "0.60.2"
@ -4044,6 +4395,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-numerics"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26"
dependencies = [
"windows-core",
"windows-link",
]
[[package]]
name = "windows-registry"
version = "0.6.1"
@ -4133,6 +4494,15 @@ dependencies = [
"windows_x86_64_msvc 0.53.1",
]
[[package]]
name = "windows-threading"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37"
dependencies = [
"windows-link",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"

View file

@ -3,6 +3,18 @@ name = "brightstaff"
version = "0.1.0"
edition = "2021"
[features]
default = ["jemalloc"]
jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"]
[[bin]]
name = "brightstaff"
path = "src/main.rs"
[[bin]]
name = "signals_replay"
path = "src/bin/signals_replay.rs"
[dependencies]
async-openai = "0.30.1"
async-trait = "0.1"
@ -26,7 +38,11 @@ opentelemetry-stdout = "0.31"
opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] }
pretty_assertions = "1.4.1"
rand = "0.9.2"
regex = "1.10"
lru = "0.12"
metrics = "0.23"
metrics-exporter-prometheus = { version = "0.15", default-features = false, features = ["http-listener"] }
metrics-process = "2.1"
redis = { version = "0.27", features = ["tokio-comp"] }
reqwest = { version = "0.12.15", features = ["stream"] }
serde = { version = "1.0.219", features = ["derive"] }
@ -35,6 +51,8 @@ serde_with = "3.13.0"
strsim = "0.11"
serde_yaml = "0.9.34"
thiserror = "2.0.12"
tikv-jemallocator = { version = "0.6", optional = true }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true }
tokio = { version = "1.44.2", features = ["full"] }
tokio-postgres = { version = "0.7", features = ["with-serde_json-1"] }
tokio-stream = "0.1"

View file

@ -24,4 +24,7 @@ pub struct AppState {
/// Shared HTTP client for upstream LLM requests (connection pooling / keep-alive).
pub http_client: reqwest::Client,
pub filter_pipeline: Arc<FilterPipeline>,
/// When false, agentic signal analysis is skipped on LLM responses to save CPU.
/// Controlled by `overrides.disable_signals` in plano config.
pub signals_enabled: bool,
}

View file

@ -0,0 +1,175 @@
//! `signals-replay` — batch driver for the `brightstaff` signal analyzer.
//!
//! Reads JSONL conversations from stdin (one per line) and emits matching
//! JSONL reports on stdout, one per input conversation, in the same order.
//!
//! Input shape (per line):
//! ```json
//! {"id": "convo-42", "messages": [{"from": "human", "value": "..."}, ...]}
//! ```
//!
//! Output shape (per line, success):
//! ```json
//! {"id": "convo-42", "report": { ...python-compatible SignalReport dict... }}
//! ```
//!
//! On per-line failure (parse / analyzer error), emits:
//! ```json
//! {"id": "convo-42", "error": "..."}
//! ```
//!
//! The output report dict is shaped to match the Python reference's
//! `SignalReport.to_dict()` byte-for-byte so the parity comparator can do a
//! direct structural diff.
use std::io::{self, BufRead, BufWriter, Write};
use serde::Deserialize;
use serde_json::{json, Map, Value};
use brightstaff::signals::{SignalAnalyzer, SignalGroup, SignalReport};
#[derive(Debug, Deserialize)]
struct InputLine {
id: Value,
messages: Vec<MessageRow>,
}
#[derive(Debug, Deserialize)]
struct MessageRow {
#[serde(default)]
from: String,
#[serde(default)]
value: String,
}
fn main() {
let stdin = io::stdin();
let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock());
let analyzer = SignalAnalyzer::default();
for line in stdin.lock().lines() {
let line = match line {
Ok(l) => l,
Err(e) => {
eprintln!("read error: {e}");
std::process::exit(1);
}
};
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let result = process_line(&analyzer, trimmed);
// Always emit one line per input line so id ordering stays aligned.
if let Err(e) = writeln!(out, "{result}") {
eprintln!("write error: {e}");
std::process::exit(1);
}
// Flush periodically isn't strictly needed — BufWriter handles it,
// and the parent process reads the whole stream when we're done.
}
let _ = out.flush();
}
fn process_line(analyzer: &SignalAnalyzer, line: &str) -> Value {
let parsed: InputLine = match serde_json::from_str(line) {
Ok(p) => p,
Err(e) => {
return json!({
"id": Value::Null,
"error": format!("input parse: {e}"),
});
}
};
let id = parsed.id.clone();
let view: Vec<brightstaff::signals::analyzer::ShareGptMessage<'_>> = parsed
.messages
.iter()
.map(|m| brightstaff::signals::analyzer::ShareGptMessage {
from: m.from.as_str(),
value: m.value.as_str(),
})
.collect();
let report = analyzer.analyze_sharegpt(&view);
let report_dict = report_to_python_dict(&report);
json!({
"id": id,
"report": report_dict,
})
}
/// Convert a `SignalReport` into the Python reference's `to_dict()` shape.
///
/// Ordering of category keys in each layer dict follows the Python source
/// exactly so even string-equality comparisons behave deterministically.
fn report_to_python_dict(r: &SignalReport) -> Value {
let mut interaction = Map::new();
interaction.insert(
"misalignment".to_string(),
signal_group_to_python(&r.interaction.misalignment),
);
interaction.insert(
"stagnation".to_string(),
signal_group_to_python(&r.interaction.stagnation),
);
interaction.insert(
"disengagement".to_string(),
signal_group_to_python(&r.interaction.disengagement),
);
interaction.insert(
"satisfaction".to_string(),
signal_group_to_python(&r.interaction.satisfaction),
);
let mut execution = Map::new();
execution.insert(
"failure".to_string(),
signal_group_to_python(&r.execution.failure),
);
execution.insert(
"loops".to_string(),
signal_group_to_python(&r.execution.loops),
);
let mut environment = Map::new();
environment.insert(
"exhaustion".to_string(),
signal_group_to_python(&r.environment.exhaustion),
);
json!({
"interaction_signals": Value::Object(interaction),
"execution_signals": Value::Object(execution),
"environment_signals": Value::Object(environment),
"overall_quality": r.overall_quality.as_str(),
"summary": r.summary,
})
}
fn signal_group_to_python(g: &SignalGroup) -> Value {
let signals: Vec<Value> = g
.signals
.iter()
.map(|s| {
json!({
"signal_type": s.signal_type.as_str(),
"message_index": s.message_index,
"snippet": s.snippet,
"confidence": s.confidence,
"metadata": s.metadata,
})
})
.collect();
json!({
"category": g.category,
"count": g.count,
"severity": g.severity,
"signals": signals,
})
}

View file

@ -0,0 +1,53 @@
use bytes::Bytes;
use http_body_util::combinators::BoxBody;
use hyper::{Response, StatusCode};
use super::full;
#[derive(serde::Serialize)]
struct MemStats {
allocated_bytes: usize,
resident_bytes: usize,
#[serde(skip_serializing_if = "Option::is_none")]
error: Option<String>,
}
/// Returns jemalloc memory statistics as JSON.
/// Falls back to a stub when the jemalloc feature is disabled.
pub async fn memstats() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let stats = get_jemalloc_stats();
let json = serde_json::to_string(&stats).unwrap();
Ok(Response::builder()
.status(StatusCode::OK)
.header("Content-Type", "application/json")
.body(full(json))
.unwrap())
}
#[cfg(feature = "jemalloc")]
fn get_jemalloc_stats() -> MemStats {
use tikv_jemalloc_ctl::{epoch, stats};
if let Err(e) = epoch::advance() {
return MemStats {
allocated_bytes: 0,
resident_bytes: 0,
error: Some(format!("failed to advance jemalloc epoch: {e}")),
};
}
MemStats {
allocated_bytes: stats::allocated::read().unwrap_or(0),
resident_bytes: stats::resident::read().unwrap_or(0),
error: None,
}
}
#[cfg(not(feature = "jemalloc"))]
fn get_jemalloc_stats() -> MemStats {
MemStats {
allocated_bytes: 0,
resident_bytes: 0,
error: Some("jemalloc feature not enabled".to_string()),
}
}

View file

@ -441,10 +441,8 @@ impl ArchFunctionHandler {
}
}
// Handle str/string conversions
"str" | "string" => {
if !value.is_string() {
return Ok(json!(value.to_string()));
}
"str" | "string" if !value.is_string() => {
return Ok(json!(value.to_string()));
}
_ => {}
}

View file

@ -24,16 +24,18 @@ use crate::app_state::AppState;
use crate::handlers::agents::pipeline::PipelineProcessor;
use crate::handlers::extract_request_id;
use crate::handlers::full;
use crate::metrics as bs_metrics;
use crate::state::response_state_processor::ResponsesStateProcessor;
use crate::state::{
extract_input_items, retrieve_and_combine_input, StateStorage, StateStorageError,
};
use crate::streaming::{
create_streaming_response, create_streaming_response_with_output_filter, truncate_message,
ObservableStreamProcessor, StreamProcessor,
LlmMetricsCtx, ObservableStreamProcessor, StreamProcessor,
};
use crate::tracing::{
collect_custom_trace_attributes, llm as tracing_llm, operation_component, set_service_name,
collect_custom_trace_attributes, llm as tracing_llm, operation_component,
plano as tracing_plano, set_service_name,
};
use model_selection::router_chat_get_upstream_model;
@ -102,15 +104,36 @@ async fn llm_chat_inner(
.and_then(|hdr| request_headers.get(hdr))
.and_then(|v| v.to_str().ok())
.map(|s| s.to_string());
let pinned_model: Option<String> = if let Some(ref sid) = session_id {
let cached_route = if let Some(ref sid) = session_id {
state
.orchestrator_service
.get_cached_route(sid, tenant_id.as_deref())
.await
.map(|c| c.model_name)
} else {
None
};
let (pinned_model, pinned_route_name): (Option<String>, Option<String>) = match cached_route {
Some(c) => (Some(c.model_name), c.route_name),
None => (None, None),
};
// Record session id on the LLM span for the observability console.
if let Some(ref sid) = session_id {
get_active_span(|span| {
span.set_attribute(opentelemetry::KeyValue::new(
tracing_plano::SESSION_ID,
sid.clone(),
));
});
}
if let Some(ref route_name) = pinned_route_name {
get_active_span(|span| {
span.set_attribute(opentelemetry::KeyValue::new(
tracing_plano::ROUTE_NAME,
route_name.clone(),
));
});
}
let full_qualified_llm_provider_url = format!("{}{}", state.llm_provider_url, request_path);
@ -120,6 +143,7 @@ async fn llm_chat_inner(
&request_path,
&state.model_aliases,
&state.llm_providers,
state.signals_enabled,
)
.await
{
@ -311,6 +335,18 @@ async fn llm_chat_inner(
alias_resolved_model.clone()
};
// Record route name on the LLM span (only when the orchestrator produced one).
if let Some(ref rn) = route_name {
if !rn.is_empty() && rn != "none" {
get_active_span(|span| {
span.set_attribute(opentelemetry::KeyValue::new(
tracing_plano::ROUTE_NAME,
rn.clone(),
));
});
}
}
if let Some(ref sid) = session_id {
state
.orchestrator_service
@ -373,6 +409,7 @@ async fn parse_and_validate_request(
request_path: &str,
model_aliases: &Option<HashMap<String, ModelAlias>>,
llm_providers: &Arc<RwLock<LlmProviders>>,
signals_enabled: bool,
) -> Result<PreparedRequest, Response<BoxBody<Bytes, hyper::Error>>> {
let raw_bytes = request
.collect()
@ -451,7 +488,11 @@ async fn parse_and_validate_request(
let user_message_preview = client_request
.get_recent_user_message()
.map(|msg| truncate_message(&msg, 50));
let messages_for_signals = Some(client_request.get_messages());
let messages_for_signals = if signals_enabled {
Some(client_request.get_messages())
} else {
None
};
// Set the upstream model name and strip routing metadata
client_request.set_model(model_name_only.clone());
@ -652,6 +693,13 @@ async fn send_upstream(
let request_start_time = std::time::Instant::now();
// Labels for LLM upstream metrics. We prefer `resolved_model` (post-routing)
// and derive the provider from its `provider/model` prefix. This matches the
// same model id the cost/latency router keys off.
let (metric_provider_raw, metric_model_raw) = bs_metrics::split_provider_model(resolved_model);
let metric_provider = metric_provider_raw.to_string();
let metric_model = metric_model_raw.to_string();
let llm_response = match http_client
.post(upstream_url)
.headers(request_headers.clone())
@ -661,6 +709,14 @@ async fn send_upstream(
{
Ok(res) => res,
Err(err) => {
let err_class = bs_metrics::llm_error_class_from_reqwest(&err);
bs_metrics::record_llm_upstream(
&metric_provider,
&metric_model,
0,
err_class,
request_start_time.elapsed(),
);
let err_msg = format!("Failed to send request: {}", err);
let mut internal_error = Response::new(full(err_msg));
*internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
@ -671,6 +727,36 @@ async fn send_upstream(
// Propagate upstream headers and status
let response_headers = llm_response.headers().clone();
let upstream_status = llm_response.status();
// Upstream routers (e.g. DigitalOcean Gradient) may return an
// `x-model-router-selected-route` header indicating which task-level
// route the request was classified into (e.g. "Code Generation"). Surface
// it as `plano.route.name` so the obs console's Route hit % panel can
// show the breakdown even when Plano's own orchestrator wasn't in the
// routing path. Any value from Plano's orchestrator already set earlier
// takes precedence — this only fires when the span doesn't already have
// a route name.
if let Some(upstream_route) = response_headers
.get("x-model-router-selected-route")
.and_then(|v| v.to_str().ok())
{
if !upstream_route.is_empty() {
get_active_span(|span| {
span.set_attribute(opentelemetry::KeyValue::new(
crate::tracing::plano::ROUTE_NAME,
upstream_route.to_string(),
));
});
}
}
// Record the upstream HTTP status on the span for the obs console.
get_active_span(|span| {
span.set_attribute(opentelemetry::KeyValue::new(
crate::tracing::http::STATUS_CODE,
upstream_status.as_u16() as i64,
));
});
let mut response = Response::builder().status(upstream_status);
if let Some(headers) = response.headers_mut() {
for (name, value) in response_headers.iter() {
@ -686,7 +772,12 @@ async fn send_upstream(
span_name,
request_start_time,
messages_for_signals,
);
)
.with_llm_metrics(LlmMetricsCtx {
provider: metric_provider.clone(),
model: metric_model.clone(),
upstream_status: upstream_status.as_u16(),
});
let output_filter_request_headers = if filter_pipeline.has_output_filters() {
Some(request_headers.clone())

View file

@ -5,10 +5,24 @@ use hyper::StatusCode;
use std::sync::Arc;
use tracing::{debug, info, warn};
use crate::metrics as bs_metrics;
use crate::metrics::labels as metric_labels;
use crate::router::orchestrator::OrchestratorService;
use crate::streaming::truncate_message;
use crate::tracing::routing;
/// Classify a request path (already stripped of `/agents` or `/routing` by
/// the caller) into the fixed `route` label used on routing metrics.
fn route_label_for_path(request_path: &str) -> &'static str {
if request_path.starts_with("/agents") {
metric_labels::ROUTE_AGENT
} else if request_path.starts_with("/routing") {
metric_labels::ROUTE_ROUTING
} else {
metric_labels::ROUTE_LLM
}
}
pub struct RoutingResult {
/// Primary model to use (first in the ranked list).
pub model_name: String,
@ -106,15 +120,23 @@ pub async fn router_chat_get_upstream_model(
)
.await;
let determination_ms = routing_start_time.elapsed().as_millis() as i64;
let determination_elapsed = routing_start_time.elapsed();
let determination_ms = determination_elapsed.as_millis() as i64;
let current_span = tracing::Span::current();
current_span.record(routing::ROUTE_DETERMINATION_MS, determination_ms);
let route_label = route_label_for_path(request_path);
match routing_result {
Ok(route) => match route {
Some((route_name, ranked_models)) => {
let model_name = ranked_models.first().cloned().unwrap_or_default();
current_span.record("route.selected_model", model_name.as_str());
bs_metrics::record_router_decision(
route_label,
&model_name,
false,
determination_elapsed,
);
Ok(RoutingResult {
model_name,
models: ranked_models,
@ -126,6 +148,12 @@ pub async fn router_chat_get_upstream_model(
// This signals to llm.rs to use the original validated request model
current_span.record("route.selected_model", "none");
info!("no route determined, using default model");
bs_metrics::record_router_decision(
route_label,
"none",
true,
determination_elapsed,
);
Ok(RoutingResult {
model_name: "none".to_string(),
@ -136,6 +164,7 @@ pub async fn router_chat_get_upstream_model(
},
Err(err) => {
current_span.record("route.selected_model", "unknown");
bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed);
Err(RoutingError::internal_error(format!(
"Failed to determine route: {}",
err

View file

@ -1,4 +1,5 @@
pub mod agents;
pub mod debug;
pub mod function_calling;
pub mod llm;
pub mod models;

View file

@ -12,6 +12,8 @@ use tracing::{debug, info, info_span, warn, Instrument};
use super::extract_or_generate_traceparent;
use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
use crate::metrics as bs_metrics;
use crate::metrics::labels as metric_labels;
use crate::router::orchestrator::OrchestratorService;
use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
@ -230,6 +232,17 @@ async fn routing_decision_inner(
pinned: false,
};
// Distinguish "decision served" (a concrete model picked) from
// "no_candidates" (the sentinel "none" returned when nothing
// matched). The handler still responds 200 in both cases, so RED
// metrics alone can't tell them apart.
let outcome = if response.models.first().map(|m| m == "none").unwrap_or(true) {
metric_labels::ROUTING_SVC_NO_CANDIDATES
} else {
metric_labels::ROUTING_SVC_DECISION_SERVED
};
bs_metrics::record_routing_service_outcome(outcome);
info!(
primary_model = %response.models.first().map(|s| s.as_str()).unwrap_or("none"),
total_models = response.models.len(),
@ -249,6 +262,7 @@ async fn routing_decision_inner(
.unwrap())
}
Err(err) => {
bs_metrics::record_routing_service_outcome(metric_labels::ROUTING_SVC_POLICY_ERROR);
warn!(error = %err.message, "routing decision failed");
Ok(BrightStaffError::InternalServerError(err.message).into_response())
}

View file

@ -1,5 +1,6 @@
pub mod app_state;
pub mod handlers;
pub mod metrics;
pub mod router;
pub mod session_cache;
pub mod signals;

View file

@ -1,10 +1,17 @@
#[cfg(feature = "jemalloc")]
#[global_allocator]
static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
use brightstaff::app_state::AppState;
use brightstaff::handlers::agents::orchestrator::agent_chat;
use brightstaff::handlers::debug;
use brightstaff::handlers::empty;
use brightstaff::handlers::function_calling::function_calling_chat_handler;
use brightstaff::handlers::llm::llm_chat;
use brightstaff::handlers::models::list_models;
use brightstaff::handlers::routing_service::routing_decision;
use brightstaff::metrics as bs_metrics;
use brightstaff::metrics::labels as metric_labels;
use brightstaff::router::model_metrics::ModelMetricsService;
use brightstaff::router::orchestrator::OrchestratorService;
use brightstaff::session_cache::init_session_cache;
@ -326,6 +333,8 @@ async fn init_app_state(
.as_ref()
.and_then(|tracing| tracing.span_attributes.clone());
let signals_enabled = !overrides.disable_signals.unwrap_or(false);
Ok(AppState {
orchestrator_service,
model_aliases: config.model_aliases.clone(),
@ -337,6 +346,7 @@ async fn init_app_state(
span_attributes,
http_client: reqwest::Client::new(),
filter_pipeline,
signals_enabled,
})
}
@ -384,10 +394,79 @@ async fn init_state_storage(
// Request routing
// ---------------------------------------------------------------------------
/// Normalized method label — limited set so we never emit a free-form string.
fn method_label(method: &Method) -> &'static str {
match *method {
Method::GET => "GET",
Method::POST => "POST",
Method::PUT => "PUT",
Method::DELETE => "DELETE",
Method::PATCH => "PATCH",
Method::HEAD => "HEAD",
Method::OPTIONS => "OPTIONS",
_ => "OTHER",
}
}
/// Compute the fixed `handler` metric label from the request's path+method.
/// Returning `None` for fall-through means `route()` will hand the request to
/// the catch-all 404 branch.
fn handler_label_for(method: &Method, path: &str) -> &'static str {
if let Some(stripped) = path.strip_prefix("/agents") {
if matches!(
stripped,
CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
) {
return metric_labels::HANDLER_AGENT_CHAT;
}
}
if let Some(stripped) = path.strip_prefix("/routing") {
if matches!(
stripped,
CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
) {
return metric_labels::HANDLER_ROUTING_DECISION;
}
}
match (method, path) {
(&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
metric_labels::HANDLER_LLM_CHAT
}
(&Method::POST, "/function_calling") => metric_labels::HANDLER_FUNCTION_CALLING,
(&Method::GET, "/v1/models" | "/agents/v1/models") => metric_labels::HANDLER_LIST_MODELS,
(&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => {
metric_labels::HANDLER_CORS_PREFLIGHT
}
_ => metric_labels::HANDLER_NOT_FOUND,
}
}
/// Route an incoming HTTP request to the appropriate handler.
async fn route(
req: Request<Incoming>,
state: Arc<AppState>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let handler = handler_label_for(req.method(), req.uri().path());
let method = method_label(req.method());
let started = std::time::Instant::now();
let _in_flight = bs_metrics::InFlightGuard::new(handler);
let result = dispatch(req, state).await;
let status = match &result {
Ok(resp) => resp.status().as_u16(),
// hyper::Error here means the body couldn't be produced; conventionally 500.
Err(_) => 500,
};
bs_metrics::record_http(handler, method, status, started);
result
}
/// Inner dispatcher split out so `route()` can wrap it with metrics without
/// duplicating the match tree.
async fn dispatch(
req: Request<Incoming>,
state: Arc<AppState>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let parent_cx = global::get_text_map_propagator(|p| p.extract(&HeaderExtractor(req.headers())));
let path = req.uri().path().to_string();
@ -439,6 +518,7 @@ async fn route(
Ok(list_models(Arc::clone(&state.llm_providers)).await)
}
(&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => cors_preflight(),
(&Method::GET, "/debug/memstats") => debug::memstats().await,
_ => {
debug!(method = %req.method(), path = %path, "no route found");
let mut not_found = Response::new(empty());
@ -503,6 +583,7 @@ async fn run_server(state: Arc<AppState>) -> Result<(), Box<dyn std::error::Erro
async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let config = load_config()?;
let _tracer_provider = init_tracer(config.tracing.as_ref());
bs_metrics::init();
info!("loaded plano_config.yaml");
let state = Arc::new(init_app_state(&config).await?);
run_server(state).await

View file

@ -0,0 +1,38 @@
//! Fixed label-value constants so callers never emit free-form strings
//! (which would blow up cardinality).
// Handler enum — derived from the path+method match in `route()`.
pub const HANDLER_AGENT_CHAT: &str = "agent_chat";
pub const HANDLER_ROUTING_DECISION: &str = "routing_decision";
pub const HANDLER_LLM_CHAT: &str = "llm_chat";
pub const HANDLER_FUNCTION_CALLING: &str = "function_calling";
pub const HANDLER_LIST_MODELS: &str = "list_models";
pub const HANDLER_CORS_PREFLIGHT: &str = "cors_preflight";
pub const HANDLER_NOT_FOUND: &str = "not_found";
// Router "route" class — which brightstaff endpoint prompted the decision.
pub const ROUTE_AGENT: &str = "agent";
pub const ROUTE_ROUTING: &str = "routing";
pub const ROUTE_LLM: &str = "llm";
// Token kind for brightstaff_llm_tokens_total.
pub const TOKEN_KIND_PROMPT: &str = "prompt";
pub const TOKEN_KIND_COMPLETION: &str = "completion";
// LLM error_class values (match docstring in metrics/mod.rs).
pub const LLM_ERR_NONE: &str = "none";
pub const LLM_ERR_TIMEOUT: &str = "timeout";
pub const LLM_ERR_CONNECT: &str = "connect";
pub const LLM_ERR_PARSE: &str = "parse";
pub const LLM_ERR_OTHER: &str = "other";
pub const LLM_ERR_STREAM: &str = "stream";
// Routing service outcome values.
pub const ROUTING_SVC_DECISION_SERVED: &str = "decision_served";
pub const ROUTING_SVC_NO_CANDIDATES: &str = "no_candidates";
pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error";
// Session cache outcome values.
pub const SESSION_CACHE_HIT: &str = "hit";
pub const SESSION_CACHE_MISS: &str = "miss";
pub const SESSION_CACHE_STORE: &str = "store";

View file

@ -0,0 +1,377 @@
//! Prometheus metrics for brightstaff.
//!
//! Installs the `metrics` global recorder backed by
//! `metrics-exporter-prometheus` and exposes a `/metrics` HTTP endpoint on a
//! dedicated admin port (default `0.0.0.0:9092`, overridable via
//! `METRICS_BIND_ADDRESS`).
//!
//! Emitted metric families (see `describe_all` for full list):
//! - HTTP RED: `brightstaff_http_requests_total`,
//! `brightstaff_http_request_duration_seconds`,
//! `brightstaff_http_in_flight_requests`.
//! - LLM upstream: `brightstaff_llm_upstream_requests_total`,
//! `brightstaff_llm_upstream_duration_seconds`,
//! `brightstaff_llm_time_to_first_token_seconds`,
//! `brightstaff_llm_tokens_total`,
//! `brightstaff_llm_tokens_usage_missing_total`.
//! - Routing: `brightstaff_router_decisions_total`,
//! `brightstaff_router_decision_duration_seconds`,
//! `brightstaff_routing_service_requests_total`,
//! `brightstaff_session_cache_events_total`.
//! - Process: via `metrics-process`.
//! - Build: `brightstaff_build_info`.
use std::net::SocketAddr;
use std::sync::OnceLock;
use std::time::{Duration, Instant};
use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder};
use tracing::{info, warn};
pub mod labels;
/// Guard flag so tests don't re-install the global recorder.
static INIT: OnceLock<()> = OnceLock::new();
const DEFAULT_METRICS_BIND: &str = "0.0.0.0:9092";
/// HTTP request duration buckets (seconds). Capped at 60s.
const HTTP_BUCKETS: &[f64] = &[
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
];
/// LLM upstream / TTFT buckets (seconds). Capped at 120s because provider
/// completions routinely run that long.
const LLM_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0];
/// Router decision buckets (seconds). The orchestrator call itself is usually
/// sub-second but bucketed generously in case of upstream slowness.
const ROUTER_BUCKETS: &[f64] = &[
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0,
];
/// Install the global recorder and spawn the `/metrics` HTTP listener.
///
/// Safe to call more than once; subsequent calls are no-ops so tests that
/// construct their own recorder still work.
pub fn init() {
if INIT.get().is_some() {
return;
}
let bind: SocketAddr = std::env::var("METRICS_BIND_ADDRESS")
.unwrap_or_else(|_| DEFAULT_METRICS_BIND.to_string())
.parse()
.unwrap_or_else(|err| {
warn!(error = %err, default = DEFAULT_METRICS_BIND, "invalid METRICS_BIND_ADDRESS, falling back to default");
DEFAULT_METRICS_BIND.parse().expect("default bind parses")
});
let builder = PrometheusBuilder::new()
.with_http_listener(bind)
.set_buckets_for_metric(
Matcher::Full("brightstaff_http_request_duration_seconds".to_string()),
HTTP_BUCKETS,
)
.and_then(|b| {
b.set_buckets_for_metric(Matcher::Prefix("brightstaff_llm_".to_string()), LLM_BUCKETS)
})
.and_then(|b| {
b.set_buckets_for_metric(
Matcher::Full("brightstaff_router_decision_duration_seconds".to_string()),
ROUTER_BUCKETS,
)
});
let builder = match builder {
Ok(b) => b,
Err(err) => {
warn!(error = %err, "failed to configure metrics buckets, using defaults");
PrometheusBuilder::new().with_http_listener(bind)
}
};
if let Err(err) = builder.install() {
warn!(error = %err, "failed to install Prometheus recorder; metrics disabled");
return;
}
let _ = INIT.set(());
describe_all();
emit_build_info();
// Register process-level collector (RSS, CPU, FDs).
let collector = metrics_process::Collector::default();
collector.describe();
// Prime once at startup; subsequent scrapes refresh via the exporter's
// per-scrape render, so we additionally refresh on a short interval to
// keep gauges moving between scrapes without requiring client pull.
collector.collect();
tokio::spawn(async move {
let mut tick = tokio::time::interval(Duration::from_secs(10));
tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
tick.tick().await;
collector.collect();
}
});
info!(address = %bind, "metrics listener started");
}
fn describe_all() {
describe_counter!(
"brightstaff_http_requests_total",
"Total HTTP requests served by brightstaff, by handler and status class."
);
describe_histogram!(
"brightstaff_http_request_duration_seconds",
"Wall-clock duration of HTTP requests served by brightstaff, by handler."
);
describe_gauge!(
"brightstaff_http_in_flight_requests",
"Number of HTTP requests currently being served by brightstaff, by handler."
);
describe_counter!(
"brightstaff_llm_upstream_requests_total",
"LLM upstream request outcomes, by provider, model, status class and error class."
);
describe_histogram!(
"brightstaff_llm_upstream_duration_seconds",
"Wall-clock duration of LLM upstream calls (stream close for streaming), by provider and model."
);
describe_histogram!(
"brightstaff_llm_time_to_first_token_seconds",
"Time from request start to first streamed byte, by provider and model (streaming only)."
);
describe_counter!(
"brightstaff_llm_tokens_total",
"Tokens reported in the provider `usage` field, by provider, model and kind (prompt/completion)."
);
describe_counter!(
"brightstaff_llm_tokens_usage_missing_total",
"LLM responses that completed without a usable `usage` block (so token counts are unknown)."
);
describe_counter!(
"brightstaff_router_decisions_total",
"Routing decisions made by the orchestrator, by route, selected model, and whether a fallback was used."
);
describe_histogram!(
"brightstaff_router_decision_duration_seconds",
"Time spent in the orchestrator deciding a route, by route."
);
describe_counter!(
"brightstaff_routing_service_requests_total",
"Outcomes of /routing/* decision requests: decision_served, no_candidates, policy_error."
);
describe_counter!(
"brightstaff_session_cache_events_total",
"Session affinity cache lookups and stores, by outcome."
);
describe_gauge!(
"brightstaff_build_info",
"Build metadata. Always 1; labels carry version and git SHA."
);
}
fn emit_build_info() {
let version = env!("CARGO_PKG_VERSION");
let git_sha = option_env!("GIT_SHA").unwrap_or("unknown");
gauge!(
"brightstaff_build_info",
"version" => version.to_string(),
"git_sha" => git_sha.to_string(),
)
.set(1.0);
}
/// Split a provider-qualified model id like `"openai/gpt-4o"` into
/// `(provider, model)`. Returns `("unknown", raw)` when there is no `/`.
pub fn split_provider_model(full: &str) -> (&str, &str) {
match full.split_once('/') {
Some((p, m)) => (p, m),
None => ("unknown", full),
}
}
/// Bucket an HTTP status code into `"2xx"` / `"4xx"` / `"5xx"` / `"1xx"` / `"3xx"`.
pub fn status_class(status: u16) -> &'static str {
match status {
100..=199 => "1xx",
200..=299 => "2xx",
300..=399 => "3xx",
400..=499 => "4xx",
500..=599 => "5xx",
_ => "other",
}
}
// ---------------------------------------------------------------------------
// HTTP RED helpers
// ---------------------------------------------------------------------------
/// RAII guard that increments the in-flight gauge on construction and
/// decrements on drop. Pair with [`HttpTimer`] in the `route()` wrapper so the
/// gauge drops even on error paths.
pub struct InFlightGuard {
handler: &'static str,
}
impl InFlightGuard {
pub fn new(handler: &'static str) -> Self {
gauge!(
"brightstaff_http_in_flight_requests",
"handler" => handler,
)
.increment(1.0);
Self { handler }
}
}
impl Drop for InFlightGuard {
fn drop(&mut self) {
gauge!(
"brightstaff_http_in_flight_requests",
"handler" => self.handler,
)
.decrement(1.0);
}
}
/// Record the HTTP request counter + duration histogram.
pub fn record_http(handler: &'static str, method: &'static str, status: u16, started: Instant) {
let class = status_class(status);
counter!(
"brightstaff_http_requests_total",
"handler" => handler,
"method" => method,
"status_class" => class,
)
.increment(1);
histogram!(
"brightstaff_http_request_duration_seconds",
"handler" => handler,
)
.record(started.elapsed().as_secs_f64());
}
// ---------------------------------------------------------------------------
// LLM upstream helpers
// ---------------------------------------------------------------------------
/// Classify an outcome of an LLM upstream call for the `error_class` label.
pub fn llm_error_class_from_reqwest(err: &reqwest::Error) -> &'static str {
if err.is_timeout() {
"timeout"
} else if err.is_connect() {
"connect"
} else if err.is_decode() {
"parse"
} else {
"other"
}
}
/// Record the outcome of an LLM upstream call. `status` is the HTTP status
/// the upstream returned (0 if the call never produced one, e.g. send failure).
/// `error_class` is `"none"` on success, or a discriminated error label.
pub fn record_llm_upstream(
provider: &str,
model: &str,
status: u16,
error_class: &str,
duration: Duration,
) {
let class = if status == 0 {
"error"
} else {
status_class(status)
};
counter!(
"brightstaff_llm_upstream_requests_total",
"provider" => provider.to_string(),
"model" => model.to_string(),
"status_class" => class,
"error_class" => error_class.to_string(),
)
.increment(1);
histogram!(
"brightstaff_llm_upstream_duration_seconds",
"provider" => provider.to_string(),
"model" => model.to_string(),
)
.record(duration.as_secs_f64());
}
pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) {
histogram!(
"brightstaff_llm_time_to_first_token_seconds",
"provider" => provider.to_string(),
"model" => model.to_string(),
)
.record(ttft.as_secs_f64());
}
pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) {
counter!(
"brightstaff_llm_tokens_total",
"provider" => provider.to_string(),
"model" => model.to_string(),
"kind" => kind,
)
.increment(count);
}
pub fn record_llm_tokens_usage_missing(provider: &str, model: &str) {
counter!(
"brightstaff_llm_tokens_usage_missing_total",
"provider" => provider.to_string(),
"model" => model.to_string(),
)
.increment(1);
}
// ---------------------------------------------------------------------------
// Router helpers
// ---------------------------------------------------------------------------
pub fn record_router_decision(
route: &'static str,
selected_model: &str,
fallback: bool,
duration: Duration,
) {
counter!(
"brightstaff_router_decisions_total",
"route" => route,
"selected_model" => selected_model.to_string(),
"fallback" => if fallback { "true" } else { "false" },
)
.increment(1);
histogram!(
"brightstaff_router_decision_duration_seconds",
"route" => route,
)
.record(duration.as_secs_f64());
}
pub fn record_routing_service_outcome(outcome: &'static str) {
counter!(
"brightstaff_routing_service_requests_total",
"outcome" => outcome,
)
.increment(1);
}
pub fn record_session_cache_event(outcome: &'static str) {
counter!(
"brightstaff_session_cache_events_total",
"outcome" => outcome,
)
.increment(1);
}

View file

@ -1,8 +1,14 @@
use hermesllm::apis::openai::ChatCompletionsResponse;
use hyper::header;
use serde::Deserialize;
use thiserror::Error;
use tracing::warn;
/// Max bytes of raw upstream body we include in a log message or error text
/// when the body is not a recognizable error envelope. Keeps logs from being
/// flooded by huge HTML error pages.
const RAW_BODY_LOG_LIMIT: usize = 512;
#[derive(Debug, Error)]
pub enum HttpError {
#[error("Failed to send request: {0}")]
@ -10,13 +16,64 @@ pub enum HttpError {
#[error("Failed to parse JSON response: {0}")]
Json(serde_json::Error, String),
#[error("Upstream returned {status}: {message}")]
Upstream { status: u16, message: String },
}
/// Shape of an OpenAI-style error response body, e.g.
/// `{"error": {"message": "...", "type": "...", "param": "...", "code": ...}}`.
#[derive(Debug, Deserialize)]
struct UpstreamErrorEnvelope {
error: UpstreamErrorBody,
}
#[derive(Debug, Deserialize)]
struct UpstreamErrorBody {
message: String,
#[serde(default, rename = "type")]
err_type: Option<String>,
#[serde(default)]
param: Option<String>,
}
/// Extract a human-readable error message from an upstream response body.
/// Tries to parse an OpenAI-style `{"error": {"message": ...}}` envelope; if
/// that fails, falls back to the first `RAW_BODY_LOG_LIMIT` bytes of the raw
/// body (UTF-8 safe).
fn extract_upstream_error_message(body: &str) -> String {
if let Ok(env) = serde_json::from_str::<UpstreamErrorEnvelope>(body) {
let mut msg = env.error.message;
if let Some(param) = env.error.param {
msg.push_str(&format!(" (param={param})"));
}
if let Some(err_type) = env.error.err_type {
msg.push_str(&format!(" [type={err_type}]"));
}
return msg;
}
truncate_for_log(body).to_string()
}
fn truncate_for_log(s: &str) -> &str {
if s.len() <= RAW_BODY_LOG_LIMIT {
return s;
}
let mut end = RAW_BODY_LOG_LIMIT;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
/// Sends a POST request to the given URL and extracts the text content
/// from the first choice of the `ChatCompletionsResponse`.
///
/// Returns `Some((content, elapsed))` on success, or `None` if the response
/// had no choices or the first choice had no content.
/// Returns `Some((content, elapsed))` on success, `None` if the response
/// had no choices or the first choice had no content. Returns
/// `HttpError::Upstream` for any non-2xx status, carrying a message
/// extracted from the OpenAI-style error envelope (or a truncated raw body
/// if the body is not in that shape).
pub async fn post_and_extract_content(
client: &reqwest::Client,
url: &str,
@ -26,17 +83,36 @@ pub async fn post_and_extract_content(
let start_time = std::time::Instant::now();
let res = client.post(url).headers(headers).body(body).send().await?;
let status = res.status();
let body = res.text().await?;
let elapsed = start_time.elapsed();
if !status.is_success() {
let message = extract_upstream_error_message(&body);
warn!(
status = status.as_u16(),
message = %message,
body_size = body.len(),
"upstream returned error response"
);
return Err(HttpError::Upstream {
status: status.as_u16(),
message,
});
}
let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| {
warn!(error = %err, body = %body, "failed to parse json response");
warn!(
error = %err,
body = %truncate_for_log(&body),
"failed to parse json response",
);
HttpError::Json(err, format!("Failed to parse JSON: {}", body))
})?;
if response.choices.is_empty() {
warn!(body = %body, "no choices in response");
warn!(body = %truncate_for_log(&body), "no choices in response");
return Ok(None);
}
@ -46,3 +122,52 @@ pub async fn post_and_extract_content(
.as_ref()
.map(|c| (c.clone(), elapsed)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_message_from_openai_style_error_envelope() {
let body = r#"{"error":{"code":400,"message":"This model's maximum context length is 32768 tokens. However, you requested 0 output tokens and your prompt contains at least 32769 input tokens, for a total of at least 32769 tokens.","param":"input_tokens","type":"BadRequestError"}}"#;
let msg = extract_upstream_error_message(body);
assert!(
msg.starts_with("This model's maximum context length is 32768 tokens."),
"unexpected message: {msg}"
);
assert!(msg.contains("(param=input_tokens)"));
assert!(msg.contains("[type=BadRequestError]"));
}
#[test]
fn extracts_message_without_optional_fields() {
let body = r#"{"error":{"message":"something broke"}}"#;
let msg = extract_upstream_error_message(body);
assert_eq!(msg, "something broke");
}
#[test]
fn falls_back_to_raw_body_when_not_error_envelope() {
let body = "<html><body>502 Bad Gateway</body></html>";
let msg = extract_upstream_error_message(body);
assert_eq!(msg, body);
}
#[test]
fn truncates_non_envelope_bodies_in_logs() {
let body = "x".repeat(RAW_BODY_LOG_LIMIT * 3);
let msg = extract_upstream_error_message(&body);
assert_eq!(msg.len(), RAW_BODY_LOG_LIMIT);
}
#[test]
fn truncate_for_log_respects_utf8_boundaries() {
// 2-byte characters; picking a length that would split mid-char.
let body = "é".repeat(RAW_BODY_LOG_LIMIT);
let out = truncate_for_log(&body);
// Should be a valid &str (implicit — would panic if we returned
// a non-boundary slice) and at most RAW_BODY_LOG_LIMIT bytes.
assert!(out.len() <= RAW_BODY_LOG_LIMIT);
assert!(out.chars().all(|c| c == 'é'));
}
}

View file

@ -3,3 +3,5 @@ pub mod model_metrics;
pub mod orchestrator;
pub mod orchestrator_model;
pub mod orchestrator_model_v1;
#[cfg(test)]
mod stress_tests;

View file

@ -15,6 +15,8 @@ use super::http::{self, post_and_extract_content};
use super::model_metrics::ModelMetricsService;
use super::orchestrator_model::OrchestratorModel;
use crate::metrics as bs_metrics;
use crate::metrics::labels as metric_labels;
use crate::router::orchestrator_model_v1;
use crate::session_cache::SessionCache;
@ -130,7 +132,13 @@ impl OrchestratorService {
tenant_id: Option<&str>,
) -> Option<CachedRoute> {
let cache = self.session_cache.as_ref()?;
cache.get(&Self::session_key(tenant_id, session_id)).await
let result = cache.get(&Self::session_key(tenant_id, session_id)).await;
bs_metrics::record_session_cache_event(if result.is_some() {
metric_labels::SESSION_CACHE_HIT
} else {
metric_labels::SESSION_CACHE_MISS
});
result
}
pub async fn cache_route(
@ -151,6 +159,7 @@ impl OrchestratorService {
self.session_ttl,
)
.await;
bs_metrics::record_session_cache_event(metric_labels::SESSION_CACHE_STORE);
}
}

View file

@ -10,6 +10,18 @@ use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError};
pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model
/// Hard cap on the number of recent messages considered when building the
/// routing prompt. Bounds prompt growth for long-running conversations and
/// acts as an outer guardrail before the token-budget loop runs. The most
/// recent `MAX_ROUTING_TURNS` filtered messages are kept; older turns are
/// dropped entirely.
pub const MAX_ROUTING_TURNS: usize = 16;
/// Unicode ellipsis used to mark where content was trimmed out of a long
/// message. Helps signal to the downstream router model that the message was
/// truncated.
const TRIM_MARKER: &str = "";
/// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python
struct SpacedJsonFormatter;
@ -176,10 +188,9 @@ impl OrchestratorModel for OrchestratorModelV1 {
messages: &[Message],
usage_preferences_from_request: &Option<Vec<AgentUsagePreference>>,
) -> ChatCompletionsRequest {
// remove system prompt, tool calls, tool call response and messages without content
// if content is empty its likely a tool call
// when role == tool its tool call response
let messages_vec = messages
// Remove system/developer/tool messages and messages without extractable
// text (tool calls have no text content we can classify against).
let filtered: Vec<&Message> = messages
.iter()
.filter(|m| {
m.role != Role::System
@ -187,37 +198,72 @@ impl OrchestratorModel for OrchestratorModelV1 {
&& m.role != Role::Tool
&& !m.content.extract_text().is_empty()
})
.collect::<Vec<&Message>>();
.collect();
// Following code is to ensure that the conversation does not exceed max token length
// Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
// Outer guardrail: only consider the last `MAX_ROUTING_TURNS` filtered
// messages when building the routing prompt. Keeps prompt growth
// predictable for long conversations regardless of per-message size.
let start = filtered.len().saturating_sub(MAX_ROUTING_TURNS);
let messages_vec: &[&Message] = &filtered[start..];
// Ensure the conversation does not exceed the configured token budget.
// We use `len() / TOKEN_LENGTH_DIVISOR` as a cheap token estimate to
// avoid running a real tokenizer on the hot path.
let mut token_count = ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
let mut selected_messages_list_reversed: Vec<&Message> = vec![];
let mut selected_messages_list_reversed: Vec<Message> = vec![];
for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
token_count += message_token_count;
if token_count > self.max_token_length {
let message_text = message.content.extract_text();
let message_token_count = message_text.len() / TOKEN_LENGTH_DIVISOR;
if token_count + message_token_count > self.max_token_length {
let remaining_tokens = self.max_token_length.saturating_sub(token_count);
debug!(
token_count = token_count,
attempted_total_tokens = token_count + message_token_count,
max_tokens = self.max_token_length,
remaining_tokens,
selected = selected_messsage_count,
total = messages_vec.len(),
"token count exceeds max, truncating conversation"
);
if message.role == Role::User {
// If message that exceeds max token length is from user, we need to keep it
selected_messages_list_reversed.push(message);
// If the overflow message is from the user we need to keep
// some of it so the orchestrator still sees the latest user
// intent. Use a middle-trim (head + ellipsis + tail): users
// often frame the task at the start AND put the actual ask
// at the end of a long pasted block, so preserving both is
// better than a head-only cut. The ellipsis also signals to
// the router model that content was dropped.
if message.role == Role::User && remaining_tokens > 0 {
let max_bytes = remaining_tokens.saturating_mul(TOKEN_LENGTH_DIVISOR);
let truncated = trim_middle_utf8(&message_text, max_bytes);
selected_messages_list_reversed.push(Message {
role: Role::User,
content: Some(MessageContent::Text(truncated)),
name: None,
tool_calls: None,
tool_call_id: None,
});
}
break;
}
// If we are here, it means that the message is within the max token length
selected_messages_list_reversed.push(message);
token_count += message_token_count;
selected_messages_list_reversed.push(Message {
role: message.role.clone(),
content: Some(MessageContent::Text(message_text)),
name: None,
tool_calls: None,
tool_call_id: None,
});
}
if selected_messages_list_reversed.is_empty() {
debug!("no messages selected, using last message");
if let Some(last_message) = messages_vec.last() {
selected_messages_list_reversed.push(last_message);
selected_messages_list_reversed.push(Message {
role: last_message.role.clone(),
content: Some(MessageContent::Text(last_message.content.extract_text())),
name: None,
tool_calls: None,
tool_call_id: None,
});
}
}
@ -237,22 +283,8 @@ impl OrchestratorModel for OrchestratorModelV1 {
}
// Reverse the selected messages to maintain the conversation order
let selected_conversation_list = selected_messages_list_reversed
.iter()
.rev()
.map(|message| Message {
role: message.role.clone(),
content: Some(MessageContent::Text(
message
.content
.as_ref()
.map_or(String::new(), |c| c.to_string()),
)),
name: None,
tool_calls: None,
tool_call_id: None,
})
.collect::<Vec<Message>>();
let selected_conversation_list: Vec<Message> =
selected_messages_list_reversed.into_iter().rev().collect();
// Generate the orchestrator request message based on the usage preferences.
// If preferences are passed in request then we use them;
@ -405,6 +437,45 @@ fn fix_json_response(body: &str) -> String {
body.replace("'", "\"").replace("\\n", "")
}
/// Truncate `s` so the result is at most `max_bytes` bytes long, keeping
/// roughly 60% from the start and 40% from the end, with a Unicode ellipsis
/// separating the two. All splits respect UTF-8 character boundaries. When
/// `max_bytes` is too small to fit the marker at all, falls back to a
/// head-only truncation.
fn trim_middle_utf8(s: &str, max_bytes: usize) -> String {
if s.len() <= max_bytes {
return s.to_string();
}
if max_bytes <= TRIM_MARKER.len() {
// Not enough room even for the marker — just keep the start.
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
return s[..end].to_string();
}
let available = max_bytes - TRIM_MARKER.len();
// Bias toward the start (60%) where task framing typically lives, while
// still preserving ~40% of the tail where the user's actual ask often
// appears after a long paste.
let mut start_len = available * 3 / 5;
while start_len > 0 && !s.is_char_boundary(start_len) {
start_len -= 1;
}
let end_len = available - start_len;
let mut end_start = s.len().saturating_sub(end_len);
while end_start < s.len() && !s.is_char_boundary(end_start) {
end_start += 1;
}
let mut out = String::with_capacity(start_len + TRIM_MARKER.len() + (s.len() - end_start));
out.push_str(&s[..start_len]);
out.push_str(TRIM_MARKER);
out.push_str(&s[end_start..]);
out
}
impl std::fmt::Debug for dyn OrchestratorModel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "OrchestratorModel")
@ -777,6 +848,10 @@ If no routes are needed, return an empty list for `route`.
#[test]
fn test_conversation_trim_upto_user_message() {
// With max_token_length=230, the older user message "given the image
// In style of Andy Warhol" overflows the remaining budget and gets
// middle-trimmed (head + ellipsis + tail) until it fits. Newer turns
// are kept in full.
let expected_prompt = r#"
You are a helpful assistant that selects the most suitable routes based on user intent.
You are provided with a list of available routes enclosed within <routes></routes> XML tags:
@ -789,7 +864,7 @@ You are also given the conversation context enclosed within <conversation></conv
[
{
"role": "user",
"content": "given the image In style of Andy Warhol"
"content": "givenrhol"
},
{
"role": "assistant",
@ -862,6 +937,190 @@ If no routes are needed, return an empty list for `route`.
assert_eq!(expected_prompt, prompt);
}
#[test]
fn test_huge_single_user_message_is_middle_trimmed() {
// Regression test for the case where a single, extremely large user
// message was being passed to the orchestrator verbatim and blowing
// past the upstream model's context window. The trimmer must now
// middle-trim (head + ellipsis + tail) the oversized message so the
// resulting request stays within the configured budget, and the
// trim marker must be present so the router model knows content
// was dropped.
let orchestrations_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let agent_orchestrations = serde_json::from_str::<
HashMap<String, Vec<OrchestrationPreference>>,
>(orchestrations_str)
.unwrap();
let max_token_length = 2048;
let orchestrator = OrchestratorModelV1::new(
agent_orchestrations,
"test-model".to_string(),
max_token_length,
);
// ~500KB of content — same scale as the real payload that triggered
// the production upstream 400.
let head = "HEAD_MARKER_START ";
let tail = " TAIL_MARKER_END";
let filler = "A".repeat(500_000);
let huge_user_content = format!("{head}{filler}{tail}");
let conversation = vec![Message {
role: Role::User,
content: Some(MessageContent::Text(huge_user_content.clone())),
name: None,
tool_calls: None,
tool_call_id: None,
}];
let req = orchestrator.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
// Prompt must stay bounded. Generous ceiling = budget-in-bytes +
// scaffolding + slack. Real result should be well under this.
let byte_ceiling = max_token_length * TOKEN_LENGTH_DIVISOR
+ ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len()
+ 1024;
assert!(
prompt.len() < byte_ceiling,
"prompt length {} exceeded ceiling {} — truncation did not apply",
prompt.len(),
byte_ceiling,
);
// Not all 500k filler chars survive.
let a_count = prompt.chars().filter(|c| *c == 'A').count();
assert!(
a_count < filler.len(),
"expected user message to be truncated; all {} 'A's survived",
a_count
);
assert!(
a_count > 0,
"expected some of the user message to survive truncation"
);
// Head and tail of the message must both be preserved (that's the
// whole point of middle-trim over head-only).
assert!(
prompt.contains(head),
"head marker missing — head was not preserved"
);
assert!(
prompt.contains(tail),
"tail marker missing — tail was not preserved"
);
// Trim marker must be present so the router model can see that
// content was omitted.
assert!(
prompt.contains(TRIM_MARKER),
"ellipsis trim marker missing from truncated prompt"
);
// Routing prompt scaffolding remains intact.
assert!(prompt.contains("<conversation>"));
assert!(prompt.contains("<routes>"));
}
#[test]
fn test_turn_cap_limits_routing_history() {
// The outer turn-cap guardrail should keep only the last
// `MAX_ROUTING_TURNS` filtered messages regardless of how long the
// conversation is. We build a conversation with alternating
// user/assistant turns tagged with their index and verify that only
// the tail of the conversation makes it into the prompt.
let orchestrations_str = r#"
{
"gpt-4o": [
{"name": "Image generation", "description": "generating image"}
]
}
"#;
let agent_orchestrations = serde_json::from_str::<
HashMap<String, Vec<OrchestrationPreference>>,
>(orchestrations_str)
.unwrap();
let orchestrator =
OrchestratorModelV1::new(agent_orchestrations, "test-model".to_string(), usize::MAX);
let mut conversation: Vec<Message> = Vec::new();
let total_turns = MAX_ROUTING_TURNS * 2; // well past the cap
for i in 0..total_turns {
let role = if i % 2 == 0 {
Role::User
} else {
Role::Assistant
};
conversation.push(Message {
role,
content: Some(MessageContent::Text(format!("turn-{i:03}"))),
name: None,
tool_calls: None,
tool_call_id: None,
});
}
let req = orchestrator.generate_request(&conversation, &None);
let prompt = req.messages[0].content.extract_text();
// The last MAX_ROUTING_TURNS messages (indexes total-cap..total)
// must all appear.
for i in (total_turns - MAX_ROUTING_TURNS)..total_turns {
let tag = format!("turn-{i:03}");
assert!(
prompt.contains(&tag),
"expected recent turn tag {tag} to be present"
);
}
// And earlier turns (indexes 0..total-cap) must all be dropped.
for i in 0..(total_turns - MAX_ROUTING_TURNS) {
let tag = format!("turn-{i:03}");
assert!(
!prompt.contains(&tag),
"old turn tag {tag} leaked past turn cap into the prompt"
);
}
}
#[test]
fn test_trim_middle_utf8_helper() {
// No-op when already small enough.
assert_eq!(trim_middle_utf8("hello", 100), "hello");
assert_eq!(trim_middle_utf8("hello", 5), "hello");
// 60/40 split with ellipsis when too long.
let long = "a".repeat(20);
let out = trim_middle_utf8(&long, 10);
assert!(out.len() <= 10);
assert!(out.contains(TRIM_MARKER));
// Exactly one ellipsis, rest are 'a's.
assert_eq!(out.matches(TRIM_MARKER).count(), 1);
assert!(out.chars().filter(|c| *c == 'a').count() > 0);
// When max_bytes is smaller than the marker, falls back to
// head-only truncation (no marker).
let out = trim_middle_utf8("abcdefgh", 2);
assert_eq!(out, "ab");
// UTF-8 boundary safety: 2-byte chars.
let s = "é".repeat(50); // 100 bytes
let out = trim_middle_utf8(&s, 25);
assert!(out.len() <= 25);
// Must still be valid UTF-8 that only contains 'é' and the marker.
let ok = out.chars().all(|c| c == 'é' || c == '…');
assert!(ok, "unexpected char in trimmed output: {out:?}");
}
#[test]
fn test_non_text_input() {
let expected_prompt = r#"

View file

@ -0,0 +1,264 @@
#[cfg(test)]
mod tests {
use crate::router::orchestrator::OrchestratorService;
use crate::session_cache::memory::MemorySessionCache;
use common::configuration::{SelectionPolicy, SelectionPreference, TopLevelRoutingPreference};
use hermesllm::apis::openai::{Message, MessageContent, Role};
use std::sync::Arc;
fn make_messages(n: usize) -> Vec<Message> {
(0..n)
.map(|i| Message {
role: if i % 2 == 0 {
Role::User
} else {
Role::Assistant
},
content: Some(MessageContent::Text(format!(
"This is message number {i} with some padding text to make it realistic."
))),
name: None,
tool_calls: None,
tool_call_id: None,
})
.collect()
}
fn make_routing_prefs() -> Vec<TopLevelRoutingPreference> {
vec![
TopLevelRoutingPreference {
name: "code_generation".to_string(),
description: "Code generation and debugging tasks".to_string(),
models: vec![
"openai/gpt-4o".to_string(),
"openai/gpt-4o-mini".to_string(),
],
selection_policy: SelectionPolicy {
prefer: SelectionPreference::None,
},
},
TopLevelRoutingPreference {
name: "summarization".to_string(),
description: "Summarizing documents and text".to_string(),
models: vec![
"anthropic/claude-3-sonnet".to_string(),
"openai/gpt-4o-mini".to_string(),
],
selection_policy: SelectionPolicy {
prefer: SelectionPreference::None,
},
},
]
}
/// Stress test: exercise the full routing code path N times using a mock
/// HTTP server and measure jemalloc allocated bytes before/after.
///
/// This catches:
/// - Memory leaks in generate_request / parse_response
/// - Leaks in reqwest connection handling
/// - String accumulation in the orchestrator model
/// - Fragmentation (jemalloc allocated vs resident)
#[tokio::test]
async fn stress_test_routing_determine_route() {
let mut server = mockito::Server::new_async().await;
let router_url = format!("{}/v1/chat/completions", server.url());
let mock_response = serde_json::json!({
"id": "chatcmpl-mock",
"object": "chat.completion",
"created": 1234567890,
"model": "plano-orchestrator",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "{\"route\": \"code_generation\"}"
},
"finish_reason": "stop"
}],
"usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110}
});
let _mock = server
.mock("POST", "/v1/chat/completions")
.with_status(200)
.with_header("content-type", "application/json")
.with_body(mock_response.to_string())
.expect_at_least(1)
.create_async()
.await;
let prefs = make_routing_prefs();
let session_cache = Arc::new(MemorySessionCache::new(1000));
let orchestrator_service = Arc::new(OrchestratorService::with_routing(
router_url,
"Plano-Orchestrator".to_string(),
"plano-orchestrator".to_string(),
Some(prefs.clone()),
None,
None,
session_cache,
None,
2048,
));
// Warm up: a few requests to stabilize allocator state
for _ in 0..10 {
let msgs = make_messages(5);
let _ = orchestrator_service
.determine_route(&msgs, None, "warmup")
.await;
}
// Snapshot memory after warmup
let baseline = get_allocated();
let num_iterations = 2000;
for i in 0..num_iterations {
let msgs = make_messages(5 + (i % 10));
let inline = if i % 3 == 0 {
Some(make_routing_prefs())
} else {
None
};
let _ = orchestrator_service
.determine_route(&msgs, inline, &format!("req-{i}"))
.await;
}
let after = get_allocated();
let growth = after.saturating_sub(baseline);
let growth_mb = growth as f64 / (1024.0 * 1024.0);
let per_request = if num_iterations > 0 {
growth / num_iterations
} else {
0
};
eprintln!("=== Routing Stress Test Results ===");
eprintln!(" Iterations: {num_iterations}");
eprintln!(" Baseline alloc: {} bytes", baseline);
eprintln!(" Final alloc: {} bytes", after);
eprintln!(" Growth: {} bytes ({growth_mb:.2} MB)", growth);
eprintln!(" Per-request: {} bytes", per_request);
// Allow up to 256 bytes per request of retained growth (connection pool, etc.)
// A true leak would show thousands of bytes per request.
assert!(
per_request < 256,
"Possible memory leak: {per_request} bytes/request retained after {num_iterations} iterations"
);
}
/// Stress test with high concurrency: many parallel determine_route calls.
#[tokio::test]
async fn stress_test_routing_concurrent() {
let mut server = mockito::Server::new_async().await;
let router_url = format!("{}/v1/chat/completions", server.url());
let mock_response = serde_json::json!({
"id": "chatcmpl-mock",
"object": "chat.completion",
"created": 1234567890,
"model": "plano-orchestrator",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": "{\"route\": \"summarization\"}"
},
"finish_reason": "stop"
}],
"usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110}
});
let _mock = server
.mock("POST", "/v1/chat/completions")
.with_status(200)
.with_header("content-type", "application/json")
.with_body(mock_response.to_string())
.expect_at_least(1)
.create_async()
.await;
let prefs = make_routing_prefs();
let session_cache = Arc::new(MemorySessionCache::new(1000));
let orchestrator_service = Arc::new(OrchestratorService::with_routing(
router_url,
"Plano-Orchestrator".to_string(),
"plano-orchestrator".to_string(),
Some(prefs),
None,
None,
session_cache,
None,
2048,
));
// Warm up
for _ in 0..20 {
let msgs = make_messages(3);
let _ = orchestrator_service
.determine_route(&msgs, None, "warmup")
.await;
}
let baseline = get_allocated();
let concurrency = 50;
let requests_per_task = 100;
let total = concurrency * requests_per_task;
let mut handles = vec![];
for t in 0..concurrency {
let svc = Arc::clone(&orchestrator_service);
let handle = tokio::spawn(async move {
for r in 0..requests_per_task {
let msgs = make_messages(3 + (r % 8));
let _ = svc
.determine_route(&msgs, None, &format!("req-{t}-{r}"))
.await;
}
});
handles.push(handle);
}
for h in handles {
h.await.unwrap();
}
let after = get_allocated();
let growth = after.saturating_sub(baseline);
let per_request = growth / total;
eprintln!("=== Concurrent Routing Stress Test Results ===");
eprintln!(" Tasks: {concurrency} x {requests_per_task} = {total}");
eprintln!(" Baseline: {} bytes", baseline);
eprintln!(" Final: {} bytes", after);
eprintln!(
" Growth: {} bytes ({:.2} MB)",
growth,
growth as f64 / 1_048_576.0
);
eprintln!(" Per-request: {} bytes", per_request);
assert!(
per_request < 512,
"Possible memory leak under concurrency: {per_request} bytes/request retained after {total} requests"
);
}
#[cfg(feature = "jemalloc")]
fn get_allocated() -> usize {
tikv_jemalloc_ctl::epoch::advance().unwrap();
tikv_jemalloc_ctl::stats::allocated::read().unwrap_or(0)
}
#[cfg(not(feature = "jemalloc"))]
fn get_allocated() -> usize {
0
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,347 @@
//! Environment exhaustion detector. Direct port of
//! `signals/environment/exhaustion.py`.
use std::sync::OnceLock;
use regex::Regex;
use serde_json::json;
use crate::signals::analyzer::ShareGptMessage;
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
pub const API_ERROR_PATTERNS: &[&str] = &[
r"500\s*(internal\s+)?server\s+error",
r"502\s*bad\s+gateway",
r"503\s*service\s+unavailable",
r"504\s*gateway\s+timeout",
r"internal\s+server\s+error",
r"service\s+unavailable",
r"server\s+error",
r"backend\s+error",
r"upstream\s+error",
r"service\s+temporarily\s+unavailable",
r"maintenance\s+mode",
r"under\s+maintenance",
r"try\s+again\s+later",
r"temporarily\s+unavailable",
r"system\s+error",
r"unexpected\s+error",
r"unhandled\s+exception",
];
pub const TIMEOUT_PATTERNS: &[&str] = &[
r"timeout",
r"timed?\s*out",
r"etimedout",
r"connection\s+timed?\s*out",
r"read\s+timed?\s*out",
r"request\s+timed?\s*out",
r"gateway\s+timeout",
r"deadline\s+exceeded",
r"took\s+too\s+long",
r"operation\s+timed?\s*out",
r"socket\s+timeout",
];
pub const RATE_LIMIT_PATTERNS: &[&str] = &[
r"rate\s+limit",
r"rate.limited",
r"(status|error|http)\s*:?\s*429",
r"429\s+(too\s+many|rate|limit)",
r"too\s+many\s+requests?",
r"quota\s+exceeded",
r"quota\s+limit",
r"throttl(ed|ing)",
r"request\s+limit",
r"api\s+limit",
r"calls?\s+per\s+(second|minute|hour|day)",
r"exceeded\s+.*\s+limit",
r"slow\s+down",
r"retry\s+after",
r"requests?\s+exceeded",
];
pub const NETWORK_PATTERNS: &[&str] = &[
r"connection\s+refused",
r"econnrefused",
r"econnreset",
r"connection\s+reset",
r"enotfound",
r"dns\s+(error|failure|lookup)",
r"host\s+not\s+found",
r"network\s+(error|failure|unreachable)",
r"no\s+route\s+to\s+host",
r"socket\s+error",
r"connection\s+failed",
r"unable\s+to\s+connect",
r"cannot\s+connect",
r"could\s+not\s+connect",
r"connect\s+error",
r"ssl\s+(error|handshake|certificate)",
r"certificate\s+(error|invalid|expired)",
];
pub const MALFORMED_PATTERNS: &[&str] = &[
r"json\s+parse\s+error",
r"invalid\s+json",
r"unexpected\s+token",
r"syntax\s+error.*json",
r"malformed\s+(response|json|data)",
r"unexpected\s+end\s+of",
r"parse\s+error",
r"parsing\s+failed",
r"invalid\s+response",
r"unexpected\s+response",
r"response\s+format",
r"missing\s+field.*response",
r"unexpected\s+schema",
r"schema\s+validation",
r"deserialization\s+error",
r"failed\s+to\s+decode",
];
pub const CONTEXT_OVERFLOW_PATTERNS: &[&str] = &[
r"context\s+(length|limit|overflow|exceeded)",
r"token\s+(limit|overflow|exceeded)",
r"max(imum)?\s+tokens?",
r"input\s+too\s+(long|large)",
r"exceeds?\s+(context|token|character|input)\s+limit",
r"message\s+too\s+(long|large)",
r"content\s+too\s+(long|large)",
r"truncat(ed|ion)\s+(due\s+to|because|for)\s+(length|size|limit)",
r"maximum\s+context",
r"prompt\s+too\s+(long|large)",
];
fn compile(patterns: &[&str]) -> Regex {
let combined = patterns
.iter()
.map(|p| format!("({})", p))
.collect::<Vec<_>>()
.join("|");
Regex::new(&format!("(?i){}", combined)).expect("exhaustion pattern regex must compile")
}
fn api_error_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(API_ERROR_PATTERNS))
}
fn timeout_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(TIMEOUT_PATTERNS))
}
fn rate_limit_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(RATE_LIMIT_PATTERNS))
}
fn network_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(NETWORK_PATTERNS))
}
fn malformed_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(MALFORMED_PATTERNS))
}
fn context_overflow_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(CONTEXT_OVERFLOW_PATTERNS))
}
fn snippet_around(text: &str, m: regex::Match<'_>, context: usize) -> String {
let start = m.start().saturating_sub(context);
let end = (m.end() + context).min(text.len());
let start = align_char_boundary(text, start, false);
let end = align_char_boundary(text, end, true);
let mut snippet = String::new();
if start > 0 {
snippet.push_str("...");
}
snippet.push_str(&text[start..end]);
if end < text.len() {
snippet.push_str("...");
}
snippet
}
fn align_char_boundary(s: &str, mut idx: usize, forward: bool) -> usize {
if idx >= s.len() {
return s.len();
}
while !s.is_char_boundary(idx) {
if forward {
idx += 1;
} else if idx == 0 {
break;
} else {
idx -= 1;
}
}
idx
}
pub fn analyze_exhaustion(messages: &[ShareGptMessage<'_>]) -> SignalGroup {
let mut group = SignalGroup::new("exhaustion");
for (i, msg) in messages.iter().enumerate() {
if msg.from != "observation" {
continue;
}
let value = msg.value;
let lower = value.to_lowercase();
if let Some(m) = rate_limit_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionRateLimit,
i,
snippet_around(value, m, 50),
0.95,
"rate_limit",
m.as_str(),
));
continue;
}
if let Some(m) = api_error_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionApiError,
i,
snippet_around(value, m, 50),
0.9,
"api_error",
m.as_str(),
));
continue;
}
if let Some(m) = timeout_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionTimeout,
i,
snippet_around(value, m, 50),
0.9,
"timeout",
m.as_str(),
));
continue;
}
if let Some(m) = network_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionNetwork,
i,
snippet_around(value, m, 50),
0.9,
"network",
m.as_str(),
));
continue;
}
if let Some(m) = malformed_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionMalformed,
i,
snippet_around(value, m, 50),
0.85,
"malformed_response",
m.as_str(),
));
continue;
}
if let Some(m) = context_overflow_re().find(&lower) {
group.add_signal(emit(
SignalType::EnvironmentExhaustionContextOverflow,
i,
snippet_around(value, m, 50),
0.9,
"context_overflow",
m.as_str(),
));
}
}
group
}
fn emit(
t: SignalType,
idx: usize,
snippet: String,
confidence: f32,
kind: &str,
matched: &str,
) -> SignalInstance {
SignalInstance::new(t, idx, snippet)
.with_confidence(confidence)
.with_metadata(json!({
"exhaustion_type": kind,
"matched": matched,
}))
}
#[cfg(test)]
mod tests {
use super::*;
fn obs(value: &str) -> ShareGptMessage<'_> {
ShareGptMessage {
from: "observation",
value,
}
}
#[test]
fn detects_rate_limit() {
let g = analyze_exhaustion(&[obs("HTTP 429: too many requests, retry after 30s")]);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionRateLimit)));
}
#[test]
fn detects_api_error() {
let g = analyze_exhaustion(&[obs("503 service unavailable - try again later")]);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionApiError)));
}
#[test]
fn detects_timeout() {
let g = analyze_exhaustion(&[obs("Connection timed out after 30 seconds")]);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionTimeout)));
}
#[test]
fn detects_network_failure() {
let g = analyze_exhaustion(&[obs("ECONNREFUSED: connection refused by remote host")]);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionNetwork)));
}
#[test]
fn detects_malformed_response() {
let g = analyze_exhaustion(&[obs("Invalid JSON: unexpected token at position 42")]);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionMalformed)));
}
#[test]
fn detects_context_overflow() {
let g = analyze_exhaustion(&[obs("Maximum context length exceeded for this model")]);
assert!(g.signals.iter().any(|s| matches!(
s.signal_type,
SignalType::EnvironmentExhaustionContextOverflow
)));
}
}

View file

@ -0,0 +1,3 @@
//! Environment signals: exhaustion (external system failures and constraints).
pub mod exhaustion;

View file

@ -0,0 +1,388 @@
//! Execution failure detector. Direct port of `signals/execution/failure.py`.
use std::sync::OnceLock;
use regex::Regex;
use serde_json::json;
use crate::signals::analyzer::ShareGptMessage;
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
pub const INVALID_ARGS_PATTERNS: &[&str] = &[
r"invalid\s+argument",
r"invalid\s+parameter",
r"invalid\s+type",
r"type\s*error",
r"expected\s+\w+\s*,?\s*got\s+\w+",
r"required\s+field",
r"required\s+parameter",
r"missing\s+required",
r"missing\s+argument",
r"validation\s+failed",
r"validation\s+error",
r"invalid\s+value",
r"invalid\s+format",
r"must\s+be\s+(a|an)\s+\w+",
r"cannot\s+be\s+(null|empty|none)",
r"is\s+not\s+valid",
r"does\s+not\s+match",
r"out\s+of\s+range",
r"invalid\s+date",
r"invalid\s+json",
r"malformed\s+request",
];
pub const BAD_QUERY_PATTERNS: &[&str] = &[
r"invalid\s+query",
r"query\s+syntax\s+error",
r"malformed\s+query",
r"unknown\s+field",
r"invalid\s+field",
r"invalid\s+filter",
r"invalid\s+search",
r"unknown\s+id",
r"invalid\s+id",
r"id\s+format\s+error",
r"invalid\s+identifier",
r"query\s+failed",
r"search\s+error",
r"invalid\s+operator",
r"unsupported\s+query",
];
pub const TOOL_NOT_FOUND_PATTERNS: &[&str] = &[
r"unknown\s+function",
r"unknown\s+tool",
r"function\s+not\s+found",
r"tool\s+not\s+found",
r"no\s+such\s+function",
r"no\s+such\s+tool",
r"undefined\s+function",
r"action\s+not\s+supported",
r"invalid\s+tool",
r"invalid\s+function",
r"unrecognized\s+function",
];
pub const AUTH_MISUSE_PATTERNS: &[&str] = &[
r"\bunauthorized\b",
r"(status|error|http|code)\s*:?\s*401",
r"401\s+unauthorized",
r"403\s+forbidden",
r"permission\s+denied",
r"access\s+denied",
r"authentication\s+required",
r"invalid\s+credentials",
r"invalid\s+token",
r"token\s+expired",
r"missing\s+authorization",
r"\bforbidden\b",
r"not\s+authorized",
r"insufficient\s+permissions?",
];
pub const STATE_ERROR_PATTERNS: &[&str] = &[
r"invalid\s+state",
r"illegal\s+state",
r"must\s+call\s+\w+\s+first",
r"must\s+\w+\s+before",
r"cannot\s+\w+\s+before",
r"already\s+(exists?|created|started|finished)",
r"not\s+initialized",
r"not\s+started",
r"already\s+in\s+progress",
r"operation\s+in\s+progress",
r"sequence\s+error",
r"precondition\s+failed",
r"(status|error|http)\s*:?\s*409",
r"409\s+conflict",
r"\bconflict\b",
];
fn compile(patterns: &[&str]) -> Regex {
// Use `(?i)` flag for case-insensitive matching, matching Python's `re.IGNORECASE`.
let combined = patterns
.iter()
.map(|p| format!("({})", p))
.collect::<Vec<_>>()
.join("|");
Regex::new(&format!("(?i){}", combined)).expect("failure pattern regex must compile")
}
fn invalid_args_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(INVALID_ARGS_PATTERNS))
}
fn bad_query_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(BAD_QUERY_PATTERNS))
}
fn tool_not_found_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(TOOL_NOT_FOUND_PATTERNS))
}
fn auth_misuse_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(AUTH_MISUSE_PATTERNS))
}
fn state_error_re() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| compile(STATE_ERROR_PATTERNS))
}
/// Pull tool name + args from a `function_call` message. Mirrors
/// `_extract_tool_info` in the reference.
pub(crate) fn extract_tool_info(value: &str) -> (String, String) {
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(value) {
if let Some(obj) = parsed.as_object() {
let name = obj
.get("name")
.or_else(|| obj.get("function"))
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| "unknown".to_string());
let args = match obj.get("arguments").or_else(|| obj.get("args")) {
Some(serde_json::Value::Object(o)) => {
serde_json::to_string(&serde_json::Value::Object(o.clone())).unwrap_or_default()
}
Some(other) => other
.as_str()
.map(|s| s.to_string())
.unwrap_or_else(|| serde_json::to_string(other).unwrap_or_default()),
None => String::new(),
};
return (name, args);
}
}
let mut snippet: String = value.chars().take(200).collect();
snippet.shrink_to_fit();
("unknown".to_string(), snippet)
}
/// Build a context-window snippet around a regex match, with leading/trailing
/// ellipses when truncated. Mirrors `_get_snippet`.
fn snippet_around(text: &str, m: regex::Match<'_>, context: usize) -> String {
let start = m.start().saturating_sub(context);
let end = (m.end() + context).min(text.len());
// Ensure we cut on UTF-8 boundaries.
let start = align_char_boundary(text, start, false);
let end = align_char_boundary(text, end, true);
let mut snippet = String::new();
if start > 0 {
snippet.push_str("...");
}
snippet.push_str(&text[start..end]);
if end < text.len() {
snippet.push_str("...");
}
snippet
}
fn align_char_boundary(s: &str, mut idx: usize, forward: bool) -> usize {
if idx >= s.len() {
return s.len();
}
while !s.is_char_boundary(idx) {
if forward {
idx += 1;
} else if idx == 0 {
break;
} else {
idx -= 1;
}
}
idx
}
pub fn analyze_failure(messages: &[ShareGptMessage<'_>]) -> SignalGroup {
let mut group = SignalGroup::new("failure");
let mut last_call: Option<(usize, String, String)> = None;
for (i, msg) in messages.iter().enumerate() {
match msg.from {
"function_call" => {
let (name, args) = extract_tool_info(msg.value);
last_call = Some((i, name, args));
continue;
}
"observation" => {}
_ => continue,
}
let value = msg.value;
let lower = value.to_lowercase();
let (call_index, tool_name) = match &last_call {
Some((idx, name, _)) => (*idx, name.clone()),
None => (i.saturating_sub(1), "unknown".to_string()),
};
if let Some(m) = invalid_args_re().find(&lower) {
group.add_signal(
SignalInstance::new(
SignalType::ExecutionFailureInvalidArgs,
i,
snippet_around(value, m, 50),
)
.with_confidence(0.9)
.with_metadata(json!({
"tool_name": tool_name,
"call_index": call_index,
"error_type": "invalid_args",
"matched": m.as_str(),
})),
);
continue;
}
if let Some(m) = tool_not_found_re().find(&lower) {
group.add_signal(
SignalInstance::new(
SignalType::ExecutionFailureToolNotFound,
i,
snippet_around(value, m, 50),
)
.with_confidence(0.95)
.with_metadata(json!({
"tool_name": tool_name,
"call_index": call_index,
"error_type": "tool_not_found",
"matched": m.as_str(),
})),
);
continue;
}
if let Some(m) = auth_misuse_re().find(&lower) {
group.add_signal(
SignalInstance::new(
SignalType::ExecutionFailureAuthMisuse,
i,
snippet_around(value, m, 50),
)
.with_confidence(0.8)
.with_metadata(json!({
"tool_name": tool_name,
"call_index": call_index,
"error_type": "auth_misuse",
"matched": m.as_str(),
})),
);
continue;
}
if let Some(m) = state_error_re().find(&lower) {
group.add_signal(
SignalInstance::new(
SignalType::ExecutionFailureStateError,
i,
snippet_around(value, m, 50),
)
.with_confidence(0.85)
.with_metadata(json!({
"tool_name": tool_name,
"call_index": call_index,
"error_type": "state_error",
"matched": m.as_str(),
})),
);
continue;
}
if let Some(m) = bad_query_re().find(&lower) {
let confidence = if ["error", "invalid", "failed"]
.iter()
.any(|w| lower.contains(w))
{
0.8
} else {
0.6
};
group.add_signal(
SignalInstance::new(
SignalType::ExecutionFailureBadQuery,
i,
snippet_around(value, m, 50),
)
.with_confidence(confidence)
.with_metadata(json!({
"tool_name": tool_name,
"call_index": call_index,
"error_type": "bad_query",
"matched": m.as_str(),
})),
);
}
}
group
}
#[cfg(test)]
mod tests {
use super::*;
fn fc(value: &str) -> ShareGptMessage<'_> {
ShareGptMessage {
from: "function_call",
value,
}
}
fn obs(value: &str) -> ShareGptMessage<'_> {
ShareGptMessage {
from: "observation",
value,
}
}
#[test]
fn detects_invalid_args() {
let msgs = vec![
fc(r#"{"name":"create_user","arguments":{"age":"twelve"}}"#),
obs("Error: validation failed - expected integer got string for field age"),
];
let g = analyze_failure(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionFailureInvalidArgs)));
}
#[test]
fn detects_tool_not_found() {
let msgs = vec![
fc(r#"{"name":"send_thought","arguments":{}}"#),
obs("Error: unknown function 'send_thought'"),
];
let g = analyze_failure(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionFailureToolNotFound)));
}
#[test]
fn detects_auth_misuse() {
let msgs = vec![
fc(r#"{"name":"get_secret","arguments":{}}"#),
obs("HTTP 401 Unauthorized"),
];
let g = analyze_failure(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionFailureAuthMisuse)));
}
#[test]
fn detects_state_error() {
let msgs = vec![
fc(r#"{"name":"commit_tx","arguments":{}}"#),
obs("must call begin_tx first"),
];
let g = analyze_failure(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionFailureStateError)));
}
}

View file

@ -0,0 +1,433 @@
//! Execution loops detector. Direct port of `signals/execution/loops.py`.
use serde_json::json;
use crate::signals::analyzer::ShareGptMessage;
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
pub const RETRY_THRESHOLD: usize = 3;
pub const PARAMETER_DRIFT_THRESHOLD: usize = 3;
pub const OSCILLATION_CYCLES_THRESHOLD: usize = 3;
#[derive(Debug, Clone)]
pub struct ToolCall {
pub index: usize,
pub name: String,
/// Canonical JSON string of arguments (sorted keys when parseable).
pub args: String,
pub args_dict: Option<serde_json::Map<String, serde_json::Value>>,
}
impl ToolCall {
pub fn args_equal(&self, other: &ToolCall) -> bool {
match (&self.args_dict, &other.args_dict) {
(Some(a), Some(b)) => a == b,
_ => self.args == other.args,
}
}
}
fn parse_tool_call(index: usize, msg: &ShareGptMessage<'_>) -> Option<ToolCall> {
if msg.from != "function_call" {
return None;
}
let value = msg.value;
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(value) {
if let Some(obj) = parsed.as_object() {
let name = obj
.get("name")
.or_else(|| obj.get("function"))
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| "unknown".to_string());
let raw_args = obj.get("arguments").or_else(|| obj.get("args"));
let (args_str, args_dict) = match raw_args {
Some(serde_json::Value::Object(o)) => {
let mut keys: Vec<&String> = o.keys().collect();
keys.sort();
let mut canon = serde_json::Map::new();
for k in keys {
canon.insert(k.clone(), o[k].clone());
}
(
serde_json::to_string(&serde_json::Value::Object(canon.clone()))
.unwrap_or_default(),
Some(canon),
)
}
Some(other) => (
other
.as_str()
.map(|s| s.to_string())
.unwrap_or_else(|| serde_json::to_string(other).unwrap_or_default()),
None,
),
None => (String::new(), None),
};
return Some(ToolCall {
index,
name,
args: args_str,
args_dict,
});
}
}
if let Some(paren) = value.find('(') {
if paren > 0 {
let name = value[..paren].trim().to_string();
let args_part = &value[paren..];
if args_part.starts_with('(') && args_part.ends_with(')') {
let inner = args_part[1..args_part.len() - 1].trim();
if let Ok(serde_json::Value::Object(o)) =
serde_json::from_str::<serde_json::Value>(inner)
{
let mut keys: Vec<&String> = o.keys().collect();
keys.sort();
let mut canon = serde_json::Map::new();
for k in keys {
canon.insert(k.clone(), o[k].clone());
}
return Some(ToolCall {
index,
name,
args: serde_json::to_string(&serde_json::Value::Object(canon.clone()))
.unwrap_or_default(),
args_dict: Some(canon),
});
}
return Some(ToolCall {
index,
name,
args: inner.to_string(),
args_dict: None,
});
}
return Some(ToolCall {
index,
name,
args: args_part.to_string(),
args_dict: None,
});
}
}
Some(ToolCall {
index,
name: value.trim().to_string(),
args: String::new(),
args_dict: None,
})
}
fn extract_tool_calls(messages: &[ShareGptMessage<'_>]) -> Vec<ToolCall> {
let mut out = Vec::new();
for (i, msg) in messages.iter().enumerate() {
if let Some(c) = parse_tool_call(i, msg) {
out.push(c);
}
}
out
}
fn detect_retry(calls: &[ToolCall]) -> Vec<(usize, usize, String)> {
if calls.len() < RETRY_THRESHOLD {
return Vec::new();
}
let mut patterns = Vec::new();
let mut i = 0;
while i < calls.len() {
let current = &calls[i];
let mut j = i + 1;
let mut run_length = 1;
while j < calls.len() {
if calls[j].name == current.name && calls[j].args_equal(current) {
run_length += 1;
j += 1;
} else {
break;
}
}
if run_length >= RETRY_THRESHOLD {
patterns.push((calls[i].index, calls[j - 1].index, current.name.clone()));
i = j;
} else {
i += 1;
}
}
patterns
}
fn detect_parameter_drift(calls: &[ToolCall]) -> Vec<(usize, usize, String, usize)> {
if calls.len() < PARAMETER_DRIFT_THRESHOLD {
return Vec::new();
}
let mut patterns = Vec::new();
let mut i = 0;
while i < calls.len() {
let current_name = calls[i].name.clone();
let mut seen_args: Vec<String> = vec![calls[i].args.clone()];
let mut unique_args = 1;
let mut j = i + 1;
while j < calls.len() {
if calls[j].name != current_name {
break;
}
if !seen_args.iter().any(|a| a == &calls[j].args) {
seen_args.push(calls[j].args.clone());
unique_args += 1;
}
j += 1;
}
let run_length = j - i;
if run_length >= PARAMETER_DRIFT_THRESHOLD && unique_args >= 2 {
patterns.push((
calls[i].index,
calls[j - 1].index,
current_name,
unique_args,
));
i = j;
} else {
i += 1;
}
}
patterns
}
fn detect_oscillation(calls: &[ToolCall]) -> Vec<(usize, usize, Vec<String>, usize)> {
let min_calls = 2 * OSCILLATION_CYCLES_THRESHOLD;
if calls.len() < min_calls {
return Vec::new();
}
let mut patterns = Vec::new();
let mut i: usize = 0;
while i + min_calls <= calls.len() {
let max_pat_len = (5usize).min(calls.len() - i);
let mut found_for_i = false;
for pat_len in 2..=max_pat_len {
let pattern_names: Vec<String> =
(0..pat_len).map(|k| calls[i + k].name.clone()).collect();
let unique: std::collections::HashSet<&String> = pattern_names.iter().collect();
if unique.len() < 2 {
continue;
}
let mut cycles = 1;
let mut pos = i + pat_len;
while pos + pat_len <= calls.len() {
let mut all_match = true;
for k in 0..pat_len {
if calls[pos + k].name != pattern_names[k] {
all_match = false;
break;
}
}
if all_match {
cycles += 1;
pos += pat_len;
} else {
break;
}
}
if cycles >= OSCILLATION_CYCLES_THRESHOLD {
let end_idx_in_calls = i + (cycles * pat_len) - 1;
patterns.push((
calls[i].index,
calls[end_idx_in_calls].index,
pattern_names,
cycles,
));
// Mirror Python: `i = end_idx + 1 - pattern_len`. We set `i` so that
// the next outer iteration begins after we account for overlap.
i = end_idx_in_calls + 1 - pat_len;
found_for_i = true;
break;
}
}
if !found_for_i {
i += 1;
} else {
// Match Python's `i = end_idx + 1 - pattern_len; break` then loop.
// We'll continue; the outer while re-checks i.
}
}
if patterns.len() > 1 {
patterns = deduplicate_patterns(patterns);
}
patterns
}
fn deduplicate_patterns(
mut patterns: Vec<(usize, usize, Vec<String>, usize)>,
) -> Vec<(usize, usize, Vec<String>, usize)> {
if patterns.is_empty() {
return patterns;
}
patterns.sort_by(|a, b| {
let ord = a.0.cmp(&b.0);
if ord != std::cmp::Ordering::Equal {
ord
} else {
(b.1 - b.0).cmp(&(a.1 - a.0))
}
});
let mut result = Vec::new();
let mut last_end: i64 = -1;
for p in patterns {
if (p.0 as i64) > last_end {
last_end = p.1 as i64;
result.push(p);
}
}
result
}
pub fn analyze_loops(messages: &[ShareGptMessage<'_>]) -> SignalGroup {
let mut group = SignalGroup::new("loops");
let calls = extract_tool_calls(messages);
if calls.len() < RETRY_THRESHOLD {
return group;
}
let retries = detect_retry(&calls);
for (start_idx, end_idx, tool_name) in &retries {
let call_count = calls
.iter()
.filter(|c| *start_idx <= c.index && c.index <= *end_idx)
.count();
group.add_signal(
SignalInstance::new(
SignalType::ExecutionLoopsRetry,
*start_idx,
format!(
"Tool '{}' called {} times with identical arguments",
tool_name, call_count
),
)
.with_confidence(0.95)
.with_metadata(json!({
"tool_name": tool_name,
"start_index": start_idx,
"end_index": end_idx,
"call_count": call_count,
"loop_type": "retry",
})),
);
}
let drifts = detect_parameter_drift(&calls);
for (start_idx, end_idx, tool_name, variation_count) in &drifts {
let overlaps_retry = retries
.iter()
.any(|r| !(*end_idx < r.0 || *start_idx > r.1));
if overlaps_retry {
continue;
}
let call_count = calls
.iter()
.filter(|c| *start_idx <= c.index && c.index <= *end_idx)
.count();
group.add_signal(
SignalInstance::new(
SignalType::ExecutionLoopsParameterDrift,
*start_idx,
format!(
"Tool '{}' called {} times with {} different argument variations",
tool_name, call_count, variation_count
),
)
.with_confidence(0.85)
.with_metadata(json!({
"tool_name": tool_name,
"start_index": start_idx,
"end_index": end_idx,
"call_count": call_count,
"variation_count": variation_count,
"loop_type": "parameter_drift",
})),
);
}
let oscillations = detect_oscillation(&calls);
for (start_idx, end_idx, tool_names, cycle_count) in &oscillations {
let pattern_str = tool_names.join(" \u{2192} ");
group.add_signal(
SignalInstance::new(
SignalType::ExecutionLoopsOscillation,
*start_idx,
format!(
"Oscillation pattern [{}] repeated {} times",
pattern_str, cycle_count
),
)
.with_confidence(0.9)
.with_metadata(json!({
"pattern": tool_names,
"start_index": start_idx,
"end_index": end_idx,
"cycle_count": cycle_count,
"loop_type": "oscillation",
})),
);
}
group
}
#[cfg(test)]
mod tests {
use super::*;
fn fc(value: &str) -> ShareGptMessage<'_> {
ShareGptMessage {
from: "function_call",
value,
}
}
#[test]
fn detects_retry_loop() {
let arg = r#"{"name":"check_status","arguments":{"id":"abc"}}"#;
let msgs = vec![fc(arg), fc(arg), fc(arg), fc(arg)];
let g = analyze_loops(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsRetry)));
}
#[test]
fn detects_parameter_drift() {
let msgs = vec![
fc(r#"{"name":"search","arguments":{"q":"a"}}"#),
fc(r#"{"name":"search","arguments":{"q":"ab"}}"#),
fc(r#"{"name":"search","arguments":{"q":"abc"}}"#),
fc(r#"{"name":"search","arguments":{"q":"abcd"}}"#),
];
let g = analyze_loops(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsParameterDrift)));
}
#[test]
fn detects_oscillation() {
let a = r#"{"name":"toolA","arguments":{}}"#;
let b = r#"{"name":"toolB","arguments":{}}"#;
let msgs = vec![fc(a), fc(b), fc(a), fc(b), fc(a), fc(b)];
let g = analyze_loops(&msgs);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsOscillation)));
}
#[test]
fn no_signals_when_few_calls() {
let msgs = vec![fc(r#"{"name":"only_once","arguments":{}}"#)];
let g = analyze_loops(&msgs);
assert!(g.signals.is_empty());
}
}

View file

@ -0,0 +1,5 @@
//! Execution signals: failure (agent-caused tool errors) and loops
//! (repetitive tool-call behavior).
pub mod failure;
pub mod loops;

View file

@ -0,0 +1,193 @@
//! Shared constants for the interaction layer detectors.
//!
//! Direct port of `signals/interaction/constants.py`.
use std::collections::HashSet;
use std::sync::OnceLock;
pub const POSITIVE_PREFIXES: &[&str] = &[
"yes",
"yeah",
"yep",
"yup",
"sure",
"ok",
"okay",
"great",
"awesome",
"perfect",
"thanks",
"thank",
"wonderful",
"excellent",
"amazing",
"nice",
"good",
"cool",
"absolutely",
"definitely",
"please",
];
pub const CONFIRMATION_PREFIXES: &[&str] = &[
"yes",
"yeah",
"yep",
"yup",
"correct",
"right",
"that's correct",
"thats correct",
"that's right",
"thats right",
"that is correct",
"that is right",
];
const STOPWORD_LIST: &[&str] = &[
"a",
"about",
"above",
"after",
"again",
"against",
"all",
"am",
"an",
"and",
"any",
"are",
"as",
"at",
"be",
"because",
"been",
"before",
"being",
"below",
"between",
"both",
"but",
"by",
"can",
"could",
"did",
"do",
"does",
"doing",
"down",
"during",
"each",
"few",
"for",
"from",
"further",
"had",
"has",
"have",
"having",
"he",
"her",
"here",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"i",
"if",
"in",
"into",
"is",
"it",
"its",
"itself",
"just",
"me",
"more",
"most",
"my",
"myself",
"no",
"nor",
"not",
"now",
"of",
"off",
"on",
"once",
"only",
"or",
"other",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"same",
"she",
"should",
"so",
"some",
"such",
"than",
"that",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"there",
"these",
"they",
"this",
"those",
"through",
"to",
"too",
"under",
"until",
"up",
"very",
"was",
"we",
"were",
"what",
"when",
"where",
"which",
"while",
"who",
"whom",
"why",
"with",
"would",
"you",
"your",
"yours",
"yourself",
"yourselves",
];
pub fn stopwords() -> &'static HashSet<&'static str> {
static SET: OnceLock<HashSet<&'static str>> = OnceLock::new();
SET.get_or_init(|| STOPWORD_LIST.iter().copied().collect())
}
/// Returns true if `text` (case-insensitive, trimmed) starts with any of the
/// given prefixes treated as **whole tokens or token sequences**. This matches
/// the Python's `text_lower.startswith(prefix)` plus the natural intent that
/// `"please"` shouldn't fire on `"pleased"`.
pub fn starts_with_prefix(text: &str, prefixes: &[&str]) -> bool {
let lowered = text.to_lowercase();
let trimmed = lowered.trim_start();
for prefix in prefixes {
if trimmed.starts_with(prefix) {
return true;
}
}
false
}

View file

@ -0,0 +1,445 @@
//! Disengagement signals: escalation, quit, negative stance.
//!
//! Direct port of `signals/interaction/disengagement.py`.
use std::sync::OnceLock;
use regex::Regex;
use serde_json::json;
use super::constants::{starts_with_prefix, POSITIVE_PREFIXES};
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern};
const ESCALATION_PATTERN_TEXTS: &[&str] = &[
// Human requests
"speak to a human",
"talk to a human",
"connect me to a human",
"connect me with a human",
"transfer me to a human",
"get me a human",
"chat with a human",
// Person requests
"speak to a person",
"talk to a person",
"connect me to a person",
"connect me with a person",
"transfer me to a person",
"get me a person",
"chat with a person",
// Real person requests
"speak to a real person",
"talk to a real person",
"connect me to a real person",
"connect me with a real person",
"transfer me to a real person",
"get me a real person",
"chat with a real person",
// Actual person requests
"speak to an actual person",
"talk to an actual person",
"connect me to an actual person",
"connect me with an actual person",
"transfer me to an actual person",
"get me an actual person",
"chat with an actual person",
// Supervisor requests
"speak to a supervisor",
"talk to a supervisor",
"connect me to a supervisor",
"connect me with a supervisor",
"transfer me to a supervisor",
"get me a supervisor",
"chat with a supervisor",
// Manager requests
"speak to a manager",
"talk to a manager",
"connect me to a manager",
"connect me with a manager",
"transfer me to a manager",
"get me a manager",
"chat with a manager",
// Customer service requests
"speak to customer service",
"talk to customer service",
"connect me to customer service",
"connect me with customer service",
"transfer me to customer service",
"get me customer service",
"chat with customer service",
// Customer support requests
"speak to customer support",
"talk to customer support",
"connect me to customer support",
"connect me with customer support",
"transfer me to customer support",
"get me customer support",
"chat with customer support",
// Support requests
"speak to support",
"talk to support",
"connect me to support",
"connect me with support",
"transfer me to support",
"get me support",
"chat with support",
// Tech support requests
"speak to tech support",
"talk to tech support",
"connect me to tech support",
"connect me with tech support",
"transfer me to tech support",
"get me tech support",
"chat with tech support",
// Help desk requests
"speak to help desk",
"talk to help desk",
"connect me to help desk",
"connect me with help desk",
"transfer me to help desk",
"get me help desk",
"chat with help desk",
// Explicit escalation
"escalate this",
];
const QUIT_PATTERN_TEXTS: &[&str] = &[
"i give up",
"i'm giving up",
"im giving up",
"i'm going to quit",
"i quit",
"forget it",
"forget this",
"screw it",
"screw this",
"don't bother trying",
"don't bother with this",
"don't bother with it",
"don't even bother",
"why bother",
"not worth it",
"this is hopeless",
"going elsewhere",
"try somewhere else",
"look elsewhere",
];
const NEGATIVE_STANCE_PATTERN_TEXTS: &[&str] = &[
"this is useless",
"not helpful",
"doesn't help",
"not helping",
"you're not helping",
"youre not helping",
"this doesn't work",
"this doesnt work",
"this isn't working",
"this isnt working",
"still doesn't work",
"still doesnt work",
"still not working",
"still isn't working",
"still isnt working",
"waste of time",
"wasting my time",
"this is ridiculous",
"this is absurd",
"this is insane",
"this is stupid",
"this is dumb",
"this sucks",
"this is frustrating",
"not good enough",
"why can't you",
"why cant you",
"same issue",
"did that already",
"done that already",
"tried that already",
"already tried that",
"i've done that",
"ive done that",
"i've tried that",
"ive tried that",
"i'm disappointed",
"im disappointed",
"disappointed with you",
"disappointed in you",
"useless bot",
"dumb bot",
"stupid bot",
];
const AGENT_DIRECTED_PROFANITY_PATTERN_TEXTS: &[&str] = &[
"this is bullshit",
"what bullshit",
"such bullshit",
"total bullshit",
"complete bullshit",
"this is crap",
"what crap",
"this is shit",
"what the hell is wrong with you",
"what the fuck is wrong with you",
"you're fucking useless",
"youre fucking useless",
"you are fucking useless",
"fucking useless",
"this bot is shit",
"this bot is crap",
"damn bot",
"fucking bot",
"stupid fucking",
"are you fucking kidding",
"wtf is wrong with you",
"wtf is this",
"ffs just",
"for fucks sake",
"for fuck's sake",
"what the f**k",
"what the f*ck",
"what the f***",
"that's bullsh*t",
"thats bullsh*t",
"that's bull***t",
"thats bull***t",
"that's bs",
"thats bs",
"this is bullsh*t",
"this is bull***t",
"this is bs",
];
fn escalation_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(ESCALATION_PATTERN_TEXTS))
}
fn quit_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(QUIT_PATTERN_TEXTS))
}
fn negative_stance_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(NEGATIVE_STANCE_PATTERN_TEXTS))
}
fn profanity_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(AGENT_DIRECTED_PROFANITY_PATTERN_TEXTS))
}
fn re_consecutive_q() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| Regex::new(r"\?{2,}").unwrap())
}
fn re_consecutive_e() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| Regex::new(r"!{2,}").unwrap())
}
fn re_mixed_punct() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| Regex::new(r"[?!]{3,}").unwrap())
}
pub fn analyze_disengagement(
normalized_messages: &[(usize, &str, NormalizedMessage)],
char_ngram_threshold: f32,
token_cosine_threshold: f32,
) -> SignalGroup {
let mut group = SignalGroup::new("disengagement");
for (idx, role, norm_msg) in normalized_messages {
if *role != "human" {
continue;
}
let text = &norm_msg.raw;
// All-caps shouting check.
let alpha_chars: String = text.chars().filter(|c| c.is_alphabetic()).collect();
if alpha_chars.chars().count() >= 10 {
let upper_count = alpha_chars.chars().filter(|c| c.is_uppercase()).count();
let upper_ratio = upper_count as f32 / alpha_chars.chars().count() as f32;
if upper_ratio >= 0.8 {
let snippet: String = text.chars().take(50).collect();
group.add_signal(
SignalInstance::new(SignalType::DisengagementNegativeStance, *idx, snippet)
.with_metadata(json!({
"indicator_type": "all_caps",
"upper_ratio": upper_ratio,
})),
);
}
}
// Excessive consecutive punctuation.
let starts_with_positive = starts_with_prefix(text, POSITIVE_PREFIXES);
let cq = re_consecutive_q().find_iter(text).count();
let ce = re_consecutive_e().find_iter(text).count();
let mixed = re_mixed_punct().find_iter(text).count();
if !starts_with_positive && (cq >= 1 || ce >= 1 || mixed >= 1) {
let snippet: String = text.chars().take(50).collect();
group.add_signal(
SignalInstance::new(SignalType::DisengagementNegativeStance, *idx, snippet)
.with_metadata(json!({
"indicator_type": "excessive_punctuation",
"consecutive_questions": cq,
"consecutive_exclamations": ce,
"mixed_punctuation": mixed,
})),
);
}
// Escalation patterns.
let mut found_escalation = false;
for pattern in escalation_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::DisengagementEscalation,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "escalation"})),
);
found_escalation = true;
break;
}
}
// Quit patterns (independent of escalation).
for pattern in quit_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(SignalType::DisengagementQuit, *idx, pattern.raw.clone())
.with_metadata(json!({"pattern_type": "quit"})),
);
break;
}
}
// Profanity (more specific) before generic negative stance.
let mut found_profanity = false;
for pattern in profanity_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::DisengagementNegativeStance,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({
"indicator_type": "profanity",
"pattern": pattern.raw,
})),
);
found_profanity = true;
break;
}
}
if !found_escalation && !found_profanity {
for pattern in negative_stance_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::DisengagementNegativeStance,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({
"indicator_type": "complaint",
"pattern": pattern.raw,
})),
);
break;
}
}
}
}
group
}
#[cfg(test)]
mod tests {
use super::*;
fn nm(s: &str) -> NormalizedMessage {
NormalizedMessage::from_text(s, 2000)
}
#[test]
fn detects_human_escalation_request() {
let msgs = vec![(
0usize,
"human",
nm("This is taking forever, get me a human"),
)];
let g = analyze_disengagement(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::DisengagementEscalation)));
}
#[test]
fn detects_quit_intent() {
let msgs = vec![(0usize, "human", nm("Forget it, I give up"))];
let g = analyze_disengagement(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::DisengagementQuit)));
}
#[test]
fn detects_negative_stance_complaint() {
let msgs = vec![(0usize, "human", nm("This is useless"))];
let g = analyze_disengagement(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::DisengagementNegativeStance)));
}
#[test]
fn detects_excessive_punctuation_as_negative_stance() {
let msgs = vec![(0usize, "human", nm("WHY isn't this working???"))];
let g = analyze_disengagement(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::DisengagementNegativeStance)));
}
#[test]
fn positive_excitement_is_not_disengagement() {
let msgs = vec![(0usize, "human", nm("Yes!! That's perfect!!!"))];
let g = analyze_disengagement(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.all(|s| !matches!(s.signal_type, SignalType::DisengagementNegativeStance)));
}
}

View file

@ -0,0 +1,338 @@
//! Misalignment signals: corrections, rephrases, clarifications.
//!
//! Direct port of `signals/interaction/misalignment.py`.
use std::sync::OnceLock;
use serde_json::json;
use super::constants::{stopwords, CONFIRMATION_PREFIXES};
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern};
const CORRECTION_PATTERN_TEXTS: &[&str] = &[
"no, i meant",
"no i meant",
"no, i said",
"no i said",
"no, i asked",
"no i asked",
"nah, i meant",
"nope, i meant",
"not what i said",
"not what i asked",
"that's not what i said",
"that's not what i asked",
"that's not what i meant",
"thats not what i said",
"thats not what i asked",
"thats not what i meant",
"that's not what you",
"no that's not what i",
"no, that's not what i",
"you're not quite right",
"youre not quite right",
"you're not exactly right",
"youre not exactly right",
"you're wrong about",
"youre wrong about",
"i just said",
"i already said",
"i already told you",
];
const REPHRASE_PATTERN_TEXTS: &[&str] = &[
"let me rephrase",
"let me explain again",
"what i'm trying to say",
"what i'm saying is",
"in other words",
];
const CLARIFICATION_PATTERN_TEXTS: &[&str] = &[
"i don't understand",
"don't understand",
"not understanding",
"can't understand",
"don't get it",
"don't follow",
"i'm confused",
"so confused",
"makes no sense",
"doesn't make sense",
"not making sense",
"what do you mean",
"what does that mean",
"what are you saying",
"i'm lost",
"totally lost",
"lost me",
"no clue what you",
"no idea what you",
"no clue what that",
"no idea what that",
"come again",
"say that again",
"repeat that",
"trouble following",
"hard to follow",
"can't follow",
];
fn correction_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(CORRECTION_PATTERN_TEXTS))
}
fn rephrase_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(REPHRASE_PATTERN_TEXTS))
}
fn clarification_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(CLARIFICATION_PATTERN_TEXTS))
}
fn is_confirmation_message(text: &str) -> bool {
let lowered = text.to_lowercase();
let trimmed = lowered.trim();
CONFIRMATION_PREFIXES.iter().any(|p| trimmed.starts_with(p))
}
/// Detect whether two user messages appear to be rephrases of each other.
pub fn is_similar_rephrase(
norm_msg1: &NormalizedMessage,
norm_msg2: &NormalizedMessage,
overlap_threshold: f32,
min_meaningful_tokens: usize,
max_new_content_ratio: f32,
) -> bool {
if norm_msg1.tokens.len() < 3 || norm_msg2.tokens.len() < 3 {
return false;
}
if is_confirmation_message(&norm_msg1.raw) {
return false;
}
let stops = stopwords();
let tokens1: std::collections::HashSet<&str> = norm_msg1
.tokens
.iter()
.filter(|t| !stops.contains(t.as_str()))
.map(|s| s.as_str())
.collect();
let tokens2: std::collections::HashSet<&str> = norm_msg2
.tokens
.iter()
.filter(|t| !stops.contains(t.as_str()))
.map(|s| s.as_str())
.collect();
if tokens1.len() < min_meaningful_tokens || tokens2.len() < min_meaningful_tokens {
return false;
}
let new_tokens: std::collections::HashSet<&&str> = tokens1.difference(&tokens2).collect();
let new_content_ratio = if tokens1.is_empty() {
0.0
} else {
new_tokens.len() as f32 / tokens1.len() as f32
};
if new_content_ratio > max_new_content_ratio {
return false;
}
let intersection = tokens1.intersection(&tokens2).count();
let min_size = tokens1.len().min(tokens2.len());
if min_size == 0 {
return false;
}
let overlap_ratio = intersection as f32 / min_size as f32;
overlap_ratio >= overlap_threshold
}
/// Analyze user messages for misalignment signals.
pub fn analyze_misalignment(
normalized_messages: &[(usize, &str, NormalizedMessage)],
char_ngram_threshold: f32,
token_cosine_threshold: f32,
) -> SignalGroup {
let mut group = SignalGroup::new("misalignment");
let mut prev_user_idx: Option<usize> = None;
let mut prev_user_msg: Option<&NormalizedMessage> = None;
for (idx, role, norm_msg) in normalized_messages {
if *role != "human" {
continue;
}
let mut found_in_turn = false;
for pattern in correction_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::MisalignmentCorrection,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "correction"})),
);
found_in_turn = true;
break;
}
}
if found_in_turn {
prev_user_idx = Some(*idx);
prev_user_msg = Some(norm_msg);
continue;
}
for pattern in rephrase_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::MisalignmentRephrase,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "rephrase"})),
);
found_in_turn = true;
break;
}
}
if found_in_turn {
prev_user_idx = Some(*idx);
prev_user_msg = Some(norm_msg);
continue;
}
for pattern in clarification_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::MisalignmentClarification,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "clarification"})),
);
found_in_turn = true;
break;
}
}
if found_in_turn {
prev_user_idx = Some(*idx);
prev_user_msg = Some(norm_msg);
continue;
}
// Semantic rephrase vs the previous user message (recent only).
if let (Some(prev_idx), Some(prev_msg)) = (prev_user_idx, prev_user_msg) {
let turns_between = idx.saturating_sub(prev_idx);
if turns_between <= 3 && is_similar_rephrase(norm_msg, prev_msg, 0.75, 4, 0.5) {
group.add_signal(
SignalInstance::new(
SignalType::MisalignmentRephrase,
*idx,
"[similar rephrase detected]",
)
.with_confidence(0.8)
.with_metadata(json!({
"pattern_type": "semantic_rephrase",
"compared_to": prev_idx,
})),
);
}
}
prev_user_idx = Some(*idx);
prev_user_msg = Some(norm_msg);
}
group
}
#[cfg(test)]
mod tests {
use super::*;
fn nm(s: &str) -> NormalizedMessage {
NormalizedMessage::from_text(s, 2000)
}
fn make(items: &[(&'static str, &str)]) -> Vec<(usize, &'static str, NormalizedMessage)> {
items
.iter()
.enumerate()
.map(|(i, (role, text))| (i, *role, nm(text)))
.collect()
}
#[test]
fn detects_explicit_correction() {
let msgs = make(&[
("human", "Show me my orders"),
("gpt", "Sure, here are your invoices"),
("human", "No, I meant my recent orders"),
]);
let g = analyze_misalignment(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::MisalignmentCorrection)));
}
#[test]
fn detects_rephrase_marker() {
let msgs = make(&[
("human", "Show me X"),
("gpt", "Sure"),
("human", "Let me rephrase: I want X grouped by date"),
]);
let g = analyze_misalignment(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::MisalignmentRephrase)));
}
#[test]
fn detects_clarification_request() {
let msgs = make(&[
("human", "Run the report"),
("gpt", "Foobar quux baz."),
("human", "I don't understand what you mean"),
]);
let g = analyze_misalignment(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::MisalignmentClarification)));
}
#[test]
fn confirmation_is_not_a_rephrase() {
let m1 = nm("Yes, that's correct, please proceed with the order");
let m2 = nm("please proceed with the order for the same product");
assert!(!is_similar_rephrase(&m1, &m2, 0.75, 4, 0.5));
}
}

View file

@ -0,0 +1,10 @@
//! Interaction signals: misalignment, stagnation, disengagement, satisfaction.
//!
//! These signals capture how the dialogue itself unfolds (semantic alignment,
//! progress, engagement, closure) independent of tool execution outcomes.
pub mod constants;
pub mod disengagement;
pub mod misalignment;
pub mod satisfaction;
pub mod stagnation;

View file

@ -0,0 +1,177 @@
//! Satisfaction signals: gratitude, confirmation, success.
//!
//! Direct port of `signals/interaction/satisfaction.py`.
use std::sync::OnceLock;
use serde_json::json;
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType};
use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern};
const GRATITUDE_PATTERN_TEXTS: &[&str] = &[
"that's helpful",
"that helps",
"this helps",
"appreciate it",
"appreciate that",
"that's perfect",
"exactly what i needed",
"just what i needed",
"you're the best",
"you rock",
"you're awesome",
"you're amazing",
"you're great",
];
const CONFIRMATION_PATTERN_TEXTS: &[&str] = &[
"that works",
"this works",
"that's great",
"that's amazing",
"this is great",
"that's awesome",
"love it",
"love this",
"love that",
];
const SUCCESS_PATTERN_TEXTS: &[&str] = &[
"it worked",
"that worked",
"this worked",
"it's working",
"that's working",
"this is working",
];
fn gratitude_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(GRATITUDE_PATTERN_TEXTS))
}
fn confirmation_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(CONFIRMATION_PATTERN_TEXTS))
}
fn success_patterns() -> &'static Vec<NormalizedPattern> {
static PATS: OnceLock<Vec<NormalizedPattern>> = OnceLock::new();
PATS.get_or_init(|| normalize_patterns(SUCCESS_PATTERN_TEXTS))
}
pub fn analyze_satisfaction(
normalized_messages: &[(usize, &str, NormalizedMessage)],
char_ngram_threshold: f32,
token_cosine_threshold: f32,
) -> SignalGroup {
let mut group = SignalGroup::new("satisfaction");
for (idx, role, norm_msg) in normalized_messages {
if *role != "human" {
continue;
}
let mut found = false;
for pattern in gratitude_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::SatisfactionGratitude,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "gratitude"})),
);
found = true;
break;
}
}
if found {
continue;
}
for pattern in confirmation_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(
SignalType::SatisfactionConfirmation,
*idx,
pattern.raw.clone(),
)
.with_metadata(json!({"pattern_type": "confirmation"})),
);
found = true;
break;
}
}
if found {
continue;
}
for pattern in success_patterns() {
if norm_msg.matches_normalized_pattern(
pattern,
char_ngram_threshold,
token_cosine_threshold,
) {
group.add_signal(
SignalInstance::new(SignalType::SatisfactionSuccess, *idx, pattern.raw.clone())
.with_metadata(json!({"pattern_type": "success"})),
);
break;
}
}
}
group
}
#[cfg(test)]
mod tests {
use super::*;
fn nm(s: &str) -> NormalizedMessage {
NormalizedMessage::from_text(s, 2000)
}
#[test]
fn detects_gratitude() {
let msgs = vec![(0usize, "human", nm("That's perfect, appreciate it!"))];
let g = analyze_satisfaction(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::SatisfactionGratitude)));
}
#[test]
fn detects_confirmation() {
let msgs = vec![(0usize, "human", nm("That works for me, thanks"))];
let g = analyze_satisfaction(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::SatisfactionConfirmation)));
}
#[test]
fn detects_success() {
let msgs = vec![(0usize, "human", nm("Great, it worked!"))];
let g = analyze_satisfaction(&msgs, 0.65, 0.6);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::SatisfactionSuccess)));
}
}

View file

@ -0,0 +1,241 @@
//! Stagnation signals: dragging (turn-count efficiency) and repetition.
//!
//! Direct port of `signals/interaction/stagnation.py`.
use serde_json::json;
use super::constants::{starts_with_prefix, POSITIVE_PREFIXES};
use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType, TurnMetrics};
use crate::signals::text_processing::NormalizedMessage;
/// Adapter row used by stagnation::dragging detector. Mirrors the ShareGPT
/// `{"from": role, "value": text}` shape used in the Python reference.
pub struct ShareGptMsg<'a> {
pub from: &'a str,
}
pub fn analyze_dragging(
messages: &[ShareGptMsg<'_>],
baseline_turns: usize,
efficiency_threshold: f32,
) -> (SignalGroup, TurnMetrics) {
let mut group = SignalGroup::new("stagnation");
let mut user_turns: usize = 0;
let mut assistant_turns: usize = 0;
for m in messages {
match m.from {
"human" => user_turns += 1,
"gpt" => assistant_turns += 1,
_ => {}
}
}
let total_turns = user_turns;
let efficiency_score: f32 = if total_turns == 0 || total_turns <= baseline_turns {
1.0
} else {
let excess = (total_turns - baseline_turns) as f32;
1.0 / (1.0 + excess * 0.25)
};
let is_dragging = efficiency_score < efficiency_threshold;
let metrics = TurnMetrics {
total_turns,
user_turns,
assistant_turns,
is_dragging,
efficiency_score,
};
if is_dragging {
let last_idx = messages.len().saturating_sub(1);
group.add_signal(
SignalInstance::new(
SignalType::StagnationDragging,
last_idx,
format!(
"Conversation dragging: {} turns (efficiency: {:.2})",
total_turns, efficiency_score
),
)
.with_confidence(1.0 - efficiency_score)
.with_metadata(json!({
"total_turns": total_turns,
"efficiency_score": efficiency_score,
"baseline_turns": baseline_turns,
})),
);
}
(group, metrics)
}
pub fn analyze_repetition(
normalized_messages: &[(usize, &str, NormalizedMessage)],
lookback: usize,
exact_threshold: f32,
near_duplicate_threshold: f32,
) -> SignalGroup {
let mut group = SignalGroup::new("stagnation");
// We keep references into `normalized_messages`. Since `normalized_messages`
// is borrowed for the whole function, this avoids cloning.
let mut prev_human: Vec<(usize, &NormalizedMessage)> = Vec::new();
let mut prev_gpt: Vec<(usize, &NormalizedMessage)> = Vec::new();
for (idx, role, norm_msg) in normalized_messages {
if *role != "human" && *role != "gpt" {
continue;
}
// Skip human positive-prefix messages; they're naturally repetitive.
if *role == "human" && starts_with_prefix(&norm_msg.raw, POSITIVE_PREFIXES) {
prev_human.push((*idx, norm_msg));
continue;
}
if norm_msg.tokens.len() < 5 {
if *role == "human" {
prev_human.push((*idx, norm_msg));
} else {
prev_gpt.push((*idx, norm_msg));
}
continue;
}
let prev = if *role == "human" {
&prev_human
} else {
&prev_gpt
};
let start = prev.len().saturating_sub(lookback);
let mut matched = false;
for (prev_idx, prev_msg) in &prev[start..] {
if prev_msg.tokens.len() < 5 {
continue;
}
let similarity = norm_msg.ngram_similarity_with_message(prev_msg);
if similarity >= exact_threshold {
group.add_signal(
SignalInstance::new(
SignalType::StagnationRepetition,
*idx,
format!("Exact repetition with message {}", prev_idx),
)
.with_confidence(similarity)
.with_metadata(json!({
"repetition_type": "exact",
"compared_to": prev_idx,
"similarity": similarity,
"role": role,
})),
);
matched = true;
break;
} else if similarity >= near_duplicate_threshold {
group.add_signal(
SignalInstance::new(
SignalType::StagnationRepetition,
*idx,
format!("Near-duplicate with message {}", prev_idx),
)
.with_confidence(similarity)
.with_metadata(json!({
"repetition_type": "near_duplicate",
"compared_to": prev_idx,
"similarity": similarity,
"role": role,
})),
);
matched = true;
break;
}
}
let _ = matched;
if *role == "human" {
prev_human.push((*idx, norm_msg));
} else {
prev_gpt.push((*idx, norm_msg));
}
}
group
}
/// Combined stagnation analyzer: dragging + repetition.
pub fn analyze_stagnation(
messages: &[ShareGptMsg<'_>],
normalized_messages: &[(usize, &str, NormalizedMessage)],
baseline_turns: usize,
) -> (SignalGroup, TurnMetrics) {
let (dragging_group, metrics) = analyze_dragging(messages, baseline_turns, 0.5);
let repetition_group = analyze_repetition(normalized_messages, 2, 0.95, 0.85);
let mut combined = SignalGroup::new("stagnation");
for s in dragging_group.signals.iter().cloned() {
combined.add_signal(s);
}
for s in repetition_group.signals.iter().cloned() {
combined.add_signal(s);
}
(combined, metrics)
}
#[cfg(test)]
mod tests {
use super::*;
fn nm(s: &str) -> NormalizedMessage {
NormalizedMessage::from_text(s, 2000)
}
#[test]
fn dragging_after_many_user_turns() {
let msgs: Vec<_> = (0..15)
.flat_map(|_| [ShareGptMsg { from: "human" }, ShareGptMsg { from: "gpt" }])
.collect();
let (g, m) = analyze_dragging(&msgs, 5, 0.5);
assert!(m.is_dragging);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::StagnationDragging)));
}
#[test]
fn no_dragging_below_baseline() {
let msgs = vec![
ShareGptMsg { from: "human" },
ShareGptMsg { from: "gpt" },
ShareGptMsg { from: "human" },
ShareGptMsg { from: "gpt" },
];
let (g, m) = analyze_dragging(&msgs, 5, 0.5);
assert!(!m.is_dragging);
assert!(g.signals.is_empty());
}
#[test]
fn detects_exact_repetition_in_user_messages() {
let n = vec![
(
0usize,
"human",
nm("This widget is broken and needs repair right now"),
),
(1, "gpt", nm("Sorry to hear that. Let me look into it.")),
(
2,
"human",
nm("This widget is broken and needs repair right now"),
),
];
let g = analyze_repetition(&n, 2, 0.95, 0.85);
assert!(g
.signals
.iter()
.any(|s| matches!(s.signal_type, SignalType::StagnationRepetition)));
}
}

View file

@ -1,3 +1,26 @@
mod analyzer;
//! Plano signals: behavioral quality indicators for agent interactions.
//!
//! This is a Rust port of the paper-aligned Python reference implementation at
//! `https://github.com/katanemo/signals` (or `/Users/shashmi/repos/signals`).
//!
//! Three layers of signals are detected from a conversation transcript:
//!
//! - **Interaction**: misalignment, stagnation, disengagement, satisfaction
//! - **Execution**: failure, loops
//! - **Environment**: exhaustion
//!
//! See `SignalType` for the full hierarchy.
pub use analyzer::*;
pub mod analyzer;
pub mod environment;
pub mod execution;
pub mod interaction;
pub mod otel;
pub mod schemas;
pub mod text_processing;
pub use analyzer::{SignalAnalyzer, FLAG_MARKER};
pub use schemas::{
EnvironmentSignals, ExecutionSignals, InteractionQuality, InteractionSignals, SignalGroup,
SignalInstance, SignalLayer, SignalReport, SignalType, TurnMetrics,
};

View file

@ -0,0 +1,241 @@
//! Helpers for emitting `SignalReport` data to OpenTelemetry spans.
//!
//! Two sets of attributes are emitted:
//!
//! - **Legacy** keys under `signals.*` (e.g. `signals.frustration.count`),
//! computed from the new layered counts. Preserved for one release for
//! backward compatibility with existing dashboards.
//! - **New** layered keys (e.g. `signals.interaction.misalignment.count`),
//! one set of `count`/`severity` attributes per category, plus per-instance
//! span events named `signal.<dotted_signal_type>`.
use opentelemetry::trace::SpanRef;
use opentelemetry::KeyValue;
use crate::signals::schemas::{SignalGroup, SignalReport, SignalType};
/// Emit both legacy and layered OTel attributes/events for a `SignalReport`.
///
/// Returns `true` if any "concerning" signal was found, mirroring the previous
/// behavior used to flag the span operation name.
pub fn emit_signals_to_span(span: &SpanRef<'_>, report: &SignalReport) -> bool {
emit_overall(span, report);
emit_layered_attributes(span, report);
emit_legacy_attributes(span, report);
emit_signal_events(span, report);
is_concerning(report)
}
fn emit_overall(span: &SpanRef<'_>, report: &SignalReport) {
span.set_attribute(KeyValue::new(
"signals.quality",
report.overall_quality.as_str().to_string(),
));
span.set_attribute(KeyValue::new(
"signals.quality_score",
report.quality_score as f64,
));
span.set_attribute(KeyValue::new(
"signals.turn_count",
report.turn_metrics.total_turns as i64,
));
span.set_attribute(KeyValue::new(
"signals.efficiency_score",
report.turn_metrics.efficiency_score as f64,
));
}
fn emit_group(span: &SpanRef<'_>, prefix: &str, group: &SignalGroup) {
if group.count == 0 {
return;
}
span.set_attribute(KeyValue::new(
format!("{}.count", prefix),
group.count as i64,
));
span.set_attribute(KeyValue::new(
format!("{}.severity", prefix),
group.severity as i64,
));
}
fn emit_layered_attributes(span: &SpanRef<'_>, report: &SignalReport) {
emit_group(
span,
"signals.interaction.misalignment",
&report.interaction.misalignment,
);
emit_group(
span,
"signals.interaction.stagnation",
&report.interaction.stagnation,
);
emit_group(
span,
"signals.interaction.disengagement",
&report.interaction.disengagement,
);
emit_group(
span,
"signals.interaction.satisfaction",
&report.interaction.satisfaction,
);
emit_group(span, "signals.execution.failure", &report.execution.failure);
emit_group(span, "signals.execution.loops", &report.execution.loops);
emit_group(
span,
"signals.environment.exhaustion",
&report.environment.exhaustion,
);
}
fn count_of(report: &SignalReport, t: SignalType) -> usize {
report.iter_signals().filter(|s| s.signal_type == t).count()
}
/// Emit the legacy attribute keys consumed by existing dashboards. These are
/// derived from the new `SignalReport` so no detector contract is broken.
fn emit_legacy_attributes(span: &SpanRef<'_>, report: &SignalReport) {
use crate::tracing::signals as legacy;
// signals.follow_up.repair.{count,ratio} - misalignment proxies repairs.
let repair_count = report.interaction.misalignment.count;
let user_turns = report.turn_metrics.user_turns.max(1) as f32;
if repair_count > 0 {
span.set_attribute(KeyValue::new(legacy::REPAIR_COUNT, repair_count as i64));
let ratio = repair_count as f32 / user_turns;
span.set_attribute(KeyValue::new(legacy::REPAIR_RATIO, format!("{:.3}", ratio)));
}
// signals.frustration.{count,severity} - disengagement.negative_stance is
// the closest legacy analog of "frustration".
let frustration_count = count_of(report, SignalType::DisengagementNegativeStance);
if frustration_count > 0 {
span.set_attribute(KeyValue::new(
legacy::FRUSTRATION_COUNT,
frustration_count as i64,
));
let severity = match frustration_count {
0 => 0,
1..=2 => 1,
3..=4 => 2,
_ => 3,
};
span.set_attribute(KeyValue::new(legacy::FRUSTRATION_SEVERITY, severity as i64));
}
// signals.repetition.count - stagnation (repetition + dragging).
if report.interaction.stagnation.count > 0 {
span.set_attribute(KeyValue::new(
legacy::REPETITION_COUNT,
report.interaction.stagnation.count as i64,
));
}
// signals.escalation.requested - any escalation/quit signal.
let escalated = report.interaction.disengagement.signals.iter().any(|s| {
matches!(
s.signal_type,
SignalType::DisengagementEscalation | SignalType::DisengagementQuit
)
});
if escalated {
span.set_attribute(KeyValue::new(legacy::ESCALATION_REQUESTED, true));
}
// signals.positive_feedback.count - satisfaction signals.
if report.interaction.satisfaction.count > 0 {
span.set_attribute(KeyValue::new(
legacy::POSITIVE_FEEDBACK_COUNT,
report.interaction.satisfaction.count as i64,
));
}
}
fn emit_signal_events(span: &SpanRef<'_>, report: &SignalReport) {
for sig in report.iter_signals() {
let event_name = format!("signal.{}", sig.signal_type.as_str());
let mut attrs: Vec<KeyValue> = vec![
KeyValue::new("signal.type", sig.signal_type.as_str().to_string()),
KeyValue::new("signal.message_index", sig.message_index as i64),
KeyValue::new("signal.confidence", sig.confidence as f64),
];
if !sig.snippet.is_empty() {
attrs.push(KeyValue::new("signal.snippet", sig.snippet.clone()));
}
if !sig.metadata.is_null() {
attrs.push(KeyValue::new("signal.metadata", sig.metadata.to_string()));
}
span.add_event(event_name, attrs);
}
}
fn is_concerning(report: &SignalReport) -> bool {
use crate::signals::schemas::InteractionQuality;
if matches!(
report.overall_quality,
InteractionQuality::Poor | InteractionQuality::Severe
) {
return true;
}
if report.interaction.disengagement.count > 0 {
return true;
}
if report.interaction.stagnation.count > 2 {
return true;
}
if report.execution.failure.count > 0 || report.execution.loops.count > 0 {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::signals::schemas::{
EnvironmentSignals, ExecutionSignals, InteractionQuality, InteractionSignals, SignalGroup,
SignalInstance, SignalReport, SignalType, TurnMetrics,
};
fn report_with_escalation() -> SignalReport {
let mut diseng = SignalGroup::new("disengagement");
diseng.add_signal(SignalInstance::new(
SignalType::DisengagementEscalation,
3,
"get me a human",
));
SignalReport {
interaction: InteractionSignals {
disengagement: diseng,
..InteractionSignals::default()
},
execution: ExecutionSignals::default(),
environment: EnvironmentSignals::default(),
overall_quality: InteractionQuality::Severe,
quality_score: 0.0,
turn_metrics: TurnMetrics {
total_turns: 3,
user_turns: 2,
assistant_turns: 1,
is_dragging: false,
efficiency_score: 1.0,
},
summary: String::new(),
}
}
#[test]
fn is_concerning_flags_disengagement() {
let r = report_with_escalation();
assert!(is_concerning(&r));
}
#[test]
fn count_of_returns_per_type_count() {
let r = report_with_escalation();
assert_eq!(count_of(&r, SignalType::DisengagementEscalation), 1);
assert_eq!(count_of(&r, SignalType::DisengagementNegativeStance), 0);
}
}

View file

@ -0,0 +1,431 @@
//! Data shapes for the signal analyzer.
//!
//! Mirrors `signals/schemas.py` from the reference implementation. Where the
//! Python library exposes a `Dict[str, SignalGroup]` partitioned by category,
//! the Rust port uses strongly-typed sub-structs (`InteractionSignals`,
//! `ExecutionSignals`, `EnvironmentSignals`) for the same partitioning.
use serde::{Deserialize, Serialize};
/// Hierarchical signal type. The 20 leaf variants mirror the paper taxonomy
/// and the Python reference's `SignalType` string enum.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum SignalType {
// Interaction > Misalignment
MisalignmentCorrection,
MisalignmentRephrase,
MisalignmentClarification,
// Interaction > Stagnation
StagnationDragging,
StagnationRepetition,
// Interaction > Disengagement
DisengagementEscalation,
DisengagementQuit,
DisengagementNegativeStance,
// Interaction > Satisfaction
SatisfactionGratitude,
SatisfactionConfirmation,
SatisfactionSuccess,
// Execution > Failure
ExecutionFailureInvalidArgs,
ExecutionFailureBadQuery,
ExecutionFailureToolNotFound,
ExecutionFailureAuthMisuse,
ExecutionFailureStateError,
// Execution > Loops
ExecutionLoopsRetry,
ExecutionLoopsParameterDrift,
ExecutionLoopsOscillation,
// Environment > Exhaustion
EnvironmentExhaustionApiError,
EnvironmentExhaustionTimeout,
EnvironmentExhaustionRateLimit,
EnvironmentExhaustionNetwork,
EnvironmentExhaustionMalformed,
EnvironmentExhaustionContextOverflow,
}
impl SignalType {
/// Dotted hierarchical string identifier, e.g.
/// `"interaction.misalignment.correction"`. Matches the Python reference's
/// `SignalType` enum *value* strings byte-for-byte.
pub fn as_str(&self) -> &'static str {
match self {
SignalType::MisalignmentCorrection => "interaction.misalignment.correction",
SignalType::MisalignmentRephrase => "interaction.misalignment.rephrase",
SignalType::MisalignmentClarification => "interaction.misalignment.clarification",
SignalType::StagnationDragging => "interaction.stagnation.dragging",
SignalType::StagnationRepetition => "interaction.stagnation.repetition",
SignalType::DisengagementEscalation => "interaction.disengagement.escalation",
SignalType::DisengagementQuit => "interaction.disengagement.quit",
SignalType::DisengagementNegativeStance => "interaction.disengagement.negative_stance",
SignalType::SatisfactionGratitude => "interaction.satisfaction.gratitude",
SignalType::SatisfactionConfirmation => "interaction.satisfaction.confirmation",
SignalType::SatisfactionSuccess => "interaction.satisfaction.success",
SignalType::ExecutionFailureInvalidArgs => "execution.failure.invalid_args",
SignalType::ExecutionFailureBadQuery => "execution.failure.bad_query",
SignalType::ExecutionFailureToolNotFound => "execution.failure.tool_not_found",
SignalType::ExecutionFailureAuthMisuse => "execution.failure.auth_misuse",
SignalType::ExecutionFailureStateError => "execution.failure.state_error",
SignalType::ExecutionLoopsRetry => "execution.loops.retry",
SignalType::ExecutionLoopsParameterDrift => "execution.loops.parameter_drift",
SignalType::ExecutionLoopsOscillation => "execution.loops.oscillation",
SignalType::EnvironmentExhaustionApiError => "environment.exhaustion.api_error",
SignalType::EnvironmentExhaustionTimeout => "environment.exhaustion.timeout",
SignalType::EnvironmentExhaustionRateLimit => "environment.exhaustion.rate_limit",
SignalType::EnvironmentExhaustionNetwork => "environment.exhaustion.network",
SignalType::EnvironmentExhaustionMalformed => {
"environment.exhaustion.malformed_response"
}
SignalType::EnvironmentExhaustionContextOverflow => {
"environment.exhaustion.context_overflow"
}
}
}
pub fn layer(&self) -> SignalLayer {
match self {
SignalType::MisalignmentCorrection
| SignalType::MisalignmentRephrase
| SignalType::MisalignmentClarification
| SignalType::StagnationDragging
| SignalType::StagnationRepetition
| SignalType::DisengagementEscalation
| SignalType::DisengagementQuit
| SignalType::DisengagementNegativeStance
| SignalType::SatisfactionGratitude
| SignalType::SatisfactionConfirmation
| SignalType::SatisfactionSuccess => SignalLayer::Interaction,
SignalType::ExecutionFailureInvalidArgs
| SignalType::ExecutionFailureBadQuery
| SignalType::ExecutionFailureToolNotFound
| SignalType::ExecutionFailureAuthMisuse
| SignalType::ExecutionFailureStateError
| SignalType::ExecutionLoopsRetry
| SignalType::ExecutionLoopsParameterDrift
| SignalType::ExecutionLoopsOscillation => SignalLayer::Execution,
SignalType::EnvironmentExhaustionApiError
| SignalType::EnvironmentExhaustionTimeout
| SignalType::EnvironmentExhaustionRateLimit
| SignalType::EnvironmentExhaustionNetwork
| SignalType::EnvironmentExhaustionMalformed
| SignalType::EnvironmentExhaustionContextOverflow => SignalLayer::Environment,
}
}
/// Category name within the layer (e.g. `"misalignment"`, `"failure"`).
pub fn category(&self) -> &'static str {
// Strip the layer prefix and take everything before the next dot.
let s = self.as_str();
let after_layer = s.split_once('.').map(|(_, rest)| rest).unwrap_or(s);
after_layer
.split_once('.')
.map(|(c, _)| c)
.unwrap_or(after_layer)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum SignalLayer {
Interaction,
Execution,
Environment,
}
impl SignalLayer {
pub fn as_str(&self) -> &'static str {
match self {
SignalLayer::Interaction => "interaction",
SignalLayer::Execution => "execution",
SignalLayer::Environment => "environment",
}
}
}
/// Overall quality assessment for an agent interaction session.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum InteractionQuality {
Excellent,
Good,
Neutral,
Poor,
Severe,
}
impl InteractionQuality {
pub fn as_str(&self) -> &'static str {
match self {
InteractionQuality::Excellent => "excellent",
InteractionQuality::Good => "good",
InteractionQuality::Neutral => "neutral",
InteractionQuality::Poor => "poor",
InteractionQuality::Severe => "severe",
}
}
}
/// A single detected signal instance.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SignalInstance {
pub signal_type: SignalType,
/// Absolute index into the original conversation `Vec<Message>`.
pub message_index: usize,
pub snippet: String,
pub confidence: f32,
/// Free-form metadata payload mirroring the Python `Dict[str, Any]`.
/// Stored as a JSON object so we can faithfully reproduce the reference's
/// flexible per-detector metadata.
#[serde(default)]
pub metadata: serde_json::Value,
}
impl SignalInstance {
pub fn new(signal_type: SignalType, message_index: usize, snippet: impl Into<String>) -> Self {
Self {
signal_type,
message_index,
snippet: snippet.into(),
confidence: 1.0,
metadata: serde_json::Value::Object(serde_json::Map::new()),
}
}
pub fn with_confidence(mut self, c: f32) -> Self {
self.confidence = c;
self
}
pub fn with_metadata(mut self, m: serde_json::Value) -> Self {
self.metadata = m;
self
}
}
/// Aggregated signals for a specific category.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SignalGroup {
pub category: String,
pub count: usize,
pub signals: Vec<SignalInstance>,
/// Severity level (0-3: none, mild, moderate, severe).
pub severity: u8,
}
impl SignalGroup {
pub fn new(category: impl Into<String>) -> Self {
Self {
category: category.into(),
count: 0,
signals: Vec::new(),
severity: 0,
}
}
pub fn add_signal(&mut self, signal: SignalInstance) {
self.signals.push(signal);
self.count = self.signals.len();
self.update_severity();
}
fn update_severity(&mut self) {
self.severity = match self.count {
0 => 0,
1..=2 => 1,
3..=4 => 2,
_ => 3,
};
}
}
/// Turn count and efficiency metrics, used by stagnation.dragging.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TurnMetrics {
pub total_turns: usize,
pub user_turns: usize,
pub assistant_turns: usize,
pub is_dragging: bool,
pub efficiency_score: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InteractionSignals {
pub misalignment: SignalGroup,
pub stagnation: SignalGroup,
pub disengagement: SignalGroup,
pub satisfaction: SignalGroup,
}
impl Default for InteractionSignals {
fn default() -> Self {
Self {
misalignment: SignalGroup::new("misalignment"),
stagnation: SignalGroup::new("stagnation"),
disengagement: SignalGroup::new("disengagement"),
satisfaction: SignalGroup::new("satisfaction"),
}
}
}
impl InteractionSignals {
/// Ratio of misalignment instances to user turns. Used as a quality
/// scoring input and as a threshold for the "high misalignment rate"
/// summary callout. Mirrors `misalignment.count / max(user_turns, 1)`
/// from the Python reference's `_assess_quality` and `_generate_summary`.
pub fn misalignment_ratio(&self, user_turns: usize) -> f32 {
let denom = user_turns.max(1) as f32;
self.misalignment.count as f32 / denom
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExecutionSignals {
pub failure: SignalGroup,
pub loops: SignalGroup,
}
impl Default for ExecutionSignals {
fn default() -> Self {
Self {
failure: SignalGroup::new("failure"),
loops: SignalGroup::new("loops"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnvironmentSignals {
pub exhaustion: SignalGroup,
}
impl Default for EnvironmentSignals {
fn default() -> Self {
Self {
exhaustion: SignalGroup::new("exhaustion"),
}
}
}
/// Complete signal analysis report for a conversation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SignalReport {
pub interaction: InteractionSignals,
pub execution: ExecutionSignals,
pub environment: EnvironmentSignals,
pub overall_quality: InteractionQuality,
pub quality_score: f32,
pub turn_metrics: TurnMetrics,
pub summary: String,
}
impl Default for SignalReport {
fn default() -> Self {
Self {
interaction: InteractionSignals::default(),
execution: ExecutionSignals::default(),
environment: EnvironmentSignals::default(),
overall_quality: InteractionQuality::Neutral,
quality_score: 50.0,
turn_metrics: TurnMetrics::default(),
summary: String::new(),
}
}
}
impl SignalReport {
/// Iterate over every `SignalInstance` across all layers and groups.
pub fn iter_signals(&self) -> impl Iterator<Item = &SignalInstance> {
self.interaction
.misalignment
.signals
.iter()
.chain(self.interaction.stagnation.signals.iter())
.chain(self.interaction.disengagement.signals.iter())
.chain(self.interaction.satisfaction.signals.iter())
.chain(self.execution.failure.signals.iter())
.chain(self.execution.loops.signals.iter())
.chain(self.environment.exhaustion.signals.iter())
}
pub fn has_signal_type(&self, t: SignalType) -> bool {
self.iter_signals().any(|s| s.signal_type == t)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn signal_type_strings_match_paper_taxonomy() {
assert_eq!(
SignalType::MisalignmentCorrection.as_str(),
"interaction.misalignment.correction"
);
assert_eq!(
SignalType::ExecutionFailureInvalidArgs.as_str(),
"execution.failure.invalid_args"
);
assert_eq!(
SignalType::EnvironmentExhaustionMalformed.as_str(),
"environment.exhaustion.malformed_response"
);
}
#[test]
fn signal_type_layer_and_category() {
assert_eq!(
SignalType::MisalignmentRephrase.layer(),
SignalLayer::Interaction
);
assert_eq!(SignalType::MisalignmentRephrase.category(), "misalignment");
assert_eq!(
SignalType::ExecutionLoopsRetry.layer(),
SignalLayer::Execution
);
assert_eq!(SignalType::ExecutionLoopsRetry.category(), "loops");
assert_eq!(
SignalType::EnvironmentExhaustionTimeout.layer(),
SignalLayer::Environment
);
assert_eq!(
SignalType::EnvironmentExhaustionTimeout.category(),
"exhaustion"
);
}
#[test]
fn signal_group_severity_buckets_match_python() {
let mut g = SignalGroup::new("misalignment");
assert_eq!(g.severity, 0);
for n in 1..=2 {
g.add_signal(SignalInstance::new(
SignalType::MisalignmentCorrection,
n,
"x",
));
}
assert_eq!(g.severity, 1);
for n in 3..=4 {
g.add_signal(SignalInstance::new(
SignalType::MisalignmentCorrection,
n,
"x",
));
}
assert_eq!(g.severity, 2);
for n in 5..=6 {
g.add_signal(SignalInstance::new(
SignalType::MisalignmentCorrection,
n,
"x",
));
}
assert_eq!(g.severity, 3);
}
}

View file

@ -0,0 +1,401 @@
//! Text normalization and similarity primitives.
//!
//! Direct Rust port of `signals/text_processing.py` from the reference. The
//! shapes (`NormalizedMessage`, `NormalizedPattern`) and similarity formulas
//! match the Python implementation exactly so that pattern matching produces
//! the same results on the same inputs.
use std::collections::{HashMap, HashSet};
/// Size of character n-grams used for fuzzy similarity (3 = trigrams).
pub const NGRAM_SIZE: usize = 3;
const PUNCT_TRIM: &[char] = &[
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=',
'>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
];
/// Pre-processed message with normalized text and tokens for efficient matching.
#[derive(Debug, Clone, Default)]
pub struct NormalizedMessage {
pub raw: String,
pub tokens: Vec<String>,
pub token_set: HashSet<String>,
pub bigram_set: HashSet<String>,
pub char_ngram_set: HashSet<String>,
pub token_frequency: HashMap<String, usize>,
}
impl NormalizedMessage {
/// Create a normalized message from raw text. Mirrors
/// `NormalizedMessage.from_text` in the reference, including the
/// head-20%/tail-80% truncation strategy when text exceeds `max_length`.
pub fn from_text(text: &str, max_length: usize) -> Self {
let char_count = text.chars().count();
let raw: String = if char_count <= max_length {
text.to_string()
} else {
let head_len = max_length / 5;
// Reserve one char for the joining space.
let tail_len = max_length.saturating_sub(head_len + 1);
let head: String = text.chars().take(head_len).collect();
let tail: String = text
.chars()
.skip(char_count.saturating_sub(tail_len))
.collect();
format!("{} {}", head, tail)
};
// Normalize unicode punctuation to ASCII equivalents.
let normalized_unicode = raw
.replace(['\u{2019}', '\u{2018}'], "'")
.replace(['\u{201c}', '\u{201d}'], "\"")
.replace(['\u{2013}', '\u{2014}'], "-");
// Lowercase + collapse whitespace (matches Python's `" ".join(s.split())`).
let normalized: String = normalized_unicode
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let mut tokens: Vec<String> = Vec::new();
for word in normalized.split_whitespace() {
let stripped: String = word.trim_matches(PUNCT_TRIM).to_string();
if !stripped.is_empty() {
tokens.push(stripped);
}
}
let token_set: HashSet<String> = tokens.iter().cloned().collect();
let mut bigram_set: HashSet<String> = HashSet::new();
for i in 0..tokens.len().saturating_sub(1) {
bigram_set.insert(format!("{} {}", tokens[i], tokens[i + 1]));
}
let tokens_text = tokens.join(" ");
let char_ngram_set = char_ngrams(&tokens_text, NGRAM_SIZE);
let mut token_frequency: HashMap<String, usize> = HashMap::new();
for t in &tokens {
*token_frequency.entry(t.clone()).or_insert(0) += 1;
}
Self {
raw,
tokens,
token_set,
bigram_set,
char_ngram_set,
token_frequency,
}
}
pub fn contains_token(&self, token: &str) -> bool {
self.token_set.contains(token)
}
pub fn contains_phrase(&self, phrase: &str) -> bool {
let phrase_tokens: Vec<&str> = phrase.split_whitespace().collect();
if phrase_tokens.is_empty() {
return false;
}
if phrase_tokens.len() == 1 {
return self.contains_token(phrase_tokens[0]);
}
if phrase_tokens.len() > self.tokens.len() {
return false;
}
let n = phrase_tokens.len();
for i in 0..=self.tokens.len() - n {
if self.tokens[i..i + n]
.iter()
.zip(phrase_tokens.iter())
.all(|(a, b)| a == b)
{
return true;
}
}
false
}
/// Character n-gram (Jaccard) similarity vs another normalized message.
pub fn ngram_similarity_with_message(&self, other: &NormalizedMessage) -> f32 {
jaccard(&self.char_ngram_set, &other.char_ngram_set)
}
/// Character n-gram (Jaccard) similarity vs a raw pattern string.
pub fn ngram_similarity_with_pattern(&self, pattern: &str) -> f32 {
let normalized = strip_non_word_chars(&pattern.to_lowercase());
let pattern_ngrams = char_ngrams(&normalized, NGRAM_SIZE);
jaccard(&self.char_ngram_set, &pattern_ngrams)
}
/// Fraction of pattern's ngrams contained in this message's ngram set.
pub fn char_ngram_containment(&self, pattern: &str) -> f32 {
let normalized = strip_non_word_chars(&pattern.to_lowercase());
let pattern_ngrams = char_ngrams(&normalized, NGRAM_SIZE);
if pattern_ngrams.is_empty() {
return 0.0;
}
let contained = pattern_ngrams
.iter()
.filter(|ng| self.char_ngram_set.contains(*ng))
.count();
contained as f32 / pattern_ngrams.len() as f32
}
/// Token-frequency cosine similarity vs a raw pattern string.
pub fn token_cosine_similarity(&self, pattern: &str) -> f32 {
let mut pattern_freq: HashMap<String, usize> = HashMap::new();
for word in pattern.to_lowercase().split_whitespace() {
let stripped = word.trim_matches(PUNCT_TRIM);
if !stripped.is_empty() {
*pattern_freq.entry(stripped.to_string()).or_insert(0) += 1;
}
}
cosine_freq(&self.token_frequency, &pattern_freq)
}
/// Layered match against a pre-normalized pattern. Mirrors
/// `matches_normalized_pattern` from the reference: exact phrase ->
/// char-ngram Jaccard -> token cosine.
pub fn matches_normalized_pattern(
&self,
pattern: &NormalizedPattern,
char_ngram_threshold: f32,
token_cosine_threshold: f32,
) -> bool {
// Layer 0: exact phrase match using pre-tokenized message.
let plen = pattern.tokens.len();
let slen = self.tokens.len();
if plen > 0 && plen <= slen {
for i in 0..=slen - plen {
if self.tokens[i..i + plen] == pattern.tokens[..] {
return true;
}
}
}
// Layer 1: character n-gram Jaccard similarity.
if !self.char_ngram_set.is_empty() && !pattern.char_ngram_set.is_empty() {
let inter = self
.char_ngram_set
.intersection(&pattern.char_ngram_set)
.count();
let union = self.char_ngram_set.union(&pattern.char_ngram_set).count();
if union > 0 {
let sim = inter as f32 / union as f32;
if sim >= char_ngram_threshold {
return true;
}
}
}
// Layer 2: token frequency cosine similarity.
if !self.token_frequency.is_empty() && !pattern.token_frequency.is_empty() {
let sim = cosine_freq(&self.token_frequency, &pattern.token_frequency);
if sim >= token_cosine_threshold {
return true;
}
}
false
}
}
/// Pre-processed pattern with normalized text and pre-computed n-grams/tokens.
#[derive(Debug, Clone, Default)]
pub struct NormalizedPattern {
pub raw: String,
pub tokens: Vec<String>,
pub char_ngram_set: HashSet<String>,
pub token_frequency: HashMap<String, usize>,
}
impl NormalizedPattern {
pub fn from_text(pattern: &str) -> Self {
let normalized = pattern
.to_lowercase()
.replace(['\u{2019}', '\u{2018}'], "'")
.replace(['\u{201c}', '\u{201d}'], "\"")
.replace(['\u{2013}', '\u{2014}'], "-");
let normalized: String = normalized.split_whitespace().collect::<Vec<_>>().join(" ");
// Tokenize the same way as NormalizedMessage (trim boundary punctuation,
// keep internal punctuation).
let mut tokens: Vec<String> = Vec::new();
for word in normalized.split_whitespace() {
let stripped = word.trim_matches(PUNCT_TRIM);
if !stripped.is_empty() {
tokens.push(stripped.to_string());
}
}
// For ngrams + cosine, strip ALL punctuation (matches Python's
// `re.sub(r"[^\w\s]", "", normalized)`).
let normalized_for_ngrams = strip_non_word_chars(&normalized);
let char_ngram_set = char_ngrams(&normalized_for_ngrams, NGRAM_SIZE);
let tokens_no_punct: Vec<&str> = normalized_for_ngrams.split_whitespace().collect();
let mut token_frequency: HashMap<String, usize> = HashMap::new();
for t in &tokens_no_punct {
*token_frequency.entry((*t).to_string()).or_insert(0) += 1;
}
Self {
raw: pattern.to_string(),
tokens,
char_ngram_set,
token_frequency,
}
}
}
/// Convenience: normalize a list of raw pattern strings into `NormalizedPattern`s.
pub fn normalize_patterns(patterns: &[&str]) -> Vec<NormalizedPattern> {
patterns
.iter()
.map(|p| NormalizedPattern::from_text(p))
.collect()
}
// ---------------------------------------------------------------------------
// Similarity primitives
// ---------------------------------------------------------------------------
fn char_ngrams(s: &str, n: usize) -> HashSet<String> {
// Python iterates by character index, not byte; mirror that with .chars().
let chars: Vec<char> = s.chars().collect();
let mut out: HashSet<String> = HashSet::new();
if chars.len() < n {
return out;
}
for i in 0..=chars.len() - n {
out.insert(chars[i..i + n].iter().collect());
}
out
}
fn jaccard(a: &HashSet<String>, b: &HashSet<String>) -> f32 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
if a.is_empty() || b.is_empty() {
return 0.0;
}
let inter = a.intersection(b).count();
let union = a.union(b).count();
if union == 0 {
0.0
} else {
inter as f32 / union as f32
}
}
fn cosine_freq(a: &HashMap<String, usize>, b: &HashMap<String, usize>) -> f32 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
if a.is_empty() || b.is_empty() {
return 0.0;
}
let mut dot: f64 = 0.0;
let mut n1_sq: f64 = 0.0;
let mut n2_sq: f64 = 0.0;
for (token, &freq2) in b {
let freq1 = *a.get(token).unwrap_or(&0);
dot += (freq1 * freq2) as f64;
n2_sq += (freq2 * freq2) as f64;
}
for &freq1 in a.values() {
n1_sq += (freq1 * freq1) as f64;
}
let n1 = n1_sq.sqrt();
let n2 = n2_sq.sqrt();
if n1 == 0.0 || n2 == 0.0 {
0.0
} else {
(dot / (n1 * n2)) as f32
}
}
/// Python equivalent: `re.sub(r"[^\w\s]", "", text)` followed by whitespace
/// collapse. Python's `\w` is `[A-Za-z0-9_]` plus unicode word characters; we
/// use Rust's `char::is_alphanumeric()` plus `_` for an equivalent definition.
fn strip_non_word_chars(text: &str) -> String {
let mut out = String::with_capacity(text.len());
for c in text.chars() {
if c.is_alphanumeric() || c == '_' || c.is_whitespace() {
out.push(c);
}
}
out.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_lowercases_and_strips_punctuation() {
let m = NormalizedMessage::from_text("Hello, World!", 2000);
assert_eq!(m.tokens, vec!["hello".to_string(), "world".to_string()]);
}
#[test]
fn normalizes_smart_quotes() {
let m = NormalizedMessage::from_text("don\u{2019}t", 2000);
assert!(m.tokens.contains(&"don't".to_string()));
}
#[test]
fn truncates_long_text_with_head_tail() {
let long = "a".repeat(3000);
let m = NormalizedMessage::from_text(&long, 2000);
// raw should be ~ 2000 chars (head + space + tail)
assert!(m.raw.chars().count() <= 2001);
assert!(m.raw.starts_with("aa"));
assert!(m.raw.ends_with("aa"));
}
#[test]
fn contains_phrase_matches_consecutive_tokens() {
let m = NormalizedMessage::from_text("I think this is great work", 2000);
assert!(m.contains_phrase("this is great"));
assert!(!m.contains_phrase("great this"));
}
#[test]
fn matches_pattern_via_exact_phrase() {
let m = NormalizedMessage::from_text("No, I meant the second one", 2000);
let p = NormalizedPattern::from_text("no i meant");
assert!(m.matches_normalized_pattern(&p, 0.65, 0.6));
}
#[test]
fn matches_pattern_via_char_ngram_fuzziness() {
// Typo in "meant" -> "ment" so layer 0 (exact phrase) cannot match,
// forcing the matcher to fall back to layer 1 (char n-gram Jaccard).
let m = NormalizedMessage::from_text("No I ment", 2000);
let p = NormalizedPattern::from_text("no i meant");
assert!(m.matches_normalized_pattern(&p, 0.4, 0.6));
}
#[test]
fn jaccard_identical_sets_is_one() {
let a: HashSet<String> = ["abc", "bcd"].iter().map(|s| s.to_string()).collect();
assert!((jaccard(&a, &a) - 1.0).abs() < 1e-6);
}
#[test]
fn cosine_freq_orthogonal_is_zero() {
let mut a: HashMap<String, usize> = HashMap::new();
a.insert("hello".to_string(), 1);
let mut b: HashMap<String, usize> = HashMap::new();
b.insert("world".to_string(), 1);
assert_eq!(cosine_freq(&a, &b), 0.0);
}
}

View file

@ -16,10 +16,134 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
use crate::handlers::agents::pipeline::{PipelineError, PipelineProcessor};
const STREAM_BUFFER_SIZE: usize = 16;
use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER};
use crate::tracing::{llm, set_service_name, signals as signal_constants};
/// Cap on accumulated response bytes kept for usage extraction.
/// Most chat responses are well under this; pathological ones are dropped without
/// affecting pass-through streaming to the client.
const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024;
use crate::metrics as bs_metrics;
use crate::metrics::labels as metric_labels;
use crate::signals::otel::emit_signals_to_span;
use crate::signals::{SignalAnalyzer, FLAG_MARKER};
use crate::tracing::{llm, set_service_name};
use hermesllm::apis::openai::Message;
/// Parsed usage + resolved-model details from a provider response.
#[derive(Debug, Default, Clone)]
struct ExtractedUsage {
prompt_tokens: Option<i64>,
completion_tokens: Option<i64>,
total_tokens: Option<i64>,
cached_input_tokens: Option<i64>,
cache_creation_tokens: Option<i64>,
reasoning_tokens: Option<i64>,
/// The model the upstream actually used. For router aliases (e.g.
/// `router:software-engineering`), this differs from the request model.
resolved_model: Option<String>,
}
impl ExtractedUsage {
fn is_empty(&self) -> bool {
self.prompt_tokens.is_none()
&& self.completion_tokens.is_none()
&& self.total_tokens.is_none()
&& self.resolved_model.is_none()
}
fn from_json(value: &serde_json::Value) -> Self {
let mut out = Self::default();
if let Some(model) = value.get("model").and_then(|v| v.as_str()) {
if !model.is_empty() {
out.resolved_model = Some(model.to_string());
}
}
if let Some(u) = value.get("usage") {
// OpenAI-shape usage
out.prompt_tokens = u.get("prompt_tokens").and_then(|v| v.as_i64());
out.completion_tokens = u.get("completion_tokens").and_then(|v| v.as_i64());
out.total_tokens = u.get("total_tokens").and_then(|v| v.as_i64());
out.cached_input_tokens = u
.get("prompt_tokens_details")
.and_then(|d| d.get("cached_tokens"))
.and_then(|v| v.as_i64());
out.reasoning_tokens = u
.get("completion_tokens_details")
.and_then(|d| d.get("reasoning_tokens"))
.and_then(|v| v.as_i64());
// Anthropic-shape fallbacks
if out.prompt_tokens.is_none() {
out.prompt_tokens = u.get("input_tokens").and_then(|v| v.as_i64());
}
if out.completion_tokens.is_none() {
out.completion_tokens = u.get("output_tokens").and_then(|v| v.as_i64());
}
if out.total_tokens.is_none() {
if let (Some(p), Some(c)) = (out.prompt_tokens, out.completion_tokens) {
out.total_tokens = Some(p + c);
}
}
if out.cached_input_tokens.is_none() {
out.cached_input_tokens = u.get("cache_read_input_tokens").and_then(|v| v.as_i64());
}
if out.cached_input_tokens.is_none() {
out.cached_input_tokens =
u.get("cached_content_token_count").and_then(|v| v.as_i64());
}
out.cache_creation_tokens = u
.get("cache_creation_input_tokens")
.and_then(|v| v.as_i64());
if out.reasoning_tokens.is_none() {
out.reasoning_tokens = u.get("thoughts_token_count").and_then(|v| v.as_i64());
}
}
out
}
}
/// Try to pull usage out of an accumulated response body.
/// Handles both a single JSON object (non-streaming) and SSE streams where the
/// final `data: {...}` event carries the `usage` field.
fn extract_usage_from_bytes(buf: &[u8]) -> ExtractedUsage {
if buf.is_empty() {
return ExtractedUsage::default();
}
// Fast path: full-body JSON (non-streaming).
if let Ok(value) = serde_json::from_slice::<serde_json::Value>(buf) {
let u = ExtractedUsage::from_json(&value);
if !u.is_empty() {
return u;
}
}
// SSE path: scan from the end for a `data:` line containing a usage object.
let text = match std::str::from_utf8(buf) {
Ok(t) => t,
Err(_) => return ExtractedUsage::default(),
};
for line in text.lines().rev() {
let trimmed = line.trim_start();
let payload = match trimmed.strip_prefix("data:") {
Some(p) => p.trim_start(),
None => continue,
};
if payload == "[DONE]" || payload.is_empty() {
continue;
}
if !payload.contains("\"usage\"") {
continue;
}
if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
let u = ExtractedUsage::from_json(&value);
if !u.is_empty() {
return u;
}
}
}
ExtractedUsage::default()
}
/// Trait for processing streaming chunks
/// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging)
pub trait StreamProcessor: Send + 'static {
@ -51,6 +175,18 @@ impl StreamProcessor for Box<dyn StreamProcessor> {
}
}
/// Optional Prometheus-metric context for an LLM upstream call. When present,
/// [`ObservableStreamProcessor`] emits `brightstaff_llm_*` metrics at
/// first-byte / complete / error callbacks.
#[derive(Debug, Clone)]
pub struct LlmMetricsCtx {
pub provider: String,
pub model: String,
/// HTTP status of the upstream response. Used to pick `status_class` and
/// `error_class` on `on_complete`.
pub upstream_status: u16,
}
/// A processor that tracks streaming metrics
pub struct ObservableStreamProcessor {
service_name: String,
@ -60,6 +196,12 @@ pub struct ObservableStreamProcessor {
start_time: Instant,
time_to_first_token: Option<u128>,
messages: Option<Vec<Message>>,
/// Accumulated response bytes used only for best-effort usage extraction
/// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped
/// from the buffer (they still pass through to the client).
response_buffer: Vec<u8>,
llm_metrics: Option<LlmMetricsCtx>,
metrics_recorded: bool,
}
impl ObservableStreamProcessor {
@ -93,21 +235,42 @@ impl ObservableStreamProcessor {
start_time,
time_to_first_token: None,
messages,
response_buffer: Vec::new(),
llm_metrics: None,
metrics_recorded: false,
}
}
/// Attach LLM upstream metric context so the processor emits
/// `brightstaff_llm_*` metrics on first-byte / complete / error.
pub fn with_llm_metrics(mut self, ctx: LlmMetricsCtx) -> Self {
self.llm_metrics = Some(ctx);
self
}
}
impl StreamProcessor for ObservableStreamProcessor {
fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String> {
self.total_bytes += chunk.len();
self.chunk_count += 1;
// Accumulate for best-effort usage extraction; drop further chunks once
// the cap is reached so we don't retain huge response bodies in memory.
if self.response_buffer.len() < USAGE_BUFFER_MAX {
let remaining = USAGE_BUFFER_MAX - self.response_buffer.len();
let take = chunk.len().min(remaining);
self.response_buffer.extend_from_slice(&chunk[..take]);
}
Ok(Some(chunk))
}
fn on_first_bytes(&mut self) {
// Record time to first token (only for streaming)
if self.time_to_first_token.is_none() {
self.time_to_first_token = Some(self.start_time.elapsed().as_millis());
let elapsed = self.start_time.elapsed();
self.time_to_first_token = Some(elapsed.as_millis());
if let Some(ref ctx) = self.llm_metrics {
bs_metrics::record_llm_ttft(&ctx.provider, &ctx.model, elapsed);
}
}
}
@ -124,77 +287,98 @@ impl StreamProcessor for ObservableStreamProcessor {
);
}
// Analyze signals if messages are available and record as span attributes
if let Some(ref messages) = self.messages {
let analyzer: Box<dyn SignalAnalyzer> = Box::new(TextBasedSignalAnalyzer::new());
let report = analyzer.analyze(messages);
// Record total duration on the span for the observability console.
let duration_ms = self.start_time.elapsed().as_millis() as i64;
{
let span = tracing::Span::current();
let otel_context = span.context();
let otel_span = otel_context.span();
otel_span.set_attribute(KeyValue::new(llm::DURATION_MS, duration_ms));
otel_span.set_attribute(KeyValue::new(llm::RESPONSE_BYTES, self.total_bytes as i64));
}
// Best-effort usage extraction + emission (works for both streaming
// SSE and non-streaming JSON responses that include a `usage` object).
let usage = extract_usage_from_bytes(&self.response_buffer);
if !usage.is_empty() {
let span = tracing::Span::current();
let otel_context = span.context();
let otel_span = otel_context.span();
if let Some(v) = usage.prompt_tokens {
otel_span.set_attribute(KeyValue::new(llm::PROMPT_TOKENS, v));
}
if let Some(v) = usage.completion_tokens {
otel_span.set_attribute(KeyValue::new(llm::COMPLETION_TOKENS, v));
}
if let Some(v) = usage.total_tokens {
otel_span.set_attribute(KeyValue::new(llm::TOTAL_TOKENS, v));
}
if let Some(v) = usage.cached_input_tokens {
otel_span.set_attribute(KeyValue::new(llm::CACHED_INPUT_TOKENS, v));
}
if let Some(v) = usage.cache_creation_tokens {
otel_span.set_attribute(KeyValue::new(llm::CACHE_CREATION_TOKENS, v));
}
if let Some(v) = usage.reasoning_tokens {
otel_span.set_attribute(KeyValue::new(llm::REASONING_TOKENS, v));
}
// Override `llm.model` with the model the upstream actually ran
// (e.g. `openai-gpt-5.4` resolved from `router:software-engineering`).
// Cost lookup keys off the real model, not the alias.
if let Some(resolved) = usage.resolved_model.clone() {
otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved));
}
}
// Emit LLM upstream prometheus metrics (duration + tokens) if wired.
// The upstream responded (we have a status), so status_class alone
// carries the non-2xx signal — error_class stays "none".
if let Some(ref ctx) = self.llm_metrics {
bs_metrics::record_llm_upstream(
&ctx.provider,
&ctx.model,
ctx.upstream_status,
metric_labels::LLM_ERR_NONE,
self.start_time.elapsed(),
);
if let Some(v) = usage.prompt_tokens {
bs_metrics::record_llm_tokens(
&ctx.provider,
&ctx.model,
metric_labels::TOKEN_KIND_PROMPT,
v.max(0) as u64,
);
}
if let Some(v) = usage.completion_tokens {
bs_metrics::record_llm_tokens(
&ctx.provider,
&ctx.model,
metric_labels::TOKEN_KIND_COMPLETION,
v.max(0) as u64,
);
}
if usage.prompt_tokens.is_none() && usage.completion_tokens.is_none() {
bs_metrics::record_llm_tokens_usage_missing(&ctx.provider, &ctx.model);
}
self.metrics_recorded = true;
}
// Release the buffered bytes early; nothing downstream needs them.
self.response_buffer.clear();
self.response_buffer.shrink_to_fit();
// Analyze signals if messages are available and record as span
// attributes + per-signal events. We dual-emit legacy aggregate keys
// and the new layered taxonomy so existing dashboards keep working
// while new consumers can opt into the richer hierarchy.
if let Some(ref messages) = self.messages {
let analyzer = SignalAnalyzer::default();
let report = analyzer.analyze_openai(messages);
// Get the current OTel span to set signal attributes
let span = tracing::Span::current();
let otel_context = span.context();
let otel_span = otel_context.span();
// Add overall quality
otel_span.set_attribute(KeyValue::new(
signal_constants::QUALITY,
format!("{:?}", report.overall_quality),
));
// Add repair/follow-up metrics if concerning
if report.follow_up.is_concerning || report.follow_up.repair_count > 0 {
otel_span.set_attribute(KeyValue::new(
signal_constants::REPAIR_COUNT,
report.follow_up.repair_count as i64,
));
otel_span.set_attribute(KeyValue::new(
signal_constants::REPAIR_RATIO,
format!("{:.3}", report.follow_up.repair_ratio),
));
}
// Add frustration metrics
if report.frustration.has_frustration {
otel_span.set_attribute(KeyValue::new(
signal_constants::FRUSTRATION_COUNT,
report.frustration.frustration_count as i64,
));
otel_span.set_attribute(KeyValue::new(
signal_constants::FRUSTRATION_SEVERITY,
report.frustration.severity as i64,
));
}
// Add repetition metrics
if report.repetition.has_looping {
otel_span.set_attribute(KeyValue::new(
signal_constants::REPETITION_COUNT,
report.repetition.repetition_count as i64,
));
}
// Add escalation metrics
if report.escalation.escalation_requested {
otel_span
.set_attribute(KeyValue::new(signal_constants::ESCALATION_REQUESTED, true));
}
// Add positive feedback metrics
if report.positive_feedback.has_positive_feedback {
otel_span.set_attribute(KeyValue::new(
signal_constants::POSITIVE_FEEDBACK_COUNT,
report.positive_feedback.positive_count as i64,
));
}
// Flag the span name if any concerning signal is detected
let should_flag = report.frustration.has_frustration
|| report.repetition.has_looping
|| report.escalation.escalation_requested
|| matches!(
report.overall_quality,
InteractionQuality::Poor | InteractionQuality::Severe
);
let should_flag = emit_signals_to_span(&otel_span, &report);
if should_flag {
otel_span.update_name(format!("{} {}", self.operation_name, FLAG_MARKER));
}
@ -217,6 +401,18 @@ impl StreamProcessor for ObservableStreamProcessor {
duration_ms = self.start_time.elapsed().as_millis(),
"stream error"
);
if let Some(ref ctx) = self.llm_metrics {
if !self.metrics_recorded {
bs_metrics::record_llm_upstream(
&ctx.provider,
&ctx.model,
ctx.upstream_status,
metric_labels::LLM_ERR_STREAM,
self.start_time.elapsed(),
);
self.metrics_recorded = true;
}
}
}
}
@ -404,3 +600,55 @@ pub fn truncate_message(message: &str, max_length: usize) -> String {
message.to_string()
}
}
#[cfg(test)]
mod usage_extraction_tests {
use super::*;
#[test]
fn non_streaming_openai_with_cached() {
let body = br#"{"id":"x","model":"gpt-4o","choices":[],"usage":{"prompt_tokens":12,"completion_tokens":34,"total_tokens":46,"prompt_tokens_details":{"cached_tokens":5}}}"#;
let u = extract_usage_from_bytes(body);
assert_eq!(u.prompt_tokens, Some(12));
assert_eq!(u.completion_tokens, Some(34));
assert_eq!(u.total_tokens, Some(46));
assert_eq!(u.cached_input_tokens, Some(5));
assert_eq!(u.reasoning_tokens, None);
}
#[test]
fn non_streaming_anthropic_with_cache_creation() {
let body = br#"{"id":"x","model":"claude","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":20,"cache_read_input_tokens":30}}"#;
let u = extract_usage_from_bytes(body);
assert_eq!(u.prompt_tokens, Some(100));
assert_eq!(u.completion_tokens, Some(50));
assert_eq!(u.total_tokens, Some(150));
assert_eq!(u.cached_input_tokens, Some(30));
assert_eq!(u.cache_creation_tokens, Some(20));
}
#[test]
fn streaming_openai_final_chunk_has_usage() {
let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}
data: {\"choices\":[{\"delta\":{}, \"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":3,\"total_tokens\":10}}
data: [DONE]
";
let u = extract_usage_from_bytes(sse);
assert_eq!(u.prompt_tokens, Some(7));
assert_eq!(u.completion_tokens, Some(3));
assert_eq!(u.total_tokens, Some(10));
}
#[test]
fn empty_returns_default() {
assert!(extract_usage_from_bytes(b"").is_empty());
}
#[test]
fn no_usage_in_body_returns_default() {
assert!(extract_usage_from_bytes(br#"{"ok":true}"#).is_empty());
}
}

View file

@ -80,6 +80,18 @@ pub mod llm {
/// Total tokens used (prompt + completion)
pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens";
/// Tokens served from a prompt cache read
/// (OpenAI `prompt_tokens_details.cached_tokens`, Anthropic `cache_read_input_tokens`,
/// Google `cached_content_token_count`)
pub const CACHED_INPUT_TOKENS: &str = "llm.usage.cached_input_tokens";
/// Tokens used to write a prompt cache entry (Anthropic `cache_creation_input_tokens`)
pub const CACHE_CREATION_TOKENS: &str = "llm.usage.cache_creation_tokens";
/// Reasoning tokens for reasoning models
/// (OpenAI `completion_tokens_details.reasoning_tokens`, Google `thoughts_token_count`)
pub const REASONING_TOKENS: &str = "llm.usage.reasoning_tokens";
/// Temperature parameter used
pub const TEMPERATURE: &str = "llm.temperature";
@ -119,6 +131,22 @@ pub mod routing {
pub const SELECTION_REASON: &str = "routing.selection_reason";
}
// =============================================================================
// Span Attributes - Plano-specific
// =============================================================================
/// Attributes specific to Plano (session affinity, routing decisions).
pub mod plano {
/// Session identifier propagated via the `x-model-affinity` header.
/// Absent when the client did not send the header.
pub const SESSION_ID: &str = "plano.session_id";
/// Matched route name from routing (e.g. "code", "summarization",
/// "software-engineering"). Absent when the client routed directly
/// to a concrete model.
pub const ROUTE_NAME: &str = "plano.route.name";
}
// =============================================================================
// Span Attributes - Error Handling
// =============================================================================

View file

@ -4,7 +4,7 @@ mod init;
mod service_name_exporter;
pub use constants::{
error, http, llm, operation_component, routing, signals, OperationNameBuilder,
error, http, llm, operation_component, plano, routing, signals, OperationNameBuilder,
};
pub use custom_attributes::collect_custom_trace_attributes;
pub use init::init_tracer;

View file

@ -234,6 +234,7 @@ pub struct Overrides {
pub llm_routing_model: Option<String>,
pub agent_orchestration_model: Option<String>,
pub orchestrator_model_context_length: Option<usize>,
pub disable_signals: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -395,6 +396,8 @@ pub enum LlmProviderType {
Vercel,
#[serde(rename = "openrouter")]
OpenRouter,
#[serde(rename = "digitalocean")]
DigitalOcean,
}
impl Display for LlmProviderType {
@ -418,6 +421,7 @@ impl Display for LlmProviderType {
LlmProviderType::Plano => write!(f, "plano"),
LlmProviderType::Vercel => write!(f, "vercel"),
LlmProviderType::OpenRouter => write!(f, "openrouter"),
LlmProviderType::DigitalOcean => write!(f, "digitalocean"),
}
}
}
@ -753,4 +757,29 @@ mod test {
assert!(model_ids.contains(&"openai-gpt4".to_string()));
assert!(!model_ids.contains(&"plano-orchestrator".to_string()));
}
#[test]
fn test_overrides_disable_signals_default_none() {
let overrides = super::Overrides::default();
assert_eq!(overrides.disable_signals, None);
}
#[test]
fn test_overrides_disable_signals_deserialize() {
let yaml = r#"
disable_signals: true
"#;
let overrides: super::Overrides = serde_yaml::from_str(yaml).unwrap();
assert_eq!(overrides.disable_signals, Some(true));
let yaml_false = r#"
disable_signals: false
"#;
let overrides: super::Overrides = serde_yaml::from_str(yaml_false).unwrap();
assert_eq!(overrides.disable_signals, Some(false));
let yaml_missing = "{}";
let overrides: super::Overrides = serde_yaml::from_str(yaml_missing).unwrap();
assert_eq!(overrides.disable_signals, None);
}
}

View file

@ -435,6 +435,12 @@ impl TokenUsage for MessagesResponse {
fn total_tokens(&self) -> usize {
(self.usage.input_tokens + self.usage.output_tokens) as usize
}
fn cached_input_tokens(&self) -> Option<usize> {
self.usage.cache_read_input_tokens.map(|t| t as usize)
}
fn cache_creation_tokens(&self) -> Option<usize> {
self.usage.cache_creation_input_tokens.map(|t| t as usize)
}
}
impl ProviderResponse for MessagesResponse {

View file

@ -596,6 +596,18 @@ impl TokenUsage for Usage {
fn total_tokens(&self) -> usize {
self.total_tokens as usize
}
fn cached_input_tokens(&self) -> Option<usize> {
self.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens.map(|t| t as usize))
}
fn reasoning_tokens(&self) -> Option<usize> {
self.completion_tokens_details
.as_ref()
.and_then(|d| d.reasoning_tokens.map(|t| t as usize))
}
}
/// Implementation of ProviderRequest for ChatCompletionsRequest

View file

@ -710,6 +710,18 @@ impl crate::providers::response::TokenUsage for ResponseUsage {
fn total_tokens(&self) -> usize {
self.total_tokens as usize
}
fn cached_input_tokens(&self) -> Option<usize> {
self.input_tokens_details
.as_ref()
.map(|d| d.cached_tokens.max(0) as usize)
}
fn reasoning_tokens(&self) -> Option<usize> {
self.output_tokens_details
.as_ref()
.map(|d| d.reasoning_tokens.max(0) as usize)
}
}
/// Token details

View file

@ -1,6 +1,9 @@
use crate::apis::anthropic::MessagesStreamEvent;
use crate::apis::anthropic::{
MessagesMessageDelta, MessagesStopReason, MessagesStreamEvent, MessagesUsage,
};
use crate::apis::streaming_shapes::sse::{SseEvent, SseStreamBufferTrait};
use crate::providers::streaming_response::ProviderStreamResponseType;
use log::warn;
use std::collections::HashSet;
/// SSE Stream Buffer for Anthropic Messages API streaming.
@ -11,13 +14,24 @@ use std::collections::HashSet;
///
/// When converting from OpenAI to Anthropic format, this buffer injects the required
/// ContentBlockStart and ContentBlockStop events to maintain proper Anthropic protocol.
///
/// Guarantees (Anthropic Messages API contract):
/// 1. `message_stop` is never emitted unless a matching `message_start` was emitted first.
/// 2. `message_stop` is emitted at most once per stream (no double-close).
/// 3. If upstream terminates with no content (empty/filtered/errored response), a
/// minimal but well-formed envelope is synthesized so the client's state machine
/// stays consistent.
pub struct AnthropicMessagesStreamBuffer {
/// Buffered SSE events ready to be written to wire
buffered_events: Vec<SseEvent>,
/// Track if we've seen a message_start event
/// Track if we've emitted a message_start event
message_started: bool,
/// Track if we've emitted a terminal message_stop event (for idempotency /
/// double-close protection).
message_stopped: bool,
/// Track content block indices that have received ContentBlockStart events
content_block_start_indices: HashSet<i32>,
@ -42,6 +56,7 @@ impl AnthropicMessagesStreamBuffer {
Self {
buffered_events: Vec::new(),
message_started: false,
message_stopped: false,
content_block_start_indices: HashSet::new(),
needs_content_block_stop: false,
seen_message_delta: false,
@ -49,6 +64,66 @@ impl AnthropicMessagesStreamBuffer {
}
}
/// Inject a `message_start` event into the buffer if one hasn't been emitted yet.
/// This is the single source of truth for opening a message — every handler
/// that can legitimately be the first event on the wire must call this before
/// pushing its own event.
fn ensure_message_started(&mut self) {
if self.message_started {
return;
}
let model = self.model.as_deref().unwrap_or("unknown");
let message_start = AnthropicMessagesStreamBuffer::create_message_start_event(model);
self.buffered_events.push(message_start);
self.message_started = true;
}
/// Inject a synthetic `message_delta` with `end_turn` / zero usage.
/// Used when we must close a message but upstream never produced a terminal
/// event (e.g. `[DONE]` arrives with no prior `finish_reason`).
fn push_synthetic_message_delta(&mut self) {
let event = MessagesStreamEvent::MessageDelta {
delta: MessagesMessageDelta {
stop_reason: MessagesStopReason::EndTurn,
stop_sequence: None,
},
usage: MessagesUsage {
input_tokens: 0,
output_tokens: 0,
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
};
let sse_string: String = event.clone().into();
self.buffered_events.push(SseEvent {
data: None,
event: Some("message_delta".to_string()),
raw_line: sse_string.clone(),
sse_transformed_lines: sse_string,
provider_stream_response: Some(ProviderStreamResponseType::MessagesStreamEvent(event)),
});
self.seen_message_delta = true;
}
/// Inject a `message_stop` event into the buffer, marking the stream as closed.
/// Idempotent — subsequent calls are no-ops.
fn push_message_stop(&mut self) {
if self.message_stopped {
return;
}
let message_stop = MessagesStreamEvent::MessageStop;
let sse_string: String = message_stop.into();
self.buffered_events.push(SseEvent {
data: None,
event: Some("message_stop".to_string()),
raw_line: sse_string.clone(),
sse_transformed_lines: sse_string,
provider_stream_response: None,
});
self.message_stopped = true;
self.seen_message_delta = false;
}
/// Check if a content_block_start event has been sent for the given index
fn has_content_block_start_been_sent(&self, index: i32) -> bool {
self.content_block_start_indices.contains(&index)
@ -149,6 +224,27 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
// We match on a reference first to determine the type, then move the event
match &event.provider_stream_response {
Some(ProviderStreamResponseType::MessagesStreamEvent(evt)) => {
// If the message has already been closed, drop any trailing events
// to avoid emitting data after `message_stop` (protocol violation).
// This typically indicates a duplicate `[DONE]` from upstream or a
// replay of previously-buffered bytes — worth surfacing so we can
// spot misbehaving providers.
if self.message_stopped {
warn!(
"anthropic stream buffer: dropping event after message_stop (variant={})",
match evt {
MessagesStreamEvent::MessageStart { .. } => "message_start",
MessagesStreamEvent::ContentBlockStart { .. } => "content_block_start",
MessagesStreamEvent::ContentBlockDelta { .. } => "content_block_delta",
MessagesStreamEvent::ContentBlockStop { .. } => "content_block_stop",
MessagesStreamEvent::MessageDelta { .. } => "message_delta",
MessagesStreamEvent::MessageStop => "message_stop",
MessagesStreamEvent::Ping => "ping",
}
);
return;
}
match evt {
MessagesStreamEvent::MessageStart { .. } => {
// Add the message_start event
@ -157,14 +253,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
}
MessagesStreamEvent::ContentBlockStart { index, .. } => {
let index = *index as i32;
// Inject message_start if needed
if !self.message_started {
let model = self.model.as_deref().unwrap_or("unknown");
let message_start =
AnthropicMessagesStreamBuffer::create_message_start_event(model);
self.buffered_events.push(message_start);
self.message_started = true;
}
self.ensure_message_started();
// Add the content_block_start event (from tool calls or other sources)
self.buffered_events.push(event);
@ -173,14 +262,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
}
MessagesStreamEvent::ContentBlockDelta { index, .. } => {
let index = *index as i32;
// Inject message_start if needed
if !self.message_started {
let model = self.model.as_deref().unwrap_or("unknown");
let message_start =
AnthropicMessagesStreamBuffer::create_message_start_event(model);
self.buffered_events.push(message_start);
self.message_started = true;
}
self.ensure_message_started();
// Check if ContentBlockStart was sent for this index
if !self.has_content_block_start_been_sent(index) {
@ -196,6 +278,11 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
self.buffered_events.push(event);
}
MessagesStreamEvent::MessageDelta { usage, .. } => {
// `message_delta` is only meaningful inside an open message.
// Upstream can send it with no prior content (empty completion,
// content filter, etc.), so we must open a message first.
self.ensure_message_started();
// Inject ContentBlockStop before message_delta
if self.needs_content_block_stop {
let content_block_stop =
@ -230,15 +317,52 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
}
MessagesStreamEvent::ContentBlockStop { .. } => {
// ContentBlockStop received from upstream (e.g., Bedrock)
self.ensure_message_started();
// Clear the flag so we don't inject another one
self.needs_content_block_stop = false;
self.buffered_events.push(event);
}
MessagesStreamEvent::MessageStop => {
// MessageStop received from upstream (e.g., OpenAI via [DONE])
// Clear the flag so we don't inject another one
self.seen_message_delta = false;
// MessageStop received from upstream (e.g., OpenAI via [DONE]).
//
// The Anthropic protocol requires the full envelope
// message_start → [content blocks] → message_delta → message_stop
// so we must not emit a bare `message_stop`. Synthesize whatever
// is missing to keep the client's state machine consistent.
self.ensure_message_started();
if self.needs_content_block_stop {
let content_block_stop =
AnthropicMessagesStreamBuffer::create_content_block_stop_event();
self.buffered_events.push(content_block_stop);
self.needs_content_block_stop = false;
}
// If no message_delta has been emitted yet (empty/filtered upstream
// response), synthesize a minimal one carrying `end_turn`.
if !self.seen_message_delta {
// If we also never opened a content block, open and close one
// so clients that expect at least one block are happy.
if self.content_block_start_indices.is_empty() {
let content_block_start =
AnthropicMessagesStreamBuffer::create_content_block_start_event(
);
self.buffered_events.push(content_block_start);
self.set_content_block_start_sent(0);
let content_block_stop =
AnthropicMessagesStreamBuffer::create_content_block_stop_event(
);
self.buffered_events.push(content_block_stop);
}
self.push_synthetic_message_delta();
}
// Push the upstream-provided message_stop and mark closed.
// `push_message_stop` is idempotent but we want to reuse the
// original SseEvent so raw passthrough semantics are preserved.
self.buffered_events.push(event);
self.message_stopped = true;
self.seen_message_delta = false;
}
_ => {
// Other Anthropic event types (Ping, etc.), just accumulate
@ -254,24 +378,23 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer {
}
fn to_bytes(&mut self) -> Vec<u8> {
// Convert all accumulated events to bytes and clear buffer
// Convert all accumulated events to bytes and clear buffer.
//
// NOTE: We do NOT inject ContentBlockStop here because it's injected when we see MessageDelta
// or MessageStop. Injecting it here causes premature ContentBlockStop in the middle of streaming.
// Inject MessageStop after MessageDelta if we've seen one
// This completes the Anthropic Messages API event sequence
if self.seen_message_delta {
let message_stop = MessagesStreamEvent::MessageStop;
let sse_string: String = message_stop.into();
let message_stop_event = SseEvent {
data: None,
event: Some("message_stop".to_string()),
raw_line: sse_string.clone(),
sse_transformed_lines: sse_string,
provider_stream_response: None,
};
self.buffered_events.push(message_stop_event);
self.seen_message_delta = false;
//
// Inject a synthetic `message_stop` only when:
// 1. A `message_delta` has been seen (otherwise we'd violate the Anthropic
// protocol by emitting `message_stop` without a preceding `message_delta`), AND
// 2. We haven't already emitted `message_stop` (either synthetic from a
// previous flush, or real from an upstream `[DONE]`).
//
// Without the `!message_stopped` guard, a stream whose `finish_reason` chunk
// and `[DONE]` marker land in separate HTTP body chunks would receive two
// `message_stop` events, triggering Claude Code's "Received message_stop
// without a current message" error.
if self.seen_message_delta && !self.message_stopped {
self.push_message_stop();
}
let mut buffer = Vec::new();
@ -615,4 +738,133 @@ data: [DONE]"#;
println!("✓ Stop reason: tool_use");
println!("✓ Proper Anthropic tool_use protocol\n");
}
/// Regression test for:
/// Claude Code CLI error: "Received message_stop without a current message"
///
/// Reproduces the *double-close* scenario: OpenAI's final `finish_reason`
/// chunk and the `[DONE]` marker arrive in **separate** HTTP body chunks, so
/// `to_bytes()` is called between them. Before the fix, this produced two
/// `message_stop` events on the wire (one synthetic, one from `[DONE]`).
#[test]
fn test_openai_to_anthropic_emits_single_message_stop_across_chunk_boundary() {
let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
let mut buffer = AnthropicMessagesStreamBuffer::new();
// --- HTTP chunk 1: content + finish_reason (no [DONE] yet) -----------
let chunk_1 = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]}
data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}"#;
for raw in SseStreamIter::try_from(chunk_1.as_bytes()).unwrap() {
let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
buffer.add_transformed_event(e);
}
let out_1 = String::from_utf8(buffer.to_bytes()).unwrap();
// --- HTTP chunk 2: just the [DONE] marker ----------------------------
let chunk_2 = "data: [DONE]";
for raw in SseStreamIter::try_from(chunk_2.as_bytes()).unwrap() {
let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
buffer.add_transformed_event(e);
}
let out_2 = String::from_utf8(buffer.to_bytes()).unwrap();
let combined = format!("{}{}", out_1, out_2);
let start_count = combined.matches("event: message_start").count();
let stop_count = combined.matches("event: message_stop").count();
assert_eq!(
start_count, 1,
"Must emit exactly one message_start across chunks, got {start_count}. Output:\n{combined}"
);
assert_eq!(
stop_count, 1,
"Must emit exactly one message_stop across chunks (no double-close), got {stop_count}. Output:\n{combined}"
);
// Every message_stop must be preceded by a message_start earlier in the stream.
let start_pos = combined.find("event: message_start").unwrap();
let stop_pos = combined.find("event: message_stop").unwrap();
assert!(
start_pos < stop_pos,
"message_start must come before message_stop. Output:\n{combined}"
);
}
/// Regression test for:
/// "Received message_stop without a current message" on empty upstream responses.
///
/// OpenAI returns only `[DONE]` with no content deltas and no `finish_reason`
/// (this happens with content filters, truncated upstream streams, and some
/// 5xx recoveries). Before the fix, the buffer emitted a bare `message_stop`
/// with no preceding `message_start`. After the fix, it synthesizes a
/// minimal but well-formed envelope.
#[test]
fn test_openai_done_only_stream_synthesizes_valid_envelope() {
let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
let mut buffer = AnthropicMessagesStreamBuffer::new();
let raw_input = "data: [DONE]";
for raw in SseStreamIter::try_from(raw_input.as_bytes()).unwrap() {
let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
buffer.add_transformed_event(e);
}
let out = String::from_utf8(buffer.to_bytes()).unwrap();
assert!(
out.contains("event: message_start"),
"Empty upstream must still produce message_start. Output:\n{out}"
);
assert!(
out.contains("event: message_delta"),
"Empty upstream must produce a synthesized message_delta. Output:\n{out}"
);
assert_eq!(
out.matches("event: message_stop").count(),
1,
"Empty upstream must produce exactly one message_stop. Output:\n{out}"
);
// Protocol ordering: start < delta < stop.
let p_start = out.find("event: message_start").unwrap();
let p_delta = out.find("event: message_delta").unwrap();
let p_stop = out.find("event: message_stop").unwrap();
assert!(
p_start < p_delta && p_delta < p_stop,
"Bad ordering. Output:\n{out}"
);
}
/// Regression test: events arriving after `message_stop` (e.g. a stray `[DONE]`
/// echo, or late-arriving deltas from a racing upstream) must be dropped
/// rather than written after the terminal frame.
#[test]
fn test_events_after_message_stop_are_dropped() {
let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages);
let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions);
let mut buffer = AnthropicMessagesStreamBuffer::new();
let first = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"content":"ok"},"finish_reason":"stop"}]}
data: [DONE]"#;
for raw in SseStreamIter::try_from(first.as_bytes()).unwrap() {
let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
buffer.add_transformed_event(e);
}
let _ = buffer.to_bytes();
// Simulate a duplicate / late `[DONE]` after the stream was already closed.
let late = "data: [DONE]";
for raw in SseStreamIter::try_from(late.as_bytes()).unwrap() {
let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap();
buffer.add_transformed_event(e);
}
let tail = String::from_utf8(buffer.to_bytes()).unwrap();
assert!(
tail.is_empty(),
"No bytes should be emitted after message_stop, got: {tail:?}"
);
}
}

View file

@ -95,6 +95,7 @@ providers:
anthropic:
- anthropic/claude-sonnet-4-6
- anthropic/claude-opus-4-6
- anthropic/claude-opus-4-7
- anthropic/claude-opus-4-5-20251101
- anthropic/claude-opus-4-5
- anthropic/claude-haiku-4-5-20251001
@ -328,7 +329,53 @@ providers:
- xiaomi/mimo-v2-flash
- xiaomi/mimo-v2-omni
- xiaomi/mimo-v2-pro
digitalocean:
- digitalocean/openai-gpt-4.1
- digitalocean/openai-gpt-4o
- digitalocean/openai-gpt-4o-mini
- digitalocean/openai-gpt-5
- digitalocean/openai-gpt-5-mini
- digitalocean/openai-gpt-5-nano
- digitalocean/openai-gpt-5.1-codex-max
- digitalocean/openai-gpt-5.2
- digitalocean/openai-gpt-5.2-pro
- digitalocean/openai-gpt-5.3-codex
- digitalocean/openai-gpt-5.4
- digitalocean/openai-gpt-5.4-mini
- digitalocean/openai-gpt-5.4-nano
- digitalocean/openai-gpt-5.4-pro
- digitalocean/openai-gpt-oss-120b
- digitalocean/openai-gpt-oss-20b
- digitalocean/openai-o1
- digitalocean/openai-o3
- digitalocean/openai-o3-mini
- digitalocean/anthropic-claude-4.1-opus
- digitalocean/anthropic-claude-4.5-sonnet
- digitalocean/anthropic-claude-4.6-sonnet
- digitalocean/anthropic-claude-haiku-4.5
- digitalocean/anthropic-claude-opus-4
- digitalocean/anthropic-claude-opus-4.5
- digitalocean/anthropic-claude-opus-4.6
- digitalocean/anthropic-claude-opus-4.7
- digitalocean/anthropic-claude-sonnet-4
- digitalocean/alibaba-qwen3-32b
- digitalocean/arcee-trinity-large-thinking
- digitalocean/deepseek-3.2
- digitalocean/deepseek-r1-distill-llama-70b
- digitalocean/gemma-4-31B-it
- digitalocean/glm-5
- digitalocean/kimi-k2.5
- digitalocean/llama3.3-70b-instruct
- digitalocean/minimax-m2.5
- digitalocean/nvidia-nemotron-3-super-120b
- digitalocean/qwen3-coder-flash
- digitalocean/qwen3.5-397b-a17b
- digitalocean/all-mini-lm-l6-v2
- digitalocean/gte-large-en-v1.5
- digitalocean/multi-qa-mpnet-base-dot-v1
- digitalocean/qwen3-embedding-0.6b
- digitalocean/router:software-engineering
metadata:
total_providers: 11
total_models: 316
last_updated: 2026-04-03T23:14:46.956158+00:00
total_providers: 12
total_models: 361
last_updated: 2026-04-16T00:00:00.000000+00:00

View file

@ -46,6 +46,7 @@ pub enum ProviderId {
AmazonBedrock,
Vercel,
OpenRouter,
DigitalOcean,
}
impl TryFrom<&str> for ProviderId {
@ -75,6 +76,9 @@ impl TryFrom<&str> for ProviderId {
"amazon" => Ok(ProviderId::AmazonBedrock), // alias
"vercel" => Ok(ProviderId::Vercel),
"openrouter" => Ok(ProviderId::OpenRouter),
"digitalocean" => Ok(ProviderId::DigitalOcean),
"do" => Ok(ProviderId::DigitalOcean), // alias
"do_ai" => Ok(ProviderId::DigitalOcean), // alias
_ => Err(format!("Unknown provider: {}", value)),
}
}
@ -99,6 +103,7 @@ impl ProviderId {
ProviderId::Moonshotai => "moonshotai",
ProviderId::Zhipu => "z-ai",
ProviderId::Qwen => "qwen",
ProviderId::DigitalOcean => "digitalocean",
// Vercel and OpenRouter are open-ended gateways; model lists are unbounded.
// Users configure these with wildcards (e.g. vercel/*); no static expansion needed.
ProviderId::Vercel | ProviderId::OpenRouter => return Vec::new(),
@ -157,7 +162,8 @@ impl ProviderId {
| ProviderId::Zhipu
| ProviderId::Qwen
| ProviderId::Vercel
| ProviderId::OpenRouter,
| ProviderId::OpenRouter
| ProviderId::DigitalOcean,
SupportedAPIsFromClient::AnthropicMessagesAPI(_),
) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
@ -178,7 +184,8 @@ impl ProviderId {
| ProviderId::Zhipu
| ProviderId::Qwen
| ProviderId::Vercel
| ProviderId::OpenRouter,
| ProviderId::OpenRouter
| ProviderId::DigitalOcean,
SupportedAPIsFromClient::OpenAIChatCompletions(_),
) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions),
@ -247,6 +254,7 @@ impl Display for ProviderId {
ProviderId::AmazonBedrock => write!(f, "amazon_bedrock"),
ProviderId::Vercel => write!(f, "vercel"),
ProviderId::OpenRouter => write!(f, "openrouter"),
ProviderId::DigitalOcean => write!(f, "digitalocean"),
}
}
}

View file

@ -23,6 +23,31 @@ pub trait TokenUsage {
fn completion_tokens(&self) -> usize;
fn prompt_tokens(&self) -> usize;
fn total_tokens(&self) -> usize;
/// Tokens served from a prompt cache read (OpenAI `prompt_tokens_details.cached_tokens`,
/// Anthropic `cache_read_input_tokens`, Google `cached_content_token_count`).
fn cached_input_tokens(&self) -> Option<usize> {
None
}
/// Tokens used to write a cache entry (Anthropic `cache_creation_input_tokens`).
fn cache_creation_tokens(&self) -> Option<usize> {
None
}
/// Reasoning tokens for reasoning models (OpenAI `completion_tokens_details.reasoning_tokens`,
/// Google `thoughts_token_count`).
fn reasoning_tokens(&self) -> Option<usize> {
None
}
}
/// Rich usage breakdown extracted from a provider response.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct UsageDetails {
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
pub cached_input_tokens: Option<usize>,
pub cache_creation_tokens: Option<usize>,
pub reasoning_tokens: Option<usize>,
}
pub trait ProviderResponse: Send + Sync {
@ -34,6 +59,18 @@ pub trait ProviderResponse: Send + Sync {
self.usage()
.map(|u| (u.prompt_tokens(), u.completion_tokens(), u.total_tokens()))
}
/// Extract a rich usage breakdown including cached/cache-creation/reasoning tokens.
fn extract_usage_details(&self) -> Option<UsageDetails> {
self.usage().map(|u| UsageDetails {
prompt_tokens: u.prompt_tokens(),
completion_tokens: u.completion_tokens(),
total_tokens: u.total_tokens(),
cached_input_tokens: u.cached_input_tokens(),
cache_creation_tokens: u.cache_creation_tokens(),
reasoning_tokens: u.reasoning_tokens(),
})
}
}
impl ProviderResponse for ProviderResponseType {

View file

@ -346,12 +346,10 @@ impl TryFrom<(SseEvent, &SupportedAPIsFromClient, &SupportedUpstreamAPIs)> for S
(
SupportedAPIsFromClient::OpenAIChatCompletions(_),
SupportedUpstreamAPIs::AnthropicMessagesAPI(_),
) => {
) if transformed_event.is_event_only() && transformed_event.event.is_some() => {
// OpenAI clients don't expect separate event: lines
// Suppress upstream Anthropic event-only lines
if transformed_event.is_event_only() && transformed_event.event.is_some() {
transformed_event.sse_transformed_lines = "\n".to_string();
}
transformed_event.sse_transformed_lines = "\n".to_string();
}
_ => {
// Other cross-API combinations can be handled here as needed
@ -371,12 +369,10 @@ impl TryFrom<(SseEvent, &SupportedAPIsFromClient, &SupportedUpstreamAPIs)> for S
| (
SupportedAPIsFromClient::OpenAIResponsesAPI(_),
SupportedUpstreamAPIs::OpenAIResponsesAPI(_),
) => {
if transformed_event.is_event_only() && transformed_event.event.is_some() {
// Mark as should-skip by clearing sse_transformed_lines
// The event line is already included when the data line is transformed
transformed_event.sse_transformed_lines = String::new();
}
) if transformed_event.is_event_only() && transformed_event.event.is_some() => {
// Mark as should-skip by clearing sse_transformed_lines
// The event line is already included when the data line is transformed
transformed_event.sse_transformed_lines = String::new();
}
_ => {
// Other passthrough combinations (OpenAI ChatCompletions, etc.) don't have this issue

View file

@ -188,14 +188,13 @@ pub fn convert_openai_message_to_anthropic_content(
// Handle regular content
match &message.content {
Some(MessageContent::Text(text)) => {
if !text.is_empty() {
blocks.push(MessagesContentBlock::Text {
text: text.clone(),
cache_control: None,
});
}
Some(MessageContent::Text(text)) if !text.is_empty() => {
blocks.push(MessagesContentBlock::Text {
text: text.clone(),
cache_control: None,
});
}
Some(MessageContent::Text(_)) => {}
Some(MessageContent::Parts(parts)) => {
for part in parts {
match part {

View file

@ -354,10 +354,10 @@ impl TryFrom<MessagesMessage> for BedrockMessage {
MessagesMessageContent::Blocks(blocks) => {
for block in blocks {
match block {
crate::apis::anthropic::MessagesContentBlock::Text { text, .. } => {
if !text.is_empty() {
content_blocks.push(ContentBlock::Text { text });
}
crate::apis::anthropic::MessagesContentBlock::Text { text, .. }
if !text.is_empty() =>
{
content_blocks.push(ContentBlock::Text { text });
}
crate::apis::anthropic::MessagesContentBlock::ToolUse {
id,

View file

@ -317,11 +317,10 @@ impl TryFrom<Message> for BedrockMessage {
Role::User => {
// Convert user message content to content blocks
match message.content {
Some(MessageContent::Text(text)) => {
if !text.is_empty() {
content_blocks.push(ContentBlock::Text { text });
}
Some(MessageContent::Text(text)) if !text.is_empty() => {
content_blocks.push(ContentBlock::Text { text });
}
Some(MessageContent::Text(_)) => {}
Some(MessageContent::Parts(parts)) => {
// Convert OpenAI content parts to Bedrock ContentBlocks
for part in parts {

View file

@ -177,24 +177,33 @@ impl StreamContext {
}
fn modify_auth_headers(&mut self) -> Result<(), ServerError> {
if self.llm_provider().passthrough_auth == Some(true) {
// Check if client provided an Authorization header
if self.get_http_request_header("Authorization").is_none() {
warn!(
"request_id={}: passthrough_auth enabled but no authorization header present in client request",
self.request_identifier()
);
} else {
debug!(
"request_id={}: preserving client authorization header for provider '{}'",
self.request_identifier(),
self.llm_provider().name
);
// Determine the credential to forward upstream. Either the client
// supplied one (passthrough_auth) or it's configured on the provider.
let credential: String = if self.llm_provider().passthrough_auth == Some(true) {
// Client auth may arrive in either Anthropic-style (`x-api-key`)
// or OpenAI-style (`Authorization: Bearer ...`). Accept both so
// clients using Anthropic SDKs (which default to `x-api-key`)
// work when the upstream is OpenAI-compatible, and vice versa.
let authorization = self.get_http_request_header("Authorization");
let x_api_key = self.get_http_request_header("x-api-key");
match extract_client_credential(authorization.as_deref(), x_api_key.as_deref()) {
Some(key) => {
debug!(
"request_id={}: forwarding client credential to provider '{}'",
self.request_identifier(),
self.llm_provider().name
);
key
}
None => {
warn!(
"request_id={}: passthrough_auth enabled but no Authorization / x-api-key header present in client request",
self.request_identifier()
);
return Ok(());
}
}
return Ok(());
}
let llm_provider_api_key_value =
} else {
self.llm_provider()
.access_key
.as_ref()
@ -203,15 +212,19 @@ impl StreamContext {
"No access key configured for selected LLM Provider \"{}\"",
self.llm_provider()
),
})?;
})?
.clone()
};
// Set API-specific headers based on the resolved upstream API
// Normalize the credential into whichever header the upstream expects.
// This lets an Anthropic-SDK client reach an OpenAI-compatible upstream
// (and vice versa) without the caller needing to know what format the
// upstream uses.
match self.resolved_api.as_ref() {
Some(SupportedUpstreamAPIs::AnthropicMessagesAPI(_)) => {
// Anthropic API requires x-api-key and anthropic-version headers
// Remove any existing Authorization header since Anthropic doesn't use it
// Anthropic expects `x-api-key` + `anthropic-version`.
self.remove_http_request_header("Authorization");
self.set_http_request_header("x-api-key", Some(llm_provider_api_key_value));
self.set_http_request_header("x-api-key", Some(&credential));
self.set_http_request_header("anthropic-version", Some("2023-06-01"));
}
Some(
@ -221,10 +234,9 @@ impl StreamContext {
| SupportedUpstreamAPIs::OpenAIResponsesAPI(_),
)
| None => {
// OpenAI and default: use Authorization Bearer token
// Remove any existing x-api-key header since OpenAI doesn't use it
// OpenAI (and default): `Authorization: Bearer ...`.
self.remove_http_request_header("x-api-key");
let authorization_header_value = format!("Bearer {}", llm_provider_api_key_value);
let authorization_header_value = format!("Bearer {}", credential);
self.set_http_request_header("Authorization", Some(&authorization_header_value));
}
}
@ -1235,3 +1247,86 @@ fn current_time_ns() -> u128 {
}
impl Context for StreamContext {}
/// Extract the credential a client sent in either an OpenAI-style
/// `Authorization` header or an Anthropic-style `x-api-key` header.
///
/// Returns `None` when neither header is present or both are empty/whitespace.
/// The `Bearer ` prefix on the `Authorization` value is stripped if present;
/// otherwise the value is taken verbatim (some clients send a raw token).
fn extract_client_credential(
authorization: Option<&str>,
x_api_key: Option<&str>,
) -> Option<String> {
// Strip the optional "Bearer " / "Bearer" prefix (case-sensitive, matches
// OpenAI SDK behavior) and trim surrounding whitespace before validating
// non-empty.
let from_authorization = authorization
.map(|v| {
v.strip_prefix("Bearer ")
.or_else(|| v.strip_prefix("Bearer"))
.unwrap_or(v)
.trim()
.to_string()
})
.filter(|s| !s.is_empty());
if from_authorization.is_some() {
return from_authorization;
}
x_api_key
.map(str::trim)
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
}
#[cfg(test)]
mod tests {
use super::extract_client_credential;
#[test]
fn authorization_bearer_strips_prefix() {
assert_eq!(
extract_client_credential(Some("Bearer sk-abc"), None),
Some("sk-abc".to_string())
);
}
#[test]
fn authorization_raw_token_preserved() {
// Some clients send the raw token without "Bearer " — accept it.
assert_eq!(
extract_client_credential(Some("sk-abc"), None),
Some("sk-abc".to_string())
);
}
#[test]
fn x_api_key_used_when_authorization_absent() {
assert_eq!(
extract_client_credential(None, Some("sk-ant-api-key")),
Some("sk-ant-api-key".to_string())
);
}
#[test]
fn authorization_wins_when_both_present() {
// If a client is particularly exotic and sends both, prefer the
// OpenAI-style Authorization header.
assert_eq!(
extract_client_credential(Some("Bearer openai-key"), Some("anthropic-key")),
Some("openai-key".to_string())
);
}
#[test]
fn returns_none_when_neither_present() {
assert!(extract_client_credential(None, None).is_none());
}
#[test]
fn empty_and_whitespace_headers_are_ignored() {
assert!(extract_client_credential(Some(""), None).is_none());
assert!(extract_client_credential(Some("Bearer "), None).is_none());
assert!(extract_client_credential(Some(" "), Some(" ")).is_none());
}
}

View file

@ -3,7 +3,7 @@ This demo shows how you can use user preferences to route user prompts to approp
## How to start the demo
Make sure you have Plano CLI installed (`pip install planoai==0.4.19` or `uv tool install planoai==0.4.19`).
Make sure you have Plano CLI installed (`pip install planoai==0.4.20` or `uv tool install planoai==0.4.20`).
```bash
cd demos/llm_routing/preference_based_routing

View file

@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
project = "Plano Docs"
copyright = "2026, Katanemo Labs, a DigitalOcean Company"
author = "Katanemo Labs, Inc"
release = " v0.4.19"
release = " v0.4.20"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

View file

@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
.. code-block:: console
$ uv tool install planoai==0.4.19
$ uv tool install planoai==0.4.20
**Option 2: Install with pip (Traditional)**
@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
$ python -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install planoai==0.4.19
$ pip install planoai==0.4.20
.. _llm_routing_quickstart:
@ -340,6 +340,67 @@ And to get the list of supported currencies:
"Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in."
Observability
-------------
Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering.
===================== ============================================ =============================================================
Command When to use Shows
===================== ============================================ =============================================================
``planoai obs`` Live view while you drive traffic Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model
``planoai trace`` Deep-dive into a single request after the fact Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors
===================== ============================================ =============================================================
Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add:
.. code-block:: yaml
tracing:
random_sampling: 100
opentracing_grpc_endpoint: http://localhost:4317
Live console — ``planoai obs``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: console
$ planoai obs
# In another terminal:
$ planoai up
Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required.
With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request:
.. code-block:: console
$ curl localhost:12000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $DO_API_KEY" \
-d '{"model":"digitalocean/router:software-engineering",
"messages":[{"role":"user","content":"write code to print prime numbers in python"}],
"stream":false}'
When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``.
Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk.
Single-request traces — ``planoai trace``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``:
.. code-block:: console
$ planoai trace listen # start the OTLP listener (daemon)
# drive some traffic through localhost:12000 ...
$ planoai trace # show the most recent trace
$ planoai trace <trace-id> # show a specific trace by id
$ planoai trace --list # list the last 50 trace ids
Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time.
Next Steps
==========

View file

@ -75,3 +75,54 @@ are some sample configuration files for both, respectively.
isDefault: true
access: proxy
editable: true
Brightstaff metrics
~~~~~~~~~~~~~~~~~~~
In addition to Envoy's stats on ``:9901``, the brightstaff dataplane
process exposes its own Prometheus endpoint on ``0.0.0.0:9092`` (override
with ``METRICS_BIND_ADDRESS``). It publishes:
* HTTP RED — ``brightstaff_http_requests_total``,
``brightstaff_http_request_duration_seconds``,
``brightstaff_http_in_flight_requests`` (labels: ``handler``, ``method``,
``status_class``).
* LLM upstream — ``brightstaff_llm_upstream_requests_total``,
``brightstaff_llm_upstream_duration_seconds``,
``brightstaff_llm_time_to_first_token_seconds``,
``brightstaff_llm_tokens_total`` (labels: ``provider``, ``model``,
``error_class``, ``kind``).
* Routing — ``brightstaff_router_decisions_total``,
``brightstaff_router_decision_duration_seconds``,
``brightstaff_routing_service_requests_total``,
``brightstaff_session_cache_events_total``.
* Process & build — ``process_resident_memory_bytes``,
``process_cpu_seconds_total``, ``brightstaff_build_info``.
A self-contained Prometheus + Grafana stack is shipped under
``config/grafana/``. With Plano already running on the host, bring it up
with one command:
.. code-block:: bash
cd config/grafana
docker compose up -d
open http://localhost:3000 # admin / admin (anonymous viewer also enabled)
Grafana auto-loads the Prometheus datasource and the brightstaff
dashboard (look under the *Plano* folder). Prometheus scrapes the host's
``:9092`` and ``:9901`` via ``host.docker.internal``.
Files:
* ``config/grafana/docker-compose.yaml`` — one-command Prom + Grafana
stack with provisioning.
* ``config/grafana/prometheus_scrape.yaml`` — complete Prometheus config
with ``envoy`` and ``brightstaff`` scrape jobs (mounted by the
compose).
* ``config/grafana/brightstaff_dashboard.json`` — 19-panel dashboard
across HTTP RED, LLM upstream, Routing service, and Process & Envoy
link rows. Auto-provisioned by the compose; can also be imported by
hand via *Dashboards → New → Import*.
* ``config/grafana/provisioning/`` — Grafana provisioning files for the
datasource and dashboard provider.

View file

@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
# docker-compose.yml
services:
plano:
image: katanemo/plano:0.4.19
image: katanemo/plano:0.4.20
container_name: plano
ports:
- "10000:10000" # ingress (client -> plano)
@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``:
spec:
containers:
- name: plano
image: katanemo/plano:0.4.19
image: katanemo/plano:0.4.20
ports:
- containerPort: 12000 # LLM gateway (chat completions, model routing)
name: llm-gateway

View file

@ -173,6 +173,9 @@ overrides:
llm_routing_model: Plano-Orchestrator
# Model used for agent orchestration (must be listed in model_providers)
agent_orchestration_model: Plano-Orchestrator
# Disable agentic signal analysis (frustration, repetition, escalation, etc.)
# on LLM responses to save CPU. Default: false.
disable_signals: false
# Model affinity — pin routing decisions for agentic loops
routing:

View file

@ -170,6 +170,7 @@ model_providers:
provider_interface: plano
overrides:
agent_orchestration_model: Plano-Orchestrator
disable_signals: false
llm_routing_model: Plano-Orchestrator
optimize_context_window: true
prompt_target_intent_matching_threshold: 0.7

2109
skills/AGENTS.md Normal file

File diff suppressed because it is too large Load diff

243
skills/README.md Normal file
View file

@ -0,0 +1,243 @@
# Plano Agent Skills
A structured repository of best practices for building agents and agentic applications with [Plano](https://github.com/katanemo/archgw) — the AI-native proxy and dataplane. Optimized for coding agents and LLMs.
## What Are Skills?
Skills are principle-based guides that help coding agents (Claude Code, Cursor, Copilot, etc.) make better decisions when working with Plano. They cover configuration patterns, routing strategies, agent orchestration, observability, and CLI workflows — acting as operating principles, not documentation replacements.
## Installing
```bash
# Install via npx skills
npx skills add katanemo/plano
```
This skills collection is published from the `skills/` directory in the `katanemo/plano` monorepo.
Install a specific skill:
```bash
npx skills add katanemo/plano --skill plano-routing-model-selection
```
List available skills before install:
```bash
npx skills add katanemo/plano --list
```
## Using Skills in Agents
After installation, these skills are available to your coding agent and can be invoked with normal language. You do not need special syntax unless your tooling requires it.
### Natural Language Invocation Examples
- "Use the Plano skills to validate this `config.yaml` and fix issues."
- "Apply Plano routing best practices to improve model/provider selection."
- "Review this agent listener config with the orchestration rules."
- "Refactor this filter chain to follow guardrail ordering best practices."
- "Audit this setup against Plano deployment and security recommendations."
### Prompting Tips for Better Results
- Name your goal and file: "Harden `config.yaml` for production."
- Ask for an action: "Generate a patch," "fix directly," or "explain the changes."
- Include runtime context when relevant: trace output, logs, listener errors.
- Ask for verification: "Run a final validation check after edits."
### Invoke by Skill Area (Optional)
- **Configuration:** "Use Plano configuration fundamentals on this config."
- **Routing:** "Use routing/model-selection skills to tune defaults and aliases."
- **Agent orchestration:** "Use agent orchestration skills to improve routing accuracy."
- **Filters/guardrails:** "Use filter-chain skills to harden input/output safety."
- **Observability:** "Use observability skills to add traceability and debug routing."
- **CLI/deployment:** "Use CLI and deployment skills to produce a startup checklist."
## Available Skills
- `plano-agent-skills` - Umbrella skill covering all Plano areas
- `plano-config-fundamentals` - Config versioning, listeners, providers, secrets
- `plano-routing-model-selection` - Defaults, aliases, passthrough auth, preferences
- `plano-agent-orchestration` - Agent registration and routing descriptions
- `plano-filter-guardrails` - MCP filters, guardrail messaging, filter ordering
- `plano-observability-debugging` - Tracing setup, span attributes, trace analysis
- `plano-cli-operations` - `planoai up`, `cli_agent`, init, prompt target generation
- `plano-deployment-security` - Docker networking, health checks, state storage
- `plano-advanced-patterns` - Multi-listener architecture and prompt target schema design
## Local Testing
```bash
# From repo root
npx skills add ./skills --list
npx skills add ./skills --skill plano-agent-skills -y
npx skills list
```
## Structure
```
skills/
├── rules/ # Individual rule files (one per rule)
│ ├── _sections.md # Section metadata and prefix definitions
│ ├── _template.md # Template for creating new rules
│ ├── config-*.md # Section 1: Configuration Fundamentals
│ ├── routing-*.md # Section 2: Routing & Model Selection
│ ├── agent-*.md # Section 3: Agent Orchestration
│ ├── filter-*.md # Section 4: Filter Chains & Guardrails
│ ├── observe-*.md # Section 5: Observability & Debugging
│ ├── cli-*.md # Section 6: CLI Operations
│ ├── deploy-*.md # Section 7: Deployment & Security
│ └── advanced-*.md # Section 8: Advanced Patterns
├── src/
│ ├── build.ts # Compiles rules/ into AGENTS.md
│ ├── validate.ts # Validates rule files
│ └── extract-tests.ts # Extracts test cases for LLM evaluation
├── metadata.json # Document metadata
├── AGENTS.md # Compiled output (generated — do not edit directly)
├── test-cases.json # Test cases for LLM evaluation (generated)
└── package.json
```
## Sections
| # | Prefix | Section | Rules |
|---|--------|---------|-------|
| 1 | `config-` | Configuration Fundamentals | Version, listeners, providers, secrets, timeouts |
| 2 | `routing-` | Routing & Model Selection | Preferences, aliases, defaults, passthrough |
| 3 | `agent-` | Agent Orchestration | Descriptions, agent registration |
| 4 | `filter-` | Filter Chains & Guardrails | Ordering, MCP integration, guardrails |
| 5 | `observe-` | Observability & Debugging | Tracing, trace inspection, span attributes |
| 6 | `cli-` | CLI Operations | Startup, CLI agent, init, code generation |
| 7 | `deploy-` | Deployment & Security | Docker networking, state storage, health checks |
| 8 | `advanced-` | Advanced Patterns | Prompt targets, rate limits, multi-listener |
## Getting Started
```bash
# Install dependencies
npm install
# Validate all rule files
npm run validate
# Build AGENTS.md from rules
npm run build
# Extract test cases for LLM evaluation
npm run extract-tests
# Run all of the above
npm run dev
```
## Creating a New Rule
1. Copy `rules/_template.md` to `rules/<prefix>-<description>.md`
2. Choose the correct prefix for your section:
- `config-` — Configuration Fundamentals
- `routing-` — Routing & Model Selection
- `agent-` — Agent Orchestration
- `filter-` — Filter Chains & Guardrails
- `observe-` — Observability & Debugging
- `cli-` — CLI Operations
- `deploy-` — Deployment & Security
- `advanced-` — Advanced Patterns
3. Fill in the frontmatter:
```yaml
---
title: Clear, Actionable Rule Title
impact: HIGH
impactDescription: One-line description of why this matters
tags: config, routing, relevant-tags
---
```
4. Write the rule body with:
- Brief explanation of the principle and why it matters
- **Incorrect** example (YAML config or CLI command showing the wrong pattern)
- **Correct** example (the right pattern with comments)
- Optional explanatory notes
5. Run `npm run dev` to validate and regenerate
## Rule File Structure
```markdown
---
title: Rule Title Here
impact: CRITICAL
impactDescription: One sentence on the impact
tags: tag1, tag2, tag3
---
## Rule Title Here
Brief explanation of the rule and why it matters for Plano developers.
**Incorrect (describe what's wrong):**
```yaml
# Bad example
```
**Correct (describe what's right):**
```yaml
# Good example with comments explaining the decisions
```
Optional explanatory text, lists, or tables.
Reference: https://github.com/katanemo/archgw
## Impact Levels
| Level | Description |
|-------|-------------|
| `CRITICAL` | Causes startup failures or silent misbehavior — always fix |
| `HIGH` | Significantly degrades routing accuracy, security, or reliability |
| `MEDIUM-HIGH` | Important for production deployments |
| `MEDIUM` | Best practice for maintainability and developer experience |
| `LOW-MEDIUM` | Incremental improvements |
| `LOW` | Nice to have |
## Key Rules at a Glance
- **Always set `version: v0.3.0`** — config is rejected without it
- **Use `host.docker.internal`** for agent/filter URLs — `localhost` doesn't work inside Docker
- **Set exactly one `default: true` provider** — unmatched requests need a fallback
- **Write specific routing preference descriptions** — vague descriptions cause misroutes
- **Order filter chains: guards → rewriters → context builders** — never build context before blocking bad input
- **Use `$VAR_NAME` for all secrets** — never hardcode API keys in config.yaml
- **Enable tracing with `--with-tracing`** — traces are the primary debugging tool
## Scripts
| Command | Description |
|---------|-------------|
| `npm run build` | Compile `rules/` into `AGENTS.md` |
| `npm run validate` | Validate all rule files for required fields and structure |
| `npm run extract-tests` | Generate `test-cases.json` for LLM evaluation |
| `npm run dev` | Validate + build + extract tests |
## Contributing
Rules are automatically sorted alphabetically by title within each section — no need to manage numbers. IDs (`1.1`, `1.2`, etc.) are assigned during build.
When adding rules:
1. Use the correct filename prefix for your section
2. Follow `_template.md` structure
3. Include clear bad/good YAML or CLI examples
4. Add relevant tags
5. Run `npm run dev` to validate and regenerate
## License
Apache-2.0 — see [LICENSE](../LICENSE)

8
skills/metadata.json Normal file
View file

@ -0,0 +1,8 @@
{
"version": "1.0.0",
"organization": "Plano",
"name": "plano-agent-skills",
"abstract": "Best practices for building agents and agentic applications with Plano — the AI-native proxy and dataplane. Covers configuration, routing, agent orchestration, filter chains, observability, CLI operations, and deployment patterns.",
"homepage": "https://github.com/katanemo/archgw",
"license": "Apache-2.0"
}

594
skills/package-lock.json generated Normal file
View file

@ -0,0 +1,594 @@
{
"name": "plano-agent-skills",
"version": "1.0.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "plano-agent-skills",
"version": "1.0.0",
"license": "Apache-2.0",
"devDependencies": {
"@types/node": "^24.3.0",
"tsx": "^4.20.5",
"typescript": "^5.9.2"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@esbuild/aix-ppc64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz",
"integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==",
"cpu": [
"ppc64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"aix"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/android-arm": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz",
"integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==",
"cpu": [
"arm"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/android-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz",
"integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/android-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz",
"integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/darwin-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz",
"integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/darwin-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz",
"integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/freebsd-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz",
"integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/freebsd-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz",
"integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-arm": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz",
"integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==",
"cpu": [
"arm"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz",
"integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-ia32": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz",
"integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==",
"cpu": [
"ia32"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-loong64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz",
"integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==",
"cpu": [
"loong64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-mips64el": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz",
"integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==",
"cpu": [
"mips64el"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-ppc64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz",
"integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==",
"cpu": [
"ppc64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-riscv64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz",
"integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==",
"cpu": [
"riscv64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-s390x": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz",
"integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==",
"cpu": [
"s390x"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/linux-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz",
"integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/netbsd-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz",
"integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"netbsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/netbsd-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz",
"integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"netbsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/openbsd-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz",
"integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"openbsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/openbsd-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz",
"integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"openbsd"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/openharmony-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz",
"integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"openharmony"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/sunos-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz",
"integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"sunos"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/win32-arm64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz",
"integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==",
"cpu": [
"arm64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/win32-ia32": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz",
"integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==",
"cpu": [
"ia32"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@esbuild/win32-x64": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz",
"integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==",
"cpu": [
"x64"
],
"dev": true,
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=18"
}
},
"node_modules/@types/node": {
"version": "24.11.0",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.11.0.tgz",
"integrity": "sha512-fPxQqz4VTgPI/IQ+lj9r0h+fDR66bzoeMGHp8ASee+32OSGIkeASsoZuJixsQoVef1QJbeubcPBxKk22QVoWdw==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/esbuild": {
"version": "0.27.3",
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz",
"integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==",
"dev": true,
"hasInstallScript": true,
"license": "MIT",
"bin": {
"esbuild": "bin/esbuild"
},
"engines": {
"node": ">=18"
},
"optionalDependencies": {
"@esbuild/aix-ppc64": "0.27.3",
"@esbuild/android-arm": "0.27.3",
"@esbuild/android-arm64": "0.27.3",
"@esbuild/android-x64": "0.27.3",
"@esbuild/darwin-arm64": "0.27.3",
"@esbuild/darwin-x64": "0.27.3",
"@esbuild/freebsd-arm64": "0.27.3",
"@esbuild/freebsd-x64": "0.27.3",
"@esbuild/linux-arm": "0.27.3",
"@esbuild/linux-arm64": "0.27.3",
"@esbuild/linux-ia32": "0.27.3",
"@esbuild/linux-loong64": "0.27.3",
"@esbuild/linux-mips64el": "0.27.3",
"@esbuild/linux-ppc64": "0.27.3",
"@esbuild/linux-riscv64": "0.27.3",
"@esbuild/linux-s390x": "0.27.3",
"@esbuild/linux-x64": "0.27.3",
"@esbuild/netbsd-arm64": "0.27.3",
"@esbuild/netbsd-x64": "0.27.3",
"@esbuild/openbsd-arm64": "0.27.3",
"@esbuild/openbsd-x64": "0.27.3",
"@esbuild/openharmony-arm64": "0.27.3",
"@esbuild/sunos-x64": "0.27.3",
"@esbuild/win32-arm64": "0.27.3",
"@esbuild/win32-ia32": "0.27.3",
"@esbuild/win32-x64": "0.27.3"
}
},
"node_modules/fsevents": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
"dev": true,
"hasInstallScript": true,
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/get-tsconfig": {
"version": "4.13.6",
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.6.tgz",
"integrity": "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==",
"dev": true,
"license": "MIT",
"dependencies": {
"resolve-pkg-maps": "^1.0.0"
},
"funding": {
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
}
},
"node_modules/resolve-pkg-maps": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
"integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
"dev": true,
"license": "MIT",
"funding": {
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
}
},
"node_modules/tsx": {
"version": "4.21.0",
"resolved": "https://registry.npmjs.org/tsx/-/tsx-4.21.0.tgz",
"integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==",
"dev": true,
"license": "MIT",
"dependencies": {
"esbuild": "~0.27.0",
"get-tsconfig": "^4.7.5"
},
"bin": {
"tsx": "dist/cli.mjs"
},
"engines": {
"node": ">=18.0.0"
},
"optionalDependencies": {
"fsevents": "~2.3.3"
}
},
"node_modules/typescript": {
"version": "5.9.3",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
"dev": true,
"license": "Apache-2.0",
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
},
"engines": {
"node": ">=14.17"
}
},
"node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
"dev": true,
"license": "MIT"
}
}
}

31
skills/package.json Normal file
View file

@ -0,0 +1,31 @@
{
"name": "plano-agent-skills",
"version": "1.0.0",
"description": "Best practices for building agents and agentic applications with Plano — installable via npx skills add",
"type": "module",
"scripts": {
"typecheck": "tsc --noEmit",
"build": "tsx src/build.ts",
"validate": "tsx src/validate.ts",
"extract-tests": "tsx src/extract-tests.ts",
"dev": "npm run typecheck && npm run validate && npm run build && npm run extract-tests"
},
"keywords": [
"plano",
"archgw",
"ai-gateway",
"agent",
"llm",
"skills",
"best-practices"
],
"license": "Apache-2.0",
"engines": {
"node": ">=18.0.0"
},
"devDependencies": {
"@types/node": "^24.3.0",
"tsx": "^4.20.5",
"typescript": "^5.9.2"
}
}

View file

@ -0,0 +1,32 @@
---
name: plano-advanced-patterns
description: Design advanced Plano architectures. Use for multi-listener systems, prompt target schema quality, and layered orchestration patterns.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Advanced Patterns
Use this skill for higher-order architecture decisions once fundamentals are stable.
## When To Use
- "Design a multi-listener Plano architecture"
- "Improve prompt target schema precision"
- "Combine model, prompt, and agent listeners"
- "Refine advanced routing/function-calling behavior"
## Apply These Rules
- `advanced-multi-listener`
- `advanced-prompt-targets`
## Execution Checklist
1. Use multiple listeners only when interfaces are truly distinct.
2. Keep provider/routing definitions shared and consistent.
3. Define prompt target parameters with strict, explicit schemas.
4. Minimize ambiguity that causes malformed tool calls.
5. Provide migration-safe recommendations and test scenarios.

View file

@ -0,0 +1,32 @@
---
name: plano-agent-orchestration
description: Improve multi-agent orchestration in Plano. Use for agent registration, agent listener wiring, and capability-focused agent descriptions for accurate routing.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Agent Orchestration
Use this skill for agent listener quality, sub-agent registration, and route accuracy.
## When To Use
- "Fix multi-agent routing"
- "Validate agents vs listeners.agents config"
- "Improve agent descriptions"
- "Set up a reliable orchestrator"
## Apply These Rules
- `agent-orchestration`
- `agent-descriptions`
## Execution Checklist
1. Verify each agent exists in both `agents` and `listeners[].agents`.
2. Ensure one fallback/default agent where appropriate.
3. Rewrite descriptions to be capability-focused and non-overlapping.
4. Keep descriptions specific, concise, and example-driven.
5. Provide test prompts to validate routing outcomes.

View file

@ -0,0 +1,53 @@
---
name: plano-agent-skills
description: Best practices for building agents and agentic applications with Plano, including configuration, routing, orchestration, guardrails, observability, and deployment.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Agent Skills
Comprehensive Plano guidance for coding agents. Use this umbrella skill when a task spans multiple areas (config, routing, orchestration, filters, observability, CLI, deployment).
## When To Use
- Validating or fixing Plano `config.yaml`
- Designing listener architecture (`model`, `prompt`, `agent`)
- Improving model/provider routing quality and fallback behavior
- Hardening filter chains and prompt guardrails
- Debugging routing with traces and CLI workflows
- Preparing deployment and production readiness checks
## How To Use
1. Classify the request by scope (single section vs. cross-cutting).
2. For focused work, prefer a section-specific skill (for example `plano-routing-model-selection`).
3. For broad work, apply this umbrella skill and reference section rules from `skills/AGENTS.md`.
4. Produce concrete edits first, then concise reasoning and validation steps.
## Operating Workflow
1. Identify the task area first: config, routing, orchestration, filters, observability, CLI, or deployment.
2. Apply the smallest correct change that satisfies the requested behavior.
3. Preserve security and reliability defaults:
- `version: v0.3.0`
- exactly one `default: true` model provider
- secrets via `$ENV_VAR` substitution only
- `host.docker.internal` for host services from inside Docker
- guardrails before enrichment in filter chains
4. For debugging, prioritize traces over guesswork (`planoai up --with-tracing`, `planoai trace`).
5. Return concrete diffs and a short validation checklist.
## Response Style
- Prefer actionable edits over generic advice.
- Be explicit about why a config choice is correct.
- Call out risky patterns (hardcoded secrets, missing default provider, bad filter ordering).
- Keep examples minimal and production-viable.
## References
- Repo: https://github.com/katanemo/plano
- Full rulebook: `skills/AGENTS.md`

View file

@ -0,0 +1,34 @@
---
name: plano-cli-operations
description: Apply Plano CLI best practices. Use for startup troubleshooting, cli_agent workflows, prompt target generation, and template-based project bootstrapping.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano CLI Operations
Use this skill when the task is primarily operational and CLI-driven.
## When To Use
- "Fix `planoai up` failures"
- "Use `planoai cli_agent` with coding agents"
- "Generate prompt targets from Python functions"
- "Bootstrap a project with `planoai init` templates"
## Apply These Rules
- `cli-startup`
- `cli-agent`
- `cli-generate`
- `cli-init`
## Execution Checklist
1. Follow startup validation order before deep debugging.
2. Use `cli_agent` to route coding-agent traffic through Plano.
3. Generate prompt target schema, then wire endpoint details explicitly.
4. Start from templates for reliable first-time setup.
5. Provide a compact runbook with exact CLI commands.

View file

@ -0,0 +1,34 @@
---
name: plano-config-fundamentals
description: Validate and fix Plano config fundamentals. Use for config versioning, listener types, provider registration, secrets handling, and startup validation failures.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Configuration Fundamentals
Use this skill for foundational `config.yaml` correctness.
## When To Use
- "Validate this Plano config"
- "Fix startup config errors"
- "Check listeners/providers/secrets"
- "Why does `planoai up` fail schema validation?"
## Apply These Rules
- `config-version`
- `config-listeners`
- `config-providers`
- `config-secrets`
## Execution Checklist
1. Ensure `version: v0.3.0` is present.
2. Confirm listener type matches intended architecture.
3. Verify provider names/interfaces and exactly one default provider.
4. Replace hardcoded secrets with `$ENV_VAR` substitution.
5. Return minimal patch and a `planoai up` verification plan.

View file

@ -0,0 +1,33 @@
---
name: plano-deployment-security
description: Apply Plano deployment and production security practices. Use for Docker networking, state storage choices, readiness checks, and environment-based secret handling.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Deployment and Security
Use this skill to harden production deployments and reduce runtime surprises.
## When To Use
- "Fix unreachable agents in Docker"
- "Configure persistent conversation state"
- "Add readiness and health checks"
- "Prepare production deployment checklist"
## Apply These Rules
- `deploy-docker`
- `deploy-state`
- `deploy-health`
## Execution Checklist
1. Use `host.docker.internal` for host-side services from inside Plano container.
2. Prefer PostgreSQL state storage for production multi-turn workloads.
3. Verify `/healthz` before traffic or CI assertions.
4. Ensure secrets remain environment-based, never hardcoded.
5. Return deployment checks with failure-mode diagnostics.

View file

@ -0,0 +1,33 @@
---
name: plano-filter-guardrails
description: Harden Plano filter chains and guardrails. Use for MCP filter setup, prompt guard responses, and safe filter ordering.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Filter Chains and Guardrails
Use this skill when safety controls or filter pipelines need correction.
## When To Use
- "Fix filter chain ordering"
- "Set up MCP filters correctly"
- "Improve guardrail rejection behavior"
- "Harden request processing for safety"
## Apply These Rules
- `filter-mcp`
- `filter-guardrails`
- `filter-ordering`
## Execution Checklist
1. Configure filter `type`, `transport`, and `tool` explicitly for MCP.
2. Ensure rejection messages are clear and actionable.
3. Order chain as guards -> rewriters -> enrichment -> output checks.
4. Prevent expensive enrichment on unsafe requests.
5. Verify with representative blocked and allowed test prompts.

View file

@ -0,0 +1,33 @@
---
name: plano-observability-debugging
description: Improve Plano tracing and debugging workflows. Use for sampling strategy, span attributes, and trace query-based root-cause analysis.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Observability and Debugging
Use this skill to make routing and latency behavior inspectable and debuggable.
## When To Use
- "Enable tracing correctly"
- "Add useful span attributes"
- "Debug why a request routed incorrectly"
- "Inspect filter/model latency from traces"
## Apply These Rules
- `observe-tracing`
- `observe-span-attributes`
- `observe-trace-query`
## Execution Checklist
1. Enable tracing with environment-appropriate sampling.
2. Add useful static and header-derived span attributes.
3. Use `planoai trace` filters to isolate route and latency issues.
4. Prefer trace evidence over assumptions in recommendations.
5. Return exact commands to reproduce and validate findings.

View file

@ -0,0 +1,34 @@
---
name: plano-routing-model-selection
description: Optimize Plano model routing and selection. Use for provider defaults, model aliases, passthrough auth, and routing preference quality.
license: Apache-2.0
metadata:
author: katanemo
version: "1.0.0"
---
# Plano Routing and Model Selection
Use this skill when requests are routed to the wrong model, costs are high, or fallback behavior is unclear.
## When To Use
- "Improve model routing"
- "Add aliases and defaults"
- "Fix passthrough auth with proxy providers"
- "Tune routing preferences for better classification"
## Apply These Rules
- `routing-default`
- `routing-aliases`
- `routing-passthrough`
- `routing-preferences`
## Execution Checklist
1. Ensure exactly one `default: true` provider.
2. Add semantic aliases for stable client contracts.
3. Configure passthrough auth only where required.
4. Rewrite vague preference descriptions with concrete task scopes.
5. Validate routing behavior using trace-based checks.

Some files were not shown because too many files have changed in this diff Show more