diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e8d3223..0882479d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,13 +133,13 @@ jobs: load: true tags: | ${{ env.PLANO_DOCKER_IMAGE }} - ${{ env.DOCKER_IMAGE }}:0.4.19 + ${{ env.DOCKER_IMAGE }}:0.4.21 ${{ env.DOCKER_IMAGE }}:latest cache-from: type=gha cache-to: type=gha,mode=max - name: Save image as artifact - run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.19 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar + run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.21 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar - name: Upload image artifact uses: actions/upload-artifact@v6 diff --git a/apps/www/src/components/Hero.tsx b/apps/www/src/components/Hero.tsx index 05e615b9..aa9a2298 100644 --- a/apps/www/src/components/Hero.tsx +++ b/apps/www/src/components/Hero.tsx @@ -24,7 +24,7 @@ export function Hero() { >
- v0.4.19 + v0.4.21 — diff --git a/build_filter_image.sh b/build_filter_image.sh index 73e51b61..a0dd2498 100644 --- a/build_filter_image.sh +++ b/build_filter_image.sh @@ -1 +1 @@ -docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.19 +docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.21 diff --git a/cli/planoai/__init__.py b/cli/planoai/__init__.py index 2492d40c..ac0015d7 100644 --- a/cli/planoai/__init__.py +++ b/cli/planoai/__init__.py @@ -1,3 +1,3 @@ """Plano CLI - Intelligent Prompt Gateway.""" -__version__ = "0.4.19" +__version__ = "0.4.21" diff --git a/cli/planoai/chatgpt_auth.py b/cli/planoai/chatgpt_auth.py new file mode 100644 index 00000000..dbbde3ac --- /dev/null +++ b/cli/planoai/chatgpt_auth.py @@ -0,0 +1,290 @@ +""" +ChatGPT subscription OAuth device-flow authentication. + +Implements the device code flow used by OpenAI Codex CLI to authenticate +with a ChatGPT Plus/Pro subscription. Tokens are stored locally in +~/.plano/chatgpt/auth.json and auto-refreshed when expired. +""" + +import base64 +import json +import os +import time +from typing import Any, Dict, Optional, Tuple + +import requests + +from planoai.consts import PLANO_HOME + +# OAuth + API constants (derived from openai/codex) +CHATGPT_AUTH_BASE = "https://auth.openai.com" +CHATGPT_DEVICE_CODE_URL = f"{CHATGPT_AUTH_BASE}/api/accounts/deviceauth/usercode" +CHATGPT_DEVICE_TOKEN_URL = f"{CHATGPT_AUTH_BASE}/api/accounts/deviceauth/token" +CHATGPT_OAUTH_TOKEN_URL = f"{CHATGPT_AUTH_BASE}/oauth/token" +CHATGPT_DEVICE_VERIFY_URL = f"{CHATGPT_AUTH_BASE}/codex/device" +CHATGPT_API_BASE = "https://chatgpt.com/backend-api/codex" +CHATGPT_CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann" + +# Local storage +CHATGPT_AUTH_DIR = os.path.join(PLANO_HOME, "chatgpt") +CHATGPT_AUTH_FILE = os.path.join(CHATGPT_AUTH_DIR, "auth.json") + +# Timeouts +TOKEN_EXPIRY_SKEW_SECONDS = 60 +DEVICE_CODE_TIMEOUT_SECONDS = 15 * 60 +DEVICE_CODE_POLL_SECONDS = 5 + + +def _ensure_auth_dir(): + os.makedirs(CHATGPT_AUTH_DIR, exist_ok=True) + + +def load_auth() -> Optional[Dict[str, Any]]: + """Load auth data from disk.""" + try: + with open(CHATGPT_AUTH_FILE, "r") as f: + return json.load(f) + except (IOError, json.JSONDecodeError): + return None + + +def save_auth(data: Dict[str, Any]): + """Save auth data to disk.""" + _ensure_auth_dir() + fd = os.open(CHATGPT_AUTH_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "w") as f: + json.dump(data, f, indent=2) + + +def delete_auth(): + """Remove stored credentials.""" + try: + os.remove(CHATGPT_AUTH_FILE) + except FileNotFoundError: + pass + + +def _decode_jwt_claims(token: str) -> Dict[str, Any]: + """Decode JWT payload without verification.""" + try: + parts = token.split(".") + if len(parts) < 2: + return {} + payload_b64 = parts[1] + payload_b64 += "=" * (-len(payload_b64) % 4) + return json.loads(base64.urlsafe_b64decode(payload_b64).decode("utf-8")) + except Exception: + return {} + + +def _get_expires_at(token: str) -> Optional[int]: + """Extract expiration time from JWT.""" + claims = _decode_jwt_claims(token) + exp = claims.get("exp") + return int(exp) if isinstance(exp, (int, float)) else None + + +def _extract_account_id(token: Optional[str]) -> Optional[str]: + """Extract ChatGPT account ID from JWT claims.""" + if not token: + return None + claims = _decode_jwt_claims(token) + auth_claims = claims.get("https://api.openai.com/auth") + if isinstance(auth_claims, dict): + account_id = auth_claims.get("chatgpt_account_id") + if isinstance(account_id, str) and account_id: + return account_id + return None + + +def _is_token_expired(auth_data: Dict[str, Any]) -> bool: + """Check if the access token is expired.""" + expires_at = auth_data.get("expires_at") + if expires_at is None: + access_token = auth_data.get("access_token") + if access_token: + expires_at = _get_expires_at(access_token) + if expires_at: + auth_data["expires_at"] = expires_at + save_auth(auth_data) + if expires_at is None: + return True + return time.time() >= float(expires_at) - TOKEN_EXPIRY_SKEW_SECONDS + + +def _refresh_tokens(refresh_token: str) -> Dict[str, str]: + """Refresh the access token using the refresh token.""" + resp = requests.post( + CHATGPT_OAUTH_TOKEN_URL, + json={ + "client_id": CHATGPT_CLIENT_ID, + "grant_type": "refresh_token", + "refresh_token": refresh_token, + "scope": "openid profile email", + }, + ) + resp.raise_for_status() + data = resp.json() + + access_token = data.get("access_token") + id_token = data.get("id_token") + if not access_token or not id_token: + raise RuntimeError(f"Refresh response missing fields: {data}") + + return { + "access_token": access_token, + "refresh_token": data.get("refresh_token", refresh_token), + "id_token": id_token, + } + + +def _build_auth_record(tokens: Dict[str, str]) -> Dict[str, Any]: + """Build the auth record to persist.""" + access_token = tokens.get("access_token") + id_token = tokens.get("id_token") + expires_at = _get_expires_at(access_token) if access_token else None + account_id = _extract_account_id(id_token or access_token) + return { + "access_token": access_token, + "refresh_token": tokens.get("refresh_token"), + "id_token": id_token, + "expires_at": expires_at, + "account_id": account_id, + } + + +def request_device_code() -> Dict[str, str]: + """Request a device code from OpenAI's device auth endpoint.""" + resp = requests.post( + CHATGPT_DEVICE_CODE_URL, + json={"client_id": CHATGPT_CLIENT_ID}, + ) + resp.raise_for_status() + data = resp.json() + + device_auth_id = data.get("device_auth_id") + user_code = data.get("user_code") or data.get("usercode") + interval = data.get("interval") + if not device_auth_id or not user_code: + raise RuntimeError(f"Device code response missing fields: {data}") + + return { + "device_auth_id": device_auth_id, + "user_code": user_code, + "interval": str(interval or "5"), + } + + +def poll_for_authorization(device_code: Dict[str, str]) -> Dict[str, str]: + """Poll until the user completes authorization. Returns code_data.""" + interval = int(device_code.get("interval", "5")) + start_time = time.time() + + while time.time() - start_time < DEVICE_CODE_TIMEOUT_SECONDS: + try: + resp = requests.post( + CHATGPT_DEVICE_TOKEN_URL, + json={ + "device_auth_id": device_code["device_auth_id"], + "user_code": device_code["user_code"], + }, + ) + if resp.status_code == 200: + data = resp.json() + if all( + key in data + for key in ("authorization_code", "code_challenge", "code_verifier") + ): + return data + if resp.status_code in (403, 404): + time.sleep(max(interval, DEVICE_CODE_POLL_SECONDS)) + continue + resp.raise_for_status() + except requests.HTTPError as exc: + if exc.response is not None and exc.response.status_code in (403, 404): + time.sleep(max(interval, DEVICE_CODE_POLL_SECONDS)) + continue + raise RuntimeError(f"Polling failed: {exc}") from exc + + time.sleep(max(interval, DEVICE_CODE_POLL_SECONDS)) + + raise RuntimeError("Timed out waiting for device authorization") + + +def exchange_code_for_tokens(code_data: Dict[str, str]) -> Dict[str, str]: + """Exchange the authorization code for access/refresh/id tokens.""" + redirect_uri = f"{CHATGPT_AUTH_BASE}/deviceauth/callback" + body = ( + "grant_type=authorization_code" + f"&code={code_data['authorization_code']}" + f"&redirect_uri={redirect_uri}" + f"&client_id={CHATGPT_CLIENT_ID}" + f"&code_verifier={code_data['code_verifier']}" + ) + resp = requests.post( + CHATGPT_OAUTH_TOKEN_URL, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data=body, + ) + resp.raise_for_status() + data = resp.json() + + if not all(key in data for key in ("access_token", "refresh_token", "id_token")): + raise RuntimeError(f"Token exchange response missing fields: {data}") + + return { + "access_token": data["access_token"], + "refresh_token": data["refresh_token"], + "id_token": data["id_token"], + } + + +def login() -> Dict[str, Any]: + """Run the full device code login flow. Returns the auth record.""" + device_code = request_device_code() + auth_record = _build_auth_record({}) + auth_record["device_code_requested_at"] = time.time() + save_auth(auth_record) + + print( + "\nSign in with your ChatGPT account:\n" + f" 1) Visit: {CHATGPT_DEVICE_VERIFY_URL}\n" + f" 2) Enter code: {device_code['user_code']}\n\n" + "Device codes are a common phishing target. Never share this code.\n", + flush=True, + ) + + code_data = poll_for_authorization(device_code) + tokens = exchange_code_for_tokens(code_data) + auth_record = _build_auth_record(tokens) + save_auth(auth_record) + return auth_record + + +def get_access_token() -> Tuple[str, Optional[str]]: + """ + Get a valid access token and account ID. + Refreshes automatically if expired. Raises if no auth data exists. + Returns (access_token, account_id). + """ + auth_data = load_auth() + if not auth_data: + raise RuntimeError( + "No ChatGPT credentials found. Run 'planoai chatgpt login' first." + ) + + access_token = auth_data.get("access_token") + if access_token and not _is_token_expired(auth_data): + return access_token, auth_data.get("account_id") + + # Try refresh + refresh_token = auth_data.get("refresh_token") + if refresh_token: + tokens = _refresh_tokens(refresh_token) + auth_record = _build_auth_record(tokens) + save_auth(auth_record) + return auth_record["access_token"], auth_record.get("account_id") + + raise RuntimeError( + "ChatGPT token expired and refresh failed. Run 'planoai chatgpt login' again." + ) diff --git a/cli/planoai/chatgpt_cmd.py b/cli/planoai/chatgpt_cmd.py new file mode 100644 index 00000000..b61068c4 --- /dev/null +++ b/cli/planoai/chatgpt_cmd.py @@ -0,0 +1,86 @@ +""" +CLI commands for ChatGPT subscription management. + +Usage: + planoai chatgpt login - Authenticate with ChatGPT via device code flow + planoai chatgpt status - Check authentication status + planoai chatgpt logout - Remove stored credentials +""" + +import datetime + +import click +from rich.console import Console + +from planoai import chatgpt_auth + +console = Console() + + +@click.group() +def chatgpt(): + """ChatGPT subscription management.""" + pass + + +@chatgpt.command() +def login(): + """Authenticate with your ChatGPT subscription using device code flow.""" + try: + auth_record = chatgpt_auth.login() + account_id = auth_record.get("account_id", "unknown") + console.print( + f"\n[green]Successfully authenticated with ChatGPT![/green]" + f"\nAccount ID: {account_id}" + f"\nCredentials saved to: {chatgpt_auth.CHATGPT_AUTH_FILE}" + ) + except Exception as e: + console.print(f"\n[red]Authentication failed:[/red] {e}") + raise SystemExit(1) + + +@chatgpt.command() +def status(): + """Check ChatGPT authentication status.""" + auth_data = chatgpt_auth.load_auth() + if not auth_data or not auth_data.get("access_token"): + console.print( + "[yellow]Not authenticated.[/yellow] Run 'planoai chatgpt login'." + ) + return + + account_id = auth_data.get("account_id", "unknown") + expires_at = auth_data.get("expires_at") + + if expires_at: + expiry_time = datetime.datetime.fromtimestamp( + expires_at, tz=datetime.timezone.utc + ) + now = datetime.datetime.now(tz=datetime.timezone.utc) + if expiry_time > now: + remaining = expiry_time - now + console.print( + f"[green]Authenticated[/green]" + f"\n Account ID: {account_id}" + f"\n Token expires: {expiry_time.strftime('%Y-%m-%d %H:%M:%S UTC')}" + f" ({remaining.seconds // 60}m remaining)" + ) + else: + console.print( + f"[yellow]Token expired[/yellow]" + f"\n Account ID: {account_id}" + f"\n Expired at: {expiry_time.strftime('%Y-%m-%d %H:%M:%S UTC')}" + f"\n Will auto-refresh on next use, or run 'planoai chatgpt login'." + ) + else: + console.print( + f"[green]Authenticated[/green] (no expiry info)" + f"\n Account ID: {account_id}" + ) + + +@chatgpt.command() +def logout(): + """Remove stored ChatGPT credentials.""" + chatgpt_auth.delete_auth() + console.print("[green]ChatGPT credentials removed.[/green]") diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py index 5a3d4f63..cb07767e 100644 --- a/cli/planoai/config_generator.py +++ b/cli/planoai/config_generator.py @@ -1,5 +1,6 @@ import json import os +import uuid from planoai.utils import convert_legacy_listeners from jinja2 import Environment, FileSystemLoader import yaml @@ -28,8 +29,16 @@ SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [ "xai", "moonshotai", "zhipu", + "chatgpt", + "digitalocean", + "vercel", + "openrouter", ] +CHATGPT_API_BASE = "https://chatgpt.com/backend-api/codex" +CHATGPT_DEFAULT_ORIGINATOR = "codex_cli_rs" +CHATGPT_DEFAULT_USER_AGENT = "codex_cli_rs/0.0.0 (Unknown 0; unknown) unknown" + SUPPORTED_PROVIDERS = ( SUPPORTED_PROVIDERS_WITHOUT_BASE_URL + SUPPORTED_PROVIDERS_WITH_BASE_URL ) @@ -49,6 +58,110 @@ def get_endpoint_and_port(endpoint, protocol): return endpoint, port +def migrate_inline_routing_preferences(config_yaml): + """Lift v0.3.0-style inline ``routing_preferences`` under each + ``model_providers`` entry to the v0.4.0 top-level ``routing_preferences`` + list with ``models: [...]``. + + This function is a no-op for configs whose ``version`` is already + ``v0.4.0`` or newer — those are assumed to be on the canonical + top-level shape and are passed through untouched. + + For older configs, the version is bumped to ``v0.4.0`` up front so + brightstaff's v0.4.0 gate for top-level ``routing_preferences`` + accepts the rendered config, then inline preferences under each + provider are lifted into the top-level list. Preferences with the + same ``name`` across multiple providers are merged into a single + top-level entry whose ``models`` list contains every provider's + full ``/`` string in declaration order. The first + ``description`` encountered wins; conflicts are warned, not errored, + so existing v0.3.0 configs keep compiling. Any top-level preference + already defined by the user is preserved as-is. + """ + current_version = str(config_yaml.get("version", "")) + if _version_tuple(current_version) >= (0, 4, 0): + return + + config_yaml["version"] = "v0.4.0" + + model_providers = config_yaml.get("model_providers") or [] + if not model_providers: + return + + migrated = {} + for model_provider in model_providers: + inline_prefs = model_provider.get("routing_preferences") + if not inline_prefs: + continue + + full_model_name = model_provider.get("model") + if not full_model_name: + continue + + if "/" in full_model_name and full_model_name.split("/")[-1].strip() == "*": + raise Exception( + f"Model {full_model_name} has routing_preferences but uses wildcard (*). Models with routing preferences cannot be wildcards." + ) + + for pref in inline_prefs: + name = pref.get("name") + description = pref.get("description", "") + if not name: + continue + if name in migrated: + entry = migrated[name] + if description and description != entry["description"]: + print( + f"WARNING: routing preference '{name}' has conflicting descriptions across providers; keeping the first one." + ) + if full_model_name not in entry["models"]: + entry["models"].append(full_model_name) + else: + migrated[name] = { + "name": name, + "description": description, + "models": [full_model_name], + } + + if not migrated: + return + + for model_provider in model_providers: + if "routing_preferences" in model_provider: + del model_provider["routing_preferences"] + + existing_top_level = config_yaml.get("routing_preferences") or [] + existing_names = {entry.get("name") for entry in existing_top_level} + merged = list(existing_top_level) + for name, entry in migrated.items(): + if name in existing_names: + continue + merged.append(entry) + config_yaml["routing_preferences"] = merged + + print( + "WARNING: inline routing_preferences under model_providers is deprecated " + "and has been auto-migrated to top-level routing_preferences. Update your " + "config to v0.4.0 top-level form. See docs/routing-api.md" + ) + + +def _version_tuple(version_string): + stripped = version_string.strip().lstrip("vV") + if not stripped: + return (0, 0, 0) + parts = stripped.split("-", 1)[0].split(".") + out = [] + for part in parts[:3]: + try: + out.append(int(part)) + except ValueError: + out.append(0) + while len(out) < 3: + out.append(0) + return tuple(out) + + def validate_and_render_schema(): ENVOY_CONFIG_TEMPLATE_FILE = os.getenv( "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml" @@ -92,6 +205,8 @@ def validate_and_render_schema(): config_yaml["model_providers"] = config_yaml["llm_providers"] del config_yaml["llm_providers"] + migrate_inline_routing_preferences(config_yaml) + listeners, llm_gateway, prompt_gateway = convert_legacy_listeners( config_yaml.get("listeners"), config_yaml.get("model_providers") ) @@ -191,7 +306,16 @@ def validate_and_render_schema(): model_provider_name_set = set() llms_with_usage = [] model_name_keys = set() - model_usage_name_keys = set() + + top_level_preferences = config_yaml.get("routing_preferences") or [] + seen_pref_names = set() + for pref in top_level_preferences: + pref_name = pref.get("name") + if pref_name in seen_pref_names: + raise Exception( + f'Duplicate routing preference name "{pref_name}", please provide unique name for each routing preference' + ) + seen_pref_names.add(pref_name) print("listeners: ", listeners) @@ -250,10 +374,6 @@ def validate_and_render_schema(): raise Exception( f"Model {model_name} is configured as default but uses wildcard (*). Default models cannot be wildcards." ) - if model_provider.get("routing_preferences"): - raise Exception( - f"Model {model_name} has routing_preferences but uses wildcard (*). Models with routing preferences cannot be wildcards." - ) # Validate azure_openai and ollama provider requires base_url if (provider in SUPPORTED_PROVIDERS_WITH_BASE_URL) and model_provider.get( @@ -302,13 +422,6 @@ def validate_and_render_schema(): ) model_name_keys.add(model_id) - for routing_preference in model_provider.get("routing_preferences", []): - if routing_preference.get("name") in model_usage_name_keys: - raise Exception( - f'Duplicate routing preference name "{routing_preference.get("name")}", please provide unique name for each routing preference' - ) - model_usage_name_keys.add(routing_preference.get("name")) - # Warn if both passthrough_auth and access_key are configured if model_provider.get("passthrough_auth") and model_provider.get( "access_key" @@ -331,6 +444,25 @@ def validate_and_render_schema(): provider = model_provider["provider"] model_provider["provider_interface"] = provider del model_provider["provider"] + + # Auto-wire ChatGPT provider: inject base_url, passthrough_auth, and extra headers + if provider == "chatgpt": + if not model_provider.get("base_url"): + model_provider["base_url"] = CHATGPT_API_BASE + if not model_provider.get("access_key") and not model_provider.get( + "passthrough_auth" + ): + model_provider["passthrough_auth"] = True + headers = model_provider.get("headers", {}) + headers.setdefault( + "ChatGPT-Account-Id", + os.environ.get("CHATGPT_ACCOUNT_ID", ""), + ) + headers.setdefault("originator", CHATGPT_DEFAULT_ORIGINATOR) + headers.setdefault("user-agent", CHATGPT_DEFAULT_USER_AGENT) + headers.setdefault("session_id", str(uuid.uuid4())) + model_provider["headers"] = headers + updated_model_providers.append(model_provider) if model_provider.get("base_url", None): @@ -377,7 +509,7 @@ def validate_and_render_schema(): router_model_id = ( router_model.split("/", 1)[1] if "/" in router_model else router_model ) - if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set: + if len(seen_pref_names) > 0 and router_model_id not in model_name_set: updated_model_providers.append( { "name": "plano-orchestrator", diff --git a/cli/planoai/consts.py b/cli/planoai/consts.py index af76d7cf..fc7b6f1a 100644 --- a/cli/planoai/consts.py +++ b/cli/planoai/consts.py @@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4" SERVICE_NAME_ARCHGW = "plano" PLANO_DOCKER_NAME = "plano" -PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.19") +PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.21") DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317" # Native mode constants diff --git a/cli/planoai/defaults.py b/cli/planoai/defaults.py new file mode 100644 index 00000000..1d9468ff --- /dev/null +++ b/cli/planoai/defaults.py @@ -0,0 +1,178 @@ +"""Default config synthesizer for zero-config ``planoai up``. + +When the user runs ``planoai up`` in a directory with no ``config.yaml`` / +``plano_config.yaml``, we synthesize a pass-through config that covers the +common LLM providers and auto-wires OTel export to ``localhost:4317`` so +``planoai obs`` works out of the box. + +Auth handling: +- If the provider's env var is set, bind ``access_key: $ENV_VAR``. +- Otherwise set ``passthrough_auth: true`` so the client's own Authorization + header is forwarded. No env var is required to start the proxy. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass + +DEFAULT_LLM_LISTENER_PORT = 12000 +# plano_config validation requires an http:// scheme on the OTLP endpoint. +DEFAULT_OTLP_ENDPOINT = "http://localhost:4317" + + +@dataclass(frozen=True) +class ProviderDefault: + name: str + env_var: str + base_url: str + model_pattern: str + # Only set for providers whose prefix in the model pattern is NOT one of the + # built-in SUPPORTED_PROVIDERS in cli/planoai/config_generator.py. For + # built-ins, the validator infers the interface from the model prefix and + # rejects configs that set this field explicitly. + provider_interface: str | None = None + + +# Keep ordering stable so synthesized configs diff cleanly across runs. +PROVIDER_DEFAULTS: list[ProviderDefault] = [ + ProviderDefault( + name="openai", + env_var="OPENAI_API_KEY", + base_url="https://api.openai.com/v1", + model_pattern="openai/*", + ), + ProviderDefault( + name="anthropic", + env_var="ANTHROPIC_API_KEY", + base_url="https://api.anthropic.com/v1", + model_pattern="anthropic/*", + ), + ProviderDefault( + name="gemini", + env_var="GEMINI_API_KEY", + base_url="https://generativelanguage.googleapis.com/v1beta", + model_pattern="gemini/*", + ), + ProviderDefault( + name="groq", + env_var="GROQ_API_KEY", + base_url="https://api.groq.com/openai/v1", + model_pattern="groq/*", + ), + ProviderDefault( + name="deepseek", + env_var="DEEPSEEK_API_KEY", + base_url="https://api.deepseek.com/v1", + model_pattern="deepseek/*", + ), + ProviderDefault( + name="mistral", + env_var="MISTRAL_API_KEY", + base_url="https://api.mistral.ai/v1", + model_pattern="mistral/*", + ), + # DigitalOcean Gradient is a first-class provider post-#889 — the + # `digitalocean/` model prefix routes to the built-in Envoy cluster, no + # base_url needed at runtime. + ProviderDefault( + name="digitalocean", + env_var="DO_API_KEY", + base_url="https://inference.do-ai.run/v1", + model_pattern="digitalocean/*", + ), + ProviderDefault( + name="vercel", + env_var="AI_GATEWAY_API_KEY", + base_url="https://ai-gateway.vercel.sh/v1", + model_pattern="vercel/*", + ), + # OpenRouter is a first-class provider — the `openrouter/` model prefix is + # accepted by the schema and brightstaff's ProviderId parser, so no + # provider_interface override is needed. + ProviderDefault( + name="openrouter", + env_var="OPENROUTER_API_KEY", + base_url="https://openrouter.ai/api/v1", + model_pattern="openrouter/*", + ), +] + + +@dataclass +class DetectionResult: + with_keys: list[ProviderDefault] + passthrough: list[ProviderDefault] + + @property + def summary(self) -> str: + parts = [] + if self.with_keys: + parts.append("env-keyed: " + ", ".join(p.name for p in self.with_keys)) + if self.passthrough: + parts.append("pass-through: " + ", ".join(p.name for p in self.passthrough)) + return " | ".join(parts) if parts else "no providers" + + +def detect_providers(env: dict[str, str] | None = None) -> DetectionResult: + env = env if env is not None else dict(os.environ) + with_keys: list[ProviderDefault] = [] + passthrough: list[ProviderDefault] = [] + for p in PROVIDER_DEFAULTS: + val = env.get(p.env_var) + if val: + with_keys.append(p) + else: + passthrough.append(p) + return DetectionResult(with_keys=with_keys, passthrough=passthrough) + + +def synthesize_default_config( + env: dict[str, str] | None = None, + *, + listener_port: int = DEFAULT_LLM_LISTENER_PORT, + otel_endpoint: str = DEFAULT_OTLP_ENDPOINT, +) -> dict: + """Build a pass-through config dict suitable for validation + envoy rendering. + + The returned dict can be dumped to YAML and handed to the existing `planoai up` + pipeline unchanged. + """ + detection = detect_providers(env) + + def _entry(p: ProviderDefault, base: dict) -> dict: + row: dict = {"name": p.name, "model": p.model_pattern, "base_url": p.base_url} + if p.provider_interface is not None: + row["provider_interface"] = p.provider_interface + row.update(base) + return row + + model_providers: list[dict] = [] + for p in detection.with_keys: + model_providers.append(_entry(p, {"access_key": f"${p.env_var}"})) + for p in detection.passthrough: + model_providers.append(_entry(p, {"passthrough_auth": True})) + + # No explicit `default: true` entry is synthesized: the plano config + # validator rejects wildcard models as defaults, and brightstaff already + # registers bare model names as lookup keys during wildcard expansion + # (crates/common/src/llm_providers.rs), so `{"model": "gpt-4o-mini"}` + # without a prefix resolves via the openai wildcard without needing + # `default: true`. See discussion on #890. + + return { + "version": "v0.4.0", + "listeners": [ + { + "name": "llm", + "type": "model", + "port": listener_port, + "address": "0.0.0.0", + } + ], + "model_providers": model_providers, + "tracing": { + "random_sampling": 100, + "opentracing_grpc_endpoint": otel_endpoint, + }, + } diff --git a/cli/planoai/main.py b/cli/planoai/main.py index c8659a3c..8e766cf8 100644 --- a/cli/planoai/main.py +++ b/cli/planoai/main.py @@ -6,7 +6,13 @@ import sys import contextlib import logging import rich_click as click +import yaml from planoai import targets +from planoai.defaults import ( + DEFAULT_LLM_LISTENER_PORT, + detect_providers, + synthesize_default_config, +) # Brand color - Plano purple PLANO_COLOR = "#969FF4" @@ -31,6 +37,8 @@ from planoai.core import ( ) from planoai.init_cmd import init as init_cmd from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_background +from planoai.chatgpt_cmd import chatgpt as chatgpt_cmd +from planoai.obs_cmd import obs as obs_cmd from planoai.consts import ( DEFAULT_OTEL_TRACING_GRPC_ENDPOINT, DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT, @@ -118,6 +126,28 @@ def _temporary_cli_log_level(level: str | None): set_log_level(current_level) +def _inject_chatgpt_tokens_if_needed(config, env, console): + """If config uses chatgpt providers, resolve tokens from ~/.plano/chatgpt/auth.json.""" + providers = config.get("model_providers") or config.get("llm_providers") or [] + has_chatgpt = any(str(p.get("model", "")).startswith("chatgpt/") for p in providers) + if not has_chatgpt: + return + + try: + from planoai.chatgpt_auth import get_access_token + + access_token, account_id = get_access_token() + env["CHATGPT_ACCESS_TOKEN"] = access_token + if account_id: + env["CHATGPT_ACCOUNT_ID"] = account_id + except Exception as e: + console.print( + f"\n[red]ChatGPT auth error:[/red] {e}\n" + f"[dim]Run 'planoai chatgpt login' to authenticate.[/dim]\n" + ) + sys.exit(1) + + def _print_missing_keys(console, missing_keys: list[str]) -> None: console.print(f"\n[red]✗[/red] [red]Missing API keys![/red]\n") for key in missing_keys: @@ -317,7 +347,23 @@ def build(docker): help="Show detailed startup logs with timestamps.", is_flag=True, ) -def up(file, path, foreground, with_tracing, tracing_port, docker, verbose): +@click.option( + "--listener-port", + default=DEFAULT_LLM_LISTENER_PORT, + type=int, + show_default=True, + help="Override the LLM listener port when running without a config file. Ignored when a config file is present.", +) +def up( + file, + path, + foreground, + with_tracing, + tracing_port, + docker, + verbose, + listener_port, +): """Starts Plano.""" from rich.status import Status @@ -328,12 +374,23 @@ def up(file, path, foreground, with_tracing, tracing_port, docker, verbose): # Use the utility function to find config file plano_config_file = find_config_file(path, file) - # Check if the file exists + # Zero-config fallback: when no user config is present, synthesize a + # pass-through config that covers the common LLM providers and + # auto-wires OTel export to ``planoai obs``. See cli/planoai/defaults.py. if not os.path.exists(plano_config_file): + detection = detect_providers() + cfg_dict = synthesize_default_config(listener_port=listener_port) + + default_dir = os.path.expanduser("~/.plano") + os.makedirs(default_dir, exist_ok=True) + synthesized_path = os.path.join(default_dir, "default_config.yaml") + with open(synthesized_path, "w") as fh: + yaml.safe_dump(cfg_dict, fh, sort_keys=False) + plano_config_file = synthesized_path console.print( - f"[red]✗[/red] Config file not found: [dim]{plano_config_file}[/dim]" + f"[dim]No plano config found; using defaults ({detection.summary}). " + f"Listening on :{listener_port}, tracing -> http://localhost:4317.[/dim]" ) - sys.exit(1) if not docker: from planoai.native_runner import native_validate_config @@ -384,6 +441,14 @@ def up(file, path, foreground, with_tracing, tracing_port, docker, verbose): env = os.environ.copy() env.pop("PATH", None) + import yaml + + with open(plano_config_file, "r") as f: + plano_config = yaml.safe_load(f) + + # Inject ChatGPT tokens from ~/.plano/chatgpt/auth.json if any provider needs them + _inject_chatgpt_tokens_if_needed(plano_config, env, console) + # Check access keys access_keys = get_llm_provider_access_keys(plano_config_file=plano_config_file) access_keys = set(access_keys) @@ -681,6 +746,8 @@ main.add_command(cli_agent) main.add_command(generate_prompt_targets) main.add_command(init_cmd, name="init") main.add_command(trace_cmd, name="trace") +main.add_command(chatgpt_cmd, name="chatgpt") +main.add_command(obs_cmd, name="obs") if __name__ == "__main__": main() diff --git a/cli/planoai/native_runner.py b/cli/planoai/native_runner.py index bbbbfd3e..1b58b36d 100644 --- a/cli/planoai/native_runner.py +++ b/cli/planoai/native_runner.py @@ -253,6 +253,7 @@ def start_native( log.info("Plano is running (native mode)") for port in gateway_ports: log.info(f" http://localhost:{port}") + break # Check if processes are still alive @@ -367,8 +368,11 @@ def _kill_pid(pid): pass -def stop_native(): - """Stop natively-running Envoy and brightstaff processes. +def stop_native(skip_pids: set | None = None): + """Stop natively-running Envoy, brightstaff, and watchdog processes. + + Args: + skip_pids: Set of PIDs to skip (used by the watchdog to avoid self-termination). Returns: bool: True if at least one process was running and received a stop signal, @@ -385,7 +389,12 @@ def stop_native(): brightstaff_pid = pids.get("brightstaff_pid") had_running_process = False - for name, pid in [("envoy", envoy_pid), ("brightstaff", brightstaff_pid)]: + for name, pid in [ + ("envoy", envoy_pid), + ("brightstaff", brightstaff_pid), + ]: + if skip_pids and pid in skip_pids: + continue if pid is None: continue try: diff --git a/cli/planoai/obs/__init__.py b/cli/planoai/obs/__init__.py new file mode 100644 index 00000000..2f4e14af --- /dev/null +++ b/cli/planoai/obs/__init__.py @@ -0,0 +1,6 @@ +"""Plano observability console: in-memory live view of LLM traffic.""" + +from planoai.obs.collector import LLMCall, LLMCallStore, ObsCollector +from planoai.obs.pricing import PricingCatalog + +__all__ = ["LLMCall", "LLMCallStore", "ObsCollector", "PricingCatalog"] diff --git a/cli/planoai/obs/collector.py b/cli/planoai/obs/collector.py new file mode 100644 index 00000000..7f4cae36 --- /dev/null +++ b/cli/planoai/obs/collector.py @@ -0,0 +1,266 @@ +"""In-memory collector for LLM calls, fed by OTLP/gRPC spans from brightstaff.""" + +from __future__ import annotations + +import threading +from collections import deque +from concurrent import futures +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, Iterable + +import grpc +from opentelemetry.proto.collector.trace.v1 import ( + trace_service_pb2, + trace_service_pb2_grpc, +) + +DEFAULT_GRPC_PORT = 4317 +DEFAULT_CAPACITY = 1000 + + +@dataclass +class LLMCall: + """One LLM call as reconstructed from a brightstaff LLM span. + + Fields default to ``None`` when the underlying span attribute was absent. + """ + + request_id: str + timestamp: datetime + model: str + provider: str | None = None + request_model: str | None = None + session_id: str | None = None + route_name: str | None = None + is_streaming: bool | None = None + status_code: int | None = None + prompt_tokens: int | None = None + completion_tokens: int | None = None + total_tokens: int | None = None + cached_input_tokens: int | None = None + cache_creation_tokens: int | None = None + reasoning_tokens: int | None = None + ttft_ms: float | None = None + duration_ms: float | None = None + routing_strategy: str | None = None + routing_reason: str | None = None + cost_usd: float | None = None + + @property + def tpt_ms(self) -> float | None: + if self.duration_ms is None or self.completion_tokens in (None, 0): + return None + ttft = self.ttft_ms or 0.0 + generate_ms = max(0.0, self.duration_ms - ttft) + if generate_ms <= 0: + return None + return generate_ms / self.completion_tokens + + @property + def tokens_per_sec(self) -> float | None: + tpt = self.tpt_ms + if tpt is None or tpt <= 0: + return None + return 1000.0 / tpt + + +class LLMCallStore: + """Thread-safe ring buffer of recent LLM calls.""" + + def __init__(self, capacity: int = DEFAULT_CAPACITY) -> None: + self._capacity = capacity + self._calls: deque[LLMCall] = deque(maxlen=capacity) + self._lock = threading.Lock() + + @property + def capacity(self) -> int: + return self._capacity + + def add(self, call: LLMCall) -> None: + with self._lock: + self._calls.append(call) + + def clear(self) -> None: + with self._lock: + self._calls.clear() + + def snapshot(self) -> list[LLMCall]: + with self._lock: + return list(self._calls) + + def __len__(self) -> int: + with self._lock: + return len(self._calls) + + +# Span attribute keys used below are the canonical OTel / Plano keys emitted by +# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source +# of truth. + + +def _anyvalue_to_python(value: Any) -> Any: # AnyValue from OTLP + kind = value.WhichOneof("value") + if kind == "string_value": + return value.string_value + if kind == "bool_value": + return value.bool_value + if kind == "int_value": + return value.int_value + if kind == "double_value": + return value.double_value + return None + + +def _attrs_to_dict(attrs: Iterable[Any]) -> dict[str, Any]: + out: dict[str, Any] = {} + for kv in attrs: + py = _anyvalue_to_python(kv.value) + if py is not None: + out[kv.key] = py + return out + + +def _maybe_int(value: Any) -> int | None: + if value is None: + return None + try: + return int(value) + except (TypeError, ValueError): + return None + + +def _maybe_float(value: Any) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def span_to_llm_call( + span: Any, service_name: str, pricing: Any | None = None +) -> LLMCall | None: + """Convert an OTLP span into an LLMCall, or return None if it isn't one. + + A span is considered an LLM call iff it carries the ``llm.model`` attribute. + """ + attrs = _attrs_to_dict(span.attributes) + model = attrs.get("llm.model") + if not model: + return None + + # Prefer explicit span attributes; fall back to likely aliases. + request_id = next( + ( + str(attrs[key]) + for key in ("request_id", "http.request_id") + if key in attrs and attrs[key] is not None + ), + span.span_id.hex() if span.span_id else "", + ) + start_ns = span.start_time_unix_nano or 0 + ts = ( + datetime.fromtimestamp(start_ns / 1_000_000_000, tz=timezone.utc).astimezone() + if start_ns + else datetime.now().astimezone() + ) + + call = LLMCall( + request_id=str(request_id), + timestamp=ts, + model=str(model), + provider=( + str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name + ), + request_model=( + str(attrs["model.requested"]) if "model.requested" in attrs else None + ), + session_id=( + str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None + ), + route_name=( + str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None + ), + is_streaming=( + bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None + ), + status_code=_maybe_int(attrs.get("http.status_code")), + prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")), + completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")), + total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")), + cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")), + cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")), + reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")), + ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")), + duration_ms=_maybe_float(attrs.get("llm.duration_ms")), + routing_strategy=( + str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None + ), + routing_reason=( + str(attrs["routing.selection_reason"]) + if "routing.selection_reason" in attrs + else None + ), + ) + + if pricing is not None: + call.cost_usd = pricing.cost_for_call(call) + + return call + + +class _ObsServicer(trace_service_pb2_grpc.TraceServiceServicer): + def __init__(self, store: LLMCallStore, pricing: Any | None) -> None: + self._store = store + self._pricing = pricing + + def Export(self, request, context): # noqa: N802 — gRPC generated name + for resource_spans in request.resource_spans: + service_name = "unknown" + for attr in resource_spans.resource.attributes: + if attr.key == "service.name": + val = _anyvalue_to_python(attr.value) + if val is not None: + service_name = str(val) + break + for scope_spans in resource_spans.scope_spans: + for span in scope_spans.spans: + call = span_to_llm_call(span, service_name, self._pricing) + if call is not None: + self._store.add(call) + return trace_service_pb2.ExportTraceServiceResponse() + + +@dataclass +class ObsCollector: + """Owns the OTLP/gRPC server and the in-memory LLMCall ring buffer.""" + + store: LLMCallStore = field(default_factory=LLMCallStore) + pricing: Any | None = None + host: str = "0.0.0.0" + port: int = DEFAULT_GRPC_PORT + _server: grpc.Server | None = field(default=None, init=False, repr=False) + + def start(self) -> None: + if self._server is not None: + return + server = grpc.server(futures.ThreadPoolExecutor(max_workers=4)) + trace_service_pb2_grpc.add_TraceServiceServicer_to_server( + _ObsServicer(self.store, self.pricing), server + ) + address = f"{self.host}:{self.port}" + bound = server.add_insecure_port(address) + if bound == 0: + raise OSError( + f"Failed to bind OTLP listener on {address}: port already in use. " + "Stop tracing via `planoai trace down` or pick another port with --port." + ) + server.start() + self._server = server + + def stop(self, grace: float = 2.0) -> None: + if self._server is not None: + self._server.stop(grace) + self._server = None diff --git a/cli/planoai/obs/pricing.py b/cli/planoai/obs/pricing.py new file mode 100644 index 00000000..6f2ce5b4 --- /dev/null +++ b/cli/planoai/obs/pricing.py @@ -0,0 +1,321 @@ +"""DigitalOcean Gradient pricing catalog for the obs console. + +Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``. +Single-source: one fetch at startup, cached for the life of the process. +""" + +from __future__ import annotations + +import logging +import re +import threading +from dataclasses import dataclass +from typing import Any + +import requests + +DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog" +FETCH_TIMEOUT_SECS = 5.0 + + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ModelPrice: + """Input/output $/token rates. Token counts are multiplied by these.""" + + input_per_token_usd: float + output_per_token_usd: float + cached_input_per_token_usd: float | None = None + + +class PricingCatalog: + """In-memory pricing lookup keyed by model id. + + DO's catalog uses ids like ``openai-gpt-5.4``; Plano's resolved model names + may arrive as ``do/openai-gpt-5.4`` or bare ``openai-gpt-5.4``. We strip the + leading provider prefix when looking up. + """ + + def __init__(self, prices: dict[str, ModelPrice] | None = None) -> None: + self._prices: dict[str, ModelPrice] = prices or {} + self._lock = threading.Lock() + + def __len__(self) -> int: + with self._lock: + return len(self._prices) + + def sample_models(self, n: int = 5) -> list[str]: + with self._lock: + return list(self._prices.keys())[:n] + + @classmethod + def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog": + """Fetch pricing from DO's catalog endpoint. On failure, returns an + empty catalog (cost column will be blank). + + The catalog endpoint is public — no auth required, no signup — so + ``planoai obs`` gets cost data on first run out of the box. + """ + try: + resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS) + resp.raise_for_status() + data = resp.json() + except Exception as exc: # noqa: BLE001 — best-effort; never fatal + logger.warning( + "DO pricing fetch failed: %s; cost column will be blank.", + exc, + ) + return cls() + + prices = _parse_do_pricing(data) + if not prices: + # Dump the first entry's raw shape so we can see which fields DO + # actually returned — helps when the catalog adds new fields or + # the response doesn't match our parser. + import json as _json + + sample_items = _coerce_items(data) + sample = sample_items[0] if sample_items else data + logger.warning( + "DO pricing response had no parseable entries; cost column " + "will be blank. Sample entry: %s", + _json.dumps(sample, default=str)[:400], + ) + return cls(prices) + + def price_for(self, model_name: str | None) -> ModelPrice | None: + if not model_name: + return None + with self._lock: + # Try the full name first, then stripped prefix, then lowercased variants. + for candidate in _model_key_candidates(model_name): + hit = self._prices.get(candidate) + if hit is not None: + return hit + return None + + def cost_for_call(self, call: Any) -> float | None: + """Compute USD cost for an LLMCall. Returns None when pricing is unknown.""" + price = self.price_for(getattr(call, "model", None)) or self.price_for( + getattr(call, "request_model", None) + ) + if price is None: + return None + prompt = int(getattr(call, "prompt_tokens", 0) or 0) + completion = int(getattr(call, "completion_tokens", 0) or 0) + cached = int(getattr(call, "cached_input_tokens", 0) or 0) + + # Cached input tokens are priced separately at the cached rate when known; + # otherwise they're already counted in prompt tokens at the regular rate. + fresh_prompt = prompt + if price.cached_input_per_token_usd is not None and cached: + fresh_prompt = max(0, prompt - cached) + cost_cached = cached * price.cached_input_per_token_usd + else: + cost_cached = 0.0 + + cost = ( + fresh_prompt * price.input_per_token_usd + + completion * price.output_per_token_usd + + cost_cached + ) + return round(cost, 6) + + +_DATE_SUFFIX_RE = re.compile(r"-\d{8}$") +_PROVIDER_PREFIXES = ("anthropic", "openai", "google", "meta", "cohere", "mistral") +_ANTHROPIC_FAMILIES = {"opus", "sonnet", "haiku"} + + +def _model_key_candidates(model_name: str) -> list[str]: + """Lookup-side variants of a Plano-emitted model name. + + Plano resolves names like ``claude-haiku-4-5-20251001``; the catalog stores + them as ``anthropic-claude-haiku-4.5``. We strip the date suffix and the + ``provider/`` prefix here; the catalog itself registers the dash/dot and + family-order aliases at parse time (see :func:`_expand_aliases`). + """ + base = model_name.strip() + out = [base] + if "/" in base: + out.append(base.split("/", 1)[1]) + for k in list(out): + stripped = _DATE_SUFFIX_RE.sub("", k) + if stripped != k: + out.append(stripped) + out.extend([v.lower() for v in list(out)]) + seen: set[str] = set() + uniq = [] + for key in out: + if key not in seen: + seen.add(key) + uniq.append(key) + return uniq + + +def _expand_aliases(model_id: str) -> set[str]: + """Catalog-side variants of a DO model id. + + DO publishes Anthropic models under ids like ``anthropic-claude-opus-4.7`` + or ``anthropic-claude-4.6-sonnet`` while Plano emits ``claude-opus-4-7`` / + ``claude-sonnet-4-6``. Generate a set covering provider-prefix stripping, + dash↔dot in version segments, and family↔version word order so a single + catalog entry matches every name shape we'll see at lookup. + """ + aliases: set[str] = set() + + def add(name: str) -> None: + if not name: + return + aliases.add(name) + aliases.add(name.lower()) + + add(model_id) + + base = model_id + head, _, rest = base.partition("-") + if head.lower() in _PROVIDER_PREFIXES and rest: + add(rest) + base = rest + + for key in list(aliases): + if "." in key: + add(key.replace(".", "-")) + + parts = base.split("-") + if len(parts) >= 3 and parts[0].lower() == "claude": + rest_parts = parts[1:] + for i, p in enumerate(rest_parts): + if p.lower() in _ANTHROPIC_FAMILIES: + others = rest_parts[:i] + rest_parts[i + 1 :] + if not others: + break + family_last = "claude-" + "-".join(others) + "-" + p + family_first = "claude-" + p + "-" + "-".join(others) + add(family_last) + add(family_first) + add(family_last.replace(".", "-")) + add(family_first.replace(".", "-")) + break + + return aliases + + +def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]: + """Parse DO catalog response into a ModelPrice map keyed by model id. + + DO's shape (as of 2026-04): + { + "data": [ + {"model_id": "openai-gpt-5.4", + "pricing": {"input_price_per_million": 5.0, + "output_price_per_million": 15.0}}, + ... + ] + } + + Older/alternate shapes are also accepted (flat top-level fields, or the + ``id``/``model``/``name`` key). + """ + prices: dict[str, ModelPrice] = {} + items = _coerce_items(data) + for item in items: + model_id = ( + item.get("model_id") + or item.get("id") + or item.get("model") + or item.get("name") + ) + if not model_id: + continue + + # DO nests rates under `pricing`; try that first, then fall back to + # top-level fields for alternate response shapes. + sources = [item] + if isinstance(item.get("pricing"), dict): + sources.insert(0, item["pricing"]) + + input_rate = _extract_rate_from_sources( + sources, + ["input_per_token", "input_token_price", "price_input"], + ["input_price_per_million", "input_per_million", "input_per_mtok"], + ) + output_rate = _extract_rate_from_sources( + sources, + ["output_per_token", "output_token_price", "price_output"], + ["output_price_per_million", "output_per_million", "output_per_mtok"], + ) + cached_rate = _extract_rate_from_sources( + sources, + [ + "cached_input_per_token", + "cached_input_token_price", + "prompt_cache_read_per_token", + ], + [ + "cached_input_price_per_million", + "cached_input_per_million", + "cached_input_per_mtok", + ], + ) + + if input_rate is None or output_rate is None: + continue + # Treat 0-rate entries as "unknown" so cost falls back to `—` rather + # than showing a misleading $0.0000. DO's catalog sometimes omits + # rates for promo/open-weight models. + if input_rate == 0 and output_rate == 0: + continue + price = ModelPrice( + input_per_token_usd=input_rate, + output_per_token_usd=output_rate, + cached_input_per_token_usd=cached_rate, + ) + for alias in _expand_aliases(str(model_id)): + prices.setdefault(alias, price) + return prices + + +def _coerce_items(data: Any) -> list[dict]: + if isinstance(data, list): + return [x for x in data if isinstance(x, dict)] + if isinstance(data, dict): + for key in ("data", "models", "pricing", "items"): + val = data.get(key) + if isinstance(val, list): + return [x for x in val if isinstance(x, dict)] + return [] + + +def _extract_rate_from_sources( + sources: list[dict], + per_token_keys: list[str], + per_million_keys: list[str], +) -> float | None: + """Return a per-token rate in USD, or None if unknown. + + Some DO catalog responses put per-token values under a field whose name + says ``_per_million`` (e.g. ``input_price_per_million: 5E-8`` — that's + $5e-8 per token, not per million). Heuristic: values < 1 are already + per-token (real per-million rates are ~0.1 to ~100); values >= 1 are + treated as per-million and divided by 1,000,000. + """ + for src in sources: + for key in per_token_keys: + if key in src and src[key] is not None: + try: + return float(src[key]) + except (TypeError, ValueError): + continue + for key in per_million_keys: + if key in src and src[key] is not None: + try: + v = float(src[key]) + except (TypeError, ValueError): + continue + if v >= 1: + return v / 1_000_000 + return v + return None diff --git a/cli/planoai/obs/render.py b/cli/planoai/obs/render.py new file mode 100644 index 00000000..e3583747 --- /dev/null +++ b/cli/planoai/obs/render.py @@ -0,0 +1,634 @@ +"""Rich TUI renderer for the observability console.""" + +from __future__ import annotations + +from collections import Counter +from dataclasses import dataclass +from datetime import datetime +from http import HTTPStatus + +from rich.align import Align +from rich.box import SIMPLE, SIMPLE_HEAVY +from rich.console import Group +from rich.panel import Panel +from rich.table import Table +from rich.text import Text + +MAX_WIDTH = 160 + +from planoai.obs.collector import LLMCall + + +@dataclass +class AggregateStats: + count: int + total_cost_usd: float + total_input_tokens: int + total_output_tokens: int + distinct_sessions: int + current_session: str | None + p50_latency_ms: float | None = None + p95_latency_ms: float | None = None + p99_latency_ms: float | None = None + p50_ttft_ms: float | None = None + p95_ttft_ms: float | None = None + p99_ttft_ms: float | None = None + error_count: int = 0 + errors_4xx: int = 0 + errors_5xx: int = 0 + has_cost: bool = False + + +@dataclass +class ModelRollup: + model: str + requests: int + input_tokens: int + output_tokens: int + cache_write: int + cache_read: int + cost_usd: float + has_cost: bool = False + avg_tokens_per_sec: float | None = None + + +def _percentile(values: list[float], pct: float) -> float | None: + if not values: + return None + s = sorted(values) + k = max(0, min(len(s) - 1, int(round((pct / 100.0) * (len(s) - 1))))) + return s[k] + + +def aggregates(calls: list[LLMCall]) -> AggregateStats: + total_cost = sum((c.cost_usd or 0.0) for c in calls) + total_input = sum(int(c.prompt_tokens or 0) for c in calls) + total_output = sum(int(c.completion_tokens or 0) for c in calls) + session_ids = {c.session_id for c in calls if c.session_id} + current = next( + (c.session_id for c in reversed(calls) if c.session_id is not None), None + ) + durations = [c.duration_ms for c in calls if c.duration_ms is not None] + ttfts = [c.ttft_ms for c in calls if c.ttft_ms is not None] + errors_4xx = sum( + 1 for c in calls if c.status_code is not None and 400 <= c.status_code < 500 + ) + errors_5xx = sum( + 1 for c in calls if c.status_code is not None and c.status_code >= 500 + ) + has_cost = any(c.cost_usd is not None for c in calls) + return AggregateStats( + count=len(calls), + total_cost_usd=total_cost, + total_input_tokens=total_input, + total_output_tokens=total_output, + distinct_sessions=len(session_ids), + current_session=current, + p50_latency_ms=_percentile(durations, 50), + p95_latency_ms=_percentile(durations, 95), + p99_latency_ms=_percentile(durations, 99), + p50_ttft_ms=_percentile(ttfts, 50), + p95_ttft_ms=_percentile(ttfts, 95), + p99_ttft_ms=_percentile(ttfts, 99), + error_count=errors_4xx + errors_5xx, + errors_4xx=errors_4xx, + errors_5xx=errors_5xx, + has_cost=has_cost, + ) + + +def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]: + buckets: dict[str, dict[str, float | int | bool]] = {} + tps_samples: dict[str, list[float]] = {} + for c in calls: + key = c.model + b = buckets.setdefault( + key, + { + "requests": 0, + "input": 0, + "output": 0, + "cache_write": 0, + "cache_read": 0, + "cost": 0.0, + "has_cost": False, + }, + ) + b["requests"] = int(b["requests"]) + 1 + b["input"] = int(b["input"]) + int(c.prompt_tokens or 0) + b["output"] = int(b["output"]) + int(c.completion_tokens or 0) + b["cache_write"] = int(b["cache_write"]) + int(c.cache_creation_tokens or 0) + b["cache_read"] = int(b["cache_read"]) + int(c.cached_input_tokens or 0) + b["cost"] = float(b["cost"]) + (c.cost_usd or 0.0) + if c.cost_usd is not None: + b["has_cost"] = True + tps = c.tokens_per_sec + if tps is not None: + tps_samples.setdefault(key, []).append(tps) + + rollups: list[ModelRollup] = [] + for model, b in buckets.items(): + samples = tps_samples.get(model) + avg_tps = (sum(samples) / len(samples)) if samples else None + rollups.append( + ModelRollup( + model=model, + requests=int(b["requests"]), + input_tokens=int(b["input"]), + output_tokens=int(b["output"]), + cache_write=int(b["cache_write"]), + cache_read=int(b["cache_read"]), + cost_usd=float(b["cost"]), + has_cost=bool(b["has_cost"]), + avg_tokens_per_sec=avg_tps, + ) + ) + rollups.sort(key=lambda r: (r.cost_usd, r.requests), reverse=True) + return rollups + + +@dataclass +class RouteHit: + route: str + hits: int + pct: float + p95_latency_ms: float | None + error_count: int + + +def route_hits(calls: list[LLMCall]) -> list[RouteHit]: + counts: Counter[str] = Counter() + per_route_latency: dict[str, list[float]] = {} + per_route_errors: dict[str, int] = {} + for c in calls: + if not c.route_name: + continue + counts[c.route_name] += 1 + if c.duration_ms is not None: + per_route_latency.setdefault(c.route_name, []).append(c.duration_ms) + if c.status_code is not None and c.status_code >= 400: + per_route_errors[c.route_name] = per_route_errors.get(c.route_name, 0) + 1 + total = sum(counts.values()) + if total == 0: + return [] + return [ + RouteHit( + route=r, + hits=n, + pct=(n / total) * 100.0, + p95_latency_ms=_percentile(per_route_latency.get(r, []), 95), + error_count=per_route_errors.get(r, 0), + ) + for r, n in counts.most_common() + ] + + +def _fmt_cost(v: float | None, *, zero: str = "—") -> str: + if v is None: + return "—" + if v == 0: + return zero + if abs(v) < 0.0001: + return f"${v:.8f}".rstrip("0").rstrip(".") + if abs(v) < 0.01: + return f"${v:.6f}".rstrip("0").rstrip(".") + if abs(v) < 1: + return f"${v:.4f}" + return f"${v:,.2f}" + + +def _fmt_ms(v: float | None) -> str: + if v is None: + return "—" + if v >= 1000: + return f"{v / 1000:.1f}s" + return f"{v:.0f}ms" + + +def _fmt_int(v: int | None) -> str: + if v is None or v == 0: + return "—" + return f"{v:,}" + + +def _fmt_tokens(v: int | None) -> str: + if v is None: + return "—" + return f"{v:,}" + + +def _fmt_tps(v: float | None) -> str: + if v is None or v <= 0: + return "—" + if v >= 100: + return f"{v:.0f}/s" + return f"{v:.1f}/s" + + +def _latency_style(v: float | None) -> str: + if v is None: + return "dim" + if v < 500: + return "green" + if v < 2000: + return "yellow" + return "red" + + +def _ttft_style(v: float | None) -> str: + if v is None: + return "dim" + if v < 300: + return "green" + if v < 1000: + return "yellow" + return "red" + + +def _truncate_model(name: str, limit: int = 32) -> str: + if len(name) <= limit: + return name + return name[: limit - 1] + "…" + + +def _status_text(code: int | None) -> Text: + if code is None: + return Text("—", style="dim") + if 200 <= code < 300: + return Text("● ok", style="green") + if 300 <= code < 400: + return Text(f"● {code}", style="yellow") + if 400 <= code < 500: + return Text(f"● {code}", style="yellow bold") + return Text(f"● {code}", style="red bold") + + +def _summary_panel(last: LLMCall | None, stats: AggregateStats) -> Panel: + # Content-sized columns with a fixed gutter keep the two blocks close + # together instead of stretching across the full terminal on wide screens. + grid = Table.grid(padding=(0, 4)) + grid.add_column(no_wrap=True) + grid.add_column(no_wrap=True) + + # Left: latest request snapshot. + left = Table.grid(padding=(0, 1)) + left.add_column(style="dim", no_wrap=True) + left.add_column(no_wrap=True) + if last is None: + left.add_row("latest", Text("waiting for spans…", style="dim italic")) + else: + model_text = Text(_truncate_model(last.model, 48), style="bold cyan") + if last.is_streaming: + model_text.append(" ⟳ stream", style="dim") + left.add_row("model", model_text) + if last.request_model and last.request_model != last.model: + left.add_row( + "requested", Text(_truncate_model(last.request_model, 48), style="cyan") + ) + if last.route_name: + left.add_row("route", Text(last.route_name, style="yellow")) + left.add_row("status", _status_text(last.status_code)) + tokens = Text() + tokens.append(_fmt_tokens(last.prompt_tokens)) + tokens.append(" in", style="dim") + tokens.append(" · ", style="dim") + tokens.append(_fmt_tokens(last.completion_tokens), style="green") + tokens.append(" out", style="dim") + if last.cached_input_tokens: + tokens.append(" · ", style="dim") + tokens.append(_fmt_tokens(last.cached_input_tokens), style="yellow") + tokens.append(" cached", style="dim") + left.add_row("tokens", tokens) + timing = Text() + timing.append("TTFT ", style="dim") + timing.append(_fmt_ms(last.ttft_ms), style=_ttft_style(last.ttft_ms)) + timing.append(" · ", style="dim") + timing.append("lat ", style="dim") + timing.append(_fmt_ms(last.duration_ms), style=_latency_style(last.duration_ms)) + tps = last.tokens_per_sec + if tps: + timing.append(" · ", style="dim") + timing.append(_fmt_tps(tps), style="green") + left.add_row("timing", timing) + left.add_row("cost", Text(_fmt_cost(last.cost_usd), style="green bold")) + + # Right: lifetime totals. + right = Table.grid(padding=(0, 1)) + right.add_column(style="dim", no_wrap=True) + right.add_column(no_wrap=True) + right.add_row( + "requests", + Text(f"{stats.count:,}", style="bold"), + ) + if stats.error_count: + err_text = Text() + err_text.append(f"{stats.error_count:,}", style="red bold") + parts: list[str] = [] + if stats.errors_4xx: + parts.append(f"{stats.errors_4xx} 4xx") + if stats.errors_5xx: + parts.append(f"{stats.errors_5xx} 5xx") + if parts: + err_text.append(f" ({' · '.join(parts)})", style="dim") + right.add_row("errors", err_text) + cost_str = _fmt_cost(stats.total_cost_usd) if stats.has_cost else "—" + right.add_row("total cost", Text(cost_str, style="green bold")) + tokens_total = Text() + tokens_total.append(_fmt_tokens(stats.total_input_tokens)) + tokens_total.append(" in", style="dim") + tokens_total.append(" · ", style="dim") + tokens_total.append(_fmt_tokens(stats.total_output_tokens), style="green") + tokens_total.append(" out", style="dim") + right.add_row("tokens", tokens_total) + lat_text = Text() + lat_text.append("p50 ", style="dim") + lat_text.append( + _fmt_ms(stats.p50_latency_ms), style=_latency_style(stats.p50_latency_ms) + ) + lat_text.append(" · ", style="dim") + lat_text.append("p95 ", style="dim") + lat_text.append( + _fmt_ms(stats.p95_latency_ms), style=_latency_style(stats.p95_latency_ms) + ) + lat_text.append(" · ", style="dim") + lat_text.append("p99 ", style="dim") + lat_text.append( + _fmt_ms(stats.p99_latency_ms), style=_latency_style(stats.p99_latency_ms) + ) + right.add_row("latency", lat_text) + ttft_text = Text() + ttft_text.append("p50 ", style="dim") + ttft_text.append(_fmt_ms(stats.p50_ttft_ms), style=_ttft_style(stats.p50_ttft_ms)) + ttft_text.append(" · ", style="dim") + ttft_text.append("p95 ", style="dim") + ttft_text.append(_fmt_ms(stats.p95_ttft_ms), style=_ttft_style(stats.p95_ttft_ms)) + ttft_text.append(" · ", style="dim") + ttft_text.append("p99 ", style="dim") + ttft_text.append(_fmt_ms(stats.p99_ttft_ms), style=_ttft_style(stats.p99_ttft_ms)) + right.add_row("TTFT", ttft_text) + sess = Text() + sess.append(f"{stats.distinct_sessions}") + if stats.current_session: + sess.append(" · current ", style="dim") + sess.append(stats.current_session, style="magenta") + right.add_row("sessions", sess) + + grid.add_row(left, right) + return Panel( + grid, + title="[bold]live LLM traffic[/]", + border_style="cyan", + box=SIMPLE_HEAVY, + padding=(0, 1), + ) + + +def _model_rollup_table(rollups: list[ModelRollup]) -> Table: + table = Table( + title="by model", + title_justify="left", + title_style="bold dim", + caption="cost via DigitalOcean Gradient catalog", + caption_justify="left", + caption_style="dim italic", + box=SIMPLE, + header_style="bold", + pad_edge=False, + padding=(0, 1), + ) + table.add_column("model", style="cyan", no_wrap=True) + table.add_column("req", justify="right") + table.add_column("input", justify="right") + table.add_column("output", justify="right", style="green") + table.add_column("cache wr", justify="right", style="yellow") + table.add_column("cache rd", justify="right", style="yellow") + table.add_column("tok/s", justify="right") + table.add_column("cost", justify="right", style="green") + if not rollups: + table.add_row( + Text("no requests yet", style="dim italic"), + *(["—"] * 7), + ) + return table + for r in rollups: + cost_cell = _fmt_cost(r.cost_usd) if r.has_cost else "—" + table.add_row( + _truncate_model(r.model), + f"{r.requests:,}", + _fmt_tokens(r.input_tokens), + _fmt_tokens(r.output_tokens), + _fmt_int(r.cache_write), + _fmt_int(r.cache_read), + _fmt_tps(r.avg_tokens_per_sec), + cost_cell, + ) + return table + + +def _route_hit_table(hits: list[RouteHit]) -> Table: + table = Table( + title="route share", + title_justify="left", + title_style="bold dim", + box=SIMPLE, + header_style="bold", + pad_edge=False, + padding=(0, 1), + ) + table.add_column("route", style="cyan") + table.add_column("hits", justify="right") + table.add_column("%", justify="right") + table.add_column("p95", justify="right") + table.add_column("err", justify="right") + for h in hits: + err_cell = ( + Text(f"{h.error_count:,}", style="red bold") if h.error_count else "—" + ) + table.add_row( + h.route, + f"{h.hits:,}", + f"{h.pct:5.1f}%", + Text(_fmt_ms(h.p95_latency_ms), style=_latency_style(h.p95_latency_ms)), + err_cell, + ) + return table + + +def _recent_table(calls: list[LLMCall], limit: int = 15) -> Table: + show_route = any(c.route_name for c in calls) + show_cache = any((c.cached_input_tokens or 0) > 0 for c in calls) + show_rsn = any((c.reasoning_tokens or 0) > 0 for c in calls) + + caption_parts = ["in·new = fresh prompt tokens"] + if show_cache: + caption_parts.append("in·cache = cached read") + if show_rsn: + caption_parts.append("rsn = reasoning") + caption_parts.append("lat = total latency") + + table = Table( + title=f"recent · last {min(limit, len(calls)) if calls else 0}", + title_justify="left", + title_style="bold dim", + caption=" · ".join(caption_parts), + caption_justify="left", + caption_style="dim italic", + box=SIMPLE, + header_style="bold", + pad_edge=False, + padding=(0, 1), + ) + table.add_column("time", no_wrap=True) + table.add_column("model", style="cyan", no_wrap=True) + if show_route: + table.add_column("route", style="yellow", no_wrap=True) + table.add_column("in·new", justify="right") + if show_cache: + table.add_column("in·cache", justify="right", style="yellow") + table.add_column("out", justify="right", style="green") + if show_rsn: + table.add_column("rsn", justify="right") + table.add_column("tok/s", justify="right") + table.add_column("TTFT", justify="right") + table.add_column("lat", justify="right") + table.add_column("cost", justify="right", style="green") + table.add_column("status") + + if not calls: + cols = len(table.columns) + table.add_row( + Text("waiting for spans…", style="dim italic"), + *(["—"] * (cols - 1)), + ) + return table + + recent = list(reversed(calls))[:limit] + for idx, c in enumerate(recent): + is_newest = idx == 0 + time_style = "bold white" if is_newest else None + model_style = "bold cyan" if is_newest else "cyan" + row: list[object] = [ + ( + Text(c.timestamp.strftime("%H:%M:%S"), style=time_style) + if time_style + else c.timestamp.strftime("%H:%M:%S") + ), + Text(_truncate_model(c.model), style=model_style), + ] + if show_route: + row.append(c.route_name or "—") + row.append(_fmt_tokens(c.prompt_tokens)) + if show_cache: + row.append(_fmt_int(c.cached_input_tokens)) + row.append(_fmt_tokens(c.completion_tokens)) + if show_rsn: + row.append(_fmt_int(c.reasoning_tokens)) + row.extend( + [ + _fmt_tps(c.tokens_per_sec), + Text(_fmt_ms(c.ttft_ms), style=_ttft_style(c.ttft_ms)), + Text(_fmt_ms(c.duration_ms), style=_latency_style(c.duration_ms)), + _fmt_cost(c.cost_usd), + _status_text(c.status_code), + ] + ) + table.add_row(*row) + return table + + +def _last_error(calls: list[LLMCall]) -> LLMCall | None: + for c in reversed(calls): + if c.status_code is not None and c.status_code >= 400: + return c + return None + + +def _http_reason(code: int) -> str: + try: + return HTTPStatus(code).phrase + except ValueError: + return "" + + +def _fmt_ago(ts: datetime) -> str: + # `ts` is produced in collector.py via datetime.now(tz=...), but fall back + # gracefully if a naive timestamp ever sneaks in. + now = datetime.now(tz=ts.tzinfo) if ts.tzinfo else datetime.now() + delta = (now - ts).total_seconds() + if delta < 0: + delta = 0 + if delta < 60: + return f"{int(delta)}s ago" + if delta < 3600: + return f"{int(delta // 60)}m ago" + return f"{int(delta // 3600)}h ago" + + +def _error_banner(call: LLMCall) -> Panel: + code = call.status_code or 0 + border = "red" if code >= 500 else "yellow" + header = Text() + header.append(f"● {code}", style=f"{border} bold") + reason = _http_reason(code) + if reason: + header.append(f" {reason}", style=border) + header.append(" · ", style="dim") + header.append(_truncate_model(call.model, 48), style="cyan") + if call.route_name: + header.append(" · ", style="dim") + header.append(call.route_name, style="yellow") + header.append(" · ", style="dim") + header.append(_fmt_ago(call.timestamp), style="dim") + if call.request_id: + header.append(" · req ", style="dim") + header.append(call.request_id, style="magenta") + return Panel( + header, + title="[bold]last error[/]", + title_align="left", + border_style=border, + box=SIMPLE, + padding=(0, 1), + ) + + +def _footer(stats: AggregateStats) -> Text: + waiting = stats.count == 0 + text = Text() + text.append("Ctrl-C ", style="bold") + text.append("exit", style="dim") + text.append(" · OTLP :4317", style="dim") + text.append(" · pricing: DigitalOcean ", style="dim") + if waiting: + text.append("waiting for spans", style="yellow") + text.append( + " — set tracing.opentracing_grpc_endpoint=localhost:4317", style="dim" + ) + else: + text.append(f"receiving · {stats.count:,} call(s) buffered", style="green") + return text + + +def render(calls: list[LLMCall]) -> Align: + last = calls[-1] if calls else None + stats = aggregates(calls) + rollups = model_rollups(calls) + hits = route_hits(calls) + + parts: list[object] = [_summary_panel(last, stats)] + err = _last_error(calls) + if err is not None: + parts.append(_error_banner(err)) + if hits: + split = Table.grid(padding=(0, 2)) + split.add_column(no_wrap=False) + split.add_column(no_wrap=False) + split.add_row(_model_rollup_table(rollups), _route_hit_table(hits)) + parts.append(split) + else: + parts.append(_model_rollup_table(rollups)) + parts.append(_recent_table(calls)) + parts.append(_footer(stats)) + # Cap overall width so wide terminals don't stretch the layout into a + # mostly-whitespace gap between columns. + return Align.left(Group(*parts), width=MAX_WIDTH) diff --git a/cli/planoai/obs_cmd.py b/cli/planoai/obs_cmd.py new file mode 100644 index 00000000..6249df30 --- /dev/null +++ b/cli/planoai/obs_cmd.py @@ -0,0 +1,99 @@ +"""`planoai obs` — live observability TUI.""" + +from __future__ import annotations + +import time + +import rich_click as click +from rich.console import Console +from rich.live import Live + +from planoai.consts import PLANO_COLOR +from planoai.obs.collector import ( + DEFAULT_CAPACITY, + DEFAULT_GRPC_PORT, + LLMCallStore, + ObsCollector, +) +from planoai.obs.pricing import PricingCatalog +from planoai.obs.render import render + + +@click.command(name="obs", help="Live observability console for Plano LLM traffic.") +@click.option( + "--port", + type=int, + default=DEFAULT_GRPC_PORT, + show_default=True, + help="OTLP/gRPC port to listen on. Must match the brightstaff tracing endpoint.", +) +@click.option( + "--host", + type=str, + default="0.0.0.0", + show_default=True, + help="Host to bind the OTLP listener.", +) +@click.option( + "--capacity", + type=int, + default=DEFAULT_CAPACITY, + show_default=True, + help="Max LLM calls kept in memory; older calls evicted FIFO.", +) +@click.option( + "--refresh-ms", + type=int, + default=500, + show_default=True, + help="TUI refresh interval.", +) +def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None: + console = Console() + console.print( + f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...", + end="", + ) + pricing = PricingCatalog.fetch() + if len(pricing): + sample = ", ".join(pricing.sample_models(3)) + console.print( + f" [green]{len(pricing)} models loaded[/] [dim]({sample}, ...)[/]" + ) + else: + console.print( + " [yellow]no pricing loaded[/] — " + "[dim]cost column will be blank (DO catalog unreachable)[/]" + ) + + store = LLMCallStore(capacity=capacity) + collector = ObsCollector(store=store, pricing=pricing, host=host, port=port) + try: + collector.start() + except OSError as exc: + console.print(f"[red]{exc}[/]") + raise SystemExit(1) + + console.print( + f"Listening for OTLP spans on [bold]{host}:{port}[/]. " + "Ensure plano config has [cyan]tracing.opentracing_grpc_endpoint: http://localhost:4317[/] " + "and [cyan]tracing.random_sampling: 100[/] (or run [bold]planoai up[/] " + "with no config — it wires this automatically)." + ) + console.print("Press [bold]Ctrl-C[/] to exit.\n") + + refresh = max(0.05, refresh_ms / 1000.0) + try: + with Live( + render(store.snapshot()), + console=console, + refresh_per_second=1.0 / refresh, + screen=False, + ) as live: + while True: + time.sleep(refresh) + live.update(render(store.snapshot())) + except KeyboardInterrupt: + console.print("\n[dim]obs stopped[/]") + finally: + collector.stop() diff --git a/cli/planoai/rich_click_config.py b/cli/planoai/rich_click_config.py index ba75bc23..fe90dcf1 100644 --- a/cli/planoai/rich_click_config.py +++ b/cli/planoai/rich_click_config.py @@ -61,7 +61,7 @@ def configure_rich_click(plano_color: str) -> None: }, { "name": "Observability", - "commands": ["trace"], + "commands": ["trace", "obs"], }, { "name": "Utilities", diff --git a/cli/planoai/utils.py b/cli/planoai/utils.py index 8f73bf18..214fd0a3 100644 --- a/cli/planoai/utils.py +++ b/cli/planoai/utils.py @@ -91,7 +91,12 @@ def convert_legacy_listeners( "type": "model", "port": 12000, "address": "0.0.0.0", - "timeout": "30s", + # LLM streaming responses routinely exceed 30s (extended thinking, + # long tool reasoning, large completions). Match the 300s ceiling + # used by the direct upstream-provider routes so Envoy doesn't + # abort streams with UT mid-response. Users can override via their + # plano_config.yaml `listeners.timeout` field. + "timeout": "300s", "model_providers": model_providers or [], } @@ -100,7 +105,7 @@ def convert_legacy_listeners( "type": "prompt", "port": 10000, "address": "0.0.0.0", - "timeout": "30s", + "timeout": "300s", } # Handle None case diff --git a/cli/pyproject.toml b/cli/pyproject.toml index 1864a915..0be85ed5 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "planoai" -version = "0.4.19" +version = "0.4.21" description = "Python-based CLI tool to manage Plano." authors = [{name = "Katanemo Labs, Inc."}] readme = "README.md" diff --git a/cli/test/test_config_generator.py b/cli/test/test_config_generator.py index 17fa56cc..77b5b480 100644 --- a/cli/test/test_config_generator.py +++ b/cli/test/test_config_generator.py @@ -1,7 +1,11 @@ import json import pytest +import yaml from unittest import mock -from planoai.config_generator import validate_and_render_schema +from planoai.config_generator import ( + validate_and_render_schema, + migrate_inline_routing_preferences, +) @pytest.fixture(autouse=True) @@ -253,38 +257,72 @@ llm_providers: base_url: "http://custom.com/api/v2" provider_interface: openai +""", + }, + { + "id": "vercel_is_supported_provider", + "expected_error": None, + "plano_config": """ +version: v0.4.0 + +listeners: + - name: llm + type: model + port: 12000 + +model_providers: + - model: vercel/* + base_url: https://ai-gateway.vercel.sh/v1 + passthrough_auth: true + +""", + }, + { + "id": "openrouter_is_supported_provider", + "expected_error": None, + "plano_config": """ +version: v0.4.0 + +listeners: + - name: llm + type: model + port: 12000 + +model_providers: + - model: openrouter/* + base_url: https://openrouter.ai/api/v1 + passthrough_auth: true + """, }, { "id": "duplicate_routeing_preference_name", "expected_error": "Duplicate routing preference name", "plano_config": """ -version: v0.1.0 +version: v0.4.0 listeners: - egress_traffic: - address: 0.0.0.0 + - name: llm + type: model port: 12000 - message_format: openai - timeout: 30s - -llm_providers: +model_providers: - model: openai/gpt-4o-mini access_key: $OPENAI_API_KEY default: true - model: openai/gpt-4o access_key: $OPENAI_API_KEY - routing_preferences: - - name: code understanding - description: understand and explain existing code snippets, functions, or libraries - - model: openai/gpt-4.1 - access_key: $OPENAI_API_KEY - routing_preferences: - - name: code understanding - description: generating new code snippets, functions, or boilerplate based on user prompts or requirements +routing_preferences: + - name: code understanding + description: understand and explain existing code snippets, functions, or libraries + models: + - openai/gpt-4o + - name: code understanding + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + models: + - openai/gpt-4o-mini tracing: random_sampling: 100 @@ -465,3 +503,238 @@ def test_convert_legacy_llm_providers_no_prompt_gateway(): "port": 12000, "timeout": "30s", } + + +def test_inline_routing_preferences_migrated_to_top_level(): + plano_config = """ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code understanding + description: understand and explain existing code snippets, functions, or libraries + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + assert config_yaml["version"] == "v0.4.0" + for provider in config_yaml["model_providers"]: + assert "routing_preferences" not in provider + + top_level = config_yaml["routing_preferences"] + by_name = {entry["name"]: entry for entry in top_level} + assert set(by_name) == {"code understanding", "code generation"} + assert by_name["code understanding"]["models"] == ["openai/gpt-4o"] + assert by_name["code generation"]["models"] == [ + "anthropic/claude-sonnet-4-20250514" + ] + assert ( + by_name["code understanding"]["description"] + == "understand and explain existing code snippets, functions, or libraries" + ) + + +def test_inline_same_name_across_providers_merges_models(): + plano_config = """ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + top_level = config_yaml["routing_preferences"] + assert len(top_level) == 1 + entry = top_level[0] + assert entry["name"] == "code generation" + assert entry["models"] == [ + "openai/gpt-4o", + "anthropic/claude-sonnet-4-20250514", + ] + assert config_yaml["version"] == "v0.4.0" + + +def test_existing_top_level_routing_preferences_preserved(): + plano_config = """ +version: v0.4.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +routing_preferences: + - name: code generation + description: generating new code snippets or boilerplate + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-20250514 +""" + config_yaml = yaml.safe_load(plano_config) + before = yaml.safe_dump(config_yaml, sort_keys=True) + migrate_inline_routing_preferences(config_yaml) + after = yaml.safe_dump(config_yaml, sort_keys=True) + + assert before == after + + +def test_existing_top_level_wins_over_inline_migration(): + plano_config = """ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: inline description should lose + +routing_preferences: + - name: code generation + description: user-defined top-level description wins + models: + - openai/gpt-4o +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + top_level = config_yaml["routing_preferences"] + assert len(top_level) == 1 + entry = top_level[0] + assert entry["description"] == "user-defined top-level description wins" + assert entry["models"] == ["openai/gpt-4o"] + + +def test_wildcard_with_inline_routing_preferences_errors(): + plano_config = """ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openrouter/* + base_url: https://openrouter.ai/api/v1 + passthrough_auth: true + routing_preferences: + - name: code generation + description: generating code +""" + config_yaml = yaml.safe_load(plano_config) + with pytest.raises(Exception) as excinfo: + migrate_inline_routing_preferences(config_yaml) + assert "wildcard" in str(excinfo.value).lower() + + +def test_migration_bumps_version_even_without_inline_preferences(): + plano_config = """ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + assert "routing_preferences" not in config_yaml + assert config_yaml["version"] == "v0.4.0" + + +def test_migration_is_noop_on_v040_config_with_stray_inline_preferences(): + # v0.4.0 configs are assumed to be on the canonical top-level shape. + # The migration intentionally does not rescue stray inline preferences + # at v0.4.0+ so that the deprecation boundary is a clean version gate. + plano_config = """ +version: v0.4.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code generation + description: generating new code +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + assert config_yaml["version"] == "v0.4.0" + assert "routing_preferences" not in config_yaml + assert config_yaml["model_providers"][0]["routing_preferences"] == [ + {"name": "code generation", "description": "generating new code"} + ] + + +def test_migration_does_not_downgrade_newer_versions(): + plano_config = """ +version: v0.5.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY +""" + config_yaml = yaml.safe_load(plano_config) + migrate_inline_routing_preferences(config_yaml) + + assert config_yaml["version"] == "v0.5.0" diff --git a/cli/test/test_defaults.py b/cli/test/test_defaults.py new file mode 100644 index 00000000..7017a70c --- /dev/null +++ b/cli/test/test_defaults.py @@ -0,0 +1,111 @@ +from pathlib import Path + +import jsonschema +import yaml + +from planoai.defaults import ( + PROVIDER_DEFAULTS, + detect_providers, + synthesize_default_config, +) + +_SCHEMA_PATH = Path(__file__).parents[2] / "config" / "plano_config_schema.yaml" + + +def _schema() -> dict: + return yaml.safe_load(_SCHEMA_PATH.read_text()) + + +def test_zero_env_vars_produces_pure_passthrough(): + cfg = synthesize_default_config(env={}) + assert cfg["version"] == "v0.4.0" + assert cfg["listeners"][0]["port"] == 12000 + for provider in cfg["model_providers"]: + assert provider.get("passthrough_auth") is True + assert "access_key" not in provider + # No provider should be marked default in pure pass-through mode. + assert provider.get("default") is not True + # All known providers should be listed. + names = {p["name"] for p in cfg["model_providers"]} + assert "digitalocean" in names + assert "vercel" in names + assert "openrouter" in names + assert "openai" in names + assert "anthropic" in names + + +def test_env_keys_promote_providers_to_env_keyed(): + cfg = synthesize_default_config( + env={"OPENAI_API_KEY": "sk-1", "DO_API_KEY": "do-1"} + ) + by_name = {p["name"]: p for p in cfg["model_providers"]} + assert by_name["openai"].get("access_key") == "$OPENAI_API_KEY" + assert by_name["openai"].get("passthrough_auth") is None + assert by_name["digitalocean"].get("access_key") == "$DO_API_KEY" + # Unset env keys remain pass-through. + assert by_name["anthropic"].get("passthrough_auth") is True + + +def test_no_default_is_synthesized(): + # Bare model names resolve via brightstaff's wildcard expansion registering + # bare keys, so the synthesizer intentionally never sets `default: true`. + cfg = synthesize_default_config( + env={"OPENAI_API_KEY": "sk-1", "ANTHROPIC_API_KEY": "a-1"} + ) + assert not any(p.get("default") is True for p in cfg["model_providers"]) + + +def test_listener_port_is_configurable(): + cfg = synthesize_default_config(env={}, listener_port=11000) + assert cfg["listeners"][0]["port"] == 11000 + + +def test_detection_summary_strings(): + det = detect_providers(env={"OPENAI_API_KEY": "sk", "DO_API_KEY": "d"}) + summary = det.summary + assert "env-keyed" in summary and "openai" in summary and "digitalocean" in summary + assert "pass-through" in summary + + +def test_tracing_block_points_at_local_console(): + cfg = synthesize_default_config(env={}) + tracing = cfg["tracing"] + assert tracing["opentracing_grpc_endpoint"] == "http://localhost:4317" + # random_sampling is a percentage in the plano config — 100 = every span. + assert tracing["random_sampling"] == 100 + + +def test_synthesized_config_validates_against_schema(): + cfg = synthesize_default_config(env={"OPENAI_API_KEY": "sk"}) + jsonschema.validate(cfg, _schema()) + + +def test_provider_defaults_digitalocean_is_configured(): + by_name = {p.name: p for p in PROVIDER_DEFAULTS} + assert "digitalocean" in by_name + assert by_name["digitalocean"].env_var == "DO_API_KEY" + assert by_name["digitalocean"].base_url == "https://inference.do-ai.run/v1" + assert by_name["digitalocean"].model_pattern == "digitalocean/*" + + +def test_provider_defaults_vercel_is_configured(): + by_name = {p.name: p for p in PROVIDER_DEFAULTS} + assert "vercel" in by_name + assert by_name["vercel"].env_var == "AI_GATEWAY_API_KEY" + assert by_name["vercel"].base_url == "https://ai-gateway.vercel.sh/v1" + assert by_name["vercel"].model_pattern == "vercel/*" + + +def test_provider_defaults_openrouter_is_configured(): + by_name = {p.name: p for p in PROVIDER_DEFAULTS} + assert "openrouter" in by_name + assert by_name["openrouter"].env_var == "OPENROUTER_API_KEY" + assert by_name["openrouter"].base_url == "https://openrouter.ai/api/v1" + assert by_name["openrouter"].model_pattern == "openrouter/*" + + +def test_openrouter_env_key_promotes_to_env_keyed(): + cfg = synthesize_default_config(env={"OPENROUTER_API_KEY": "or-1"}) + by_name = {p["name"]: p for p in cfg["model_providers"]} + assert by_name["openrouter"].get("access_key") == "$OPENROUTER_API_KEY" + assert by_name["openrouter"].get("passthrough_auth") is None diff --git a/cli/test/test_obs_collector.py b/cli/test/test_obs_collector.py new file mode 100644 index 00000000..a16506d9 --- /dev/null +++ b/cli/test/test_obs_collector.py @@ -0,0 +1,145 @@ +import time +from datetime import datetime, timezone +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call + + +def _mk_attr(key: str, value): + v = MagicMock() + if isinstance(value, bool): + v.WhichOneof.return_value = "bool_value" + v.bool_value = value + elif isinstance(value, int): + v.WhichOneof.return_value = "int_value" + v.int_value = value + elif isinstance(value, float): + v.WhichOneof.return_value = "double_value" + v.double_value = value + else: + v.WhichOneof.return_value = "string_value" + v.string_value = str(value) + kv = MagicMock() + kv.key = key + kv.value = v + return kv + + +def _mk_span( + attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab" +) -> MagicMock: + span = MagicMock() + span.attributes = [_mk_attr(k, v) for k, v in attrs.items()] + span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000) + span.span_id.hex.return_value = span_id_hex + return span + + +def test_span_without_llm_model_is_ignored(): + span = _mk_span({"http.method": "POST"}) + assert span_to_llm_call(span, "plano(llm)") is None + + +def test_span_with_full_llm_attrs_produces_call(): + span = _mk_span( + { + "llm.model": "openai-gpt-5.4", + "model.requested": "router:software-engineering", + "plano.session_id": "sess-abc", + "plano.route.name": "software-engineering", + "llm.is_streaming": False, + "llm.duration_ms": 1234, + "llm.time_to_first_token": 210, + "llm.usage.prompt_tokens": 100, + "llm.usage.completion_tokens": 50, + "llm.usage.total_tokens": 150, + "llm.usage.cached_input_tokens": 30, + "llm.usage.cache_creation_tokens": 5, + "llm.usage.reasoning_tokens": 200, + "http.status_code": 200, + "request_id": "req-42", + } + ) + call = span_to_llm_call(span, "plano(llm)") + assert call is not None + assert call.request_id == "req-42" + assert call.model == "openai-gpt-5.4" + assert call.request_model == "router:software-engineering" + assert call.session_id == "sess-abc" + assert call.route_name == "software-engineering" + assert call.is_streaming is False + assert call.duration_ms == 1234.0 + assert call.ttft_ms == 210.0 + assert call.prompt_tokens == 100 + assert call.completion_tokens == 50 + assert call.total_tokens == 150 + assert call.cached_input_tokens == 30 + assert call.cache_creation_tokens == 5 + assert call.reasoning_tokens == 200 + assert call.status_code == 200 + + +def test_pricing_lookup_attaches_cost(): + class StubPricing: + def cost_for_call(self, call): + # Simple: 2 * prompt + 3 * completion, in cents + return 0.02 * (call.prompt_tokens or 0) + 0.03 * ( + call.completion_tokens or 0 + ) + + span = _mk_span( + { + "llm.model": "do/openai-gpt-5.4", + "llm.usage.prompt_tokens": 10, + "llm.usage.completion_tokens": 2, + } + ) + call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing()) + assert call is not None + assert call.cost_usd == pytest.approx(0.26) + + +def test_tpt_and_tokens_per_sec_derived(): + call = LLMCall( + request_id="x", + timestamp=datetime.now(tz=timezone.utc), + model="m", + duration_ms=1000, + ttft_ms=200, + completion_tokens=80, + ) + # (1000 - 200) / 80 = 10ms per token => 100 tokens/sec + assert call.tpt_ms == 10.0 + assert call.tokens_per_sec == 100.0 + + +def test_tpt_returns_none_when_no_completion_tokens(): + call = LLMCall( + request_id="x", + timestamp=datetime.now(tz=timezone.utc), + model="m", + duration_ms=1000, + ttft_ms=200, + completion_tokens=0, + ) + assert call.tpt_ms is None + assert call.tokens_per_sec is None + + +def test_store_evicts_fifo_at_capacity(): + store = LLMCallStore(capacity=3) + now = datetime.now(tz=timezone.utc) + for i in range(5): + store.add( + LLMCall( + request_id=f"r{i}", + timestamp=now, + model="m", + ) + ) + snap = store.snapshot() + assert len(snap) == 3 + assert [c.request_id for c in snap] == ["r2", "r3", "r4"] diff --git a/cli/test/test_obs_pricing.py b/cli/test/test_obs_pricing.py new file mode 100644 index 00000000..02247d3d --- /dev/null +++ b/cli/test/test_obs_pricing.py @@ -0,0 +1,146 @@ +from datetime import datetime, timezone + +from planoai.obs.collector import LLMCall +from planoai.obs.pricing import ModelPrice, PricingCatalog + + +def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall: + return LLMCall( + request_id="r", + timestamp=datetime.now(tz=timezone.utc), + model=model, + prompt_tokens=prompt, + completion_tokens=completion, + cached_input_tokens=cached, + ) + + +def test_lookup_matches_bare_and_prefixed(): + prices = { + "openai-gpt-5.4": ModelPrice( + input_per_token_usd=0.000001, output_per_token_usd=0.000002 + ) + } + catalog = PricingCatalog(prices) + assert catalog.price_for("openai-gpt-5.4") is not None + # do/openai-gpt-5.4 should resolve after stripping the provider prefix. + assert catalog.price_for("do/openai-gpt-5.4") is not None + assert catalog.price_for("unknown-model") is None + + +def test_cost_computation_without_cache(): + prices = { + "m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002) + } + cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500)) + assert cost == 0.002 # 1000 * 1e-6 + 500 * 2e-6 + + +def test_cost_computation_with_cached_discount(): + prices = { + "m": ModelPrice( + input_per_token_usd=0.000001, + output_per_token_usd=0.000002, + cached_input_per_token_usd=0.0000001, + ) + } + # 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3 + cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200)) + assert cost == round(0.0008 + 0.00002 + 0.001, 6) + + +def test_empty_catalog_returns_none(): + assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None + + +def test_parse_do_catalog_treats_small_values_as_per_token(): + """DO's real catalog uses per-token values under the `_per_million` key + (e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token.""" + from planoai.obs.pricing import _parse_do_pricing + + sample = { + "data": [ + { + "model_id": "openai-gpt-oss-20b", + "pricing": { + "input_price_per_million": 5e-8, + "output_price_per_million": 4.5e-7, + }, + }, + { + "model_id": "openai-gpt-oss-120b", + "pricing": { + "input_price_per_million": 1e-7, + "output_price_per_million": 7e-7, + }, + }, + ] + } + prices = _parse_do_pricing(sample) + # Values < 1 are assumed to already be per-token — no extra division. + assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8 + assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7 + assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7 + + +def test_anthropic_aliases_match_plano_emitted_names(): + """DO publishes 'anthropic-claude-opus-4.7' and 'anthropic-claude-haiku-4.5'; + Plano emits 'claude-opus-4-7' and 'claude-haiku-4-5-20251001'. Aliases + registered at parse time should bridge the gap.""" + from planoai.obs.pricing import _parse_do_pricing + + sample = { + "data": [ + { + "model_id": "anthropic-claude-opus-4.7", + "pricing": { + "input_price_per_million": 15.0, + "output_price_per_million": 75.0, + }, + }, + { + "model_id": "anthropic-claude-haiku-4.5", + "pricing": { + "input_price_per_million": 1.0, + "output_price_per_million": 5.0, + }, + }, + { + "model_id": "anthropic-claude-4.6-sonnet", + "pricing": { + "input_price_per_million": 3.0, + "output_price_per_million": 15.0, + }, + }, + ] + } + catalog = PricingCatalog(_parse_do_pricing(sample)) + # Family-last shapes Plano emits. + assert catalog.price_for("claude-opus-4-7") is not None + assert catalog.price_for("claude-haiku-4-5") is not None + # Date-suffixed name (Anthropic API style). + assert catalog.price_for("claude-haiku-4-5-20251001") is not None + # Word-order swap: DO has 'claude-4.6-sonnet', Plano emits 'claude-sonnet-4-6'. + assert catalog.price_for("claude-sonnet-4-6") is not None + # Original DO ids still resolve. + assert catalog.price_for("anthropic-claude-opus-4.7") is not None + + +def test_parse_do_catalog_divides_large_values_as_per_million(): + """A provider that genuinely reports $5-per-million in that field gets divided.""" + from planoai.obs.pricing import _parse_do_pricing + + sample = { + "data": [ + { + "model_id": "mystery-model", + "pricing": { + "input_price_per_million": 5.0, # > 1 → treated as per-million + "output_price_per_million": 15.0, + }, + }, + ] + } + prices = _parse_do_pricing(sample) + assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000 + assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000 diff --git a/cli/test/test_obs_render.py b/cli/test/test_obs_render.py new file mode 100644 index 00000000..dd598363 --- /dev/null +++ b/cli/test/test_obs_render.py @@ -0,0 +1,106 @@ +from datetime import datetime, timedelta, timezone + +from planoai.obs.collector import LLMCall +from planoai.obs.render import aggregates, model_rollups, route_hits + + +def _call( + model: str, + ts: datetime, + prompt=0, + completion=0, + cost=None, + route=None, + session=None, + cache_read=0, + cache_write=0, +): + return LLMCall( + request_id="r", + timestamp=ts, + model=model, + prompt_tokens=prompt, + completion_tokens=completion, + cached_input_tokens=cache_read, + cache_creation_tokens=cache_write, + cost_usd=cost, + route_name=route, + session_id=session, + ) + + +def test_aggregates_sum_and_session_counts(): + now = datetime.now(tz=timezone.utc).astimezone() + calls = [ + _call( + "m1", + now - timedelta(seconds=50), + prompt=10, + completion=5, + cost=0.001, + session="s1", + ), + _call( + "m2", + now - timedelta(seconds=40), + prompt=20, + completion=10, + cost=0.002, + session="s1", + ), + _call( + "m1", + now - timedelta(seconds=30), + prompt=30, + completion=15, + cost=0.003, + session="s2", + ), + ] + stats = aggregates(calls) + assert stats.count == 3 + assert stats.total_cost_usd == 0.006 + assert stats.total_input_tokens == 60 + assert stats.total_output_tokens == 30 + assert stats.distinct_sessions == 2 + assert stats.current_session == "s2" + + +def test_rollups_split_by_model_and_cache(): + now = datetime.now(tz=timezone.utc).astimezone() + calls = [ + _call( + "m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7 + ), + _call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1), + _call("m2", now, prompt=30, completion=15, cost=0.004), + ] + rollups = model_rollups(calls) + by_model = {r.model: r for r in rollups} + assert by_model["m1"].requests == 2 + assert by_model["m1"].input_tokens == 30 + assert by_model["m1"].cache_write == 3 + assert by_model["m1"].cache_read == 8 + assert by_model["m2"].input_tokens == 30 + + +def test_route_hits_only_for_routed_calls(): + now = datetime.now(tz=timezone.utc).astimezone() + calls = [ + _call("m", now, route="code"), + _call("m", now, route="code"), + _call("m", now, route="summarization"), + _call("m", now), # no route + ] + hits = route_hits(calls) + # Only calls with route names are counted. + assert sum(h.hits for h in hits) == 3 + hits_by_name = {h.route: h for h in hits} + assert hits_by_name["code"].hits == 2 + assert hits_by_name["summarization"].hits == 1 + + +def test_route_hits_empty_when_no_routes(): + now = datetime.now(tz=timezone.utc).astimezone() + calls = [_call("m", now), _call("m", now)] + assert route_hits(calls) == [] diff --git a/cli/uv.lock b/cli/uv.lock index 665ebdb8..8910b0a4 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "planoai" -version = "0.4.18" +version = "0.4.21" source = { editable = "." } dependencies = [ { name = "click" }, diff --git a/config/envoy.template.yaml b/config/envoy.template.yaml index 5669511d..b2b9fb1f 100644 --- a/config/envoy.template.yaml +++ b/config/envoy.template.yaml @@ -901,6 +901,33 @@ static_resources: validation_context: trusted_ca: filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }} + - name: digitalocean + connect_timeout: {{ upstream_connect_timeout | default('5s') }} + type: LOGICAL_DNS + dns_lookup_family: V4_ONLY + lb_policy: ROUND_ROBIN + load_assignment: + cluster_name: digitalocean + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: inference.do-ai.run + port_value: 443 + hostname: "inference.do-ai.run" + transport_socket: + name: envoy.transport_sockets.tls + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: inference.do-ai.run + common_tls_context: + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + validation_context: + trusted_ca: + filename: {{ upstream_tls_ca_path | default('/etc/ssl/certs/ca-certificates.crt') }} - name: xiaomi connect_timeout: {{ upstream_connect_timeout | default('5s') }} type: LOGICAL_DNS diff --git a/config/grafana/brightstaff_dashboard.json b/config/grafana/brightstaff_dashboard.json new file mode 100644 index 00000000..4b54721f --- /dev/null +++ b/config/grafana/brightstaff_dashboard.json @@ -0,0 +1,541 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "RED, LLM upstream, routing service, and process metrics for brightstaff. Pair with Envoy admin metrics from cluster=bright_staff.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "HTTP RED", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "req/s", + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 1, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (handler) (rate(brightstaff_http_requests_total[1m]))", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "Rate — brightstaff RPS by handler", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "5xx fraction over 5m. Page-worthy when sustained above ~1%.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_http_requests_total{status_class=\"5xx\"}[5m])) / clamp_min(sum(rate(brightstaff_http_requests_total[5m])), 1)", + "legendFormat": "5xx rate", + "refId": "A" + } + ], + "title": "Errors — brightstaff 5xx rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "p50/p95/p99 by handler, computed from histogram buckets over 5m.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 9 }, + "id": 3, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.50, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p50 {{handler}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{handler}}", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.99, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p99 {{handler}}", + "refId": "C" + } + ], + "title": "Duration — p50 / p95 / p99 by handler", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "In-flight requests by handler. Climbs before latency does when brightstaff is saturated.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 }, + "id": 4, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (handler) (brightstaff_http_in_flight_requests)", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "In-flight requests by handler", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 200, + "panels": [], + "title": "LLM upstream", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 27 }, + "id": 5, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_upstream_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{provider}}/{{model}}", + "refId": "A" + } + ], + "title": "LLM upstream p95 by provider/model", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "All non-success error classes. timeout/connect = network, 5xx/429 = provider, parse = body shape mismatch, stream = mid-stream disconnect.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "reqps" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 27 }, + "id": 6, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (provider, error_class) (rate(brightstaff_llm_upstream_requests_total{error_class!=\"none\"}[5m]))", + "legendFormat": "{{provider}} / {{error_class}}", + "refId": "A" + } + ], + "title": "LLM upstream errors by provider / class", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Streaming only. Empty if the route never streams.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 36 }, + "id": 7, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_time_to_first_token_seconds_bucket[5m])))", + "legendFormat": "p95 {{provider}}/{{model}}", + "refId": "A" + } + ], + "title": "Time-to-first-token p95 (streaming)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Tokens/sec by provider/model/kind — proxy for cost. Stacked.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "tokens/s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 36 }, + "id": 8, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (provider, model, kind) (rate(brightstaff_llm_tokens_total[5m]))", + "legendFormat": "{{provider}}/{{model}} {{kind}}", + "refId": "A" + } + ], + "title": "Token throughput by provider / model / kind", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }, + "id": 300, + "panels": [], + "title": "Routing service", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Which models the orchestrator picked over the last 15 minutes.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "short" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 46 }, + "id": 9, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (selected_model) (increase(brightstaff_router_decisions_total[15m]))", + "legendFormat": "{{selected_model}}", + "refId": "A" + } + ], + "title": "Model selection distribution (last 15m)", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Fraction of decisions that fell back (orchestrator returned `none` or errored). High = router can't classify intent or no candidates configured.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "percentunit" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 46 }, + "id": 10, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (route) (rate(brightstaff_router_decisions_total{fallback=\"true\"}[5m])) / clamp_min(sum by (route) (rate(brightstaff_router_decisions_total[5m])), 1)", + "legendFormat": "{{route}}", + "refId": "A" + } + ], + "title": "Fallback rate by route", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 55 }, + "id": 11, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, route) (rate(brightstaff_router_decision_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{route}}", + "refId": "A" + } + ], + "title": "Router decision p95 latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Hit / (hit + miss). Low ratio = sessions aren't being reused or TTL too short.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "green", "value": 0.8 } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 55 }, + "id": 12, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_session_cache_events_total{outcome=\"hit\"}[5m])) / clamp_min(sum(rate(brightstaff_session_cache_events_total{outcome=~\"hit|miss\"}[5m])), 1)", + "legendFormat": "hit rate", + "refId": "A" + } + ], + "title": "Session cache hit rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "decision_served = a real model picked. no_candidates = sentinel `none` returned. policy_error = orchestrator failed.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 55 }, + "id": 13, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (outcome) (rate(brightstaff_routing_service_requests_total[5m]))", + "legendFormat": "{{outcome}}", + "refId": "A" + } + ], + "title": "/routing/* outcomes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 63 }, + "id": 400, + "panels": [], + "title": "Process & Envoy link", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Compare to brightstaff RPS (panel 1) — sustained gap = network or Envoy queueing.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 }, + "id": 14, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=\"bright_staff\"}[1m]))", + "legendFormat": "envoy → bright_staff", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_http_requests_total[1m]))", + "legendFormat": "brightstaff served", + "refId": "B" + } + ], + "title": "Envoy → brightstaff link health", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "RSS" }, + "properties": [{ "id": "unit", "value": "bytes" }] + }, + { + "matcher": { "id": "byName", "options": "CPU" }, + "properties": [{ "id": "unit", "value": "percentunit" }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 }, + "id": 15, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "process_resident_memory_bytes{job=\"brightstaff\"}", + "legendFormat": "RSS", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(process_cpu_seconds_total{job=\"brightstaff\"}[1m])", + "legendFormat": "CPU", + "refId": "B" + } + ], + "title": "Brightstaff process RSS / CPU", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["plano", "brightstaff", "llm"], + "templating": { + "list": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "type": "datasource", + "query": "prometheus", + "current": { "selected": false, "text": "Prometheus", "value": "DS_PROMETHEUS" }, + "hide": 0, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "includeAll": false, + "multi": false + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Brightstaff (Plano dataplane)", + "uid": "brightstaff", + "version": 1, + "weekStart": "" +} diff --git a/config/grafana/docker-compose.yaml b/config/grafana/docker-compose.yaml new file mode 100644 index 00000000..33238073 --- /dev/null +++ b/config/grafana/docker-compose.yaml @@ -0,0 +1,43 @@ +# One-command Prometheus + Grafana stack for observing a locally-running +# Plano (Envoy admin :9901 + brightstaff :9092 on the host). +# +# cd config/grafana +# docker compose up -d +# open http://localhost:3000 (admin / admin) +# +# Grafana is preloaded with: +# - Prometheus datasource (uid=DS_PROMETHEUS) → http://prometheus:9090 +# - Brightstaff dashboard (auto-imported from brightstaff_dashboard.json) +# +# Prometheus scrapes the host's :9092 and :9901 via host.docker.internal. +# On Linux this works because of the `extra_hosts: host-gateway` mapping +# below. On Mac/Win it works natively. + +services: + prometheus: + image: prom/prometheus:latest + container_name: plano-prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus_scrape.yaml:/etc/prometheus/prometheus.yml:ro + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + + grafana: + image: grafana/grafana:latest + container_name: plano-grafana + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + volumes: + - ./provisioning:/etc/grafana/provisioning:ro + - ./brightstaff_dashboard.json:/var/lib/grafana/dashboards/brightstaff_dashboard.json:ro + depends_on: + - prometheus + restart: unless-stopped diff --git a/config/grafana/prometheus_scrape.yaml b/config/grafana/prometheus_scrape.yaml new file mode 100644 index 00000000..b4041287 --- /dev/null +++ b/config/grafana/prometheus_scrape.yaml @@ -0,0 +1,44 @@ +# Prometheus config that scrapes Plano (Envoy admin + brightstaff). This is +# a complete Prometheus config — mount it directly at +# /etc/prometheus/prometheus.yml. The included docker-compose.yaml does this +# for you. +# +# Targets: +# - envoy:9901 Envoy admin → envoy_cluster_*, envoy_http_*, envoy_server_*. +# - brightstaff:9092 Native dataplane → brightstaff_http_*, brightstaff_llm_*, +# brightstaff_router_*, process_*. +# +# Hostname `host.docker.internal` works on Docker Desktop (Mac/Win) and on +# Linux when the container is started with `--add-host=host.docker.internal: +# host-gateway` (the included compose does this). If Plano runs *inside* +# Docker on the same network as Prometheus, replace it with the container +# name (e.g. `plano:9092`). +# +# This file is unrelated to demos/llm_routing/model_routing_service/prometheus.yaml, +# which scrapes a fake metrics service to feed the routing engine. + +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + +scrape_configs: + - job_name: envoy + honor_timestamps: true + metrics_path: /stats + params: + format: ["prometheus"] + static_configs: + - targets: + - host.docker.internal:9901 + labels: + service: plano + + - job_name: brightstaff + honor_timestamps: true + metrics_path: /metrics + static_configs: + - targets: + - host.docker.internal:9092 + labels: + service: plano diff --git a/config/grafana/provisioning/dashboards/brightstaff.yaml b/config/grafana/provisioning/dashboards/brightstaff.yaml new file mode 100644 index 00000000..271e4a9b --- /dev/null +++ b/config/grafana/provisioning/dashboards/brightstaff.yaml @@ -0,0 +1,15 @@ +# Auto-load the brightstaff dashboard JSON on Grafana startup. + +apiVersion: 1 + +providers: + - name: brightstaff + orgId: 1 + folder: Plano + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/config/grafana/provisioning/datasources/prometheus.yaml b/config/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 00000000..2e3170ec --- /dev/null +++ b/config/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,14 @@ +# Auto-provision the Prometheus datasource so the bundled dashboard wires up +# without any clicks. The `uid: DS_PROMETHEUS` matches the templated input in +# brightstaff_dashboard.json. + +apiVersion: 1 + +datasources: + - name: Prometheus + uid: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index d3d6a643..9560b437 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -190,8 +190,18 @@ properties: - openai - xiaomi - gemini + - chatgpt + - digitalocean + - vercel + - openrouter + headers: + type: object + additionalProperties: + type: string + description: "Additional headers to send with upstream requests (e.g., ChatGPT-Account-Id, originator)." routing_preferences: type: array + description: "[DEPRECATED] Inline routing_preferences under a model_provider are auto-migrated to the top-level routing_preferences list by the config generator. New configs should declare routing_preferences at the top level with an explicit models: [...] list. See docs/routing-api.md." items: type: object properties: @@ -238,8 +248,18 @@ properties: - openai - xiaomi - gemini + - chatgpt + - digitalocean + - vercel + - openrouter + headers: + type: object + additionalProperties: + type: string + description: "Additional headers to send with upstream requests (e.g., ChatGPT-Account-Id, originator)." routing_preferences: type: array + description: "[DEPRECATED] Inline routing_preferences under an llm_provider are auto-migrated to the top-level routing_preferences list by the config generator. New configs should declare routing_preferences at the top level with an explicit models: [...] list. See docs/routing-api.md." items: type: object properties: @@ -276,6 +296,9 @@ properties: type: boolean use_agent_orchestrator: type: boolean + disable_signals: + type: boolean + description: "Disable agentic signal analysis (frustration, repetition, escalation, etc.) on LLM responses to save CPU. Default false." upstream_connect_timeout: type: string description: "Connect timeout for upstream provider clusters (e.g., '5s', '10s'). Default is '5s'." diff --git a/crates/Cargo.lock b/crates/Cargo.lock index e07b47ee..39261d67 100644 --- a/crates/Cargo.lock +++ b/crates/Cargo.lock @@ -23,6 +23,18 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -257,6 +269,24 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.2", + "shlex", + "syn 2.0.117", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -316,6 +346,9 @@ dependencies = [ "hyper 1.9.0", "hyper-util", "lru", + "metrics 0.23.1", + "metrics-exporter-prometheus", + "metrics-process", "mockito", "opentelemetry", "opentelemetry-http", @@ -325,6 +358,7 @@ dependencies = [ "pretty_assertions", "rand 0.9.4", "redis", + "regex", "reqwest", "serde", "serde_json", @@ -332,6 +366,8 @@ dependencies = [ "serde_yaml", "strsim", "thiserror 2.0.18", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "time", "tokio", "tokio-postgres", @@ -391,6 +427,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -428,6 +473,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "cmov" version = "0.5.3" @@ -574,6 +630,21 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crypto-common" version = "0.1.7" @@ -1070,6 +1141,12 @@ dependencies = [ "wasip3", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "governor" version = "0.6.3" @@ -1128,7 +1205,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25" dependencies = [ - "ahash", + "ahash 0.3.8", "autocfg", ] @@ -1138,6 +1215,15 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.12", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1189,6 +1275,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1665,6 +1757,27 @@ version = "0.2.185" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libproc" +version = "0.14.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a54ad7278b8bc5301d5ffd2a94251c004feb971feba96c971ea4063645990757" +dependencies = [ + "bindgen", + "errno", + "libc", +] + [[package]] name = "libredox" version = "0.1.16" @@ -1745,6 +1858,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "mach2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae608c151f68243f2b000364e1f7b186d9c29845f7d2d85bd31b9ad77ad552b" + [[package]] name = "matchers" version = "0.2.0" @@ -1782,6 +1901,77 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "metrics" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3045b4193fbdc5b5681f32f11070da9be3609f189a79f3390706d42587f46bb5" +dependencies = [ + "ahash 0.8.12", + "portable-atomic", +] + +[[package]] +name = "metrics" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" +dependencies = [ + "ahash 0.8.12", + "portable-atomic", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6" +dependencies = [ + "base64 0.22.1", + "http-body-util", + "hyper 1.9.0", + "hyper-util", + "indexmap 2.14.0", + "ipnet", + "metrics 0.23.1", + "metrics-util", + "quanta", + "thiserror 1.0.69", + "tokio", + "tracing", +] + +[[package]] +name = "metrics-process" +version = "2.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4268d87f64a752f5a651314fc683f04da10be65701ea3e721ba4d74f79163cac" +dependencies = [ + "libc", + "libproc", + "mach2", + "metrics 0.24.3", + "once_cell", + "procfs", + "rlimit", + "windows", +] + +[[package]] +name = "metrics-util" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4259040465c955f9f2f1a4a8a16dc46726169bca0f88e8fb2dbeced487c3e828" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.14.5", + "metrics 0.23.1", + "num_cpus", + "quanta", + "sketches-ddsketch", +] + [[package]] name = "mime" version = "0.3.17" @@ -1935,6 +2125,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "objc2-core-foundation" version = "0.3.2" @@ -2125,6 +2325,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -2278,6 +2484,27 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25485360a54d6861439d60facef26de713b1e126bf015ec8f98239467a2b82f7" +dependencies = [ + "bitflags", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6401bf7b6af22f78b563665d15a22e9aef27775b79b149a66ca022468a4e405" +dependencies = [ + "bitflags", + "hex", +] + [[package]] name = "prompt_gateway" version = "0.1.0" @@ -2333,6 +2560,21 @@ dependencies = [ "log", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi 0.11.1+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quinn" version = "0.11.9" @@ -2485,6 +2727,15 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags", +] + [[package]] name = "redis" version = "0.27.6" @@ -2646,6 +2897,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rlimit" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f35ee2729c56bb610f6dba436bf78135f728b7373bdffae2ec815b2d3eb98cc3" +dependencies = [ + "libc", +] + [[package]] name = "rustc-hash" version = "1.1.0" @@ -3098,6 +3358,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" + [[package]] name = "slab" version = "0.4.12" @@ -3308,6 +3574,37 @@ dependencies = [ "rustc-hash 1.1.0", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.47" @@ -4003,6 +4300,49 @@ dependencies = [ "web-sys", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -4016,6 +4356,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -4044,6 +4395,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + [[package]] name = "windows-registry" version = "0.6.1" @@ -4133,6 +4494,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" diff --git a/crates/brightstaff/Cargo.toml b/crates/brightstaff/Cargo.toml index f88ed918..d2635963 100644 --- a/crates/brightstaff/Cargo.toml +++ b/crates/brightstaff/Cargo.toml @@ -3,6 +3,18 @@ name = "brightstaff" version = "0.1.0" edition = "2021" +[features] +default = ["jemalloc"] +jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] + +[[bin]] +name = "brightstaff" +path = "src/main.rs" + +[[bin]] +name = "signals_replay" +path = "src/bin/signals_replay.rs" + [dependencies] async-openai = "0.30.1" async-trait = "0.1" @@ -26,7 +38,11 @@ opentelemetry-stdout = "0.31" opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] } pretty_assertions = "1.4.1" rand = "0.9.2" +regex = "1.10" lru = "0.12" +metrics = "0.23" +metrics-exporter-prometheus = { version = "0.15", default-features = false, features = ["http-listener"] } +metrics-process = "2.1" redis = { version = "0.27", features = ["tokio-comp"] } reqwest = { version = "0.12.15", features = ["stream"] } serde = { version = "1.0.219", features = ["derive"] } @@ -35,6 +51,8 @@ serde_with = "3.13.0" strsim = "0.11" serde_yaml = "0.9.34" thiserror = "2.0.12" +tikv-jemallocator = { version = "0.6", optional = true } +tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true } tokio = { version = "1.44.2", features = ["full"] } tokio-postgres = { version = "0.7", features = ["with-serde_json-1"] } tokio-stream = "0.1" diff --git a/crates/brightstaff/src/app_state.rs b/crates/brightstaff/src/app_state.rs index e585d2db..1d534e89 100644 --- a/crates/brightstaff/src/app_state.rs +++ b/crates/brightstaff/src/app_state.rs @@ -24,4 +24,7 @@ pub struct AppState { /// Shared HTTP client for upstream LLM requests (connection pooling / keep-alive). pub http_client: reqwest::Client, pub filter_pipeline: Arc, + /// When false, agentic signal analysis is skipped on LLM responses to save CPU. + /// Controlled by `overrides.disable_signals` in plano config. + pub signals_enabled: bool, } diff --git a/crates/brightstaff/src/bin/signals_replay.rs b/crates/brightstaff/src/bin/signals_replay.rs new file mode 100644 index 00000000..41879ac1 --- /dev/null +++ b/crates/brightstaff/src/bin/signals_replay.rs @@ -0,0 +1,175 @@ +//! `signals-replay` — batch driver for the `brightstaff` signal analyzer. +//! +//! Reads JSONL conversations from stdin (one per line) and emits matching +//! JSONL reports on stdout, one per input conversation, in the same order. +//! +//! Input shape (per line): +//! ```json +//! {"id": "convo-42", "messages": [{"from": "human", "value": "..."}, ...]} +//! ``` +//! +//! Output shape (per line, success): +//! ```json +//! {"id": "convo-42", "report": { ...python-compatible SignalReport dict... }} +//! ``` +//! +//! On per-line failure (parse / analyzer error), emits: +//! ```json +//! {"id": "convo-42", "error": "..."} +//! ``` +//! +//! The output report dict is shaped to match the Python reference's +//! `SignalReport.to_dict()` byte-for-byte so the parity comparator can do a +//! direct structural diff. + +use std::io::{self, BufRead, BufWriter, Write}; + +use serde::Deserialize; +use serde_json::{json, Map, Value}; + +use brightstaff::signals::{SignalAnalyzer, SignalGroup, SignalReport}; + +#[derive(Debug, Deserialize)] +struct InputLine { + id: Value, + messages: Vec, +} + +#[derive(Debug, Deserialize)] +struct MessageRow { + #[serde(default)] + from: String, + #[serde(default)] + value: String, +} + +fn main() { + let stdin = io::stdin(); + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + let analyzer = SignalAnalyzer::default(); + + for line in stdin.lock().lines() { + let line = match line { + Ok(l) => l, + Err(e) => { + eprintln!("read error: {e}"); + std::process::exit(1); + } + }; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let result = process_line(&analyzer, trimmed); + // Always emit one line per input line so id ordering stays aligned. + if let Err(e) = writeln!(out, "{result}") { + eprintln!("write error: {e}"); + std::process::exit(1); + } + // Flush periodically isn't strictly needed — BufWriter handles it, + // and the parent process reads the whole stream when we're done. + } + let _ = out.flush(); +} + +fn process_line(analyzer: &SignalAnalyzer, line: &str) -> Value { + let parsed: InputLine = match serde_json::from_str(line) { + Ok(p) => p, + Err(e) => { + return json!({ + "id": Value::Null, + "error": format!("input parse: {e}"), + }); + } + }; + + let id = parsed.id.clone(); + + let view: Vec> = parsed + .messages + .iter() + .map(|m| brightstaff::signals::analyzer::ShareGptMessage { + from: m.from.as_str(), + value: m.value.as_str(), + }) + .collect(); + + let report = analyzer.analyze_sharegpt(&view); + let report_dict = report_to_python_dict(&report); + json!({ + "id": id, + "report": report_dict, + }) +} + +/// Convert a `SignalReport` into the Python reference's `to_dict()` shape. +/// +/// Ordering of category keys in each layer dict follows the Python source +/// exactly so even string-equality comparisons behave deterministically. +fn report_to_python_dict(r: &SignalReport) -> Value { + let mut interaction = Map::new(); + interaction.insert( + "misalignment".to_string(), + signal_group_to_python(&r.interaction.misalignment), + ); + interaction.insert( + "stagnation".to_string(), + signal_group_to_python(&r.interaction.stagnation), + ); + interaction.insert( + "disengagement".to_string(), + signal_group_to_python(&r.interaction.disengagement), + ); + interaction.insert( + "satisfaction".to_string(), + signal_group_to_python(&r.interaction.satisfaction), + ); + + let mut execution = Map::new(); + execution.insert( + "failure".to_string(), + signal_group_to_python(&r.execution.failure), + ); + execution.insert( + "loops".to_string(), + signal_group_to_python(&r.execution.loops), + ); + + let mut environment = Map::new(); + environment.insert( + "exhaustion".to_string(), + signal_group_to_python(&r.environment.exhaustion), + ); + + json!({ + "interaction_signals": Value::Object(interaction), + "execution_signals": Value::Object(execution), + "environment_signals": Value::Object(environment), + "overall_quality": r.overall_quality.as_str(), + "summary": r.summary, + }) +} + +fn signal_group_to_python(g: &SignalGroup) -> Value { + let signals: Vec = g + .signals + .iter() + .map(|s| { + json!({ + "signal_type": s.signal_type.as_str(), + "message_index": s.message_index, + "snippet": s.snippet, + "confidence": s.confidence, + "metadata": s.metadata, + }) + }) + .collect(); + + json!({ + "category": g.category, + "count": g.count, + "severity": g.severity, + "signals": signals, + }) +} diff --git a/crates/brightstaff/src/handlers/debug.rs b/crates/brightstaff/src/handlers/debug.rs new file mode 100644 index 00000000..58fbecd2 --- /dev/null +++ b/crates/brightstaff/src/handlers/debug.rs @@ -0,0 +1,53 @@ +use bytes::Bytes; +use http_body_util::combinators::BoxBody; +use hyper::{Response, StatusCode}; + +use super::full; + +#[derive(serde::Serialize)] +struct MemStats { + allocated_bytes: usize, + resident_bytes: usize, + #[serde(skip_serializing_if = "Option::is_none")] + error: Option, +} + +/// Returns jemalloc memory statistics as JSON. +/// Falls back to a stub when the jemalloc feature is disabled. +pub async fn memstats() -> Result>, hyper::Error> { + let stats = get_jemalloc_stats(); + let json = serde_json::to_string(&stats).unwrap(); + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/json") + .body(full(json)) + .unwrap()) +} + +#[cfg(feature = "jemalloc")] +fn get_jemalloc_stats() -> MemStats { + use tikv_jemalloc_ctl::{epoch, stats}; + + if let Err(e) = epoch::advance() { + return MemStats { + allocated_bytes: 0, + resident_bytes: 0, + error: Some(format!("failed to advance jemalloc epoch: {e}")), + }; + } + + MemStats { + allocated_bytes: stats::allocated::read().unwrap_or(0), + resident_bytes: stats::resident::read().unwrap_or(0), + error: None, + } +} + +#[cfg(not(feature = "jemalloc"))] +fn get_jemalloc_stats() -> MemStats { + MemStats { + allocated_bytes: 0, + resident_bytes: 0, + error: Some("jemalloc feature not enabled".to_string()), + } +} diff --git a/crates/brightstaff/src/handlers/function_calling.rs b/crates/brightstaff/src/handlers/function_calling.rs index ca4def32..3e2543bc 100644 --- a/crates/brightstaff/src/handlers/function_calling.rs +++ b/crates/brightstaff/src/handlers/function_calling.rs @@ -441,10 +441,8 @@ impl ArchFunctionHandler { } } // Handle str/string conversions - "str" | "string" => { - if !value.is_string() { - return Ok(json!(value.to_string())); - } + "str" | "string" if !value.is_string() => { + return Ok(json!(value.to_string())); } _ => {} } diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs index 8f00e4b6..3336209f 100644 --- a/crates/brightstaff/src/handlers/llm/mod.rs +++ b/crates/brightstaff/src/handlers/llm/mod.rs @@ -24,16 +24,18 @@ use crate::app_state::AppState; use crate::handlers::agents::pipeline::PipelineProcessor; use crate::handlers::extract_request_id; use crate::handlers::full; +use crate::metrics as bs_metrics; use crate::state::response_state_processor::ResponsesStateProcessor; use crate::state::{ extract_input_items, retrieve_and_combine_input, StateStorage, StateStorageError, }; use crate::streaming::{ create_streaming_response, create_streaming_response_with_output_filter, truncate_message, - ObservableStreamProcessor, StreamProcessor, + LlmMetricsCtx, ObservableStreamProcessor, StreamProcessor, }; use crate::tracing::{ - collect_custom_trace_attributes, llm as tracing_llm, operation_component, set_service_name, + collect_custom_trace_attributes, llm as tracing_llm, operation_component, + plano as tracing_plano, set_service_name, }; use model_selection::router_chat_get_upstream_model; @@ -102,15 +104,36 @@ async fn llm_chat_inner( .and_then(|hdr| request_headers.get(hdr)) .and_then(|v| v.to_str().ok()) .map(|s| s.to_string()); - let pinned_model: Option = if let Some(ref sid) = session_id { + let cached_route = if let Some(ref sid) = session_id { state .orchestrator_service .get_cached_route(sid, tenant_id.as_deref()) .await - .map(|c| c.model_name) } else { None }; + let (pinned_model, pinned_route_name): (Option, Option) = match cached_route { + Some(c) => (Some(c.model_name), c.route_name), + None => (None, None), + }; + + // Record session id on the LLM span for the observability console. + if let Some(ref sid) = session_id { + get_active_span(|span| { + span.set_attribute(opentelemetry::KeyValue::new( + tracing_plano::SESSION_ID, + sid.clone(), + )); + }); + } + if let Some(ref route_name) = pinned_route_name { + get_active_span(|span| { + span.set_attribute(opentelemetry::KeyValue::new( + tracing_plano::ROUTE_NAME, + route_name.clone(), + )); + }); + } let full_qualified_llm_provider_url = format!("{}{}", state.llm_provider_url, request_path); @@ -120,6 +143,7 @@ async fn llm_chat_inner( &request_path, &state.model_aliases, &state.llm_providers, + state.signals_enabled, ) .await { @@ -231,7 +255,15 @@ async fn llm_chat_inner( if let Some(ref client_api_kind) = client_api { let upstream_api = provider_id.compatible_api_for_client(client_api_kind, is_streaming_request); - client_request.normalize_for_upstream(provider_id, &upstream_api); + if let Err(e) = client_request.normalize_for_upstream(provider_id, &upstream_api) { + warn!( + "request_id={}: normalize_for_upstream failed: {}", + request_id, e + ); + let mut bad_request = Response::new(full(e.message)); + *bad_request.status_mut() = StatusCode::BAD_REQUEST; + return Ok(bad_request); + } } // --- Phase 2: Resolve conversation state (v1/responses API) --- @@ -311,6 +343,18 @@ async fn llm_chat_inner( alias_resolved_model.clone() }; + // Record route name on the LLM span (only when the orchestrator produced one). + if let Some(ref rn) = route_name { + if !rn.is_empty() && rn != "none" { + get_active_span(|span| { + span.set_attribute(opentelemetry::KeyValue::new( + tracing_plano::ROUTE_NAME, + rn.clone(), + )); + }); + } + } + if let Some(ref sid) = session_id { state .orchestrator_service @@ -373,6 +417,7 @@ async fn parse_and_validate_request( request_path: &str, model_aliases: &Option>, llm_providers: &Arc>, + signals_enabled: bool, ) -> Result>> { let raw_bytes = request .collect() @@ -451,7 +496,11 @@ async fn parse_and_validate_request( let user_message_preview = client_request .get_recent_user_message() .map(|msg| truncate_message(&msg, 50)); - let messages_for_signals = Some(client_request.get_messages()); + let messages_for_signals = if signals_enabled { + Some(client_request.get_messages()) + } else { + None + }; // Set the upstream model name and strip routing metadata client_request.set_model(model_name_only.clone()); @@ -652,6 +701,13 @@ async fn send_upstream( let request_start_time = std::time::Instant::now(); + // Labels for LLM upstream metrics. We prefer `resolved_model` (post-routing) + // and derive the provider from its `provider/model` prefix. This matches the + // same model id the cost/latency router keys off. + let (metric_provider_raw, metric_model_raw) = bs_metrics::split_provider_model(resolved_model); + let metric_provider = metric_provider_raw.to_string(); + let metric_model = metric_model_raw.to_string(); + let llm_response = match http_client .post(upstream_url) .headers(request_headers.clone()) @@ -661,6 +717,14 @@ async fn send_upstream( { Ok(res) => res, Err(err) => { + let err_class = bs_metrics::llm_error_class_from_reqwest(&err); + bs_metrics::record_llm_upstream( + &metric_provider, + &metric_model, + 0, + err_class, + request_start_time.elapsed(), + ); let err_msg = format!("Failed to send request: {}", err); let mut internal_error = Response::new(full(err_msg)); *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; @@ -671,6 +735,36 @@ async fn send_upstream( // Propagate upstream headers and status let response_headers = llm_response.headers().clone(); let upstream_status = llm_response.status(); + + // Upstream routers (e.g. DigitalOcean Gradient) may return an + // `x-model-router-selected-route` header indicating which task-level + // route the request was classified into (e.g. "Code Generation"). Surface + // it as `plano.route.name` so the obs console's Route hit % panel can + // show the breakdown even when Plano's own orchestrator wasn't in the + // routing path. Any value from Plano's orchestrator already set earlier + // takes precedence — this only fires when the span doesn't already have + // a route name. + if let Some(upstream_route) = response_headers + .get("x-model-router-selected-route") + .and_then(|v| v.to_str().ok()) + { + if !upstream_route.is_empty() { + get_active_span(|span| { + span.set_attribute(opentelemetry::KeyValue::new( + crate::tracing::plano::ROUTE_NAME, + upstream_route.to_string(), + )); + }); + } + } + // Record the upstream HTTP status on the span for the obs console. + get_active_span(|span| { + span.set_attribute(opentelemetry::KeyValue::new( + crate::tracing::http::STATUS_CODE, + upstream_status.as_u16() as i64, + )); + }); + let mut response = Response::builder().status(upstream_status); if let Some(headers) = response.headers_mut() { for (name, value) in response_headers.iter() { @@ -686,7 +780,12 @@ async fn send_upstream( span_name, request_start_time, messages_for_signals, - ); + ) + .with_llm_metrics(LlmMetricsCtx { + provider: metric_provider.clone(), + model: metric_model.clone(), + upstream_status: upstream_status.as_u16(), + }); let output_filter_request_headers = if filter_pipeline.has_output_filters() { Some(request_headers.clone()) diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs index 1b4315e7..a1378d86 100644 --- a/crates/brightstaff/src/handlers/llm/model_selection.rs +++ b/crates/brightstaff/src/handlers/llm/model_selection.rs @@ -5,10 +5,24 @@ use hyper::StatusCode; use std::sync::Arc; use tracing::{debug, info, warn}; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator::OrchestratorService; use crate::streaming::truncate_message; use crate::tracing::routing; +/// Classify a request path (already stripped of `/agents` or `/routing` by +/// the caller) into the fixed `route` label used on routing metrics. +fn route_label_for_path(request_path: &str) -> &'static str { + if request_path.starts_with("/agents") { + metric_labels::ROUTE_AGENT + } else if request_path.starts_with("/routing") { + metric_labels::ROUTE_ROUTING + } else { + metric_labels::ROUTE_LLM + } +} + pub struct RoutingResult { /// Primary model to use (first in the ranked list). pub model_name: String, @@ -106,15 +120,23 @@ pub async fn router_chat_get_upstream_model( ) .await; - let determination_ms = routing_start_time.elapsed().as_millis() as i64; + let determination_elapsed = routing_start_time.elapsed(); + let determination_ms = determination_elapsed.as_millis() as i64; let current_span = tracing::Span::current(); current_span.record(routing::ROUTE_DETERMINATION_MS, determination_ms); + let route_label = route_label_for_path(request_path); match routing_result { Ok(route) => match route { Some((route_name, ranked_models)) => { let model_name = ranked_models.first().cloned().unwrap_or_default(); current_span.record("route.selected_model", model_name.as_str()); + bs_metrics::record_router_decision( + route_label, + &model_name, + false, + determination_elapsed, + ); Ok(RoutingResult { model_name, models: ranked_models, @@ -126,6 +148,12 @@ pub async fn router_chat_get_upstream_model( // This signals to llm.rs to use the original validated request model current_span.record("route.selected_model", "none"); info!("no route determined, using default model"); + bs_metrics::record_router_decision( + route_label, + "none", + true, + determination_elapsed, + ); Ok(RoutingResult { model_name: "none".to_string(), @@ -136,6 +164,7 @@ pub async fn router_chat_get_upstream_model( }, Err(err) => { current_span.record("route.selected_model", "unknown"); + bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed); Err(RoutingError::internal_error(format!( "Failed to determine route: {}", err diff --git a/crates/brightstaff/src/handlers/mod.rs b/crates/brightstaff/src/handlers/mod.rs index 485a0438..4e851264 100644 --- a/crates/brightstaff/src/handlers/mod.rs +++ b/crates/brightstaff/src/handlers/mod.rs @@ -1,4 +1,5 @@ pub mod agents; +pub mod debug; pub mod function_calling; pub mod llm; pub mod models; diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 5fc0d3b9..b93b1422 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -12,6 +12,8 @@ use tracing::{debug, info, info_span, warn, Instrument}; use super::extract_or_generate_traceparent; use crate::handlers::llm::model_selection::router_chat_get_upstream_model; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator::OrchestratorService; use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; @@ -230,6 +232,17 @@ async fn routing_decision_inner( pinned: false, }; + // Distinguish "decision served" (a concrete model picked) from + // "no_candidates" (the sentinel "none" returned when nothing + // matched). The handler still responds 200 in both cases, so RED + // metrics alone can't tell them apart. + let outcome = if response.models.first().map(|m| m == "none").unwrap_or(true) { + metric_labels::ROUTING_SVC_NO_CANDIDATES + } else { + metric_labels::ROUTING_SVC_DECISION_SERVED + }; + bs_metrics::record_routing_service_outcome(outcome); + info!( primary_model = %response.models.first().map(|s| s.as_str()).unwrap_or("none"), total_models = response.models.len(), @@ -249,6 +262,7 @@ async fn routing_decision_inner( .unwrap()) } Err(err) => { + bs_metrics::record_routing_service_outcome(metric_labels::ROUTING_SVC_POLICY_ERROR); warn!(error = %err.message, "routing decision failed"); Ok(BrightStaffError::InternalServerError(err.message).into_response()) } diff --git a/crates/brightstaff/src/lib.rs b/crates/brightstaff/src/lib.rs index a0ba5f43..66c6eadf 100644 --- a/crates/brightstaff/src/lib.rs +++ b/crates/brightstaff/src/lib.rs @@ -1,5 +1,6 @@ pub mod app_state; pub mod handlers; +pub mod metrics; pub mod router; pub mod session_cache; pub mod signals; diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 40ac429d..b1e17e42 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -1,10 +1,17 @@ +#[cfg(feature = "jemalloc")] +#[global_allocator] +static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + use brightstaff::app_state::AppState; use brightstaff::handlers::agents::orchestrator::agent_chat; +use brightstaff::handlers::debug; use brightstaff::handlers::empty; use brightstaff::handlers::function_calling::function_calling_chat_handler; use brightstaff::handlers::llm::llm_chat; use brightstaff::handlers::models::list_models; use brightstaff::handlers::routing_service::routing_decision; +use brightstaff::metrics as bs_metrics; +use brightstaff::metrics::labels as metric_labels; use brightstaff::router::model_metrics::ModelMetricsService; use brightstaff::router::orchestrator::OrchestratorService; use brightstaff::session_cache::init_session_cache; @@ -326,6 +333,8 @@ async fn init_app_state( .as_ref() .and_then(|tracing| tracing.span_attributes.clone()); + let signals_enabled = !overrides.disable_signals.unwrap_or(false); + Ok(AppState { orchestrator_service, model_aliases: config.model_aliases.clone(), @@ -337,6 +346,7 @@ async fn init_app_state( span_attributes, http_client: reqwest::Client::new(), filter_pipeline, + signals_enabled, }) } @@ -384,10 +394,79 @@ async fn init_state_storage( // Request routing // --------------------------------------------------------------------------- +/// Normalized method label — limited set so we never emit a free-form string. +fn method_label(method: &Method) -> &'static str { + match *method { + Method::GET => "GET", + Method::POST => "POST", + Method::PUT => "PUT", + Method::DELETE => "DELETE", + Method::PATCH => "PATCH", + Method::HEAD => "HEAD", + Method::OPTIONS => "OPTIONS", + _ => "OTHER", + } +} + +/// Compute the fixed `handler` metric label from the request's path+method. +/// Returning `None` for fall-through means `route()` will hand the request to +/// the catch-all 404 branch. +fn handler_label_for(method: &Method, path: &str) -> &'static str { + if let Some(stripped) = path.strip_prefix("/agents") { + if matches!( + stripped, + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return metric_labels::HANDLER_AGENT_CHAT; + } + } + if let Some(stripped) = path.strip_prefix("/routing") { + if matches!( + stripped, + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return metric_labels::HANDLER_ROUTING_DECISION; + } + } + match (method, path) { + (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => { + metric_labels::HANDLER_LLM_CHAT + } + (&Method::POST, "/function_calling") => metric_labels::HANDLER_FUNCTION_CALLING, + (&Method::GET, "/v1/models" | "/agents/v1/models") => metric_labels::HANDLER_LIST_MODELS, + (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => { + metric_labels::HANDLER_CORS_PREFLIGHT + } + _ => metric_labels::HANDLER_NOT_FOUND, + } +} + /// Route an incoming HTTP request to the appropriate handler. async fn route( req: Request, state: Arc, +) -> Result>, hyper::Error> { + let handler = handler_label_for(req.method(), req.uri().path()); + let method = method_label(req.method()); + let started = std::time::Instant::now(); + let _in_flight = bs_metrics::InFlightGuard::new(handler); + + let result = dispatch(req, state).await; + + let status = match &result { + Ok(resp) => resp.status().as_u16(), + // hyper::Error here means the body couldn't be produced; conventionally 500. + Err(_) => 500, + }; + bs_metrics::record_http(handler, method, status, started); + result +} + +/// Inner dispatcher split out so `route()` can wrap it with metrics without +/// duplicating the match tree. +async fn dispatch( + req: Request, + state: Arc, ) -> Result>, hyper::Error> { let parent_cx = global::get_text_map_propagator(|p| p.extract(&HeaderExtractor(req.headers()))); let path = req.uri().path().to_string(); @@ -439,6 +518,7 @@ async fn route( Ok(list_models(Arc::clone(&state.llm_providers)).await) } (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => cors_preflight(), + (&Method::GET, "/debug/memstats") => debug::memstats().await, _ => { debug!(method = %req.method(), path = %path, "no route found"); let mut not_found = Response::new(empty()); @@ -503,6 +583,7 @@ async fn run_server(state: Arc) -> Result<(), Box Result<(), Box> { let config = load_config()?; let _tracer_provider = init_tracer(config.tracing.as_ref()); + bs_metrics::init(); info!("loaded plano_config.yaml"); let state = Arc::new(init_app_state(&config).await?); run_server(state).await diff --git a/crates/brightstaff/src/metrics/labels.rs b/crates/brightstaff/src/metrics/labels.rs new file mode 100644 index 00000000..4eaf3e59 --- /dev/null +++ b/crates/brightstaff/src/metrics/labels.rs @@ -0,0 +1,38 @@ +//! Fixed label-value constants so callers never emit free-form strings +//! (which would blow up cardinality). + +// Handler enum — derived from the path+method match in `route()`. +pub const HANDLER_AGENT_CHAT: &str = "agent_chat"; +pub const HANDLER_ROUTING_DECISION: &str = "routing_decision"; +pub const HANDLER_LLM_CHAT: &str = "llm_chat"; +pub const HANDLER_FUNCTION_CALLING: &str = "function_calling"; +pub const HANDLER_LIST_MODELS: &str = "list_models"; +pub const HANDLER_CORS_PREFLIGHT: &str = "cors_preflight"; +pub const HANDLER_NOT_FOUND: &str = "not_found"; + +// Router "route" class — which brightstaff endpoint prompted the decision. +pub const ROUTE_AGENT: &str = "agent"; +pub const ROUTE_ROUTING: &str = "routing"; +pub const ROUTE_LLM: &str = "llm"; + +// Token kind for brightstaff_llm_tokens_total. +pub const TOKEN_KIND_PROMPT: &str = "prompt"; +pub const TOKEN_KIND_COMPLETION: &str = "completion"; + +// LLM error_class values (match docstring in metrics/mod.rs). +pub const LLM_ERR_NONE: &str = "none"; +pub const LLM_ERR_TIMEOUT: &str = "timeout"; +pub const LLM_ERR_CONNECT: &str = "connect"; +pub const LLM_ERR_PARSE: &str = "parse"; +pub const LLM_ERR_OTHER: &str = "other"; +pub const LLM_ERR_STREAM: &str = "stream"; + +// Routing service outcome values. +pub const ROUTING_SVC_DECISION_SERVED: &str = "decision_served"; +pub const ROUTING_SVC_NO_CANDIDATES: &str = "no_candidates"; +pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error"; + +// Session cache outcome values. +pub const SESSION_CACHE_HIT: &str = "hit"; +pub const SESSION_CACHE_MISS: &str = "miss"; +pub const SESSION_CACHE_STORE: &str = "store"; diff --git a/crates/brightstaff/src/metrics/mod.rs b/crates/brightstaff/src/metrics/mod.rs new file mode 100644 index 00000000..34679cca --- /dev/null +++ b/crates/brightstaff/src/metrics/mod.rs @@ -0,0 +1,377 @@ +//! Prometheus metrics for brightstaff. +//! +//! Installs the `metrics` global recorder backed by +//! `metrics-exporter-prometheus` and exposes a `/metrics` HTTP endpoint on a +//! dedicated admin port (default `0.0.0.0:9092`, overridable via +//! `METRICS_BIND_ADDRESS`). +//! +//! Emitted metric families (see `describe_all` for full list): +//! - HTTP RED: `brightstaff_http_requests_total`, +//! `brightstaff_http_request_duration_seconds`, +//! `brightstaff_http_in_flight_requests`. +//! - LLM upstream: `brightstaff_llm_upstream_requests_total`, +//! `brightstaff_llm_upstream_duration_seconds`, +//! `brightstaff_llm_time_to_first_token_seconds`, +//! `brightstaff_llm_tokens_total`, +//! `brightstaff_llm_tokens_usage_missing_total`. +//! - Routing: `brightstaff_router_decisions_total`, +//! `brightstaff_router_decision_duration_seconds`, +//! `brightstaff_routing_service_requests_total`, +//! `brightstaff_session_cache_events_total`. +//! - Process: via `metrics-process`. +//! - Build: `brightstaff_build_info`. + +use std::net::SocketAddr; +use std::sync::OnceLock; +use std::time::{Duration, Instant}; + +use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram}; +use metrics_exporter_prometheus::{Matcher, PrometheusBuilder}; +use tracing::{info, warn}; + +pub mod labels; + +/// Guard flag so tests don't re-install the global recorder. +static INIT: OnceLock<()> = OnceLock::new(); + +const DEFAULT_METRICS_BIND: &str = "0.0.0.0:9092"; + +/// HTTP request duration buckets (seconds). Capped at 60s. +const HTTP_BUCKETS: &[f64] = &[ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, +]; + +/// LLM upstream / TTFT buckets (seconds). Capped at 120s because provider +/// completions routinely run that long. +const LLM_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0]; + +/// Router decision buckets (seconds). The orchestrator call itself is usually +/// sub-second but bucketed generously in case of upstream slowness. +const ROUTER_BUCKETS: &[f64] = &[ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, +]; + +/// Install the global recorder and spawn the `/metrics` HTTP listener. +/// +/// Safe to call more than once; subsequent calls are no-ops so tests that +/// construct their own recorder still work. +pub fn init() { + if INIT.get().is_some() { + return; + } + + let bind: SocketAddr = std::env::var("METRICS_BIND_ADDRESS") + .unwrap_or_else(|_| DEFAULT_METRICS_BIND.to_string()) + .parse() + .unwrap_or_else(|err| { + warn!(error = %err, default = DEFAULT_METRICS_BIND, "invalid METRICS_BIND_ADDRESS, falling back to default"); + DEFAULT_METRICS_BIND.parse().expect("default bind parses") + }); + + let builder = PrometheusBuilder::new() + .with_http_listener(bind) + .set_buckets_for_metric( + Matcher::Full("brightstaff_http_request_duration_seconds".to_string()), + HTTP_BUCKETS, + ) + .and_then(|b| { + b.set_buckets_for_metric(Matcher::Prefix("brightstaff_llm_".to_string()), LLM_BUCKETS) + }) + .and_then(|b| { + b.set_buckets_for_metric( + Matcher::Full("brightstaff_router_decision_duration_seconds".to_string()), + ROUTER_BUCKETS, + ) + }); + + let builder = match builder { + Ok(b) => b, + Err(err) => { + warn!(error = %err, "failed to configure metrics buckets, using defaults"); + PrometheusBuilder::new().with_http_listener(bind) + } + }; + + if let Err(err) = builder.install() { + warn!(error = %err, "failed to install Prometheus recorder; metrics disabled"); + return; + } + + let _ = INIT.set(()); + + describe_all(); + emit_build_info(); + + // Register process-level collector (RSS, CPU, FDs). + let collector = metrics_process::Collector::default(); + collector.describe(); + // Prime once at startup; subsequent scrapes refresh via the exporter's + // per-scrape render, so we additionally refresh on a short interval to + // keep gauges moving between scrapes without requiring client pull. + collector.collect(); + tokio::spawn(async move { + let mut tick = tokio::time::interval(Duration::from_secs(10)); + tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + tick.tick().await; + collector.collect(); + } + }); + + info!(address = %bind, "metrics listener started"); +} + +fn describe_all() { + describe_counter!( + "brightstaff_http_requests_total", + "Total HTTP requests served by brightstaff, by handler and status class." + ); + describe_histogram!( + "brightstaff_http_request_duration_seconds", + "Wall-clock duration of HTTP requests served by brightstaff, by handler." + ); + describe_gauge!( + "brightstaff_http_in_flight_requests", + "Number of HTTP requests currently being served by brightstaff, by handler." + ); + + describe_counter!( + "brightstaff_llm_upstream_requests_total", + "LLM upstream request outcomes, by provider, model, status class and error class." + ); + describe_histogram!( + "brightstaff_llm_upstream_duration_seconds", + "Wall-clock duration of LLM upstream calls (stream close for streaming), by provider and model." + ); + describe_histogram!( + "brightstaff_llm_time_to_first_token_seconds", + "Time from request start to first streamed byte, by provider and model (streaming only)." + ); + describe_counter!( + "brightstaff_llm_tokens_total", + "Tokens reported in the provider `usage` field, by provider, model and kind (prompt/completion)." + ); + describe_counter!( + "brightstaff_llm_tokens_usage_missing_total", + "LLM responses that completed without a usable `usage` block (so token counts are unknown)." + ); + + describe_counter!( + "brightstaff_router_decisions_total", + "Routing decisions made by the orchestrator, by route, selected model, and whether a fallback was used." + ); + describe_histogram!( + "brightstaff_router_decision_duration_seconds", + "Time spent in the orchestrator deciding a route, by route." + ); + describe_counter!( + "brightstaff_routing_service_requests_total", + "Outcomes of /routing/* decision requests: decision_served, no_candidates, policy_error." + ); + describe_counter!( + "brightstaff_session_cache_events_total", + "Session affinity cache lookups and stores, by outcome." + ); + + describe_gauge!( + "brightstaff_build_info", + "Build metadata. Always 1; labels carry version and git SHA." + ); +} + +fn emit_build_info() { + let version = env!("CARGO_PKG_VERSION"); + let git_sha = option_env!("GIT_SHA").unwrap_or("unknown"); + gauge!( + "brightstaff_build_info", + "version" => version.to_string(), + "git_sha" => git_sha.to_string(), + ) + .set(1.0); +} + +/// Split a provider-qualified model id like `"openai/gpt-4o"` into +/// `(provider, model)`. Returns `("unknown", raw)` when there is no `/`. +pub fn split_provider_model(full: &str) -> (&str, &str) { + match full.split_once('/') { + Some((p, m)) => (p, m), + None => ("unknown", full), + } +} + +/// Bucket an HTTP status code into `"2xx"` / `"4xx"` / `"5xx"` / `"1xx"` / `"3xx"`. +pub fn status_class(status: u16) -> &'static str { + match status { + 100..=199 => "1xx", + 200..=299 => "2xx", + 300..=399 => "3xx", + 400..=499 => "4xx", + 500..=599 => "5xx", + _ => "other", + } +} + +// --------------------------------------------------------------------------- +// HTTP RED helpers +// --------------------------------------------------------------------------- + +/// RAII guard that increments the in-flight gauge on construction and +/// decrements on drop. Pair with [`HttpTimer`] in the `route()` wrapper so the +/// gauge drops even on error paths. +pub struct InFlightGuard { + handler: &'static str, +} + +impl InFlightGuard { + pub fn new(handler: &'static str) -> Self { + gauge!( + "brightstaff_http_in_flight_requests", + "handler" => handler, + ) + .increment(1.0); + Self { handler } + } +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + gauge!( + "brightstaff_http_in_flight_requests", + "handler" => self.handler, + ) + .decrement(1.0); + } +} + +/// Record the HTTP request counter + duration histogram. +pub fn record_http(handler: &'static str, method: &'static str, status: u16, started: Instant) { + let class = status_class(status); + counter!( + "brightstaff_http_requests_total", + "handler" => handler, + "method" => method, + "status_class" => class, + ) + .increment(1); + histogram!( + "brightstaff_http_request_duration_seconds", + "handler" => handler, + ) + .record(started.elapsed().as_secs_f64()); +} + +// --------------------------------------------------------------------------- +// LLM upstream helpers +// --------------------------------------------------------------------------- + +/// Classify an outcome of an LLM upstream call for the `error_class` label. +pub fn llm_error_class_from_reqwest(err: &reqwest::Error) -> &'static str { + if err.is_timeout() { + "timeout" + } else if err.is_connect() { + "connect" + } else if err.is_decode() { + "parse" + } else { + "other" + } +} + +/// Record the outcome of an LLM upstream call. `status` is the HTTP status +/// the upstream returned (0 if the call never produced one, e.g. send failure). +/// `error_class` is `"none"` on success, or a discriminated error label. +pub fn record_llm_upstream( + provider: &str, + model: &str, + status: u16, + error_class: &str, + duration: Duration, +) { + let class = if status == 0 { + "error" + } else { + status_class(status) + }; + counter!( + "brightstaff_llm_upstream_requests_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + "status_class" => class, + "error_class" => error_class.to_string(), + ) + .increment(1); + histogram!( + "brightstaff_llm_upstream_duration_seconds", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .record(duration.as_secs_f64()); +} + +pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) { + histogram!( + "brightstaff_llm_time_to_first_token_seconds", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .record(ttft.as_secs_f64()); +} + +pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) { + counter!( + "brightstaff_llm_tokens_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + "kind" => kind, + ) + .increment(count); +} + +pub fn record_llm_tokens_usage_missing(provider: &str, model: &str) { + counter!( + "brightstaff_llm_tokens_usage_missing_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .increment(1); +} + +// --------------------------------------------------------------------------- +// Router helpers +// --------------------------------------------------------------------------- + +pub fn record_router_decision( + route: &'static str, + selected_model: &str, + fallback: bool, + duration: Duration, +) { + counter!( + "brightstaff_router_decisions_total", + "route" => route, + "selected_model" => selected_model.to_string(), + "fallback" => if fallback { "true" } else { "false" }, + ) + .increment(1); + histogram!( + "brightstaff_router_decision_duration_seconds", + "route" => route, + ) + .record(duration.as_secs_f64()); +} + +pub fn record_routing_service_outcome(outcome: &'static str) { + counter!( + "brightstaff_routing_service_requests_total", + "outcome" => outcome, + ) + .increment(1); +} + +pub fn record_session_cache_event(outcome: &'static str) { + counter!( + "brightstaff_session_cache_events_total", + "outcome" => outcome, + ) + .increment(1); +} diff --git a/crates/brightstaff/src/router/http.rs b/crates/brightstaff/src/router/http.rs index ad1b711c..e1f2be1e 100644 --- a/crates/brightstaff/src/router/http.rs +++ b/crates/brightstaff/src/router/http.rs @@ -1,8 +1,14 @@ use hermesllm::apis::openai::ChatCompletionsResponse; use hyper::header; +use serde::Deserialize; use thiserror::Error; use tracing::warn; +/// Max bytes of raw upstream body we include in a log message or error text +/// when the body is not a recognizable error envelope. Keeps logs from being +/// flooded by huge HTML error pages. +const RAW_BODY_LOG_LIMIT: usize = 512; + #[derive(Debug, Error)] pub enum HttpError { #[error("Failed to send request: {0}")] @@ -10,13 +16,64 @@ pub enum HttpError { #[error("Failed to parse JSON response: {0}")] Json(serde_json::Error, String), + + #[error("Upstream returned {status}: {message}")] + Upstream { status: u16, message: String }, +} + +/// Shape of an OpenAI-style error response body, e.g. +/// `{"error": {"message": "...", "type": "...", "param": "...", "code": ...}}`. +#[derive(Debug, Deserialize)] +struct UpstreamErrorEnvelope { + error: UpstreamErrorBody, +} + +#[derive(Debug, Deserialize)] +struct UpstreamErrorBody { + message: String, + #[serde(default, rename = "type")] + err_type: Option, + #[serde(default)] + param: Option, +} + +/// Extract a human-readable error message from an upstream response body. +/// Tries to parse an OpenAI-style `{"error": {"message": ...}}` envelope; if +/// that fails, falls back to the first `RAW_BODY_LOG_LIMIT` bytes of the raw +/// body (UTF-8 safe). +fn extract_upstream_error_message(body: &str) -> String { + if let Ok(env) = serde_json::from_str::(body) { + let mut msg = env.error.message; + if let Some(param) = env.error.param { + msg.push_str(&format!(" (param={param})")); + } + if let Some(err_type) = env.error.err_type { + msg.push_str(&format!(" [type={err_type}]")); + } + return msg; + } + truncate_for_log(body).to_string() +} + +fn truncate_for_log(s: &str) -> &str { + if s.len() <= RAW_BODY_LOG_LIMIT { + return s; + } + let mut end = RAW_BODY_LOG_LIMIT; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + &s[..end] } /// Sends a POST request to the given URL and extracts the text content /// from the first choice of the `ChatCompletionsResponse`. /// -/// Returns `Some((content, elapsed))` on success, or `None` if the response -/// had no choices or the first choice had no content. +/// Returns `Some((content, elapsed))` on success, `None` if the response +/// had no choices or the first choice had no content. Returns +/// `HttpError::Upstream` for any non-2xx status, carrying a message +/// extracted from the OpenAI-style error envelope (or a truncated raw body +/// if the body is not in that shape). pub async fn post_and_extract_content( client: &reqwest::Client, url: &str, @@ -26,17 +83,36 @@ pub async fn post_and_extract_content( let start_time = std::time::Instant::now(); let res = client.post(url).headers(headers).body(body).send().await?; + let status = res.status(); let body = res.text().await?; let elapsed = start_time.elapsed(); + if !status.is_success() { + let message = extract_upstream_error_message(&body); + warn!( + status = status.as_u16(), + message = %message, + body_size = body.len(), + "upstream returned error response" + ); + return Err(HttpError::Upstream { + status: status.as_u16(), + message, + }); + } + let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| { - warn!(error = %err, body = %body, "failed to parse json response"); + warn!( + error = %err, + body = %truncate_for_log(&body), + "failed to parse json response", + ); HttpError::Json(err, format!("Failed to parse JSON: {}", body)) })?; if response.choices.is_empty() { - warn!(body = %body, "no choices in response"); + warn!(body = %truncate_for_log(&body), "no choices in response"); return Ok(None); } @@ -46,3 +122,52 @@ pub async fn post_and_extract_content( .as_ref() .map(|c| (c.clone(), elapsed))) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extracts_message_from_openai_style_error_envelope() { + let body = r#"{"error":{"code":400,"message":"This model's maximum context length is 32768 tokens. However, you requested 0 output tokens and your prompt contains at least 32769 input tokens, for a total of at least 32769 tokens.","param":"input_tokens","type":"BadRequestError"}}"#; + let msg = extract_upstream_error_message(body); + assert!( + msg.starts_with("This model's maximum context length is 32768 tokens."), + "unexpected message: {msg}" + ); + assert!(msg.contains("(param=input_tokens)")); + assert!(msg.contains("[type=BadRequestError]")); + } + + #[test] + fn extracts_message_without_optional_fields() { + let body = r#"{"error":{"message":"something broke"}}"#; + let msg = extract_upstream_error_message(body); + assert_eq!(msg, "something broke"); + } + + #[test] + fn falls_back_to_raw_body_when_not_error_envelope() { + let body = "502 Bad Gateway"; + let msg = extract_upstream_error_message(body); + assert_eq!(msg, body); + } + + #[test] + fn truncates_non_envelope_bodies_in_logs() { + let body = "x".repeat(RAW_BODY_LOG_LIMIT * 3); + let msg = extract_upstream_error_message(&body); + assert_eq!(msg.len(), RAW_BODY_LOG_LIMIT); + } + + #[test] + fn truncate_for_log_respects_utf8_boundaries() { + // 2-byte characters; picking a length that would split mid-char. + let body = "é".repeat(RAW_BODY_LOG_LIMIT); + let out = truncate_for_log(&body); + // Should be a valid &str (implicit — would panic if we returned + // a non-boundary slice) and at most RAW_BODY_LOG_LIMIT bytes. + assert!(out.len() <= RAW_BODY_LOG_LIMIT); + assert!(out.chars().all(|c| c == 'é')); + } +} diff --git a/crates/brightstaff/src/router/mod.rs b/crates/brightstaff/src/router/mod.rs index 2ef0d11a..0f48c090 100644 --- a/crates/brightstaff/src/router/mod.rs +++ b/crates/brightstaff/src/router/mod.rs @@ -3,3 +3,5 @@ pub mod model_metrics; pub mod orchestrator; pub mod orchestrator_model; pub mod orchestrator_model_v1; +#[cfg(test)] +mod stress_tests; diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs index 7aaf70a2..2d7b25de 100644 --- a/crates/brightstaff/src/router/orchestrator.rs +++ b/crates/brightstaff/src/router/orchestrator.rs @@ -15,6 +15,8 @@ use super::http::{self, post_and_extract_content}; use super::model_metrics::ModelMetricsService; use super::orchestrator_model::OrchestratorModel; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator_model_v1; use crate::session_cache::SessionCache; @@ -130,7 +132,13 @@ impl OrchestratorService { tenant_id: Option<&str>, ) -> Option { let cache = self.session_cache.as_ref()?; - cache.get(&Self::session_key(tenant_id, session_id)).await + let result = cache.get(&Self::session_key(tenant_id, session_id)).await; + bs_metrics::record_session_cache_event(if result.is_some() { + metric_labels::SESSION_CACHE_HIT + } else { + metric_labels::SESSION_CACHE_MISS + }); + result } pub async fn cache_route( @@ -151,6 +159,7 @@ impl OrchestratorService { self.session_ttl, ) .await; + bs_metrics::record_session_cache_event(metric_labels::SESSION_CACHE_STORE); } } diff --git a/crates/brightstaff/src/router/orchestrator_model_v1.rs b/crates/brightstaff/src/router/orchestrator_model_v1.rs index 75e5c586..693aacc2 100644 --- a/crates/brightstaff/src/router/orchestrator_model_v1.rs +++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs @@ -10,6 +10,18 @@ use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError}; pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model +/// Hard cap on the number of recent messages considered when building the +/// routing prompt. Bounds prompt growth for long-running conversations and +/// acts as an outer guardrail before the token-budget loop runs. The most +/// recent `MAX_ROUTING_TURNS` filtered messages are kept; older turns are +/// dropped entirely. +pub const MAX_ROUTING_TURNS: usize = 16; + +/// Unicode ellipsis used to mark where content was trimmed out of a long +/// message. Helps signal to the downstream router model that the message was +/// truncated. +const TRIM_MARKER: &str = "…"; + /// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python struct SpacedJsonFormatter; @@ -176,10 +188,9 @@ impl OrchestratorModel for OrchestratorModelV1 { messages: &[Message], usage_preferences_from_request: &Option>, ) -> ChatCompletionsRequest { - // remove system prompt, tool calls, tool call response and messages without content - // if content is empty its likely a tool call - // when role == tool its tool call response - let messages_vec = messages + // Remove system/developer/tool messages and messages without extractable + // text (tool calls have no text content we can classify against). + let filtered: Vec<&Message> = messages .iter() .filter(|m| { m.role != Role::System @@ -187,37 +198,72 @@ impl OrchestratorModel for OrchestratorModelV1 { && m.role != Role::Tool && !m.content.extract_text().is_empty() }) - .collect::>(); + .collect(); - // Following code is to ensure that the conversation does not exceed max token length - // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance + // Outer guardrail: only consider the last `MAX_ROUTING_TURNS` filtered + // messages when building the routing prompt. Keeps prompt growth + // predictable for long conversations regardless of per-message size. + let start = filtered.len().saturating_sub(MAX_ROUTING_TURNS); + let messages_vec: &[&Message] = &filtered[start..]; + + // Ensure the conversation does not exceed the configured token budget. + // We use `len() / TOKEN_LENGTH_DIVISOR` as a cheap token estimate to + // avoid running a real tokenizer on the hot path. let mut token_count = ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR; - let mut selected_messages_list_reversed: Vec<&Message> = vec![]; + let mut selected_messages_list_reversed: Vec = vec![]; for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() { - let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR; - token_count += message_token_count; - if token_count > self.max_token_length { + let message_text = message.content.extract_text(); + let message_token_count = message_text.len() / TOKEN_LENGTH_DIVISOR; + if token_count + message_token_count > self.max_token_length { + let remaining_tokens = self.max_token_length.saturating_sub(token_count); debug!( - token_count = token_count, + attempted_total_tokens = token_count + message_token_count, max_tokens = self.max_token_length, + remaining_tokens, selected = selected_messsage_count, total = messages_vec.len(), "token count exceeds max, truncating conversation" ); - if message.role == Role::User { - // If message that exceeds max token length is from user, we need to keep it - selected_messages_list_reversed.push(message); + // If the overflow message is from the user we need to keep + // some of it so the orchestrator still sees the latest user + // intent. Use a middle-trim (head + ellipsis + tail): users + // often frame the task at the start AND put the actual ask + // at the end of a long pasted block, so preserving both is + // better than a head-only cut. The ellipsis also signals to + // the router model that content was dropped. + if message.role == Role::User && remaining_tokens > 0 { + let max_bytes = remaining_tokens.saturating_mul(TOKEN_LENGTH_DIVISOR); + let truncated = trim_middle_utf8(&message_text, max_bytes); + selected_messages_list_reversed.push(Message { + role: Role::User, + content: Some(MessageContent::Text(truncated)), + name: None, + tool_calls: None, + tool_call_id: None, + }); } break; } - // If we are here, it means that the message is within the max token length - selected_messages_list_reversed.push(message); + token_count += message_token_count; + selected_messages_list_reversed.push(Message { + role: message.role.clone(), + content: Some(MessageContent::Text(message_text)), + name: None, + tool_calls: None, + tool_call_id: None, + }); } if selected_messages_list_reversed.is_empty() { debug!("no messages selected, using last message"); if let Some(last_message) = messages_vec.last() { - selected_messages_list_reversed.push(last_message); + selected_messages_list_reversed.push(Message { + role: last_message.role.clone(), + content: Some(MessageContent::Text(last_message.content.extract_text())), + name: None, + tool_calls: None, + tool_call_id: None, + }); } } @@ -237,22 +283,8 @@ impl OrchestratorModel for OrchestratorModelV1 { } // Reverse the selected messages to maintain the conversation order - let selected_conversation_list = selected_messages_list_reversed - .iter() - .rev() - .map(|message| Message { - role: message.role.clone(), - content: Some(MessageContent::Text( - message - .content - .as_ref() - .map_or(String::new(), |c| c.to_string()), - )), - name: None, - tool_calls: None, - tool_call_id: None, - }) - .collect::>(); + let selected_conversation_list: Vec = + selected_messages_list_reversed.into_iter().rev().collect(); // Generate the orchestrator request message based on the usage preferences. // If preferences are passed in request then we use them; @@ -405,6 +437,45 @@ fn fix_json_response(body: &str) -> String { body.replace("'", "\"").replace("\\n", "") } +/// Truncate `s` so the result is at most `max_bytes` bytes long, keeping +/// roughly 60% from the start and 40% from the end, with a Unicode ellipsis +/// separating the two. All splits respect UTF-8 character boundaries. When +/// `max_bytes` is too small to fit the marker at all, falls back to a +/// head-only truncation. +fn trim_middle_utf8(s: &str, max_bytes: usize) -> String { + if s.len() <= max_bytes { + return s.to_string(); + } + if max_bytes <= TRIM_MARKER.len() { + // Not enough room even for the marker — just keep the start. + let mut end = max_bytes; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + return s[..end].to_string(); + } + + let available = max_bytes - TRIM_MARKER.len(); + // Bias toward the start (60%) where task framing typically lives, while + // still preserving ~40% of the tail where the user's actual ask often + // appears after a long paste. + let mut start_len = available * 3 / 5; + while start_len > 0 && !s.is_char_boundary(start_len) { + start_len -= 1; + } + let end_len = available - start_len; + let mut end_start = s.len().saturating_sub(end_len); + while end_start < s.len() && !s.is_char_boundary(end_start) { + end_start += 1; + } + + let mut out = String::with_capacity(start_len + TRIM_MARKER.len() + (s.len() - end_start)); + out.push_str(&s[..start_len]); + out.push_str(TRIM_MARKER); + out.push_str(&s[end_start..]); + out +} + impl std::fmt::Debug for dyn OrchestratorModel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "OrchestratorModel") @@ -777,6 +848,10 @@ If no routes are needed, return an empty list for `route`. #[test] fn test_conversation_trim_upto_user_message() { + // With max_token_length=230, the older user message "given the image + // In style of Andy Warhol" overflows the remaining budget and gets + // middle-trimmed (head + ellipsis + tail) until it fits. Newer turns + // are kept in full. let expected_prompt = r#" You are a helpful assistant that selects the most suitable routes based on user intent. You are provided with a list of available routes enclosed within XML tags: @@ -789,7 +864,7 @@ You are also given the conversation context enclosed within >, + >(orchestrations_str) + .unwrap(); + + let max_token_length = 2048; + let orchestrator = OrchestratorModelV1::new( + agent_orchestrations, + "test-model".to_string(), + max_token_length, + ); + + // ~500KB of content — same scale as the real payload that triggered + // the production upstream 400. + let head = "HEAD_MARKER_START "; + let tail = " TAIL_MARKER_END"; + let filler = "A".repeat(500_000); + let huge_user_content = format!("{head}{filler}{tail}"); + + let conversation = vec![Message { + role: Role::User, + content: Some(MessageContent::Text(huge_user_content.clone())), + name: None, + tool_calls: None, + tool_call_id: None, + }]; + + let req = orchestrator.generate_request(&conversation, &None); + let prompt = req.messages[0].content.extract_text(); + + // Prompt must stay bounded. Generous ceiling = budget-in-bytes + + // scaffolding + slack. Real result should be well under this. + let byte_ceiling = max_token_length * TOKEN_LENGTH_DIVISOR + + ARCH_ORCHESTRATOR_V1_SYSTEM_PROMPT.len() + + 1024; + assert!( + prompt.len() < byte_ceiling, + "prompt length {} exceeded ceiling {} — truncation did not apply", + prompt.len(), + byte_ceiling, + ); + + // Not all 500k filler chars survive. + let a_count = prompt.chars().filter(|c| *c == 'A').count(); + assert!( + a_count < filler.len(), + "expected user message to be truncated; all {} 'A's survived", + a_count + ); + assert!( + a_count > 0, + "expected some of the user message to survive truncation" + ); + + // Head and tail of the message must both be preserved (that's the + // whole point of middle-trim over head-only). + assert!( + prompt.contains(head), + "head marker missing — head was not preserved" + ); + assert!( + prompt.contains(tail), + "tail marker missing — tail was not preserved" + ); + + // Trim marker must be present so the router model can see that + // content was omitted. + assert!( + prompt.contains(TRIM_MARKER), + "ellipsis trim marker missing from truncated prompt" + ); + + // Routing prompt scaffolding remains intact. + assert!(prompt.contains("")); + assert!(prompt.contains("")); + } + + #[test] + fn test_turn_cap_limits_routing_history() { + // The outer turn-cap guardrail should keep only the last + // `MAX_ROUTING_TURNS` filtered messages regardless of how long the + // conversation is. We build a conversation with alternating + // user/assistant turns tagged with their index and verify that only + // the tail of the conversation makes it into the prompt. + let orchestrations_str = r#" + { + "gpt-4o": [ + {"name": "Image generation", "description": "generating image"} + ] + } + "#; + let agent_orchestrations = serde_json::from_str::< + HashMap>, + >(orchestrations_str) + .unwrap(); + + let orchestrator = + OrchestratorModelV1::new(agent_orchestrations, "test-model".to_string(), usize::MAX); + + let mut conversation: Vec = Vec::new(); + let total_turns = MAX_ROUTING_TURNS * 2; // well past the cap + for i in 0..total_turns { + let role = if i % 2 == 0 { + Role::User + } else { + Role::Assistant + }; + conversation.push(Message { + role, + content: Some(MessageContent::Text(format!("turn-{i:03}"))), + name: None, + tool_calls: None, + tool_call_id: None, + }); + } + + let req = orchestrator.generate_request(&conversation, &None); + let prompt = req.messages[0].content.extract_text(); + + // The last MAX_ROUTING_TURNS messages (indexes total-cap..total) + // must all appear. + for i in (total_turns - MAX_ROUTING_TURNS)..total_turns { + let tag = format!("turn-{i:03}"); + assert!( + prompt.contains(&tag), + "expected recent turn tag {tag} to be present" + ); + } + + // And earlier turns (indexes 0..total-cap) must all be dropped. + for i in 0..(total_turns - MAX_ROUTING_TURNS) { + let tag = format!("turn-{i:03}"); + assert!( + !prompt.contains(&tag), + "old turn tag {tag} leaked past turn cap into the prompt" + ); + } + } + + #[test] + fn test_trim_middle_utf8_helper() { + // No-op when already small enough. + assert_eq!(trim_middle_utf8("hello", 100), "hello"); + assert_eq!(trim_middle_utf8("hello", 5), "hello"); + + // 60/40 split with ellipsis when too long. + let long = "a".repeat(20); + let out = trim_middle_utf8(&long, 10); + assert!(out.len() <= 10); + assert!(out.contains(TRIM_MARKER)); + // Exactly one ellipsis, rest are 'a's. + assert_eq!(out.matches(TRIM_MARKER).count(), 1); + assert!(out.chars().filter(|c| *c == 'a').count() > 0); + + // When max_bytes is smaller than the marker, falls back to + // head-only truncation (no marker). + let out = trim_middle_utf8("abcdefgh", 2); + assert_eq!(out, "ab"); + + // UTF-8 boundary safety: 2-byte chars. + let s = "é".repeat(50); // 100 bytes + let out = trim_middle_utf8(&s, 25); + assert!(out.len() <= 25); + // Must still be valid UTF-8 that only contains 'é' and the marker. + let ok = out.chars().all(|c| c == 'é' || c == '…'); + assert!(ok, "unexpected char in trimmed output: {out:?}"); + } + #[test] fn test_non_text_input() { let expected_prompt = r#" diff --git a/crates/brightstaff/src/router/stress_tests.rs b/crates/brightstaff/src/router/stress_tests.rs new file mode 100644 index 00000000..6c3ffefd --- /dev/null +++ b/crates/brightstaff/src/router/stress_tests.rs @@ -0,0 +1,260 @@ +#[cfg(test)] +mod tests { + use crate::router::orchestrator::OrchestratorService; + use crate::session_cache::memory::MemorySessionCache; + use common::configuration::{SelectionPolicy, SelectionPreference, TopLevelRoutingPreference}; + use hermesllm::apis::openai::{Message, MessageContent, Role}; + use std::sync::Arc; + + fn make_messages(n: usize) -> Vec { + (0..n) + .map(|i| Message { + role: if i % 2 == 0 { + Role::User + } else { + Role::Assistant + }, + content: Some(MessageContent::Text(format!( + "This is message number {i} with some padding text to make it realistic." + ))), + name: None, + tool_calls: None, + tool_call_id: None, + }) + .collect() + } + + fn make_routing_prefs() -> Vec { + vec![ + TopLevelRoutingPreference { + name: "code_generation".to_string(), + description: "Code generation and debugging tasks".to_string(), + models: vec![ + "openai/gpt-4o".to_string(), + "openai/gpt-4o-mini".to_string(), + ], + selection_policy: SelectionPolicy { + prefer: SelectionPreference::None, + }, + }, + TopLevelRoutingPreference { + name: "summarization".to_string(), + description: "Summarizing documents and text".to_string(), + models: vec![ + "anthropic/claude-3-sonnet".to_string(), + "openai/gpt-4o-mini".to_string(), + ], + selection_policy: SelectionPolicy { + prefer: SelectionPreference::None, + }, + }, + ] + } + + /// Stress test: exercise the full routing code path N times using a mock + /// HTTP server and measure jemalloc allocated bytes before/after. + /// + /// This catches: + /// - Memory leaks in generate_request / parse_response + /// - Leaks in reqwest connection handling + /// - String accumulation in the orchestrator model + /// - Fragmentation (jemalloc allocated vs resident) + #[tokio::test] + async fn stress_test_routing_determine_route() { + let mut server = mockito::Server::new_async().await; + let router_url = format!("{}/v1/chat/completions", server.url()); + + let mock_response = serde_json::json!({ + "id": "chatcmpl-mock", + "object": "chat.completion", + "created": 1234567890, + "model": "plano-orchestrator", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": "{\"route\": \"code_generation\"}" + }, + "finish_reason": "stop" + }], + "usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110} + }); + + let _mock = server + .mock("POST", "/v1/chat/completions") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(mock_response.to_string()) + .expect_at_least(1) + .create_async() + .await; + + let prefs = make_routing_prefs(); + let session_cache = Arc::new(MemorySessionCache::new(1000)); + let orchestrator_service = Arc::new(OrchestratorService::with_routing( + router_url, + "Plano-Orchestrator".to_string(), + "plano-orchestrator".to_string(), + Some(prefs.clone()), + None, + None, + session_cache, + None, + 2048, + )); + + // Warm up: a few requests to stabilize allocator state + for _ in 0..10 { + let msgs = make_messages(5); + let _ = orchestrator_service + .determine_route(&msgs, None, "warmup") + .await; + } + + // Snapshot memory after warmup + let baseline = get_allocated(); + + let num_iterations = 2000; + + for i in 0..num_iterations { + let msgs = make_messages(5 + (i % 10)); + let inline = if i % 3 == 0 { + Some(make_routing_prefs()) + } else { + None + }; + let _ = orchestrator_service + .determine_route(&msgs, inline, &format!("req-{i}")) + .await; + } + + let after = get_allocated(); + + let growth = after.saturating_sub(baseline); + let growth_mb = growth as f64 / (1024.0 * 1024.0); + let per_request = growth.checked_div(num_iterations).unwrap_or(0); + + eprintln!("=== Routing Stress Test Results ==="); + eprintln!(" Iterations: {num_iterations}"); + eprintln!(" Baseline alloc: {} bytes", baseline); + eprintln!(" Final alloc: {} bytes", after); + eprintln!(" Growth: {} bytes ({growth_mb:.2} MB)", growth); + eprintln!(" Per-request: {} bytes", per_request); + + // Allow up to 256 bytes per request of retained growth (connection pool, etc.) + // A true leak would show thousands of bytes per request. + assert!( + per_request < 256, + "Possible memory leak: {per_request} bytes/request retained after {num_iterations} iterations" + ); + } + + /// Stress test with high concurrency: many parallel determine_route calls. + #[tokio::test] + async fn stress_test_routing_concurrent() { + let mut server = mockito::Server::new_async().await; + let router_url = format!("{}/v1/chat/completions", server.url()); + + let mock_response = serde_json::json!({ + "id": "chatcmpl-mock", + "object": "chat.completion", + "created": 1234567890, + "model": "plano-orchestrator", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": "{\"route\": \"summarization\"}" + }, + "finish_reason": "stop" + }], + "usage": {"prompt_tokens": 100, "completion_tokens": 10, "total_tokens": 110} + }); + + let _mock = server + .mock("POST", "/v1/chat/completions") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(mock_response.to_string()) + .expect_at_least(1) + .create_async() + .await; + + let prefs = make_routing_prefs(); + let session_cache = Arc::new(MemorySessionCache::new(1000)); + let orchestrator_service = Arc::new(OrchestratorService::with_routing( + router_url, + "Plano-Orchestrator".to_string(), + "plano-orchestrator".to_string(), + Some(prefs), + None, + None, + session_cache, + None, + 2048, + )); + + // Warm up + for _ in 0..20 { + let msgs = make_messages(3); + let _ = orchestrator_service + .determine_route(&msgs, None, "warmup") + .await; + } + + let baseline = get_allocated(); + + let concurrency = 50; + let requests_per_task = 100; + let total = concurrency * requests_per_task; + + let mut handles = vec![]; + for t in 0..concurrency { + let svc = Arc::clone(&orchestrator_service); + let handle = tokio::spawn(async move { + for r in 0..requests_per_task { + let msgs = make_messages(3 + (r % 8)); + let _ = svc + .determine_route(&msgs, None, &format!("req-{t}-{r}")) + .await; + } + }); + handles.push(handle); + } + + for h in handles { + h.await.unwrap(); + } + + let after = get_allocated(); + let growth = after.saturating_sub(baseline); + let per_request = growth / total; + + eprintln!("=== Concurrent Routing Stress Test Results ==="); + eprintln!(" Tasks: {concurrency} x {requests_per_task} = {total}"); + eprintln!(" Baseline: {} bytes", baseline); + eprintln!(" Final: {} bytes", after); + eprintln!( + " Growth: {} bytes ({:.2} MB)", + growth, + growth as f64 / 1_048_576.0 + ); + eprintln!(" Per-request: {} bytes", per_request); + + assert!( + per_request < 512, + "Possible memory leak under concurrency: {per_request} bytes/request retained after {total} requests" + ); + } + + #[cfg(feature = "jemalloc")] + fn get_allocated() -> usize { + tikv_jemalloc_ctl::epoch::advance().unwrap(); + tikv_jemalloc_ctl::stats::allocated::read().unwrap_or(0) + } + + #[cfg(not(feature = "jemalloc"))] + fn get_allocated() -> usize { + 0 + } +} diff --git a/crates/brightstaff/src/signals/analyzer.rs b/crates/brightstaff/src/signals/analyzer.rs index 5ee3c7d9..35e342eb 100644 --- a/crates/brightstaff/src/signals/analyzer.rs +++ b/crates/brightstaff/src/signals/analyzer.rs @@ -1,3190 +1,572 @@ -//! Agentic Signals - Behavioral quality indicators for agent interactions +//! Top-level signal analyzer. //! -//! This module implements various signals that serve as early warning indicators -//! of brilliant successes or failures in agentic interactions. These signals are -//! derived from conversation patterns and can be computed algorithmically from -//! message arrays. - -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; -use std::sync::LazyLock; +//! Direct port of `signals/analyzer.py`. Orchestrates all detectors across +//! the three layers (interaction / execution / environment) and produces a +//! `SignalReport`. use hermesllm::apis::openai::{Message, Role}; +use hermesllm::transforms::ExtractText; -// ============================================================================ -// Constants -// ============================================================================ +use super::environment::exhaustion::analyze_exhaustion; +use super::execution::failure::analyze_failure; +use super::execution::loops::analyze_loops; +use super::interaction::disengagement::analyze_disengagement; +use super::interaction::misalignment::analyze_misalignment; +use super::interaction::satisfaction::analyze_satisfaction; +use super::interaction::stagnation::{analyze_stagnation, ShareGptMsg}; +use super::schemas::{ + EnvironmentSignals, ExecutionSignals, InteractionQuality, InteractionSignals, SignalReport, + SignalType, TurnMetrics, +}; +use super::text_processing::NormalizedMessage; -/// Flag emoji for marking spans/operations worth investigating +/// Marker appended to the span operation name when concerning signals are +/// detected. The 🚩 emoji (U+1F6A9) matches the pre-port implementation so +/// downstream consumers that search for flagged traces by span-name emoji +/// keep working. pub const FLAG_MARKER: &str = "\u{1F6A9}"; -/// Size of character n-grams for similarity matching (3 = trigrams) -const NGRAM_SIZE: usize = 3; +/// ShareGPT-shaped row used as the canonical input to the analyzer's +/// detectors. `from` is one of `"human"`, `"gpt"`, `"function_call"`, +/// `"observation"`. `value` is the raw message body. +#[derive(Debug, Clone, Copy)] +pub struct ShareGptMessage<'a> { + pub from: &'a str, + pub value: &'a str, +} -// ============================================================================ -// Normalized Message Processing -// ============================================================================ - -/// Pre-processed message with normalized text and tokens for efficient matching +/// Configuration knobs for the analyzer. Defaults match +/// `signals/analyzer.py:SignalAnalyzer.__init__`. #[derive(Debug, Clone)] -struct NormalizedMessage { - /// Original raw text - raw: String, - /// Tokens (words) extracted from the message - tokens: Vec, - /// Token set for fast lookup - token_set: HashSet, - /// Bigram set for fast similarity computation - bigram_set: HashSet, - /// Character ngram set for robust similarity matching - char_ngram_set: HashSet, - /// Token frequency map for multiset cosine similarity - token_frequency: HashMap, +pub struct SignalAnalyzerConfig { + pub baseline_turns: usize, + pub char_ngram_threshold: f32, + pub token_cosine_threshold: f32, + pub max_message_length: usize, + pub max_messages: usize, } -impl NormalizedMessage { - #[allow(dead_code)] // Used in tests for algorithm validation - fn from_text(text: &str) -> Self { - Self::from_text_with_limit(text, usize::MAX) - } - - fn from_text_with_limit(text: &str, max_length: usize) -> Self { - // Truncate to max_length characters to prevent unbounded computation - // Keep head (20%) + tail (80%) to preserve both context and intent - - let char_count = text.chars().count(); - - let raw = if char_count <= max_length { - text.to_string() - } else { - // Split: 20% head, 79% tail, 1 char space delimiter - let head_len = max_length / 5; - let tail_len = max_length - head_len - 1; - - let head: String = text.chars().take(head_len).collect(); - let tail: String = text.chars().skip(char_count - tail_len).collect(); - - format!("{} {}", head, tail) - }; - - // Normalize unicode punctuation to ASCII equivalents - let normalized_unicode = raw - .replace(['\u{2019}', '\u{2018}'], "'") // U+2019/U+2018 SINGLE QUOTATION MARKs - .replace(['\u{201C}', '\u{201D}'], "\"") // U+201C/U+201D DOUBLE QUOTATION MARKs - .replace(['\u{2013}', '\u{2014}'], "-"); // U+2013/U+2014 EN/EM DASHes - - // Normalize: lowercase, collapse whitespace - let normalized = normalized_unicode - .to_lowercase() - .split_whitespace() - .collect::>() - .join(" "); - - // Tokenize: split on whitespace and strip punctuation from boundaries - let tokens: Vec = normalized - .split_whitespace() - .map(|word| { - // Strip leading/trailing punctuation but keep internal punctuation - word.trim_matches(|c: char| c.is_ascii_punctuation()) - .to_string() - }) - .filter(|w| !w.is_empty()) - .collect(); - - let token_set: HashSet = tokens.iter().cloned().collect(); - - // Generate bigram set directly for similarity matching - let bigram_set: HashSet = tokens - .windows(2) - .map(|w| format!("{} {}", w[0], w[1])) - .collect(); - - // Generate character ngram set for robust similarity matching - // Uses tokens (with punctuation stripped) for consistency with pattern matching - let tokens_text = tokens.join(" "); - let char_ngram_set: HashSet = tokens_text - .chars() - .collect::>() - .windows(NGRAM_SIZE) - .map(|w| w.iter().collect::()) - .collect(); - - // Compute token frequency map for cosine similarity - let mut token_frequency: HashMap = HashMap::new(); - for token in &tokens { - *token_frequency.entry(token.clone()).or_insert(0) += 1; - } - - Self { - raw, - tokens, - token_set, - bigram_set, - char_ngram_set, - token_frequency, - } - } - - /// Check if a single token exists in the message (word boundary aware) - fn contains_token(&self, token: &str) -> bool { - self.token_set.contains(token) - } - - /// Check if a phrase (sequence of tokens) exists in the message - fn contains_phrase(&self, phrase: &str) -> bool { - let phrase_tokens: Vec<&str> = phrase.split_whitespace().collect(); - if phrase_tokens.is_empty() { - return false; - } - - if phrase_tokens.len() == 1 { - return self.contains_token(phrase_tokens[0]); - } - - // Multi-word phrase: check for sequence in tokens - self.tokens.windows(phrase_tokens.len()).any(|window| { - window - .iter() - .zip(phrase_tokens.iter()) - .all(|(token, phrase_token)| token == phrase_token) - }) - } - - /// Calculate character ngram similarity between this message and a pattern - /// Returns a similarity score between 0.0 and 1.0 - /// This is robust to typos, small edits, and word insertions - #[allow(dead_code)] // Used in tests for algorithm validation - fn char_ngram_similarity(&self, pattern: &str) -> f64 { - // Normalize the pattern: lowercase and remove ALL punctuation - // This makes "doesn't" → "doesnt" for robust typo matching - let normalized_pattern = pattern - .to_lowercase() - .chars() - .filter(|c| c.is_alphanumeric() || c.is_whitespace()) - .collect::() - .split_whitespace() - .collect::>() - .join(" "); - - // Generate ngrams for the pattern - let pattern_ngrams: HashSet = normalized_pattern - .chars() - .collect::>() - .windows(NGRAM_SIZE) - .map(|w| w.iter().collect::()) - .collect(); - - if self.char_ngram_set.is_empty() && pattern_ngrams.is_empty() { - return 1.0; // Both empty = identical - } - - if self.char_ngram_set.is_empty() || pattern_ngrams.is_empty() { - return 0.0; - } - - // Compute Jaccard similarity (intersection / union) - let intersection = self.char_ngram_set.intersection(&pattern_ngrams).count(); - let union = self.char_ngram_set.union(&pattern_ngrams).count(); - - if union == 0 { - return 0.0; - } - - intersection as f64 / union as f64 - } - - /// Calculate token-based cosine similarity using term frequencies - /// Returns a similarity score between 0.0 and 1.0 - /// This handles word frequency and is stable for longer messages - #[allow(dead_code)] // Used in tests for algorithm validation - fn token_cosine_similarity(&self, pattern: &str) -> f64 { - // Tokenize and compute frequencies for the pattern - let pattern_tokens: Vec = pattern - .to_lowercase() - .split_whitespace() - .map(|word| { - word.trim_matches(|c: char| c.is_ascii_punctuation()) - .to_string() - }) - .filter(|w| !w.is_empty()) - .collect(); - - let mut pattern_frequency: HashMap = HashMap::new(); - for token in &pattern_tokens { - *pattern_frequency.entry(token.clone()).or_insert(0) += 1; - } - - if self.token_frequency.is_empty() && pattern_frequency.is_empty() { - return 1.0; - } - - if self.token_frequency.is_empty() || pattern_frequency.is_empty() { - return 0.0; - } - - // Compute cosine similarity - // cosine_sim = dot_product / (norm1 * norm2) - - let mut dot_product = 0.0; - let mut norm1_squared = 0.0; - let mut norm2_squared = 0.0; - - // Collect all unique tokens from both sets - let all_tokens: HashSet = self - .token_frequency - .keys() - .chain(pattern_frequency.keys()) - .cloned() - .collect(); - - for token in all_tokens { - let freq1 = *self.token_frequency.get(&token).unwrap_or(&0) as f64; - let freq2 = *pattern_frequency.get(&token).unwrap_or(&0) as f64; - - dot_product += freq1 * freq2; - norm1_squared += freq1 * freq1; - norm2_squared += freq2 * freq2; - } - - let norm1 = norm1_squared.sqrt(); - let norm2 = norm2_squared.sqrt(); - - if norm1 == 0.0 || norm2 == 0.0 { - return 0.0; - } - - dot_product / (norm1 * norm2) - } - - /// Layered phrase matching: exact → character ngram → token cosine - /// Returns true if the pattern matches using any layer - #[allow(dead_code)] // Kept for reference; production uses matches_normalized_pattern - fn layered_contains_phrase( - &self, - pattern: &str, - char_ngram_threshold: f64, - token_cosine_threshold: f64, - ) -> bool { - // Layer 0: Exact phrase match (fastest) - if self.contains_phrase(pattern) { - return true; - } - - // Layer 1: Character ngram similarity (typo/edit robustness) - // Check whole message first (for short messages) - if self.char_ngram_similarity(pattern) >= char_ngram_threshold { - return true; - } - - // ngram containment check for patterns buried in longer messages - // If ALL of the pattern's ngrams exist in the message, the pattern must be - // present (possibly with minor variations like missing apostrophes). - // This is O(pattern_ngrams) lookups vs expensive window sliding. - if self.char_ngram_containment(pattern) >= 1.0 { - return true; - } - - // Layer 2: Token cosine similarity (semantic stability for long messages) - if self.token_cosine_similarity(pattern) >= token_cosine_threshold { - return true; - } - - false - } - - fn char_ngram_containment(&self, pattern: &str) -> f64 { - // Normalize the pattern the same way as char_ngram_similarity - let normalized_pattern = pattern - .to_lowercase() - .chars() - .filter(|c| c.is_alphanumeric() || c.is_whitespace()) - .collect::() - .split_whitespace() - .collect::>() - .join(" "); - - // Generate ngrams for the pattern - let pattern_ngrams: HashSet = normalized_pattern - .chars() - .collect::>() - .windows(NGRAM_SIZE) - .map(|w| w.iter().collect::()) - .collect(); - - if pattern_ngrams.is_empty() { - return 0.0; - } - - // Count how many pattern ngrams exist in the message - let contained = pattern_ngrams - .iter() - .filter(|t| self.char_ngram_set.contains(*t)) - .count(); - - contained as f64 / pattern_ngrams.len() as f64 - } - - /// Fast matching against a pre-normalized pattern - /// This avoids re-normalizing and re-computing ngrams for each pattern - fn matches_normalized_pattern( - &self, - pattern: &NormalizedPattern, - char_ngram_threshold: f64, - token_cosine_threshold: f64, - ) -> bool { - // Layer 0: Exact phrase match (fastest) - if self.contains_phrase(&pattern.raw) { - return true; - } - - // Layer 1: Character ngram similarity using pre-computed ngrams - if !self.char_ngram_set.is_empty() && !pattern.char_ngram_set.is_empty() { - let intersection = self - .char_ngram_set - .intersection(&pattern.char_ngram_set) - .count(); - let union = self.char_ngram_set.union(&pattern.char_ngram_set).count(); - if union > 0 { - let similarity = intersection as f64 / union as f64; - if similarity >= char_ngram_threshold { - return true; - } - } - } - - // Ngram containment check using pre-computed ngrams - if !pattern.char_ngram_set.is_empty() { - let contained = pattern - .char_ngram_set - .iter() - .filter(|t| self.char_ngram_set.contains(*t)) - .count(); - let containment = contained as f64 / pattern.char_ngram_set.len() as f64; - if containment >= 1.0 { - return true; - } - } - - // Layer 2: Token cosine similarity using pre-computed frequencies - if !self.token_frequency.is_empty() && !pattern.token_frequency.is_empty() { - let mut dot_product = 0.0; - let mut norm1_squared = 0.0; - let mut norm2_squared = 0.0; - - // Iterate over pattern tokens (usually smaller set) - for (token, &freq2) in &pattern.token_frequency { - let freq1 = *self.token_frequency.get(token).unwrap_or(&0) as f64; - let freq2 = freq2 as f64; - dot_product += freq1 * freq2; - norm2_squared += freq2 * freq2; - } - - // Add self tokens not in pattern for norm1 - for &freq1 in self.token_frequency.values() { - norm1_squared += (freq1 as f64) * (freq1 as f64); - } - - let norm1 = norm1_squared.sqrt(); - let norm2 = norm2_squared.sqrt(); - - if norm1 > 0.0 && norm2 > 0.0 { - let similarity = dot_product / (norm1 * norm2); - if similarity >= token_cosine_threshold { - return true; - } - } - } - - false - } -} - -// ============================================================================ -// Normalized Pattern (pre-computed for performance) -// ============================================================================ - -/// Pre-processed pattern with normalized text and pre-computed ngrams/tokens -/// This avoids redundant computation when matching against many messages -#[derive(Debug, Clone)] -struct NormalizedPattern { - /// Original raw pattern text - raw: String, - /// Character ngram set for similarity matching - char_ngram_set: HashSet, - /// Token frequency map for cosine similarity - token_frequency: HashMap, -} - -impl NormalizedPattern { - fn new(pattern: &str) -> Self { - // Normalize: lowercase and remove ALL punctuation - let normalized = pattern - .to_lowercase() - .chars() - .filter(|c| c.is_alphanumeric() || c.is_whitespace()) - .collect::() - .split_whitespace() - .collect::>() - .join(" "); - - // Generate ngrams - let char_ngram_set: HashSet = normalized - .chars() - .collect::>() - .windows(NGRAM_SIZE) - .map(|w| w.iter().collect::()) - .collect(); - - // Compute token frequency map - let tokens: Vec = normalized - .split_whitespace() - .map(|s| s.to_string()) - .collect(); - let mut token_frequency: HashMap = HashMap::new(); - for token in tokens { - *token_frequency.entry(token).or_insert(0) += 1; - } - - Self { - raw: pattern.to_string(), - char_ngram_set, - token_frequency, - } - } -} - -/// Helper to create a static slice of normalized patterns -fn normalize_patterns(patterns: &[&str]) -> Vec { - patterns.iter().map(|p| NormalizedPattern::new(p)).collect() -} - -// ============================================================================ -// Pre-computed Pattern Caches (initialized once at startup) -// ============================================================================ - -static REPAIR_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Explicit corrections - "i meant", - "i mean", - "sorry, i meant", - "what i meant was", - "what i actually meant", - "i was trying to say", - "let me correct that", - "correction", - "i misspoke", - // Negations and disagreements - "no, i", - "no i", - "nah i", - "nope i", - "not what i", - "that's not", - "that's not what", - "that isn't what", - "not quite", - "not exactly", - // Rephrasing indicators - "let me rephrase", - "let me try again", - "let me clarify", - "to clarify", - "to be clear", - "let me explain", - "what i'm trying to", - "what i'm saying", - "in other words", - // Actual/really emphasis - "actually i", - "actually no", - "what i actually", - "i actually", - "i really meant", - // Mistake acknowledgment - "i was wrong", - "my mistake", - "my bad", - "i should have said", - "i should clarify", - // Wait/hold indicators - "wait, i", - "wait no", - "hold on", - "hang on", - ]) -}); - -static COMPLAINT_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Useless/unhelpful (multi-word only) - "this is useless", - "not helpful", - "doesn't help", - "not helping", - "you're not helping", - "no help", - "unhelpful", - // Not working - "this doesn't work", - "doesn't work", - "not working", - "isn't working", - "won't work", - "still doesn't work", - "still not working", - // Not fixing/solving - "doesn't fix", - "not fixing", - "doesn't solve", - "doesn't seem to work", - "doesn't seem to fix", - "not resolving", - // Waste/pointless - "waste of time", - "wasting my time", - // Ridiculous/absurd - "this is ridiculous", - "ridiculous", - "this is absurd", - "absurd", - "this is insane", - "insane", - // Stupid/dumb (as adjectives, not as standalone tokens) - "this is stupid", - "this is dumb", - // Quality complaints (multi-word) - "this sucks", - "not good enough", - // Capability questions - "why can't you", - "can't you", - // Frustration - "this is frustrating", - "frustrated", - "incomplete", - "overwhelm", - "overwhelmed", - "overwhelming", - "exhausted", - "struggled", - // same issue - "same issue", - // polite dissatisfaction - "i'm disappointed", - "thanks, but", - "appreciate it, but", - "good, but", - // Fed up/done - "i give up", - "give up", - "fed up", - "had enough", - "can't take", - // Bot-specific complaints - "useless bot", - "dumb bot", - "stupid bot", - ]) -}); - -static CONFUSION_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Don't understand - "i don't understand", - "don't understand", - "not understanding", - "can't understand", - "don't get it", - "don't follow", - // Confused state - "i'm confused", - "so confused", - // Makes no sense - "makes no sense", - "doesn't make sense", - "not making sense", - // What do you mean (keep multi-word) - "what do you mean", - "what does that mean", - "what are you saying", - // Lost/unclear - "i'm lost", - "totally lost", - "lost me", - // No clue - "no clue", - "no idea", - // Come again - "come again", - "say that again", - "repeat that", - ]) -}); - -static GRATITUDE_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Standard gratitude - "thank you", - "thanks", - "thank u", - "thankyou", - "thx", - "ty", - "tyvm", - "tysm", - "thnx", - "thnks", - // Strong gratitude - "thanks so much", - "thank you so much", - "thanks a lot", - "thanks a bunch", - "much appreciated", - "really appreciate", - "greatly appreciate", - "appreciate it", - "appreciate that", - "i appreciate", - "grateful", - "so grateful", - // Helpfulness acknowledgment - "that's helpful", - "very helpful", - "super helpful", - "really helpful", - "that helps", - "this helps", - "helpful", - // Perfection expressions - "perfect", - "that's perfect", - "just perfect", - "exactly what i needed", - "exactly right", - "just what i needed", - "that's exactly", - // Informal positive - "you're the best", - "you rock", - "you're awesome", - "awesome sauce", - "legend", - ]) -}); - -static SATISFACTION_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Works/functions - "that works", - "this works", - "works great", - "works perfectly", - "works for me", - // Great variations - "that's great", - "that's amazing", - "this is great", - "sounds great", - "looks great", - "great job", - // Excellent/perfect - "excellent", - "outstanding", - "superb", - "spectacular", - // Awesome/amazing - "awesome", - "that's awesome", - "amazing", - "incredible", - // Love expressions - "love it", - "love this", - "i love", - "loving it", - "love that", - // Brilliant/wonderful - "brilliant", - "wonderful", - "fantastic", - "fabulous", - "marvelous", - ]) -}); - -static SUCCESS_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Understanding confirmation - "got it", - "i got it", - "understand", - "understood", - "i understand", - "makes sense", - "clear now", - "i see", - // Success/completion - "success", - "successful", - "it worked", - "that worked", - "this worked", - "worked", - // Problem resolution - "solved", - "resolved", - "fixed", - "fixed it", - "issue resolved", - "problem solved", - // Working state - "working now", - "it's working", - "works now", - "working fine", - "working great", - // Completion - "all set", - "all good", - "we're good", - "i'm good", - "all done", - "done", - "complete", - "finished", - // Perfect fit - "spot on", - "nailed it", - "bingo", - "exactly", - "just right", - ]) -}); - -static HUMAN_AGENT_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Speak to human - "speak to a human", - "speak to human", - "speak with a human", - "speak with human", - "talk to a human", - "talk to human", - "talk to a person", - "talk to person", - "talk to someone", - // Human/real agent - "human agent", - "real agent", - "actual agent", - "live agent", - "human support", - // Real/actual person - "real person", - "actual person", - "real human", - "actual human", - "someone real", - // Need/want human - "need a human", - "need human", - "want a human", - "want human", - "get me a human", - "get me human", - "get me someone", - // Transfer/connect - "transfer me", - "connect me", - "escalate this", - // Representative (removed standalone "rep" - too many false positives) - "representative", - "customer service rep", - "customer service representative", - // Not a bot - "not a bot", - "not talking to a bot", - "tired of bots", - ]) -}); - -static SUPPORT_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Contact support - "contact support", - "call support", - "reach support", - "get support", - // Customer support - "customer support", - "customer service", - "tech support", - "technical support", - // Help desk - "help desk", - "helpdesk", - "support desk", - // Talk to support - "talk to support", - "speak to support", - "speak with support", - "chat with support", - // Need help - "need real help", - "need actual help", - "help me now", - ]) -}); - -static QUIT_PATTERNS: LazyLock> = LazyLock::new(|| { - normalize_patterns(&[ - // Give up - "i give up", - "give up", - "giving up", - // Quit/leaving - "i'm going to quit", - "i quit", - "quitting", - "i'm leaving", - "i'm done", - "i'm out", - // Forget it - "forget it", - "forget this", - "screw it", - "screw this", - // Never mind - "never mind", - "nevermind", - "don't bother", - "not worth it", - // Hopeless - "this is hopeless", - // Going elsewhere - "going elsewhere", - "try somewhere else", - "look elsewhere", - "find another", - ]) -}); - -// ============================================================================ -// Core Signal Types -// ============================================================================ - -/// Overall quality assessment for an agent interaction session -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum InteractionQuality { - /// Excellent interaction with strong positive signals - Excellent, - /// Good interaction with mostly positive signals - Good, - /// Neutral interaction with mixed signals - Neutral, - /// Poor interaction with concerning signals - Poor, - /// Critical interaction with severe negative signals - Severe, -} - -/// Container for all computed signals for a conversation -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SignalReport { - /// Turn count and efficiency metrics - pub turn_count: TurnCountSignal, - /// Follow-up and repair frequency - pub follow_up: FollowUpSignal, - /// User frustration indicators - pub frustration: FrustrationSignal, - /// Repetition and looping behavior - pub repetition: RepetitionSignal, - /// Positive feedback indicators - pub positive_feedback: PositiveFeedbackSignal, - /// User escalation requests - pub escalation: EscalationSignal, - /// Overall quality assessment - pub overall_quality: InteractionQuality, - /// Human-readable summary - pub summary: String, -} - -// ============================================================================ -// Individual Signal Types -// ============================================================================ - -/// Turn count and efficiency metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TurnCountSignal { - /// Total number of turns (user-agent exchanges) - pub total_turns: usize, - /// Number of user messages - pub user_turns: usize, - /// Number of assistant messages - pub assistant_turns: usize, - /// Whether the turn count is concerning (> 7) - pub is_concerning: bool, - /// Whether the turn count is excessive (> 12) - pub is_excessive: bool, - /// Efficiency score (0.0-1.0, lower turns = higher score) - pub efficiency_score: f64, -} - -/// Follow-up and repair frequency signal -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FollowUpSignal { - /// Number of detected repair attempts - pub repair_count: usize, - /// Ratio of repairs to total user turns - pub repair_ratio: f64, - /// Whether repair ratio is concerning (> 0.3) - pub is_concerning: bool, - /// List of detected repair phrases - pub repair_phrases: Vec, -} - -/// User frustration indicators -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FrustrationSignal { - /// Number of frustration indicators detected - pub frustration_count: usize, - /// Whether frustration is detected - pub has_frustration: bool, - /// Severity level (0-3: none, mild, moderate, severe) - pub severity: u8, - /// List of detected frustration indicators - pub indicators: Vec, -} - -/// Individual frustration indicator -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FrustrationIndicator { - /// Type of frustration detected - pub indicator_type: FrustrationType, - /// Message index where detected - pub message_index: usize, - /// Relevant text snippet - pub snippet: String, -} - -/// Types of frustration indicators -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum FrustrationType { - /// Negative sentiment detected - NegativeSentiment, - /// All caps typing - AllCaps, - /// Excessive punctuation - ExcessivePunctuation, - /// Profanity detected - Profanity, - /// Direct complaint - DirectComplaint, - /// Expression of confusion - Confusion, -} - -/// Repetition and looping behavior signal -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RepetitionSignal { - /// Number of repetitions detected - pub repetition_count: usize, - /// Whether significant looping detected (> 2 repetitions) - pub has_looping: bool, - /// Severity level (0-3: none, mild, moderate, severe) - pub severity: u8, - /// List of detected repetitions - pub repetitions: Vec, -} - -/// Individual repetition instance -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RepetitionInstance { - /// Message indices involved in repetition - pub message_indices: Vec, - /// Similarity score (0.0-1.0) - pub similarity: f64, - /// Type of repetition - pub repetition_type: RepetitionType, -} - -/// Types of repetition -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum RepetitionType { - /// Exact repetition - Exact, - /// Near-duplicate (high similarity) - NearDuplicate, - /// Semantic repetition (similar meaning) - Semantic, -} - -/// Positive feedback indicators -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PositiveFeedbackSignal { - /// Number of positive indicators detected - pub positive_count: usize, - /// Whether positive feedback is present - pub has_positive_feedback: bool, - /// Confidence score (0.0-1.0) - pub confidence: f64, - /// List of detected positive indicators - pub indicators: Vec, -} - -/// Individual positive indicator -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PositiveIndicator { - /// Type of positive feedback - pub indicator_type: PositiveType, - /// Message index where detected - pub message_index: usize, - /// Relevant text snippet - pub snippet: String, -} - -/// Types of positive indicators -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum PositiveType { - /// Expression of gratitude - Gratitude, - /// Explicit satisfaction - Satisfaction, - /// Confirmation of success - Success, - /// Positive sentiment - PositiveSentiment, - /// Natural topic transition - TopicTransition, -} - -/// User escalation signal -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EscalationSignal { - /// Whether escalation was requested - pub escalation_requested: bool, - /// Number of escalation requests - pub escalation_count: usize, - /// List of detected escalation requests - pub requests: Vec, -} - -/// Individual escalation request -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EscalationRequest { - /// Message index where detected - pub message_index: usize, - /// Relevant text snippet - pub snippet: String, - /// Type of escalation - pub escalation_type: EscalationType, -} - -/// Types of escalation -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum EscalationType { - /// Request for human agent - HumanAgent, - /// Request for support - Support, - /// Threat to quit/leave - ThreatToQuit, - /// General help request - HelpRequest, -} - -// ============================================================================ -// Signal Analyzer -// ============================================================================ - -/// Trait for analyzing conversation signals -pub trait SignalAnalyzer { - /// Analyze a conversation and generate a complete signal report - fn analyze(&self, messages: &[Message]) -> SignalReport; -} - -/// Text-based implementation of signal analyzer that computes all signals from a message array -pub struct TextBasedSignalAnalyzer { - /// Baseline expected turns for normal interactions - baseline_turns: usize, - /// Threshold for character ngram similarity (0.0-1.0) - char_ngram_threshold: f64, - /// Threshold for token cosine similarity (0.0-1.0) - token_cosine_threshold: f64, - /// Maximum message length in characters (prevents unbounded computation) - max_message_length: usize, - /// Maximum number of messages to process (prevents unbounded computation) - max_messages: usize, - /// Maximum window size for repetition detection (prevents O(n²) explosion) - max_repetition_window: usize, -} - -impl TextBasedSignalAnalyzer { - /// Extract text content from MessageContent, skipping non-text content - fn extract_text(content: &Option) -> Option { - match content { - Some(hermesllm::apis::openai::MessageContent::Text(text)) => Some(text.clone()), - // Tool calls and other structured content are skipped - _ => None, - } - } - - /// Create a new signal analyzer with default settings - pub fn new() -> Self { +impl Default for SignalAnalyzerConfig { + fn default() -> Self { Self { baseline_turns: 5, - char_ngram_threshold: 0.50, // Lowered to handle typos and small edits realistically - token_cosine_threshold: 0.60, // Lowered for better semantic match in varied contexts - max_message_length: 2000, // Prevent unbounded ngram generation - max_messages: 100, // Prevent unbounded message processing - max_repetition_window: 20, // Prevent O(n²) explosion in repetition detection - } - } - - /// Create a new signal analyzer with custom baseline - pub fn with_baseline(baseline_turns: usize) -> Self { - Self { - baseline_turns, - char_ngram_threshold: 0.50, + char_ngram_threshold: 0.65, token_cosine_threshold: 0.60, max_message_length: 2000, max_messages: 100, - max_repetition_window: 20, } } - - /// Create a new signal analyzer with custom settings - /// - /// # Arguments - /// * `baseline_turns` - Expected baseline turns for normal interactions - /// * `char_ngram_threshold` - Threshold for character ngram similarity (0.0-1.0) - /// * `token_cosine_threshold` - Threshold for token cosine similarity (0.0-1.0) - pub fn with_settings( - baseline_turns: usize, - char_ngram_threshold: f64, - token_cosine_threshold: f64, - ) -> Self { - Self { - baseline_turns, - char_ngram_threshold, - token_cosine_threshold, - max_message_length: 2000, - max_messages: 100, - max_repetition_window: 20, - } - } - - /// Create a new signal analyzer with full custom settings including computation limits - /// - /// # Arguments - /// * `baseline_turns` - Expected baseline turns for normal interactions - /// * `char_ngram_threshold` - Threshold for character ngram similarity (0.0-1.0) - /// * `token_cosine_threshold` - Threshold for token cosine similarity (0.0-1.0) - /// * `max_message_length` - Maximum characters per message to process - /// * `max_messages` - Maximum number of messages to process - /// * `max_repetition_window` - Maximum messages to compare for repetition detection - pub fn with_full_settings( - baseline_turns: usize, - char_ngram_threshold: f64, - token_cosine_threshold: f64, - max_message_length: usize, - max_messages: usize, - max_repetition_window: usize, - ) -> Self { - Self { - baseline_turns, - char_ngram_threshold, - token_cosine_threshold, - max_message_length, - max_messages, - max_repetition_window, - } - } - - // ======================================================================== - // Individual Signal Analyzers - // ======================================================================== - - /// Analyze turn count and efficiency - fn analyze_turn_count(&self, messages: &[Message]) -> TurnCountSignal { - let mut user_turns = 0; - let mut assistant_turns = 0; - - for message in messages { - match message.role { - Role::User => user_turns += 1, - Role::Assistant => assistant_turns += 1, - _ => {} - } - } - - let total_turns = user_turns + assistant_turns; - let is_concerning = total_turns > 7; - let is_excessive = total_turns > 12; - - // Calculate efficiency score (exponential decay after baseline) - let efficiency_score = if total_turns == 0 || total_turns <= self.baseline_turns { - 1.0 - } else { - let excess = total_turns - self.baseline_turns; - 1.0 / (1.0 + (excess as f64 * 0.3)) - }; - - TurnCountSignal { - total_turns, - user_turns, - assistant_turns, - is_concerning, - is_excessive, - efficiency_score, - } - } - - /// Analyze follow-up and repair frequency - fn analyze_follow_up( - &self, - normalized_messages: &[(usize, Role, NormalizedMessage)], - ) -> FollowUpSignal { - let mut repair_count = 0; - let mut repair_phrases = Vec::new(); - let mut user_turn_count = 0; - - for (i, role, norm_msg) in normalized_messages { - if *role != Role::User { - continue; - } - - user_turn_count += 1; - - // Use per-turn boolean to prevent double-counting - let mut found_in_turn = false; - - // Use pre-computed patterns for fast matching - for pattern in REPAIR_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - repair_count += 1; - repair_phrases.push(format!("Turn {}: '{}'", i + 1, pattern.raw)); - found_in_turn = true; - break; - } - } - - // Only check for semantic similarity if no pattern matched - if !found_in_turn && *i >= 2 { - // Find previous user message - for j in (0..*i).rev() { - let (_, prev_role, prev_norm_msg) = &normalized_messages[j]; - if *prev_role == Role::User { - if self.is_similar_rephrase(norm_msg, prev_norm_msg) { - repair_count += 1; - repair_phrases - .push(format!("Turn {}: Similar rephrase detected", i + 1)); - } - break; - } - } - } - } - - let repair_ratio = if user_turn_count == 0 { - 0.0 - } else { - repair_count as f64 / user_turn_count as f64 - }; - - let is_concerning = repair_ratio > 0.3; - - FollowUpSignal { - repair_count, - repair_ratio, - is_concerning, - repair_phrases, - } - } - - /// Analyze user frustration indicators - fn analyze_frustration( - &self, - normalized_messages: &[(usize, Role, NormalizedMessage)], - ) -> FrustrationSignal { - let mut indicators = Vec::new(); - - // Profanity list - only as standalone tokens, not substrings - let profanity_tokens = [ - "damn", "damnit", "crap", "wtf", "ffs", "bullshit", "shit", "fuck", "fucking", - ]; - - for (i, role, norm_msg) in normalized_messages { - if *role != Role::User { - continue; - } - - let text = &norm_msg.raw; - - // Check for all caps (at least 10 chars and 80% uppercase) - let alpha_chars: String = text.chars().filter(|c| c.is_alphabetic()).collect(); - if alpha_chars.len() >= 10 { - let upper_count = alpha_chars.chars().filter(|c| c.is_uppercase()).count(); - let upper_ratio = upper_count as f64 / alpha_chars.len() as f64; - if upper_ratio >= 0.8 { - indicators.push(FrustrationIndicator { - indicator_type: FrustrationType::AllCaps, - message_index: *i, - snippet: text.chars().take(50).collect(), - }); - } - } - - // Check for excessive punctuation - let question_marks = text.matches('?').count(); - let exclamation_marks = text.matches('!').count(); - if question_marks >= 3 || exclamation_marks >= 3 { - indicators.push(FrustrationIndicator { - indicator_type: FrustrationType::ExcessivePunctuation, - message_index: *i, - snippet: text.chars().take(50).collect(), - }); - } - - // Check for complaint patterns using pre-computed patterns - for pattern in COMPLAINT_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - indicators.push(FrustrationIndicator { - indicator_type: FrustrationType::DirectComplaint, - message_index: *i, - snippet: pattern.raw.clone(), - }); - break; - } - } - - // Check for confusion patterns using pre-computed patterns - for pattern in CONFUSION_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - indicators.push(FrustrationIndicator { - indicator_type: FrustrationType::Confusion, - message_index: *i, - snippet: pattern.raw.clone(), - }); - break; - } - } - - // Check for profanity (token-based, not substring) - for token in &profanity_tokens { - if norm_msg.contains_token(token) { - indicators.push(FrustrationIndicator { - indicator_type: FrustrationType::Profanity, - message_index: *i, - snippet: token.to_string(), - }); - break; - } - } - } - - let frustration_count = indicators.len(); - let has_frustration = frustration_count > 0; - - // Calculate severity - let severity = if frustration_count == 0 { - 0 - } else if frustration_count <= 2 { - 1 - } else if frustration_count <= 4 { - 2 - } else { - 3 - }; - - FrustrationSignal { - frustration_count, - has_frustration, - severity, - indicators, - } - } - - /// Analyze repetition and looping behavior - fn analyze_repetition( - &self, - normalized_messages: &[(usize, Role, NormalizedMessage)], - ) -> RepetitionSignal { - let mut repetitions = Vec::new(); - - // Collect assistant messages with normalized content - let assistant_messages: Vec<(usize, &NormalizedMessage)> = normalized_messages - .iter() - .filter(|(_, role, _)| *role == Role::Assistant) - .map(|(i, _, norm_msg)| (*i, norm_msg)) - .collect(); - - // Limit the window size to prevent O(n²) explosion - // Only compare messages within the max_repetition_window - let window_size = self.max_repetition_window.min(assistant_messages.len()); - - // Check for exact or near-duplicate responses using bigram similarity - // Only compare within the sliding window - for i in 0..assistant_messages.len() { - let window_start = i + 1; - let window_end = (i + 1 + window_size).min(assistant_messages.len()); - - for j in window_start..window_end { - let (idx_i, norm_msg_i) = &assistant_messages[i]; - let (idx_j, norm_msg_j) = &assistant_messages[j]; - - // Skip if messages are too short - if norm_msg_i.tokens.len() < 5 || norm_msg_j.tokens.len() < 5 { - continue; - } - - // Calculate bigram-based similarity (more accurate for near-duplicates) - let similarity = self.calculate_bigram_similarity(norm_msg_i, norm_msg_j); - - // Exact match - lowered from 0.95 to 0.85 for bigram similarity - if similarity >= 0.85 { - repetitions.push(RepetitionInstance { - message_indices: vec![*idx_i, *idx_j], - similarity, - repetition_type: RepetitionType::Exact, - }); - } - // Near duplicate - lowered from 0.75 to 0.50 to catch subtle repetitions - else if similarity >= 0.50 { - repetitions.push(RepetitionInstance { - message_indices: vec![*idx_i, *idx_j], - similarity, - repetition_type: RepetitionType::NearDuplicate, - }); - } - } - } - - let repetition_count = repetitions.len(); - let has_looping = repetition_count > 2; - - let severity = if repetition_count == 0 { - 0 - } else if repetition_count <= 2 { - 1 - } else if repetition_count <= 4 { - 2 - } else { - 3 - }; - - RepetitionSignal { - repetition_count, - has_looping, - severity, - repetitions, - } - } - - /// Calculate bigram similarity using cached bigram sets - fn calculate_bigram_similarity( - &self, - norm_msg1: &NormalizedMessage, - norm_msg2: &NormalizedMessage, - ) -> f64 { - // Use pre-cached bigram sets for O(1) lookups - let set1 = &norm_msg1.bigram_set; - let set2 = &norm_msg2.bigram_set; - - if set1.is_empty() && set2.is_empty() { - return 1.0; // Both empty = identical - } - - if set1.is_empty() || set2.is_empty() { - return 0.0; - } - - let intersection = set1.intersection(set2).count(); - let union = set1.union(set2).count(); - - if union == 0 { - return 0.0; - } - - intersection as f64 / union as f64 - } - - /// Analyze positive feedback indicators - fn analyze_positive_feedback( - &self, - normalized_messages: &[(usize, Role, NormalizedMessage)], - ) -> PositiveFeedbackSignal { - let mut indicators = Vec::new(); - - for (i, role, norm_msg) in normalized_messages { - if *role != Role::User { - continue; - } - - // Use per-turn boolean to prevent double-counting - let mut found_in_turn = false; - - // Check gratitude using pre-computed patterns - for pattern in GRATITUDE_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - indicators.push(PositiveIndicator { - indicator_type: PositiveType::Gratitude, - message_index: *i, - snippet: pattern.raw.clone(), - }); - found_in_turn = true; - break; - } - } - - if found_in_turn { - continue; - } - - // Check satisfaction using pre-computed patterns - for pattern in SATISFACTION_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - indicators.push(PositiveIndicator { - indicator_type: PositiveType::Satisfaction, - message_index: *i, - snippet: pattern.raw.clone(), - }); - found_in_turn = true; - break; - } - } - - if found_in_turn { - continue; - } - - // Check success confirmation using pre-computed patterns - for pattern in SUCCESS_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - indicators.push(PositiveIndicator { - indicator_type: PositiveType::Success, - message_index: *i, - snippet: pattern.raw.clone(), - }); - break; - } - } - } - - let positive_count = indicators.len(); - let has_positive_feedback = positive_count > 0; - - // Calculate confidence based on number and diversity of indicators - let confidence = if positive_count == 0 { - 0.0 - } else if positive_count == 1 { - 0.6 - } else if positive_count == 2 { - 0.8 - } else { - 0.95 - }; - - PositiveFeedbackSignal { - positive_count, - has_positive_feedback, - confidence, - indicators, - } - } - - /// Analyze user escalation requests - fn analyze_escalation( - &self, - normalized_messages: &[(usize, Role, NormalizedMessage)], - ) -> EscalationSignal { - let mut requests = Vec::new(); - - for (i, role, norm_msg) in normalized_messages { - if *role != Role::User { - continue; - } - - let mut found_human_agent = false; - - // Check for human agent request using pre-computed patterns - for pattern in HUMAN_AGENT_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - requests.push(EscalationRequest { - message_index: *i, - snippet: pattern.raw.clone(), - escalation_type: EscalationType::HumanAgent, - }); - found_human_agent = true; - break; - } - } - - // Check for support request (only if no human agent request found) - // HumanAgent and Support are too similar and often match the same phrase - if !found_human_agent { - for pattern in SUPPORT_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - requests.push(EscalationRequest { - message_index: *i, - snippet: pattern.raw.clone(), - escalation_type: EscalationType::Support, - }); - break; - } - } - } - - // Check for quit threats (independent of HumanAgent/Support) - // A message can contain both "give up" (quit) and "speak to human" (escalation) - for pattern in QUIT_PATTERNS.iter() { - if norm_msg.matches_normalized_pattern( - pattern, - self.char_ngram_threshold, - self.token_cosine_threshold, - ) { - requests.push(EscalationRequest { - message_index: *i, - snippet: pattern.raw.clone(), - escalation_type: EscalationType::ThreatToQuit, - }); - break; - } - } - } - - let escalation_count = requests.len(); - let escalation_requested = escalation_count > 0; - - EscalationSignal { - escalation_requested, - escalation_count, - requests, - } - } - - // ======================================================================== - // Helper Methods - // ======================================================================== - - /// Check if two messages are similar rephrases - fn is_similar_rephrase( - &self, - norm_msg1: &NormalizedMessage, - norm_msg2: &NormalizedMessage, - ) -> bool { - // Skip if too short - if norm_msg1.tokens.len() < 3 || norm_msg2.tokens.len() < 3 { - return false; - } - - // Common stopwords to downweight - let stopwords: HashSet<&str> = [ - "i", "me", "my", "you", "the", "a", "an", "is", "are", "was", "were", "to", "with", - "for", "of", "at", "by", "in", "on", "it", "this", "that", "can", "could", "do", - "does", "did", "will", "would", "should", "be", - ] - .iter() - .cloned() - .collect(); - - // Filter out stopwords for meaningful overlap - let tokens1: HashSet<_> = norm_msg1 - .tokens - .iter() - .filter(|t| !stopwords.contains(t.as_str())) - .collect(); - let tokens2: HashSet<_> = norm_msg2 - .tokens - .iter() - .filter(|t| !stopwords.contains(t.as_str())) - .collect(); - - // Need at least 2 non-stopword tokens - if tokens1.len() < 2 || tokens2.len() < 2 { - return false; - } - - let intersection = tokens1.intersection(&tokens2).count(); - let min_size = tokens1.len().min(tokens2.len()); - - // High overlap suggests rephrase - let overlap_ratio = intersection as f64 / min_size as f64; - overlap_ratio >= 0.6 - } - - /// Assess overall interaction quality based on all signals - fn assess_overall_quality( - &self, - turn_count: &TurnCountSignal, - follow_up: &FollowUpSignal, - frustration: &FrustrationSignal, - repetition: &RepetitionSignal, - positive: &PositiveFeedbackSignal, - escalation: &EscalationSignal, - ) -> InteractionQuality { - // Critical conditions - immediate fail - if escalation.escalation_requested - || frustration.severity >= 3 - || repetition.severity >= 3 - || turn_count.is_excessive - { - return InteractionQuality::Severe; - } - - // Calculate quality score - let mut score = 50.0; // Start at neutral - - // Positive factors - if positive.has_positive_feedback { - score += 20.0 * positive.confidence; - } - score += turn_count.efficiency_score * 10.0; - - // Negative factors - if frustration.has_frustration { - score -= frustration.severity as f64 * 10.00; - } - if follow_up.is_concerning { - score -= 15.0; - } - if repetition.has_looping { - score -= repetition.severity as f64 * 8.0; - } - if turn_count.is_concerning { - score -= 10.0; - } - - // Map score to quality level - if score >= 75.0 { - InteractionQuality::Excellent - } else if score >= 60.0 { - InteractionQuality::Good - } else if score >= 40.0 { - InteractionQuality::Neutral - } else if score >= 25.0 { - InteractionQuality::Poor - } else { - InteractionQuality::Severe - } - } - - /// Generate human-readable summary - #[allow(clippy::too_many_arguments)] - fn generate_summary( - &self, - turn_count: &TurnCountSignal, - follow_up: &FollowUpSignal, - frustration: &FrustrationSignal, - repetition: &RepetitionSignal, - positive: &PositiveFeedbackSignal, - escalation: &EscalationSignal, - quality: &InteractionQuality, - ) -> String { - let mut summary_parts = Vec::new(); - - summary_parts.push(format!("Overall Quality: {:?}", quality)); - - summary_parts.push(format!( - "Turn Count: {} turns (efficiency: {:.1}%)", - turn_count.total_turns, - turn_count.efficiency_score * 100.0 - )); - - if follow_up.is_concerning { - summary_parts.push(format!( - "⚠️ High repair rate: {:.1}% of user turns", - follow_up.repair_ratio * 100.0 - )); - } - - if frustration.has_frustration { - summary_parts.push(format!( - "⚠️ Frustration detected: {} indicators (severity: {})", - frustration.frustration_count, frustration.severity - )); - } - - if repetition.has_looping { - summary_parts.push(format!( - "⚠️ Looping detected: {} repetitions", - repetition.repetition_count - )); - } - - if positive.has_positive_feedback { - summary_parts.push(format!( - "✓ Positive feedback: {} indicators", - positive.positive_count - )); - } - - if escalation.escalation_requested { - summary_parts.push(format!( - "⚠️ Escalation requested: {} requests", - escalation.escalation_count - )); - } - - summary_parts.join(" | ") - } } -impl SignalAnalyzer for TextBasedSignalAnalyzer { - fn analyze(&self, messages: &[Message]) -> SignalReport { - // Limit the number of messages to process (take most recent messages) - let messages_to_process = if messages.len() > self.max_messages { - &messages[messages.len() - self.max_messages..] +/// Top-level analyzer. +pub struct SignalAnalyzer { + cfg: SignalAnalyzerConfig, +} + +impl Default for SignalAnalyzer { + fn default() -> Self { + Self::new(SignalAnalyzerConfig::default()) + } +} + +impl SignalAnalyzer { + pub fn new(cfg: SignalAnalyzerConfig) -> Self { + Self { cfg } + } + + /// Run the full multi-layer analysis on a ShareGPT-shaped conversation. + pub fn analyze_sharegpt(&self, messages: &[ShareGptMessage<'_>]) -> SignalReport { + // Truncate to the last `max_messages` (last-N is what the Python does). + let slice: &[ShareGptMessage<'_>] = if messages.len() > self.cfg.max_messages { + &messages[messages.len() - self.cfg.max_messages..] } else { messages }; + let offset = messages.len().saturating_sub(slice.len()); - // Preprocess all messages once, filtering out non-text content (tool calls, etc.) - // and truncating long messages - let normalized_messages: Vec<(usize, Role, NormalizedMessage)> = messages_to_process + // Preprocess to absolute-indexed normalized human/gpt messages. + let normalized_owned: Vec<(usize, &str, NormalizedMessage)> = slice .iter() .enumerate() - .filter_map(|(i, msg)| { - Self::extract_text(&msg.content).map(|text| { - ( - i, - msg.role.clone(), - NormalizedMessage::from_text_with_limit(&text, self.max_message_length), - ) - }) + .filter_map(|(i, m)| { + if (m.from == "human" || m.from == "gpt") && !m.value.is_empty() { + Some(( + offset + i, + m.from, + NormalizedMessage::from_text(m.value, self.cfg.max_message_length), + )) + } else { + None + } }) .collect(); - let turn_count = self.analyze_turn_count(messages_to_process); - let follow_up = self.analyze_follow_up(&normalized_messages); - let frustration = self.analyze_frustration(&normalized_messages); - let repetition = self.analyze_repetition(&normalized_messages); - let positive_feedback = self.analyze_positive_feedback(&normalized_messages); - let escalation = self.analyze_escalation(&normalized_messages); - - let overall_quality = self.assess_overall_quality( - &turn_count, - &follow_up, - &frustration, - &repetition, - &positive_feedback, - &escalation, + let misalignment = analyze_misalignment( + &normalized_owned, + self.cfg.char_ngram_threshold, + self.cfg.token_cosine_threshold, ); - let summary = self.generate_summary( - &turn_count, - &follow_up, - &frustration, - &repetition, - &positive_feedback, - &escalation, - &overall_quality, + let stagnation_input: Vec> = + slice.iter().map(|m| ShareGptMsg { from: m.from }).collect(); + let (mut stagnation, turn_metrics) = analyze_stagnation( + &stagnation_input, + &normalized_owned, + self.cfg.baseline_turns, + ); + + let disengagement = analyze_disengagement( + &normalized_owned, + self.cfg.char_ngram_threshold, + self.cfg.token_cosine_threshold, + ); + + let satisfaction = analyze_satisfaction( + &normalized_owned, + self.cfg.char_ngram_threshold, + self.cfg.token_cosine_threshold, + ); + + let failure = analyze_failure(slice); + let loops = analyze_loops(slice); + let exhaustion = analyze_exhaustion(slice); + + // Bias the dragging signal's message_index back into absolute coords. + for s in &mut stagnation.signals { + s.message_index = offset + s.message_index.min(slice.len().saturating_sub(1)); + } + + let interaction = InteractionSignals { + misalignment, + stagnation, + disengagement, + satisfaction, + }; + let execution = ExecutionSignals { failure, loops }; + let environment = EnvironmentSignals { exhaustion }; + + let (overall_quality, score) = assess_quality( + &interaction, + &execution, + &environment, + turn_metrics.user_turns, + ); + let summary = generate_summary( + &turn_metrics, + &interaction, + &execution, + &environment, + overall_quality, ); SignalReport { - turn_count, - follow_up, - frustration, - repetition, - positive_feedback, - escalation, + interaction, + execution, + environment, overall_quality, + quality_score: score, + turn_metrics, summary, } } -} -impl Default for TextBasedSignalAnalyzer { - fn default() -> Self { - Self::new() + /// Convenience entry point: convert OpenAI-shaped chat `Message`s into the + /// ShareGPT format the detectors operate on, then run analysis. + pub fn analyze_openai(&self, messages: &[Message]) -> SignalReport { + let owned = messages_to_sharegpt(messages); + let view: Vec> = owned + .iter() + .map(|(role, value)| ShareGptMessage { + from: role.as_str(), + value: value.as_str(), + }) + .collect(); + self.analyze_sharegpt(&view) } } -// ============================================================================ -// Tests -// ============================================================================ +/// Convert OpenAI-shaped messages to a sequence of ShareGPT +/// `(role, value)` pairs. +/// +/// Mapping (preserves original message order; tool calls are emitted as a +/// separate `function_call` row immediately after the assistant text): +/// +/// - `User` -> `("human", text)` +/// - `Assistant` -> `("gpt", text)`, then one `("function_call", json)` per tool call +/// - `Tool` -> `("observation", text)` +/// - `System` / `Developer` -> dropped (not analyzed) +pub fn messages_to_sharegpt(messages: &[Message]) -> Vec<(String, String)> { + let mut out: Vec<(String, String)> = Vec::with_capacity(messages.len()); + for m in messages { + match m.role { + Role::User => { + let text = m.content.extract_text(); + out.push(("human".to_string(), text)); + } + Role::Assistant => { + let text = m.content.extract_text(); + if !text.is_empty() { + out.push(("gpt".to_string(), text)); + } + if let Some(calls) = &m.tool_calls { + for call in calls { + let payload = serde_json::json!({ + "name": call.function.name, + "arguments": call.function.arguments, + }); + out.push(("function_call".to_string(), payload.to_string())); + } + } + } + Role::Tool => { + let text = m.content.extract_text(); + out.push(("observation".to_string(), text)); + } + Role::System | Role::Developer => {} + } + } + out +} + +// --------------------------------------------------------------------------- +// Quality scoring (mirrors `_assess_quality` in the reference) +// --------------------------------------------------------------------------- + +fn assess_quality( + interaction: &InteractionSignals, + execution: &ExecutionSignals, + environment: &EnvironmentSignals, + user_turns: usize, +) -> (InteractionQuality, f32) { + // Critical: explicit escalation/quit OR severe disengagement OR severe stagnation. + let has_escalation_or_quit = interaction.disengagement.signals.iter().any(|s| { + matches!( + s.signal_type, + SignalType::DisengagementEscalation | SignalType::DisengagementQuit + ) + }); + if (interaction.disengagement.count > 0 && has_escalation_or_quit) + || interaction.disengagement.severity >= 3 + || interaction.stagnation.severity >= 3 + { + return (InteractionQuality::Severe, 0.0); + } + + let mut score: f32 = 50.0; + + if interaction.satisfaction.count > 0 { + let confidence = match interaction.satisfaction.count { + 1 => 0.6, + 2 => 0.8, + _ => 0.95, + }; + score += 20.0 * confidence; + } + + if interaction.disengagement.count > 0 { + score -= interaction.disengagement.severity as f32 * 10.0; + } + if interaction.misalignment.severity > 0 && interaction.misalignment_ratio(user_turns) > 0.3 { + score -= 15.0; + } + if interaction.stagnation.count > 2 { + score -= interaction.stagnation.severity as f32 * 8.0; + } + + if execution.failure.count > 0 { + score -= execution.failure.count as f32 * 8.0; + } + if execution.loops.count > 0 { + score -= execution.loops.count as f32 * 5.0; + } + if environment.exhaustion.count > 0 { + score -= environment.exhaustion.count as f32 * 3.0; + } + + score = score.clamp(0.0, 100.0); + + let quality = if score >= 75.0 { + InteractionQuality::Excellent + } else if score >= 60.0 { + InteractionQuality::Good + } else if score >= 40.0 { + InteractionQuality::Neutral + } else if score >= 25.0 { + InteractionQuality::Poor + } else { + InteractionQuality::Severe + }; + (quality, score) +} + +/// Render the per-conversation summary string. +/// +/// Output is structurally grouped by the paper taxonomy so a reader can see +/// at a glance which layer fired: +/// +/// ```text +/// Overall Quality: severe | Turns: 7 (efficiency: 71.4%) +/// | Interaction — misalignment: 2 (sev 1), stagnation: 0, disengagement: 2 (sev 1), satisfaction: 0 +/// | Execution — failure: 0, loops: 0 +/// | Environment — exhaustion: 0 +/// | High misalignment rate: 50.0% of user turns +/// | Escalation requested: 1 +/// ``` +/// +/// Layer headers are always present (even when their counts are all zero) so +/// the taxonomy is visible by inspection. Quality-driving callouts — +/// "high misalignment rate", "looping detected", "escalation requested" — +/// are appended after the layer summary as a separate "alerts" tail. +fn generate_summary( + turn_metrics: &TurnMetrics, + interaction: &InteractionSignals, + execution: &ExecutionSignals, + environment: &EnvironmentSignals, + quality: InteractionQuality, +) -> String { + let mut parts: Vec = Vec::new(); + parts.push(format!("Overall Quality: {}", quality.as_str())); + parts.push(format!( + "Turns: {} (efficiency: {:.1}%)", + turn_metrics.total_turns, + turn_metrics.efficiency_score * 100.0 + )); + + parts.push(format!( + "Interaction \u{2014} {}, {}, {}, {}", + fmt_group("misalignment", &interaction.misalignment), + fmt_group("stagnation", &interaction.stagnation), + fmt_group("disengagement", &interaction.disengagement), + fmt_group("satisfaction", &interaction.satisfaction), + )); + parts.push(format!( + "Execution \u{2014} {}, {}", + fmt_group("failure", &execution.failure), + fmt_group("loops", &execution.loops), + )); + parts.push(format!( + "Environment \u{2014} {}", + fmt_group("exhaustion", &environment.exhaustion), + )); + + if interaction.misalignment.count > 0 { + let misalignment_ratio = interaction.misalignment_ratio(turn_metrics.user_turns); + if misalignment_ratio > 0.3 { + parts.push(format!( + "High misalignment rate: {:.1}% of user turns", + misalignment_ratio * 100.0 + )); + } + } + if interaction.stagnation.count > 2 { + parts.push(format!( + "Looping detected: {} repetitions", + interaction.stagnation.count + )); + } + let escalation_count = interaction + .disengagement + .signals + .iter() + .filter(|s| matches!(s.signal_type, SignalType::DisengagementEscalation)) + .count(); + if escalation_count > 0 { + parts.push(format!("Escalation requested: {}", escalation_count)); + } + + parts.join(" | ") +} + +/// Render `": (sev )"`, dropping the severity suffix +/// when the count is zero (keeps the summary readable for clean conversations). +fn fmt_group(name: &str, group: &super::schemas::SignalGroup) -> String { + if group.count == 0 { + format!("{}: 0", name) + } else { + format!("{}: {} (sev {})", name, group.count, group.severity) + } +} #[cfg(test)] mod tests { use super::*; - use hermesllm::apis::openai::MessageContent; - use hermesllm::transforms::lib::ExtractText; - use std::time::Instant; + use hermesllm::apis::openai::{Message, MessageContent, Role}; + #[allow(unused_imports)] + use hermesllm::transforms::ExtractText; - fn create_message(role: Role, content: &str) -> Message { + fn user(t: &str) -> Message { Message { - role, - content: Some(MessageContent::Text(content.to_string())), + role: Role::User, + content: Some(MessageContent::Text(t.to_string())), + name: None, + tool_calls: None, + tool_call_id: None, + } + } + fn assistant(t: &str) -> Message { + Message { + role: Role::Assistant, + content: Some(MessageContent::Text(t.to_string())), name: None, tool_calls: None, tool_call_id: None, } } - // ======================================================================== - // Tests for New Similarity Methods - // ======================================================================== - #[test] - fn test_char_ngram_similarity_exact_match() { - let msg = NormalizedMessage::from_text("thank you very much"); - let similarity = msg.char_ngram_similarity("thank you very much"); - assert!( - similarity > 0.95, - "Exact match should have very high similarity" - ); + fn report_quality_neutral_for_short_clean_chat() { + let msgs = vec![ + user("Hello, can you help me with a question?"), + assistant("Of course, what's your question?"), + user("How does X work?"), + assistant("X works by ..."), + ]; + let r = SignalAnalyzer::default().analyze_openai(&msgs); + assert!(matches!( + r.overall_quality, + InteractionQuality::Neutral | InteractionQuality::Good | InteractionQuality::Excellent + )); + assert!(r.summary.starts_with("Overall Quality:")); } #[test] - fn test_char_ngram_similarity_typo() { - let msg = NormalizedMessage::from_text("thank you very much"); - // Common typo: "thnks" instead of "thanks" - let similarity = msg.char_ngram_similarity("thnks you very much"); - assert!( - similarity > 0.50, - "Should handle single-character typo with decent similarity: {}", - similarity - ); - } - - #[test] - fn test_char_ngram_similarity_small_edit() { - let msg = NormalizedMessage::from_text("this doesn't work"); - let similarity = msg.char_ngram_similarity("this doesnt work"); - assert!( - similarity > 0.70, - "Should handle punctuation removal gracefully: {}", - similarity - ); - } - - #[test] - fn test_char_ngram_similarity_word_insertion() { - let msg = NormalizedMessage::from_text("i don't understand"); - let similarity = msg.char_ngram_similarity("i really don't understand"); - assert!( - similarity > 0.40, - "Should be robust to word insertions: {}", - similarity - ); - } - - #[test] - fn test_token_cosine_similarity_exact_match() { - let msg = NormalizedMessage::from_text("this is not helpful"); - let similarity = msg.token_cosine_similarity("this is not helpful"); - assert!( - (similarity - 1.0).abs() < 0.01, - "Exact match should have cosine similarity of 1.0" - ); - } - - #[test] - fn test_token_cosine_similarity_word_order() { - let msg = NormalizedMessage::from_text("not helpful at all"); - let similarity = msg.token_cosine_similarity("helpful not at all"); - assert!( - similarity > 0.95, - "Should be robust to word order changes: {}", - similarity - ); - } - - #[test] - fn test_token_cosine_similarity_frequency() { - let msg = NormalizedMessage::from_text("help help help please"); - let similarity = msg.token_cosine_similarity("help please"); - assert!( - similarity > 0.7 && similarity < 1.0, - "Should account for frequency differences: {}", - similarity - ); - } - - #[test] - fn test_token_cosine_similarity_long_message_with_context() { - let msg = NormalizedMessage::from_text( - "I've been trying to set up my account for the past hour \ - and the verification email never arrived. I checked my spam folder \ - and still nothing. This is really frustrating and not helpful at all.", - ); - let similarity = msg.token_cosine_similarity("not helpful"); - assert!( - similarity > 0.15 && similarity < 0.7, - "Should detect pattern in long message with lower but non-zero similarity: {}", - similarity - ); - } - - #[test] - fn test_layered_matching_exact_hit() { - let msg = NormalizedMessage::from_text("thank you so much"); - assert!( - msg.layered_contains_phrase("thank you", 0.50, 0.60), - "Should match exact phrase in Layer 0" - ); - } - - #[test] - fn test_layered_matching_typo_hit() { - // Test that shows layered matching is more robust than exact matching alone - let msg = NormalizedMessage::from_text("it doesnt work for me"); - - // "doesnt work" should match "doesn't work" via character ngrams (high overlap) - assert!( - msg.layered_contains_phrase("doesn't work", 0.50, 0.60), - "Should match 'doesnt work' to 'doesn't work' via character ngrams" - ); - } - - #[test] - fn test_layered_matching_word_order_hit() { - let msg = NormalizedMessage::from_text("helpful not very"); - assert!( - msg.layered_contains_phrase("not helpful", 0.50, 0.60), - "Should match reordered words via token cosine in Layer 2" - ); - } - - #[test] - fn test_layered_matching_long_message_with_pattern() { - let msg = NormalizedMessage::from_text( - "I've tried everything and followed all the instructions \ - but this is not helpful at all and I'm getting frustrated", - ); - assert!( - msg.layered_contains_phrase("not helpful", 0.50, 0.60), - "Should detect pattern buried in long message" - ); - } - - #[test] - fn test_layered_matching_no_match() { - let msg = NormalizedMessage::from_text("everything is working perfectly"); - assert!( - !msg.layered_contains_phrase("not helpful", 0.50, 0.60), - "Should not match completely different content" - ); - } - - #[test] - fn test_char_ngram_vs_token_cosine_tradeoffs() { - // Character ngrams handle character-level changes well - let msg1 = NormalizedMessage::from_text("this doesnt work"); - let char_sim1 = msg1.char_ngram_similarity("this doesn't work"); - assert!( - char_sim1 > 0.70, - "Character ngrams should handle punctuation: {}", - char_sim1 - ); - - // Token cosine is better for word order and long messages with semantic overlap - let msg2 = - NormalizedMessage::from_text("I really appreciate all your help with this issue today"); - let token_sim2 = msg2.token_cosine_similarity("thank you for help"); - assert!( - token_sim2 > 0.15, - "Token cosine should detect semantic overlap: {}", - token_sim2 - ); - } - - // ======================================================================== - // Existing Tests - // ======================================================================== - - fn preprocess_messages(messages: &[Message]) -> Vec<(usize, Role, NormalizedMessage)> { - messages + fn report_severe_when_user_escalates() { + let msgs = vec![ + user("This isn't helpful at all"), + assistant("I'm sorry, can you tell me more?"), + user("Get me a human, this is useless"), + ]; + let r = SignalAnalyzer::default().analyze_openai(&msgs); + assert_eq!(r.overall_quality, InteractionQuality::Severe); + assert!(r + .interaction + .disengagement + .signals .iter() - .enumerate() - .map(|(i, msg)| { - let text = msg.content.extract_text(); - (i, msg.role.clone(), NormalizedMessage::from_text(&text)) - }) - .collect() + .any(|s| matches!(s.signal_type, SignalType::DisengagementEscalation))); } #[test] - fn test_turn_count_efficient() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Hello"), - create_message(Role::Assistant, "Hi! How can I help?"), - create_message(Role::User, "Thanks!"), + fn report_excellent_when_user_satisfied() { + let msgs = vec![ + user("Can you summarize this report?"), + assistant("Here's a summary: ..."), + user("That's perfect, exactly what I needed, you're awesome!"), ]; - - let signal = analyzer.analyze_turn_count(&messages); - assert_eq!(signal.total_turns, 3); - assert_eq!(signal.user_turns, 2); - assert_eq!(signal.assistant_turns, 1); - assert!(!signal.is_concerning); - assert!(!signal.is_excessive); - assert!(signal.efficiency_score > 0.9); - println!("test_turn_count_efficient took: {:?}", start.elapsed()); + let r = SignalAnalyzer::default().analyze_openai(&msgs); + assert!(r.interaction.satisfaction.count > 0); + assert!(matches!( + r.overall_quality, + InteractionQuality::Good | InteractionQuality::Excellent + )); } #[test] - fn test_turn_count_excessive() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let mut messages = Vec::new(); - for i in 0..15 { - messages.push(create_message( - if i % 2 == 0 { - Role::User - } else { - Role::Assistant - }, - &format!("Message {}", i), - )); - } - - let signal = analyzer.analyze_turn_count(&messages); - assert_eq!(signal.total_turns, 15); - assert!(signal.is_concerning); - assert!(signal.is_excessive); - assert!(signal.efficiency_score < 0.5); - println!("test_turn_count_excessive took: {:?}", start.elapsed()); - } - - #[test] - fn test_follow_up_detection() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Show me restaurants"), - create_message(Role::Assistant, "Here are some options"), - create_message(Role::User, "No, I meant Italian restaurants"), - create_message(Role::Assistant, "Here are Italian restaurants"), + fn repro_gratitude_does_not_trigger_misalignment() { + let msgs = vec![ + user("What is the weather in Istanbul?"), + assistant("Istanbul is 14C and partly cloudy."), + user("That worked, exactly what I needed. Thanks, that is perfect!"), ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_follow_up(&normalized_messages); - assert_eq!(signal.repair_count, 1); - assert!(signal.repair_ratio > 0.0); - println!("test_follow_up_detection took: {:?}", start.elapsed()); - } - - #[test] - fn test_frustration_detection() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "THIS IS RIDICULOUS!!!"), - create_message(Role::Assistant, "I apologize for the frustration"), - create_message(Role::User, "This doesn't work at all"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - assert!(signal.has_frustration); - assert!(signal.frustration_count >= 2); - assert!(signal.severity > 0); - println!("test_frustration_detection took: {:?}", start.elapsed()); - } - - #[test] - fn test_positive_feedback_detection() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Can you help me?"), - create_message(Role::Assistant, "Sure!"), - create_message(Role::User, "Thank you! That's exactly what I needed."), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_positive_feedback(&normalized_messages); - assert!(signal.has_positive_feedback); - assert!(signal.positive_count >= 1); - assert!(signal.confidence > 0.5); - println!( - "test_positive_feedback_detection took: {:?}", - start.elapsed() - ); - } - - #[test] - fn test_escalation_detection() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "This isn't working"), - create_message(Role::Assistant, "Let me help"), - create_message(Role::User, "I need to speak to a human agent"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_escalation(&normalized_messages); - assert!(signal.escalation_requested); - assert_eq!(signal.escalation_count, 1); - println!("test_escalation_detection took: {:?}", start.elapsed()); - } - - #[test] - fn test_repetition_detection() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "What's the weather?"), - create_message( - Role::Assistant, - "I can help you with the weather information", - ), - create_message(Role::User, "Show me the forecast"), - create_message(Role::Assistant, "Sure, I can help you with the forecast"), - create_message(Role::User, "Stop repeating yourself"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_repetition(&normalized_messages); - - for rep in &signal.repetitions { - println!( - " - Messages {:?}, similarity: {:.3}, type: {:?}", - rep.message_indices, rep.similarity, rep.repetition_type + let r = SignalAnalyzer::default().analyze_openai(&msgs); + for s in &r.interaction.misalignment.signals { + eprintln!( + "misalignment fired: type={:?} idx={} snippet={:?} meta={:?}", + s.signal_type, s.message_index, s.snippet, s.metadata ); } - - assert!(signal.repetition_count > 0, - "Should detect the subtle repetition between 'I can help you with the weather information' \ - and 'Sure, I can help you with the forecast'"); - println!("test_repetition_detection took: {:?}", start.elapsed()); - } - - #[test] - fn test_full_analysis_excellent() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "I need to book a flight"), - create_message(Role::Assistant, "Sure! Where would you like to go?"), - create_message(Role::User, "New York"), - create_message(Role::Assistant, "Great! I found several options."), - create_message(Role::User, "Perfect!"), - ]; - - let report = analyzer.analyze(&messages); - assert!(matches!( - report.overall_quality, - InteractionQuality::Excellent | InteractionQuality::Good - )); - assert!(report.positive_feedback.has_positive_feedback); - assert!(!report.frustration.has_frustration); - println!("test_full_analysis_excellent took: {:?}", start.elapsed()); - } - - #[test] - fn test_full_analysis_poor() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Help me"), - create_message(Role::Assistant, "How can I assist?"), - create_message(Role::User, "No, I meant something else"), - create_message(Role::Assistant, "What do you need?"), - create_message(Role::User, "THIS DOESN'T WORK!!!"), - create_message(Role::Assistant, "I apologize"), - create_message(Role::User, "Let me speak to a human"), - ]; - - let report = analyzer.analyze(&messages); - assert!(matches!( - report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Severe - )); - assert!(report.frustration.has_frustration); - assert!(report.escalation.escalation_requested); - println!("test_full_analysis_poor took: {:?}", start.elapsed()); - } - - #[test] - fn test_fuzzy_matching_gratitude() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Can you help me?"), - create_message(Role::Assistant, "Sure!"), - create_message(Role::User, "thnaks! that's exactly what i needed."), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_positive_feedback(&normalized_messages); - assert!(signal.has_positive_feedback); - assert!(signal.positive_count >= 1); - println!("test_fuzzy_matching_gratitude took: {:?}", start.elapsed()); - } - - #[test] - fn test_fuzzy_matching_escalation() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "This isn't working"), - create_message(Role::Assistant, "Let me help"), - create_message(Role::User, "i need to speek to a human agnet"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_escalation(&normalized_messages); - assert!(signal.escalation_requested); - assert_eq!(signal.escalation_count, 1); - println!("test_fuzzy_matching_escalation took: {:?}", start.elapsed()); - } - - #[test] - fn test_fuzzy_matching_repair() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Show me restaurants"), - create_message(Role::Assistant, "Here are some options"), - create_message(Role::User, "no i ment Italian restaurants"), - create_message(Role::Assistant, "Here are Italian restaurants"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_follow_up(&normalized_messages); - assert!(signal.repair_count >= 1); - println!("test_fuzzy_matching_repair took: {:?}", start.elapsed()); - } - - #[test] - fn test_fuzzy_matching_complaint() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - // Use a complaint that should match - "doesnt work" is close enough to "doesn't work" - let messages = vec![ - create_message(Role::User, "this doesnt work at all"), // Common typo: missing apostrophe - create_message(Role::Assistant, "I apologize"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - - // The layered matching should catch this via character ngrams or token cosine - // "doesnt work" has high character-level similarity to "doesn't work" - assert!( - signal.has_frustration, - "Should detect frustration from complaint pattern" - ); - assert!(signal.frustration_count >= 1); - println!("test_fuzzy_matching_complaint took: {:?}", start.elapsed()); - } - - #[test] - fn test_exact_match_priority() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message(Role::User, "thank you so much")]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_positive_feedback(&normalized_messages); - assert!(signal.has_positive_feedback); - // Should detect exact match, not fuzzy - assert!(signal.indicators[0].snippet.contains("thank you")); - assert!(!signal.indicators[0].snippet.contains("fuzzy")); - println!("test_exact_match_priority took: {:?}", start.elapsed()); - } - - // ======================================================================== - // Anti-Tests: Verify fixes stay fixed - // ======================================================================== - - #[test] - fn test_hello_not_profanity() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message(Role::User, "hello there")]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - assert!( - !signal.has_frustration, - "\"hello\" should not trigger profanity detection" - ); - } - - #[test] - fn test_prepare_not_escalation() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "Can you help me prepare for the meeting?", - )]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_escalation(&normalized_messages); - assert!( - !signal.escalation_requested, - "\"prepare\" should not trigger escalation (rep pattern removed)" - ); - } - - #[test] - fn test_unicode_apostrophe_confusion() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "I'm confused"), // Unicode apostrophe - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - assert!( - signal.has_frustration, - "Unicode apostrophe 'I'm confused' should trigger confusion" - ); - } - - #[test] - fn test_unicode_quotes_work() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "\u{201C}doesn\u{2019}t work\u{201D} with unicode quotes", - )]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - assert!( - signal.has_frustration, - "Unicode quotes should be normalized and match patterns" - ); - } - - #[test] - fn test_absolute_not_profanity() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message(Role::User, "That's absolute nonsense")]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized_messages); - // Should match on "nonsense" logic, not on "bs" substring - let has_bs_match = signal - .indicators - .iter() - .any(|ind| ind.snippet.contains("bs")); - assert!( - !has_bs_match, - "\"absolute\" should not trigger 'bs' profanity match" - ); - } - - #[test] - fn test_stopwords_not_rephrase() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Help me with X"), - create_message(Role::Assistant, "Sure"), - create_message(Role::User, "Help me with Y"), - ]; - - let normalized_messages = preprocess_messages(&messages); - let signal = analyzer.analyze_follow_up(&normalized_messages); - // Should not detect as rephrase since only stopwords overlap assert_eq!( - signal.repair_count, 0, - "Messages with only stopword overlap should not be rephrases" + r.interaction.misalignment.count, 0, + "a pure gratitude message should not trigger repair/misalignment" ); + assert!(r.interaction.satisfaction.count > 0); } #[test] - fn test_frustrated_user_with_legitimate_repair() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - - use hermesllm::apis::openai::{FunctionCall, ToolCall}; - - // Helper to create a message with tool calls - let create_assistant_with_tools = - |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { - Message { - role: Role::Assistant, - content: Some(MessageContent::Text(content.to_string())), - name: None, - tool_calls: Some(vec![ToolCall { - id: tool_id.to_string(), - call_type: "function".to_string(), - function: FunctionCall { - name: tool_name.to_string(), - arguments: args.to_string(), - }, - }]), - tool_call_id: None, - } - }; - - // Helper to create a tool response message - let create_tool_message = |tool_call_id: &str, content: &str| -> Message { - Message { - role: Role::Tool, - content: Some(MessageContent::Text(content.to_string())), - name: None, - tool_calls: None, - tool_call_id: Some(tool_call_id.to_string()), - } - }; - - // Scenario: User DOES mention New York in first message, making "I already told you" legitimate - let messages = vec![ - create_message( - Role::User, - "I need to book a flight from New York to Paris for December 20th", - ), - create_assistant_with_tools( - "I'll help you search for flights to Paris.", - "call_123", - "search_flights", - r#"{"origin": "NYC", "destination": "Paris", "date": "2025-12-20"}"#, - ), - create_tool_message("call_123", r#"{"flights": []}"#), - create_message( - Role::Assistant, - "I couldn't find any flights. Could you provide your departure city?", - ), - create_message(Role::User, "I already told you, from New York!"), - create_assistant_with_tools( - "Let me try again.", - "call_456", - "search_flights", - r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"#, - ), - create_tool_message("call_456", r#"{"flights": []}"#), - create_message( - Role::Assistant, - "I'm still not finding results. Let me check the system.", - ), - create_message( - Role::User, - "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?", - ), - create_message( - Role::Assistant, - "I sincerely apologize for the frustration with the search tool.", - ), - create_message( - Role::User, - "Forget it. I need to speak to a human agent. This is a waste of time.", - ), + fn summary_groups_signals_by_taxonomy() { + // Even on a clean conversation the summary should expose the three + // layer headers so the taxonomy is visible. + let msgs = vec![ + user("Hello"), + assistant("Hi! How can I help?"), + user("What's 2 + 2?"), + assistant("4"), ]; - - let report = analyzer.analyze(&messages); - - // Tool messages should be filtered out, so we should only analyze text messages - // That's 4 user messages + 5 assistant text messages = 9 turns - assert_eq!( - report.turn_count.total_turns, 9, - "Should count 9 text messages (tool messages filtered out)" + let r = SignalAnalyzer::default().analyze_openai(&msgs); + assert!( + r.summary.contains("Interaction \u{2014}"), + "missing Interaction header in: {}", + r.summary ); assert!( - report.turn_count.is_concerning, - "Should flag concerning turn count" - ); - - // Should detect frustration (all caps, complaints) - assert!( - report.frustration.has_frustration, - "Should detect frustration" + r.summary.contains("Execution \u{2014}"), + "missing Execution header in: {}", + r.summary ); assert!( - report.frustration.frustration_count >= 2, - "Should detect multiple frustration indicators" - ); - assert!( - report.frustration.severity >= 2, - "Should have moderate or higher frustration severity" - ); - - // Should detect escalation request - assert!( - report.escalation.escalation_requested, - "Should detect escalation to human agent" - ); - assert!( - report.escalation.escalation_count >= 1, - "Should detect at least one escalation" - ); - - // Overall quality should be Poor or Severe - assert!( - matches!( - report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Severe - ), - "Quality should be Poor or Severe, got {:?}", - report.overall_quality - ); - - println!( - "test_frustrated_user_with_legitimate_repair took: {:?}", - start.elapsed() + r.summary.contains("Environment \u{2014}"), + "missing Environment header in: {}", + r.summary ); + assert!(r.summary.contains("misalignment: 0")); + assert!(r.summary.contains("loops: 0")); + assert!(r.summary.contains("exhaustion: 0")); } #[test] - fn test_frustrated_user_false_claim() { - let start = Instant::now(); - let analyzer = TextBasedSignalAnalyzer::new(); - - use hermesllm::apis::openai::{FunctionCall, ToolCall}; - - // Helper to create a message with tool calls - let create_assistant_with_tools = - |content: &str, tool_id: &str, tool_name: &str, args: &str| -> Message { - Message { - role: Role::Assistant, - content: Some(MessageContent::Text(content.to_string())), - name: None, - tool_calls: Some(vec![ToolCall { - id: tool_id.to_string(), - call_type: "function".to_string(), - function: FunctionCall { - name: tool_name.to_string(), - arguments: args.to_string(), - }, - }]), - tool_call_id: None, - } - }; - - // Helper to create a tool response message - let create_tool_message = |tool_call_id: &str, content: &str| -> Message { - Message { - role: Role::Tool, - content: Some(MessageContent::Text(content.to_string())), - name: None, - tool_calls: None, - tool_call_id: Some(tool_call_id.to_string()), - } - }; - - // Scenario: User NEVER mentions New York in first message but claims "I already told you" - // This represents realistic frustrated user behavior - exaggeration/misremembering - let messages = vec![ - create_message( - Role::User, - "I need to book a flight to Paris for December 20th", - ), - create_assistant_with_tools( - "I'll help you search for flights to Paris.", - "call_123", - "search_flights", - r#"{"destination": "Paris", "date": "2025-12-20"}"#, - ), - create_tool_message("call_123", r#"{"error": "origin required"}"#), - create_message( - Role::Assistant, - "I couldn't find any flights. Could you provide your departure city?", - ), - create_message(Role::User, "I already told you, from New York!"), // False claim - never mentioned it - create_assistant_with_tools( - "Let me try again.", - "call_456", - "search_flights", - r#"{"origin": "New York", "destination": "Paris", "date": "2025-12-20"}"#, - ), - create_tool_message("call_456", r#"{"flights": []}"#), - create_message( - Role::Assistant, - "I'm still not finding results. Let me check the system.", - ), - create_message( - Role::User, - "THIS IS RIDICULOUS!!! The tool doesn't work at all. Why do you keep calling it?", - ), - create_message( - Role::Assistant, - "I sincerely apologize for the frustration with the search tool.", - ), - create_message( - Role::User, - "Forget it. I need to speak to a human agent. This is a waste of time.", - ), + fn summary_includes_severity_when_signals_fire() { + let msgs = vec![ + user("This isn't helpful at all"), + assistant("I'm sorry, can you tell me more?"), + user("Get me a human, this is useless"), ]; - - let report = analyzer.analyze(&messages); - - // Tool messages should be filtered out, so we should only analyze text messages - // That's 4 user messages + 5 assistant text messages = 9 turns - assert_eq!( - report.turn_count.total_turns, 9, - "Should count 9 text messages (tool messages filtered out)" + let r = SignalAnalyzer::default().analyze_openai(&msgs); + // Disengagement fires; should render with `(sev N)` and the + // escalation-requested alert tail. + assert!( + r.summary.contains("disengagement:") && r.summary.contains("(sev "), + "expected severity rendered for disengagement: {}", + r.summary ); assert!( - report.turn_count.is_concerning, - "Should flag concerning turn count" - ); - - // Should detect frustration (all caps, complaints, false claims) - assert!( - report.frustration.has_frustration, - "Should detect frustration" - ); - assert!( - report.frustration.frustration_count >= 2, - "Should detect multiple frustration indicators" - ); - assert!( - report.frustration.severity >= 2, - "Should have moderate or higher frustration severity" - ); - - // Should detect escalation request - assert!( - report.escalation.escalation_requested, - "Should detect escalation to human agent" - ); - assert!( - report.escalation.escalation_count >= 1, - "Should detect at least one escalation" - ); - - // Note: May detect false positive "positive feedback" due to fuzzy matching - // e.g., "I already told YOU" matches "you rock", "THIS is RIDICULOUS" matches "this helps" - // However, the overall quality should still be Poor/Severe due to frustration+escalation - - // Overall quality should be Poor or Severe (frustration + escalation indicates poor interaction) - assert!( - matches!( - report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Severe - ), - "Quality should be Poor or Severe for frustrated user with false claims, got {:?}", - report.overall_quality - ); - - println!( - "test_frustrated_user_false_claim took: {:?}", - start.elapsed() + r.summary.contains("Escalation requested:"), + "expected escalation alert in: {}", + r.summary ); } - // false negative tests #[test] - fn test_dissatisfaction_polite_not_working_for_me() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Thanks, but this still isn't working for me."), // Polite dissatisfaction, e.g., I appreciate it, but this isn't what I was looking for. - create_message(Role::Assistant, "Sorry—what error do you see?"), + fn execution_failures_lower_quality() { + let msgs = vec![ShareGptMessage { + from: "human", + value: "do the thing", + }]; + let _ = msgs; + // Build a synthetic ShareGPT input with multiple tool failures. + let convo = vec![ + ShareGptMessage { + from: "human", + value: "create a user", + }, + ShareGptMessage { + from: "function_call", + value: r#"{"name":"create_user","arguments":{"age":"twelve"}}"#, + }, + ShareGptMessage { + from: "observation", + value: "Error: validation failed - expected integer got string", + }, + ShareGptMessage { + from: "function_call", + value: r#"{"name":"create_user","arguments":{}}"#, + }, + ShareGptMessage { + from: "observation", + value: "missing required field: name", + }, ]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized); - assert!( - signal.has_frustration, - "Polite dissatisfaction should be detected" - ); - } - - #[test] - fn test_dissatisfaction_giving_up_without_escalation() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "Never mind, I'll figure it out myself.", - )]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_escalation(&normalized); - assert!( - signal.escalation_requested, - "Giving up should count as escalation/quit intent" - ); - } - - #[test] - fn test_dissatisfaction_same_problem_again() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "I'm running into the same issue again.", - )]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized); - assert!( - signal.has_frustration, - "'same issue again' should be detected" - ); - } - - #[test] - fn test_unsatisfied_incomplete() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message(Role::User, "This feels incomplete.")]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized); - assert!( - signal.has_frustration, - "Should detect 'incomplete' dissatisfaction" - ); - } - - #[test] - fn test_low_mood_overwhelming() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "This is overwhelming and I'm not sure what to do.", - )]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized); - assert!(signal.has_frustration, "Should detect overwhelmed language"); - } - - #[test] - fn test_low_mood_exhausted_trying() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![create_message( - Role::User, - "I'm exhausted trying to get this working.", - )]; - let normalized = preprocess_messages(&messages); - let signal = analyzer.analyze_frustration(&normalized); - assert!( - signal.has_frustration, - "Should detect exhaustion/struggle language" - ); - } - - #[test] - fn test_common_polite_unresolved_dissatisfaction() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "I'm trying to set up SSH keys for GitHub."), - create_message( - Role::Assistant, - "Sure. First generate a key using ssh-keygen.", - ), - create_message(Role::User, "I did that already."), - create_message( - Role::Assistant, - "Then add the key to your GitHub account settings.", - ), - create_message(Role::User, "I've done that too."), - create_message( - Role::Assistant, - "After that, make sure your SSH agent is running.", - ), - create_message( - Role::User, - "Okay, but this still doesn't seem to fix the issue.", - ), - create_message(Role::Assistant, "What error message are you seeing?"), - create_message(Role::User, "It's just not connecting the way I expected."), - ]; - - let report = analyzer.analyze(&messages); - - // This is a common false negative if you only look for caps/profanity. - // Desired: detect dissatisfaction/frustration (or at least not rate as Excellent). - assert!( - report.frustration.has_frustration || report.follow_up.repair_count >= 1, - "Should detect polite unresolved dissatisfaction via frustration or follow-up indicators" - ); - - assert!( - !matches!(report.overall_quality, InteractionQuality::Excellent), - "Should not classify unresolved dissatisfaction as Excellent" - ); - } - - #[test] - fn test_common_resigned_giving_up_quietly() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message( - Role::User, - "Can you explain how to deploy this with Docker?", - ), - create_message( - Role::Assistant, - "You need to write a Dockerfile and build an image.", - ), - create_message(Role::User, "I tried that."), - create_message(Role::Assistant, "Then you can run docker-compose up."), - create_message(Role::User, "I did, but it didn’t really help."), - create_message(Role::Assistant, "What error are you getting?"), - create_message( - Role::User, - "Honestly, never mind. I’ll just try something else.", - ), - ]; - - let report = analyzer.analyze(&messages); - - // Many systems miss "never mind / I'll try something else" if they only look for "human agent". - assert!( - report.escalation.escalation_requested || report.frustration.has_frustration, - "Resigned quitting language should trigger escalation or frustration" - ); - - assert!( - matches!( - report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Severe - ) || report.escalation.escalation_requested - || report.frustration.has_frustration, - "Giving up should not be classified as a high-quality interaction" - ); - } - - #[test] - fn test_common_discouraged_overwhelmed_low_mood() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "I'm trying to understand backpropagation."), - create_message( - Role::Assistant, - "It's a way to compute gradients efficiently.", - ), - create_message(Role::User, "I’ve read that explanation already."), - create_message(Role::Assistant, "Would you like a mathematical derivation?"), - create_message(Role::User, "Maybe, but I’m still having trouble following."), - create_message(Role::Assistant, "I can walk through a simple example."), - create_message( - Role::User, - "That might help, but honestly this is pretty overwhelming.", - ), - create_message(Role::Assistant, "Let’s slow it down step by step."), - create_message( - Role::User, - "Yeah… I’m just feeling kind of discouraged right now.", - ), - ]; - - let report = analyzer.analyze(&messages); - - // This is negative affect without caps/profanity. Should still count as frustration/negative signal. - assert!( - report.frustration.has_frustration, - "Overwhelmed/discouraged language should be detected as negative sentiment/frustration" - ); - - assert!( - !matches!(report.overall_quality, InteractionQuality::Excellent), - "Low-mood discouragement should not be classified as Excellent" - ); - } - - #[test] - fn test_common_misalignment_not_what_i_asked() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "How do I optimize this SQL query?"), - create_message( - Role::Assistant, - "You can add indexes to improve performance.", - ), - create_message(Role::User, "I already have indexes."), - create_message(Role::Assistant, "Then you could consider query caching."), - create_message(Role::User, "That’s not really what I was asking about."), - create_message( - Role::Assistant, - "What specifically are you trying to optimize?", - ), - create_message( - Role::User, - "The execution plan — this answer doesn’t address that.", - ), - ]; - - let report = analyzer.analyze(&messages); - - // Misalignment often shows as follow-up repair or frustration. - assert!( - report.follow_up.repair_count >= 1 || report.frustration.has_frustration, - "Misalignment ('not what I asked') should trigger repair or frustration signals" - ); - - assert!( - !matches!(report.overall_quality, InteractionQuality::Excellent), - "Misalignment should not be rated as Excellent" - ); - } - - #[test] - fn test_common_false_negative_polite_disappointment_complexity() { - let analyzer = TextBasedSignalAnalyzer::new(); - let messages = vec![ - create_message(Role::User, "Can you help me write a regex for this?"), - create_message(Role::Assistant, "Sure, try this pattern: ^[a-z]+$"), - create_message(Role::User, "I tested it."), - create_message(Role::Assistant, "Did it work?"), - create_message(Role::User, "Not quite — it matches more than it should."), - create_message(Role::Assistant, "You can refine it with a lookahead."), - create_message( - Role::User, - "I see… this is more complicated than I expected.", - ), - ]; - - let report = analyzer.analyze(&messages); - - // Polite disappointment often becomes a false negative. - assert!( - report.frustration.has_frustration || report.follow_up.repair_count >= 1, - "Polite dissatisfaction ('not quite', 'more complicated than expected') should trigger a negative signal" - ); - - assert!( - !matches!(report.overall_quality, InteractionQuality::Excellent), - "Polite disappointment should not be classified as Excellent" - ); - } - - #[test] - fn test_catastrophic_failure_looping_assistant() { - let analyzer = TextBasedSignalAnalyzer::new(); - - // Catastrophic failure: assistant stuck in loop, user increasingly frustrated - let messages = vec![ - // Turn 1: Initial request - create_message(Role::User, "I need to cancel my subscription"), - create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), - - // Turn 3: First repair - repetition clarification - create_message(Role::User, "I just said I want to cancel my subscription"), - // Turn 4: Exact repetition from assistant - create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), - - // Turn 5: Repair (negation) + frustration (ALL CAPS) - create_message(Role::User, "No, I meant CANCEL. My. Subscription."), - // Turn 6: Near-duplicate repetition - create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."), - - // Turn 7: Severe frustration (complaint + excessive punctuation + ALL CAPS) - create_message(Role::User, "This is not helpful!! I want to CANCEL my subscription!!!"), - // Turn 8: Exact repetition again (looping) - create_message(Role::Assistant, "I can help you with account management. What would you like to do today?"), - - // Turn 9: Repair (explicit rephrase) + frustration (complaint) - create_message(Role::User, "This doesnt work. Let me rephrase: I need to terminate my account and stop being charged"), - // Turn 10: Near-duplicate repetition (looping continues) - create_message(Role::Assistant, "I understand you need help with your account. Let me assist you with that."), - - // Turn 11: Escalation (multiple types) + frustration - create_message(Role::User, "I give up. Can I speak to a real person? This is a waste of time"), - ]; - - let report = analyzer.analyze(&messages); - - // Validate turn count - assert_eq!( - report.turn_count.total_turns, 11, - "Should have 11 total turns" - ); - assert_eq!(report.turn_count.user_turns, 6, "Should have 6 user turns"); - assert_eq!( - report.turn_count.assistant_turns, 5, - "Should have 5 assistant turns" - ); - assert!( - report.turn_count.is_concerning, - "11 turns should be concerning (>7)" - ); - assert!( - !report.turn_count.is_excessive, - "11 turns should not be excessive (<=12)" - ); - assert!( - report.turn_count.efficiency_score < 0.5, - "Efficiency should be low" - ); - - // Validate repair detection (USER signals - query reformulation) - // Detected repairs: - // 1. "I just said I want to cancel..." - pattern: "I just said" - // 2. "No, I meant CANCEL..." - pattern: "No, I meant" - // 3. "Let me rephrase: I need to terminate..." - pattern: "let me rephrase" - // Note: "This is not helpful!!" is frustration (not repair) - // Note: "I give up..." is escalation (not repair) - assert_eq!( - report.follow_up.repair_count, 3, - "Should detect exactly 3 repair attempts from user messages" - ); - assert_eq!( - report.follow_up.repair_ratio, 0.5, - "Repair ratio should be 0.5 (3 repairs / 6 user messages)" - ); - assert!( - report.follow_up.is_concerning, - "50% repair ratio should be highly concerning (threshold is 30%)" - ); - - // Validate frustration detection - assert!( - report.frustration.has_frustration, - "Should detect frustration" - ); - assert!( - report.frustration.frustration_count >= 4, - "Should detect multiple frustration indicators: found {}", - report.frustration.frustration_count - ); - assert!( - report.frustration.severity >= 2, - "Should be at least moderate frustration" - ); - - // Validate repetition/looping detection (ASSISTANT signals - not following instructions) - // The assistant repeats the same unhelpful responses multiple times: - // 1. "I can help you with account management..." appears 3 times (exact repetition) - // 2. "I understand you need help with your account..." appears 2 times (near-duplicate) - assert!( - report.repetition.repetition_count >= 4, - "Should detect at least 4 assistant repetitions (exact + near-duplicates)" - ); - assert!( - report.repetition.has_looping, - "Should detect looping (>2 repetitions indicates stuck agent)" - ); - assert!( - report.repetition.severity >= 2, - "Should be moderate to severe looping (assistant not adapting)" - ); - - // Validate escalation detection - assert!( - report.escalation.escalation_requested, - "Should detect escalation request" - ); - assert!( - report.escalation.escalation_count >= 2, - "Should detect multiple escalation indicators: 'give up' + 'speak to a real person'" - ); - - // Validate overall quality - assert_eq!(report.overall_quality, InteractionQuality::Severe, "Should be classified as Severe due to escalation + excessive frustration + looping + high repair ratio"); + let r = SignalAnalyzer::default().analyze_sharegpt(&convo); + assert!(r.execution.failure.count >= 1); + assert!(r.quality_score < 50.0); } } diff --git a/crates/brightstaff/src/signals/environment/exhaustion.rs b/crates/brightstaff/src/signals/environment/exhaustion.rs new file mode 100644 index 00000000..142e7d6e --- /dev/null +++ b/crates/brightstaff/src/signals/environment/exhaustion.rs @@ -0,0 +1,347 @@ +//! Environment exhaustion detector. Direct port of +//! `signals/environment/exhaustion.py`. + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::json; + +use crate::signals::analyzer::ShareGptMessage; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; + +pub const API_ERROR_PATTERNS: &[&str] = &[ + r"500\s*(internal\s+)?server\s+error", + r"502\s*bad\s+gateway", + r"503\s*service\s+unavailable", + r"504\s*gateway\s+timeout", + r"internal\s+server\s+error", + r"service\s+unavailable", + r"server\s+error", + r"backend\s+error", + r"upstream\s+error", + r"service\s+temporarily\s+unavailable", + r"maintenance\s+mode", + r"under\s+maintenance", + r"try\s+again\s+later", + r"temporarily\s+unavailable", + r"system\s+error", + r"unexpected\s+error", + r"unhandled\s+exception", +]; + +pub const TIMEOUT_PATTERNS: &[&str] = &[ + r"timeout", + r"timed?\s*out", + r"etimedout", + r"connection\s+timed?\s*out", + r"read\s+timed?\s*out", + r"request\s+timed?\s*out", + r"gateway\s+timeout", + r"deadline\s+exceeded", + r"took\s+too\s+long", + r"operation\s+timed?\s*out", + r"socket\s+timeout", +]; + +pub const RATE_LIMIT_PATTERNS: &[&str] = &[ + r"rate\s+limit", + r"rate.limited", + r"(status|error|http)\s*:?\s*429", + r"429\s+(too\s+many|rate|limit)", + r"too\s+many\s+requests?", + r"quota\s+exceeded", + r"quota\s+limit", + r"throttl(ed|ing)", + r"request\s+limit", + r"api\s+limit", + r"calls?\s+per\s+(second|minute|hour|day)", + r"exceeded\s+.*\s+limit", + r"slow\s+down", + r"retry\s+after", + r"requests?\s+exceeded", +]; + +pub const NETWORK_PATTERNS: &[&str] = &[ + r"connection\s+refused", + r"econnrefused", + r"econnreset", + r"connection\s+reset", + r"enotfound", + r"dns\s+(error|failure|lookup)", + r"host\s+not\s+found", + r"network\s+(error|failure|unreachable)", + r"no\s+route\s+to\s+host", + r"socket\s+error", + r"connection\s+failed", + r"unable\s+to\s+connect", + r"cannot\s+connect", + r"could\s+not\s+connect", + r"connect\s+error", + r"ssl\s+(error|handshake|certificate)", + r"certificate\s+(error|invalid|expired)", +]; + +pub const MALFORMED_PATTERNS: &[&str] = &[ + r"json\s+parse\s+error", + r"invalid\s+json", + r"unexpected\s+token", + r"syntax\s+error.*json", + r"malformed\s+(response|json|data)", + r"unexpected\s+end\s+of", + r"parse\s+error", + r"parsing\s+failed", + r"invalid\s+response", + r"unexpected\s+response", + r"response\s+format", + r"missing\s+field.*response", + r"unexpected\s+schema", + r"schema\s+validation", + r"deserialization\s+error", + r"failed\s+to\s+decode", +]; + +pub const CONTEXT_OVERFLOW_PATTERNS: &[&str] = &[ + r"context\s+(length|limit|overflow|exceeded)", + r"token\s+(limit|overflow|exceeded)", + r"max(imum)?\s+tokens?", + r"input\s+too\s+(long|large)", + r"exceeds?\s+(context|token|character|input)\s+limit", + r"message\s+too\s+(long|large)", + r"content\s+too\s+(long|large)", + r"truncat(ed|ion)\s+(due\s+to|because|for)\s+(length|size|limit)", + r"maximum\s+context", + r"prompt\s+too\s+(long|large)", +]; + +fn compile(patterns: &[&str]) -> Regex { + let combined = patterns + .iter() + .map(|p| format!("({})", p)) + .collect::>() + .join("|"); + Regex::new(&format!("(?i){}", combined)).expect("exhaustion pattern regex must compile") +} + +fn api_error_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(API_ERROR_PATTERNS)) +} +fn timeout_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(TIMEOUT_PATTERNS)) +} +fn rate_limit_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(RATE_LIMIT_PATTERNS)) +} +fn network_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(NETWORK_PATTERNS)) +} +fn malformed_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(MALFORMED_PATTERNS)) +} +fn context_overflow_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(CONTEXT_OVERFLOW_PATTERNS)) +} + +fn snippet_around(text: &str, m: regex::Match<'_>, context: usize) -> String { + let start = m.start().saturating_sub(context); + let end = (m.end() + context).min(text.len()); + let start = align_char_boundary(text, start, false); + let end = align_char_boundary(text, end, true); + let mut snippet = String::new(); + if start > 0 { + snippet.push_str("..."); + } + snippet.push_str(&text[start..end]); + if end < text.len() { + snippet.push_str("..."); + } + snippet +} + +fn align_char_boundary(s: &str, mut idx: usize, forward: bool) -> usize { + if idx >= s.len() { + return s.len(); + } + while !s.is_char_boundary(idx) { + if forward { + idx += 1; + } else if idx == 0 { + break; + } else { + idx -= 1; + } + } + idx +} + +pub fn analyze_exhaustion(messages: &[ShareGptMessage<'_>]) -> SignalGroup { + let mut group = SignalGroup::new("exhaustion"); + + for (i, msg) in messages.iter().enumerate() { + if msg.from != "observation" { + continue; + } + let value = msg.value; + let lower = value.to_lowercase(); + + if let Some(m) = rate_limit_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionRateLimit, + i, + snippet_around(value, m, 50), + 0.95, + "rate_limit", + m.as_str(), + )); + continue; + } + + if let Some(m) = api_error_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionApiError, + i, + snippet_around(value, m, 50), + 0.9, + "api_error", + m.as_str(), + )); + continue; + } + + if let Some(m) = timeout_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionTimeout, + i, + snippet_around(value, m, 50), + 0.9, + "timeout", + m.as_str(), + )); + continue; + } + + if let Some(m) = network_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionNetwork, + i, + snippet_around(value, m, 50), + 0.9, + "network", + m.as_str(), + )); + continue; + } + + if let Some(m) = malformed_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionMalformed, + i, + snippet_around(value, m, 50), + 0.85, + "malformed_response", + m.as_str(), + )); + continue; + } + + if let Some(m) = context_overflow_re().find(&lower) { + group.add_signal(emit( + SignalType::EnvironmentExhaustionContextOverflow, + i, + snippet_around(value, m, 50), + 0.9, + "context_overflow", + m.as_str(), + )); + } + } + + group +} + +fn emit( + t: SignalType, + idx: usize, + snippet: String, + confidence: f32, + kind: &str, + matched: &str, +) -> SignalInstance { + SignalInstance::new(t, idx, snippet) + .with_confidence(confidence) + .with_metadata(json!({ + "exhaustion_type": kind, + "matched": matched, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn obs(value: &str) -> ShareGptMessage<'_> { + ShareGptMessage { + from: "observation", + value, + } + } + + #[test] + fn detects_rate_limit() { + let g = analyze_exhaustion(&[obs("HTTP 429: too many requests, retry after 30s")]); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionRateLimit))); + } + + #[test] + fn detects_api_error() { + let g = analyze_exhaustion(&[obs("503 service unavailable - try again later")]); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionApiError))); + } + + #[test] + fn detects_timeout() { + let g = analyze_exhaustion(&[obs("Connection timed out after 30 seconds")]); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionTimeout))); + } + + #[test] + fn detects_network_failure() { + let g = analyze_exhaustion(&[obs("ECONNREFUSED: connection refused by remote host")]); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionNetwork))); + } + + #[test] + fn detects_malformed_response() { + let g = analyze_exhaustion(&[obs("Invalid JSON: unexpected token at position 42")]); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::EnvironmentExhaustionMalformed))); + } + + #[test] + fn detects_context_overflow() { + let g = analyze_exhaustion(&[obs("Maximum context length exceeded for this model")]); + assert!(g.signals.iter().any(|s| matches!( + s.signal_type, + SignalType::EnvironmentExhaustionContextOverflow + ))); + } +} diff --git a/crates/brightstaff/src/signals/environment/mod.rs b/crates/brightstaff/src/signals/environment/mod.rs new file mode 100644 index 00000000..97d9b300 --- /dev/null +++ b/crates/brightstaff/src/signals/environment/mod.rs @@ -0,0 +1,3 @@ +//! Environment signals: exhaustion (external system failures and constraints). + +pub mod exhaustion; diff --git a/crates/brightstaff/src/signals/execution/failure.rs b/crates/brightstaff/src/signals/execution/failure.rs new file mode 100644 index 00000000..3e171446 --- /dev/null +++ b/crates/brightstaff/src/signals/execution/failure.rs @@ -0,0 +1,388 @@ +//! Execution failure detector. Direct port of `signals/execution/failure.py`. + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::json; + +use crate::signals::analyzer::ShareGptMessage; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; + +pub const INVALID_ARGS_PATTERNS: &[&str] = &[ + r"invalid\s+argument", + r"invalid\s+parameter", + r"invalid\s+type", + r"type\s*error", + r"expected\s+\w+\s*,?\s*got\s+\w+", + r"required\s+field", + r"required\s+parameter", + r"missing\s+required", + r"missing\s+argument", + r"validation\s+failed", + r"validation\s+error", + r"invalid\s+value", + r"invalid\s+format", + r"must\s+be\s+(a|an)\s+\w+", + r"cannot\s+be\s+(null|empty|none)", + r"is\s+not\s+valid", + r"does\s+not\s+match", + r"out\s+of\s+range", + r"invalid\s+date", + r"invalid\s+json", + r"malformed\s+request", +]; + +pub const BAD_QUERY_PATTERNS: &[&str] = &[ + r"invalid\s+query", + r"query\s+syntax\s+error", + r"malformed\s+query", + r"unknown\s+field", + r"invalid\s+field", + r"invalid\s+filter", + r"invalid\s+search", + r"unknown\s+id", + r"invalid\s+id", + r"id\s+format\s+error", + r"invalid\s+identifier", + r"query\s+failed", + r"search\s+error", + r"invalid\s+operator", + r"unsupported\s+query", +]; + +pub const TOOL_NOT_FOUND_PATTERNS: &[&str] = &[ + r"unknown\s+function", + r"unknown\s+tool", + r"function\s+not\s+found", + r"tool\s+not\s+found", + r"no\s+such\s+function", + r"no\s+such\s+tool", + r"undefined\s+function", + r"action\s+not\s+supported", + r"invalid\s+tool", + r"invalid\s+function", + r"unrecognized\s+function", +]; + +pub const AUTH_MISUSE_PATTERNS: &[&str] = &[ + r"\bunauthorized\b", + r"(status|error|http|code)\s*:?\s*401", + r"401\s+unauthorized", + r"403\s+forbidden", + r"permission\s+denied", + r"access\s+denied", + r"authentication\s+required", + r"invalid\s+credentials", + r"invalid\s+token", + r"token\s+expired", + r"missing\s+authorization", + r"\bforbidden\b", + r"not\s+authorized", + r"insufficient\s+permissions?", +]; + +pub const STATE_ERROR_PATTERNS: &[&str] = &[ + r"invalid\s+state", + r"illegal\s+state", + r"must\s+call\s+\w+\s+first", + r"must\s+\w+\s+before", + r"cannot\s+\w+\s+before", + r"already\s+(exists?|created|started|finished)", + r"not\s+initialized", + r"not\s+started", + r"already\s+in\s+progress", + r"operation\s+in\s+progress", + r"sequence\s+error", + r"precondition\s+failed", + r"(status|error|http)\s*:?\s*409", + r"409\s+conflict", + r"\bconflict\b", +]; + +fn compile(patterns: &[&str]) -> Regex { + // Use `(?i)` flag for case-insensitive matching, matching Python's `re.IGNORECASE`. + let combined = patterns + .iter() + .map(|p| format!("({})", p)) + .collect::>() + .join("|"); + Regex::new(&format!("(?i){}", combined)).expect("failure pattern regex must compile") +} + +fn invalid_args_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(INVALID_ARGS_PATTERNS)) +} +fn bad_query_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(BAD_QUERY_PATTERNS)) +} +fn tool_not_found_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(TOOL_NOT_FOUND_PATTERNS)) +} +fn auth_misuse_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(AUTH_MISUSE_PATTERNS)) +} +fn state_error_re() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| compile(STATE_ERROR_PATTERNS)) +} + +/// Pull tool name + args from a `function_call` message. Mirrors +/// `_extract_tool_info` in the reference. +pub(crate) fn extract_tool_info(value: &str) -> (String, String) { + if let Ok(parsed) = serde_json::from_str::(value) { + if let Some(obj) = parsed.as_object() { + let name = obj + .get("name") + .or_else(|| obj.get("function")) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "unknown".to_string()); + let args = match obj.get("arguments").or_else(|| obj.get("args")) { + Some(serde_json::Value::Object(o)) => { + serde_json::to_string(&serde_json::Value::Object(o.clone())).unwrap_or_default() + } + Some(other) => other + .as_str() + .map(|s| s.to_string()) + .unwrap_or_else(|| serde_json::to_string(other).unwrap_or_default()), + None => String::new(), + }; + return (name, args); + } + } + let mut snippet: String = value.chars().take(200).collect(); + snippet.shrink_to_fit(); + ("unknown".to_string(), snippet) +} + +/// Build a context-window snippet around a regex match, with leading/trailing +/// ellipses when truncated. Mirrors `_get_snippet`. +fn snippet_around(text: &str, m: regex::Match<'_>, context: usize) -> String { + let start = m.start().saturating_sub(context); + let end = (m.end() + context).min(text.len()); + // Ensure we cut on UTF-8 boundaries. + let start = align_char_boundary(text, start, false); + let end = align_char_boundary(text, end, true); + let mut snippet = String::new(); + if start > 0 { + snippet.push_str("..."); + } + snippet.push_str(&text[start..end]); + if end < text.len() { + snippet.push_str("..."); + } + snippet +} + +fn align_char_boundary(s: &str, mut idx: usize, forward: bool) -> usize { + if idx >= s.len() { + return s.len(); + } + while !s.is_char_boundary(idx) { + if forward { + idx += 1; + } else if idx == 0 { + break; + } else { + idx -= 1; + } + } + idx +} + +pub fn analyze_failure(messages: &[ShareGptMessage<'_>]) -> SignalGroup { + let mut group = SignalGroup::new("failure"); + let mut last_call: Option<(usize, String, String)> = None; + + for (i, msg) in messages.iter().enumerate() { + match msg.from { + "function_call" => { + let (name, args) = extract_tool_info(msg.value); + last_call = Some((i, name, args)); + continue; + } + "observation" => {} + _ => continue, + } + + let value = msg.value; + let lower = value.to_lowercase(); + let (call_index, tool_name) = match &last_call { + Some((idx, name, _)) => (*idx, name.clone()), + None => (i.saturating_sub(1), "unknown".to_string()), + }; + + if let Some(m) = invalid_args_re().find(&lower) { + group.add_signal( + SignalInstance::new( + SignalType::ExecutionFailureInvalidArgs, + i, + snippet_around(value, m, 50), + ) + .with_confidence(0.9) + .with_metadata(json!({ + "tool_name": tool_name, + "call_index": call_index, + "error_type": "invalid_args", + "matched": m.as_str(), + })), + ); + continue; + } + + if let Some(m) = tool_not_found_re().find(&lower) { + group.add_signal( + SignalInstance::new( + SignalType::ExecutionFailureToolNotFound, + i, + snippet_around(value, m, 50), + ) + .with_confidence(0.95) + .with_metadata(json!({ + "tool_name": tool_name, + "call_index": call_index, + "error_type": "tool_not_found", + "matched": m.as_str(), + })), + ); + continue; + } + + if let Some(m) = auth_misuse_re().find(&lower) { + group.add_signal( + SignalInstance::new( + SignalType::ExecutionFailureAuthMisuse, + i, + snippet_around(value, m, 50), + ) + .with_confidence(0.8) + .with_metadata(json!({ + "tool_name": tool_name, + "call_index": call_index, + "error_type": "auth_misuse", + "matched": m.as_str(), + })), + ); + continue; + } + + if let Some(m) = state_error_re().find(&lower) { + group.add_signal( + SignalInstance::new( + SignalType::ExecutionFailureStateError, + i, + snippet_around(value, m, 50), + ) + .with_confidence(0.85) + .with_metadata(json!({ + "tool_name": tool_name, + "call_index": call_index, + "error_type": "state_error", + "matched": m.as_str(), + })), + ); + continue; + } + + if let Some(m) = bad_query_re().find(&lower) { + let confidence = if ["error", "invalid", "failed"] + .iter() + .any(|w| lower.contains(w)) + { + 0.8 + } else { + 0.6 + }; + group.add_signal( + SignalInstance::new( + SignalType::ExecutionFailureBadQuery, + i, + snippet_around(value, m, 50), + ) + .with_confidence(confidence) + .with_metadata(json!({ + "tool_name": tool_name, + "call_index": call_index, + "error_type": "bad_query", + "matched": m.as_str(), + })), + ); + } + } + + group +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fc(value: &str) -> ShareGptMessage<'_> { + ShareGptMessage { + from: "function_call", + value, + } + } + fn obs(value: &str) -> ShareGptMessage<'_> { + ShareGptMessage { + from: "observation", + value, + } + } + + #[test] + fn detects_invalid_args() { + let msgs = vec![ + fc(r#"{"name":"create_user","arguments":{"age":"twelve"}}"#), + obs("Error: validation failed - expected integer got string for field age"), + ]; + let g = analyze_failure(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionFailureInvalidArgs))); + } + + #[test] + fn detects_tool_not_found() { + let msgs = vec![ + fc(r#"{"name":"send_thought","arguments":{}}"#), + obs("Error: unknown function 'send_thought'"), + ]; + let g = analyze_failure(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionFailureToolNotFound))); + } + + #[test] + fn detects_auth_misuse() { + let msgs = vec![ + fc(r#"{"name":"get_secret","arguments":{}}"#), + obs("HTTP 401 Unauthorized"), + ]; + let g = analyze_failure(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionFailureAuthMisuse))); + } + + #[test] + fn detects_state_error() { + let msgs = vec![ + fc(r#"{"name":"commit_tx","arguments":{}}"#), + obs("must call begin_tx first"), + ]; + let g = analyze_failure(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionFailureStateError))); + } +} diff --git a/crates/brightstaff/src/signals/execution/loops.rs b/crates/brightstaff/src/signals/execution/loops.rs new file mode 100644 index 00000000..70b90e83 --- /dev/null +++ b/crates/brightstaff/src/signals/execution/loops.rs @@ -0,0 +1,433 @@ +//! Execution loops detector. Direct port of `signals/execution/loops.py`. + +use serde_json::json; + +use crate::signals::analyzer::ShareGptMessage; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; + +pub const RETRY_THRESHOLD: usize = 3; +pub const PARAMETER_DRIFT_THRESHOLD: usize = 3; +pub const OSCILLATION_CYCLES_THRESHOLD: usize = 3; + +#[derive(Debug, Clone)] +pub struct ToolCall { + pub index: usize, + pub name: String, + /// Canonical JSON string of arguments (sorted keys when parseable). + pub args: String, + pub args_dict: Option>, +} + +impl ToolCall { + pub fn args_equal(&self, other: &ToolCall) -> bool { + match (&self.args_dict, &other.args_dict) { + (Some(a), Some(b)) => a == b, + _ => self.args == other.args, + } + } +} + +fn parse_tool_call(index: usize, msg: &ShareGptMessage<'_>) -> Option { + if msg.from != "function_call" { + return None; + } + let value = msg.value; + + if let Ok(parsed) = serde_json::from_str::(value) { + if let Some(obj) = parsed.as_object() { + let name = obj + .get("name") + .or_else(|| obj.get("function")) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "unknown".to_string()); + let raw_args = obj.get("arguments").or_else(|| obj.get("args")); + let (args_str, args_dict) = match raw_args { + Some(serde_json::Value::Object(o)) => { + let mut keys: Vec<&String> = o.keys().collect(); + keys.sort(); + let mut canon = serde_json::Map::new(); + for k in keys { + canon.insert(k.clone(), o[k].clone()); + } + ( + serde_json::to_string(&serde_json::Value::Object(canon.clone())) + .unwrap_or_default(), + Some(canon), + ) + } + Some(other) => ( + other + .as_str() + .map(|s| s.to_string()) + .unwrap_or_else(|| serde_json::to_string(other).unwrap_or_default()), + None, + ), + None => (String::new(), None), + }; + return Some(ToolCall { + index, + name, + args: args_str, + args_dict, + }); + } + } + + if let Some(paren) = value.find('(') { + if paren > 0 { + let name = value[..paren].trim().to_string(); + let args_part = &value[paren..]; + if args_part.starts_with('(') && args_part.ends_with(')') { + let inner = args_part[1..args_part.len() - 1].trim(); + if let Ok(serde_json::Value::Object(o)) = + serde_json::from_str::(inner) + { + let mut keys: Vec<&String> = o.keys().collect(); + keys.sort(); + let mut canon = serde_json::Map::new(); + for k in keys { + canon.insert(k.clone(), o[k].clone()); + } + return Some(ToolCall { + index, + name, + args: serde_json::to_string(&serde_json::Value::Object(canon.clone())) + .unwrap_or_default(), + args_dict: Some(canon), + }); + } + return Some(ToolCall { + index, + name, + args: inner.to_string(), + args_dict: None, + }); + } + return Some(ToolCall { + index, + name, + args: args_part.to_string(), + args_dict: None, + }); + } + } + + Some(ToolCall { + index, + name: value.trim().to_string(), + args: String::new(), + args_dict: None, + }) +} + +fn extract_tool_calls(messages: &[ShareGptMessage<'_>]) -> Vec { + let mut out = Vec::new(); + for (i, msg) in messages.iter().enumerate() { + if let Some(c) = parse_tool_call(i, msg) { + out.push(c); + } + } + out +} + +fn detect_retry(calls: &[ToolCall]) -> Vec<(usize, usize, String)> { + if calls.len() < RETRY_THRESHOLD { + return Vec::new(); + } + let mut patterns = Vec::new(); + let mut i = 0; + while i < calls.len() { + let current = &calls[i]; + let mut j = i + 1; + let mut run_length = 1; + while j < calls.len() { + if calls[j].name == current.name && calls[j].args_equal(current) { + run_length += 1; + j += 1; + } else { + break; + } + } + if run_length >= RETRY_THRESHOLD { + patterns.push((calls[i].index, calls[j - 1].index, current.name.clone())); + i = j; + } else { + i += 1; + } + } + patterns +} + +fn detect_parameter_drift(calls: &[ToolCall]) -> Vec<(usize, usize, String, usize)> { + if calls.len() < PARAMETER_DRIFT_THRESHOLD { + return Vec::new(); + } + let mut patterns = Vec::new(); + let mut i = 0; + while i < calls.len() { + let current_name = calls[i].name.clone(); + let mut seen_args: Vec = vec![calls[i].args.clone()]; + let mut unique_args = 1; + let mut j = i + 1; + while j < calls.len() { + if calls[j].name != current_name { + break; + } + if !seen_args.iter().any(|a| a == &calls[j].args) { + seen_args.push(calls[j].args.clone()); + unique_args += 1; + } + j += 1; + } + let run_length = j - i; + if run_length >= PARAMETER_DRIFT_THRESHOLD && unique_args >= 2 { + patterns.push(( + calls[i].index, + calls[j - 1].index, + current_name, + unique_args, + )); + i = j; + } else { + i += 1; + } + } + patterns +} + +fn detect_oscillation(calls: &[ToolCall]) -> Vec<(usize, usize, Vec, usize)> { + let min_calls = 2 * OSCILLATION_CYCLES_THRESHOLD; + if calls.len() < min_calls { + return Vec::new(); + } + let mut patterns = Vec::new(); + let mut i: usize = 0; + while i + min_calls <= calls.len() { + let max_pat_len = (5usize).min(calls.len() - i); + let mut found_for_i = false; + for pat_len in 2..=max_pat_len { + let pattern_names: Vec = + (0..pat_len).map(|k| calls[i + k].name.clone()).collect(); + let unique: std::collections::HashSet<&String> = pattern_names.iter().collect(); + if unique.len() < 2 { + continue; + } + let mut cycles = 1; + let mut pos = i + pat_len; + while pos + pat_len <= calls.len() { + let mut all_match = true; + for k in 0..pat_len { + if calls[pos + k].name != pattern_names[k] { + all_match = false; + break; + } + } + if all_match { + cycles += 1; + pos += pat_len; + } else { + break; + } + } + if cycles >= OSCILLATION_CYCLES_THRESHOLD { + let end_idx_in_calls = i + (cycles * pat_len) - 1; + patterns.push(( + calls[i].index, + calls[end_idx_in_calls].index, + pattern_names, + cycles, + )); + // Mirror Python: `i = end_idx + 1 - pattern_len`. We set `i` so that + // the next outer iteration begins after we account for overlap. + i = end_idx_in_calls + 1 - pat_len; + found_for_i = true; + break; + } + } + if !found_for_i { + i += 1; + } else { + // Match Python's `i = end_idx + 1 - pattern_len; break` then loop. + // We'll continue; the outer while re-checks i. + } + } + if patterns.len() > 1 { + patterns = deduplicate_patterns(patterns); + } + patterns +} + +fn deduplicate_patterns( + mut patterns: Vec<(usize, usize, Vec, usize)>, +) -> Vec<(usize, usize, Vec, usize)> { + if patterns.is_empty() { + return patterns; + } + patterns.sort_by(|a, b| { + let ord = a.0.cmp(&b.0); + if ord != std::cmp::Ordering::Equal { + ord + } else { + (b.1 - b.0).cmp(&(a.1 - a.0)) + } + }); + let mut result = Vec::new(); + let mut last_end: i64 = -1; + for p in patterns { + if (p.0 as i64) > last_end { + last_end = p.1 as i64; + result.push(p); + } + } + result +} + +pub fn analyze_loops(messages: &[ShareGptMessage<'_>]) -> SignalGroup { + let mut group = SignalGroup::new("loops"); + let calls = extract_tool_calls(messages); + if calls.len() < RETRY_THRESHOLD { + return group; + } + + let retries = detect_retry(&calls); + for (start_idx, end_idx, tool_name) in &retries { + let call_count = calls + .iter() + .filter(|c| *start_idx <= c.index && c.index <= *end_idx) + .count(); + group.add_signal( + SignalInstance::new( + SignalType::ExecutionLoopsRetry, + *start_idx, + format!( + "Tool '{}' called {} times with identical arguments", + tool_name, call_count + ), + ) + .with_confidence(0.95) + .with_metadata(json!({ + "tool_name": tool_name, + "start_index": start_idx, + "end_index": end_idx, + "call_count": call_count, + "loop_type": "retry", + })), + ); + } + + let drifts = detect_parameter_drift(&calls); + for (start_idx, end_idx, tool_name, variation_count) in &drifts { + let overlaps_retry = retries + .iter() + .any(|r| !(*end_idx < r.0 || *start_idx > r.1)); + if overlaps_retry { + continue; + } + let call_count = calls + .iter() + .filter(|c| *start_idx <= c.index && c.index <= *end_idx) + .count(); + group.add_signal( + SignalInstance::new( + SignalType::ExecutionLoopsParameterDrift, + *start_idx, + format!( + "Tool '{}' called {} times with {} different argument variations", + tool_name, call_count, variation_count + ), + ) + .with_confidence(0.85) + .with_metadata(json!({ + "tool_name": tool_name, + "start_index": start_idx, + "end_index": end_idx, + "call_count": call_count, + "variation_count": variation_count, + "loop_type": "parameter_drift", + })), + ); + } + + let oscillations = detect_oscillation(&calls); + for (start_idx, end_idx, tool_names, cycle_count) in &oscillations { + let pattern_str = tool_names.join(" \u{2192} "); + group.add_signal( + SignalInstance::new( + SignalType::ExecutionLoopsOscillation, + *start_idx, + format!( + "Oscillation pattern [{}] repeated {} times", + pattern_str, cycle_count + ), + ) + .with_confidence(0.9) + .with_metadata(json!({ + "pattern": tool_names, + "start_index": start_idx, + "end_index": end_idx, + "cycle_count": cycle_count, + "loop_type": "oscillation", + })), + ); + } + + group +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fc(value: &str) -> ShareGptMessage<'_> { + ShareGptMessage { + from: "function_call", + value, + } + } + + #[test] + fn detects_retry_loop() { + let arg = r#"{"name":"check_status","arguments":{"id":"abc"}}"#; + let msgs = vec![fc(arg), fc(arg), fc(arg), fc(arg)]; + let g = analyze_loops(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsRetry))); + } + + #[test] + fn detects_parameter_drift() { + let msgs = vec![ + fc(r#"{"name":"search","arguments":{"q":"a"}}"#), + fc(r#"{"name":"search","arguments":{"q":"ab"}}"#), + fc(r#"{"name":"search","arguments":{"q":"abc"}}"#), + fc(r#"{"name":"search","arguments":{"q":"abcd"}}"#), + ]; + let g = analyze_loops(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsParameterDrift))); + } + + #[test] + fn detects_oscillation() { + let a = r#"{"name":"toolA","arguments":{}}"#; + let b = r#"{"name":"toolB","arguments":{}}"#; + let msgs = vec![fc(a), fc(b), fc(a), fc(b), fc(a), fc(b)]; + let g = analyze_loops(&msgs); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::ExecutionLoopsOscillation))); + } + + #[test] + fn no_signals_when_few_calls() { + let msgs = vec![fc(r#"{"name":"only_once","arguments":{}}"#)]; + let g = analyze_loops(&msgs); + assert!(g.signals.is_empty()); + } +} diff --git a/crates/brightstaff/src/signals/execution/mod.rs b/crates/brightstaff/src/signals/execution/mod.rs new file mode 100644 index 00000000..87dc28c4 --- /dev/null +++ b/crates/brightstaff/src/signals/execution/mod.rs @@ -0,0 +1,5 @@ +//! Execution signals: failure (agent-caused tool errors) and loops +//! (repetitive tool-call behavior). + +pub mod failure; +pub mod loops; diff --git a/crates/brightstaff/src/signals/interaction/constants.rs b/crates/brightstaff/src/signals/interaction/constants.rs new file mode 100644 index 00000000..2301395c --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/constants.rs @@ -0,0 +1,193 @@ +//! Shared constants for the interaction layer detectors. +//! +//! Direct port of `signals/interaction/constants.py`. + +use std::collections::HashSet; +use std::sync::OnceLock; + +pub const POSITIVE_PREFIXES: &[&str] = &[ + "yes", + "yeah", + "yep", + "yup", + "sure", + "ok", + "okay", + "great", + "awesome", + "perfect", + "thanks", + "thank", + "wonderful", + "excellent", + "amazing", + "nice", + "good", + "cool", + "absolutely", + "definitely", + "please", +]; + +pub const CONFIRMATION_PREFIXES: &[&str] = &[ + "yes", + "yeah", + "yep", + "yup", + "correct", + "right", + "that's correct", + "thats correct", + "that's right", + "thats right", + "that is correct", + "that is right", +]; + +const STOPWORD_LIST: &[&str] = &[ + "a", + "about", + "above", + "after", + "again", + "against", + "all", + "am", + "an", + "and", + "any", + "are", + "as", + "at", + "be", + "because", + "been", + "before", + "being", + "below", + "between", + "both", + "but", + "by", + "can", + "could", + "did", + "do", + "does", + "doing", + "down", + "during", + "each", + "few", + "for", + "from", + "further", + "had", + "has", + "have", + "having", + "he", + "her", + "here", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "i", + "if", + "in", + "into", + "is", + "it", + "its", + "itself", + "just", + "me", + "more", + "most", + "my", + "myself", + "no", + "nor", + "not", + "now", + "of", + "off", + "on", + "once", + "only", + "or", + "other", + "our", + "ours", + "ourselves", + "out", + "over", + "own", + "same", + "she", + "should", + "so", + "some", + "such", + "than", + "that", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "there", + "these", + "they", + "this", + "those", + "through", + "to", + "too", + "under", + "until", + "up", + "very", + "was", + "we", + "were", + "what", + "when", + "where", + "which", + "while", + "who", + "whom", + "why", + "with", + "would", + "you", + "your", + "yours", + "yourself", + "yourselves", +]; + +pub fn stopwords() -> &'static HashSet<&'static str> { + static SET: OnceLock> = OnceLock::new(); + SET.get_or_init(|| STOPWORD_LIST.iter().copied().collect()) +} + +/// Returns true if `text` (case-insensitive, trimmed) starts with any of the +/// given prefixes treated as **whole tokens or token sequences**. This matches +/// the Python's `text_lower.startswith(prefix)` plus the natural intent that +/// `"please"` shouldn't fire on `"pleased"`. +pub fn starts_with_prefix(text: &str, prefixes: &[&str]) -> bool { + let lowered = text.to_lowercase(); + let trimmed = lowered.trim_start(); + for prefix in prefixes { + if trimmed.starts_with(prefix) { + return true; + } + } + false +} diff --git a/crates/brightstaff/src/signals/interaction/disengagement.rs b/crates/brightstaff/src/signals/interaction/disengagement.rs new file mode 100644 index 00000000..28711d18 --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/disengagement.rs @@ -0,0 +1,445 @@ +//! Disengagement signals: escalation, quit, negative stance. +//! +//! Direct port of `signals/interaction/disengagement.py`. + +use std::sync::OnceLock; + +use regex::Regex; +use serde_json::json; + +use super::constants::{starts_with_prefix, POSITIVE_PREFIXES}; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; +use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern}; + +const ESCALATION_PATTERN_TEXTS: &[&str] = &[ + // Human requests + "speak to a human", + "talk to a human", + "connect me to a human", + "connect me with a human", + "transfer me to a human", + "get me a human", + "chat with a human", + // Person requests + "speak to a person", + "talk to a person", + "connect me to a person", + "connect me with a person", + "transfer me to a person", + "get me a person", + "chat with a person", + // Real person requests + "speak to a real person", + "talk to a real person", + "connect me to a real person", + "connect me with a real person", + "transfer me to a real person", + "get me a real person", + "chat with a real person", + // Actual person requests + "speak to an actual person", + "talk to an actual person", + "connect me to an actual person", + "connect me with an actual person", + "transfer me to an actual person", + "get me an actual person", + "chat with an actual person", + // Supervisor requests + "speak to a supervisor", + "talk to a supervisor", + "connect me to a supervisor", + "connect me with a supervisor", + "transfer me to a supervisor", + "get me a supervisor", + "chat with a supervisor", + // Manager requests + "speak to a manager", + "talk to a manager", + "connect me to a manager", + "connect me with a manager", + "transfer me to a manager", + "get me a manager", + "chat with a manager", + // Customer service requests + "speak to customer service", + "talk to customer service", + "connect me to customer service", + "connect me with customer service", + "transfer me to customer service", + "get me customer service", + "chat with customer service", + // Customer support requests + "speak to customer support", + "talk to customer support", + "connect me to customer support", + "connect me with customer support", + "transfer me to customer support", + "get me customer support", + "chat with customer support", + // Support requests + "speak to support", + "talk to support", + "connect me to support", + "connect me with support", + "transfer me to support", + "get me support", + "chat with support", + // Tech support requests + "speak to tech support", + "talk to tech support", + "connect me to tech support", + "connect me with tech support", + "transfer me to tech support", + "get me tech support", + "chat with tech support", + // Help desk requests + "speak to help desk", + "talk to help desk", + "connect me to help desk", + "connect me with help desk", + "transfer me to help desk", + "get me help desk", + "chat with help desk", + // Explicit escalation + "escalate this", +]; + +const QUIT_PATTERN_TEXTS: &[&str] = &[ + "i give up", + "i'm giving up", + "im giving up", + "i'm going to quit", + "i quit", + "forget it", + "forget this", + "screw it", + "screw this", + "don't bother trying", + "don't bother with this", + "don't bother with it", + "don't even bother", + "why bother", + "not worth it", + "this is hopeless", + "going elsewhere", + "try somewhere else", + "look elsewhere", +]; + +const NEGATIVE_STANCE_PATTERN_TEXTS: &[&str] = &[ + "this is useless", + "not helpful", + "doesn't help", + "not helping", + "you're not helping", + "youre not helping", + "this doesn't work", + "this doesnt work", + "this isn't working", + "this isnt working", + "still doesn't work", + "still doesnt work", + "still not working", + "still isn't working", + "still isnt working", + "waste of time", + "wasting my time", + "this is ridiculous", + "this is absurd", + "this is insane", + "this is stupid", + "this is dumb", + "this sucks", + "this is frustrating", + "not good enough", + "why can't you", + "why cant you", + "same issue", + "did that already", + "done that already", + "tried that already", + "already tried that", + "i've done that", + "ive done that", + "i've tried that", + "ive tried that", + "i'm disappointed", + "im disappointed", + "disappointed with you", + "disappointed in you", + "useless bot", + "dumb bot", + "stupid bot", +]; + +const AGENT_DIRECTED_PROFANITY_PATTERN_TEXTS: &[&str] = &[ + "this is bullshit", + "what bullshit", + "such bullshit", + "total bullshit", + "complete bullshit", + "this is crap", + "what crap", + "this is shit", + "what the hell is wrong with you", + "what the fuck is wrong with you", + "you're fucking useless", + "youre fucking useless", + "you are fucking useless", + "fucking useless", + "this bot is shit", + "this bot is crap", + "damn bot", + "fucking bot", + "stupid fucking", + "are you fucking kidding", + "wtf is wrong with you", + "wtf is this", + "ffs just", + "for fucks sake", + "for fuck's sake", + "what the f**k", + "what the f*ck", + "what the f***", + "that's bullsh*t", + "thats bullsh*t", + "that's bull***t", + "thats bull***t", + "that's bs", + "thats bs", + "this is bullsh*t", + "this is bull***t", + "this is bs", +]; + +fn escalation_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(ESCALATION_PATTERN_TEXTS)) +} + +fn quit_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(QUIT_PATTERN_TEXTS)) +} + +fn negative_stance_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(NEGATIVE_STANCE_PATTERN_TEXTS)) +} + +fn profanity_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(AGENT_DIRECTED_PROFANITY_PATTERN_TEXTS)) +} + +fn re_consecutive_q() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| Regex::new(r"\?{2,}").unwrap()) +} +fn re_consecutive_e() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| Regex::new(r"!{2,}").unwrap()) +} +fn re_mixed_punct() -> &'static Regex { + static R: OnceLock = OnceLock::new(); + R.get_or_init(|| Regex::new(r"[?!]{3,}").unwrap()) +} + +pub fn analyze_disengagement( + normalized_messages: &[(usize, &str, NormalizedMessage)], + char_ngram_threshold: f32, + token_cosine_threshold: f32, +) -> SignalGroup { + let mut group = SignalGroup::new("disengagement"); + + for (idx, role, norm_msg) in normalized_messages { + if *role != "human" { + continue; + } + + let text = &norm_msg.raw; + + // All-caps shouting check. + let alpha_chars: String = text.chars().filter(|c| c.is_alphabetic()).collect(); + if alpha_chars.chars().count() >= 10 { + let upper_count = alpha_chars.chars().filter(|c| c.is_uppercase()).count(); + let upper_ratio = upper_count as f32 / alpha_chars.chars().count() as f32; + if upper_ratio >= 0.8 { + let snippet: String = text.chars().take(50).collect(); + group.add_signal( + SignalInstance::new(SignalType::DisengagementNegativeStance, *idx, snippet) + .with_metadata(json!({ + "indicator_type": "all_caps", + "upper_ratio": upper_ratio, + })), + ); + } + } + + // Excessive consecutive punctuation. + let starts_with_positive = starts_with_prefix(text, POSITIVE_PREFIXES); + let cq = re_consecutive_q().find_iter(text).count(); + let ce = re_consecutive_e().find_iter(text).count(); + let mixed = re_mixed_punct().find_iter(text).count(); + if !starts_with_positive && (cq >= 1 || ce >= 1 || mixed >= 1) { + let snippet: String = text.chars().take(50).collect(); + group.add_signal( + SignalInstance::new(SignalType::DisengagementNegativeStance, *idx, snippet) + .with_metadata(json!({ + "indicator_type": "excessive_punctuation", + "consecutive_questions": cq, + "consecutive_exclamations": ce, + "mixed_punctuation": mixed, + })), + ); + } + + // Escalation patterns. + let mut found_escalation = false; + for pattern in escalation_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::DisengagementEscalation, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "escalation"})), + ); + found_escalation = true; + break; + } + } + + // Quit patterns (independent of escalation). + for pattern in quit_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new(SignalType::DisengagementQuit, *idx, pattern.raw.clone()) + .with_metadata(json!({"pattern_type": "quit"})), + ); + break; + } + } + + // Profanity (more specific) before generic negative stance. + let mut found_profanity = false; + for pattern in profanity_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::DisengagementNegativeStance, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({ + "indicator_type": "profanity", + "pattern": pattern.raw, + })), + ); + found_profanity = true; + break; + } + } + + if !found_escalation && !found_profanity { + for pattern in negative_stance_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::DisengagementNegativeStance, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({ + "indicator_type": "complaint", + "pattern": pattern.raw, + })), + ); + break; + } + } + } + } + + group +} + +#[cfg(test)] +mod tests { + use super::*; + + fn nm(s: &str) -> NormalizedMessage { + NormalizedMessage::from_text(s, 2000) + } + + #[test] + fn detects_human_escalation_request() { + let msgs = vec![( + 0usize, + "human", + nm("This is taking forever, get me a human"), + )]; + let g = analyze_disengagement(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::DisengagementEscalation))); + } + + #[test] + fn detects_quit_intent() { + let msgs = vec![(0usize, "human", nm("Forget it, I give up"))]; + let g = analyze_disengagement(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::DisengagementQuit))); + } + + #[test] + fn detects_negative_stance_complaint() { + let msgs = vec![(0usize, "human", nm("This is useless"))]; + let g = analyze_disengagement(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::DisengagementNegativeStance))); + } + + #[test] + fn detects_excessive_punctuation_as_negative_stance() { + let msgs = vec![(0usize, "human", nm("WHY isn't this working???"))]; + let g = analyze_disengagement(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::DisengagementNegativeStance))); + } + + #[test] + fn positive_excitement_is_not_disengagement() { + let msgs = vec![(0usize, "human", nm("Yes!! That's perfect!!!"))]; + let g = analyze_disengagement(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .all(|s| !matches!(s.signal_type, SignalType::DisengagementNegativeStance))); + } +} diff --git a/crates/brightstaff/src/signals/interaction/misalignment.rs b/crates/brightstaff/src/signals/interaction/misalignment.rs new file mode 100644 index 00000000..3dcf3ddd --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/misalignment.rs @@ -0,0 +1,338 @@ +//! Misalignment signals: corrections, rephrases, clarifications. +//! +//! Direct port of `signals/interaction/misalignment.py`. + +use std::sync::OnceLock; + +use serde_json::json; + +use super::constants::{stopwords, CONFIRMATION_PREFIXES}; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; +use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern}; + +const CORRECTION_PATTERN_TEXTS: &[&str] = &[ + "no, i meant", + "no i meant", + "no, i said", + "no i said", + "no, i asked", + "no i asked", + "nah, i meant", + "nope, i meant", + "not what i said", + "not what i asked", + "that's not what i said", + "that's not what i asked", + "that's not what i meant", + "thats not what i said", + "thats not what i asked", + "thats not what i meant", + "that's not what you", + "no that's not what i", + "no, that's not what i", + "you're not quite right", + "youre not quite right", + "you're not exactly right", + "youre not exactly right", + "you're wrong about", + "youre wrong about", + "i just said", + "i already said", + "i already told you", +]; + +const REPHRASE_PATTERN_TEXTS: &[&str] = &[ + "let me rephrase", + "let me explain again", + "what i'm trying to say", + "what i'm saying is", + "in other words", +]; + +const CLARIFICATION_PATTERN_TEXTS: &[&str] = &[ + "i don't understand", + "don't understand", + "not understanding", + "can't understand", + "don't get it", + "don't follow", + "i'm confused", + "so confused", + "makes no sense", + "doesn't make sense", + "not making sense", + "what do you mean", + "what does that mean", + "what are you saying", + "i'm lost", + "totally lost", + "lost me", + "no clue what you", + "no idea what you", + "no clue what that", + "no idea what that", + "come again", + "say that again", + "repeat that", + "trouble following", + "hard to follow", + "can't follow", +]; + +fn correction_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(CORRECTION_PATTERN_TEXTS)) +} + +fn rephrase_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(REPHRASE_PATTERN_TEXTS)) +} + +fn clarification_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(CLARIFICATION_PATTERN_TEXTS)) +} + +fn is_confirmation_message(text: &str) -> bool { + let lowered = text.to_lowercase(); + let trimmed = lowered.trim(); + CONFIRMATION_PREFIXES.iter().any(|p| trimmed.starts_with(p)) +} + +/// Detect whether two user messages appear to be rephrases of each other. +pub fn is_similar_rephrase( + norm_msg1: &NormalizedMessage, + norm_msg2: &NormalizedMessage, + overlap_threshold: f32, + min_meaningful_tokens: usize, + max_new_content_ratio: f32, +) -> bool { + if norm_msg1.tokens.len() < 3 || norm_msg2.tokens.len() < 3 { + return false; + } + if is_confirmation_message(&norm_msg1.raw) { + return false; + } + + let stops = stopwords(); + let tokens1: std::collections::HashSet<&str> = norm_msg1 + .tokens + .iter() + .filter(|t| !stops.contains(t.as_str())) + .map(|s| s.as_str()) + .collect(); + let tokens2: std::collections::HashSet<&str> = norm_msg2 + .tokens + .iter() + .filter(|t| !stops.contains(t.as_str())) + .map(|s| s.as_str()) + .collect(); + + if tokens1.len() < min_meaningful_tokens || tokens2.len() < min_meaningful_tokens { + return false; + } + + let new_tokens: std::collections::HashSet<&&str> = tokens1.difference(&tokens2).collect(); + let new_content_ratio = if tokens1.is_empty() { + 0.0 + } else { + new_tokens.len() as f32 / tokens1.len() as f32 + }; + if new_content_ratio > max_new_content_ratio { + return false; + } + + let intersection = tokens1.intersection(&tokens2).count(); + let min_size = tokens1.len().min(tokens2.len()); + if min_size == 0 { + return false; + } + let overlap_ratio = intersection as f32 / min_size as f32; + overlap_ratio >= overlap_threshold +} + +/// Analyze user messages for misalignment signals. +pub fn analyze_misalignment( + normalized_messages: &[(usize, &str, NormalizedMessage)], + char_ngram_threshold: f32, + token_cosine_threshold: f32, +) -> SignalGroup { + let mut group = SignalGroup::new("misalignment"); + + let mut prev_user_idx: Option = None; + let mut prev_user_msg: Option<&NormalizedMessage> = None; + + for (idx, role, norm_msg) in normalized_messages { + if *role != "human" { + continue; + } + + let mut found_in_turn = false; + + for pattern in correction_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::MisalignmentCorrection, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "correction"})), + ); + found_in_turn = true; + break; + } + } + + if found_in_turn { + prev_user_idx = Some(*idx); + prev_user_msg = Some(norm_msg); + continue; + } + + for pattern in rephrase_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::MisalignmentRephrase, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "rephrase"})), + ); + found_in_turn = true; + break; + } + } + + if found_in_turn { + prev_user_idx = Some(*idx); + prev_user_msg = Some(norm_msg); + continue; + } + + for pattern in clarification_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::MisalignmentClarification, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "clarification"})), + ); + found_in_turn = true; + break; + } + } + + if found_in_turn { + prev_user_idx = Some(*idx); + prev_user_msg = Some(norm_msg); + continue; + } + + // Semantic rephrase vs the previous user message (recent only). + if let (Some(prev_idx), Some(prev_msg)) = (prev_user_idx, prev_user_msg) { + let turns_between = idx.saturating_sub(prev_idx); + if turns_between <= 3 && is_similar_rephrase(norm_msg, prev_msg, 0.75, 4, 0.5) { + group.add_signal( + SignalInstance::new( + SignalType::MisalignmentRephrase, + *idx, + "[similar rephrase detected]", + ) + .with_confidence(0.8) + .with_metadata(json!({ + "pattern_type": "semantic_rephrase", + "compared_to": prev_idx, + })), + ); + } + } + + prev_user_idx = Some(*idx); + prev_user_msg = Some(norm_msg); + } + + group +} + +#[cfg(test)] +mod tests { + use super::*; + + fn nm(s: &str) -> NormalizedMessage { + NormalizedMessage::from_text(s, 2000) + } + + fn make(items: &[(&'static str, &str)]) -> Vec<(usize, &'static str, NormalizedMessage)> { + items + .iter() + .enumerate() + .map(|(i, (role, text))| (i, *role, nm(text))) + .collect() + } + + #[test] + fn detects_explicit_correction() { + let msgs = make(&[ + ("human", "Show me my orders"), + ("gpt", "Sure, here are your invoices"), + ("human", "No, I meant my recent orders"), + ]); + let g = analyze_misalignment(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::MisalignmentCorrection))); + } + + #[test] + fn detects_rephrase_marker() { + let msgs = make(&[ + ("human", "Show me X"), + ("gpt", "Sure"), + ("human", "Let me rephrase: I want X grouped by date"), + ]); + let g = analyze_misalignment(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::MisalignmentRephrase))); + } + + #[test] + fn detects_clarification_request() { + let msgs = make(&[ + ("human", "Run the report"), + ("gpt", "Foobar quux baz."), + ("human", "I don't understand what you mean"), + ]); + let g = analyze_misalignment(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::MisalignmentClarification))); + } + + #[test] + fn confirmation_is_not_a_rephrase() { + let m1 = nm("Yes, that's correct, please proceed with the order"); + let m2 = nm("please proceed with the order for the same product"); + assert!(!is_similar_rephrase(&m1, &m2, 0.75, 4, 0.5)); + } +} diff --git a/crates/brightstaff/src/signals/interaction/mod.rs b/crates/brightstaff/src/signals/interaction/mod.rs new file mode 100644 index 00000000..b60a6748 --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/mod.rs @@ -0,0 +1,10 @@ +//! Interaction signals: misalignment, stagnation, disengagement, satisfaction. +//! +//! These signals capture how the dialogue itself unfolds (semantic alignment, +//! progress, engagement, closure) independent of tool execution outcomes. + +pub mod constants; +pub mod disengagement; +pub mod misalignment; +pub mod satisfaction; +pub mod stagnation; diff --git a/crates/brightstaff/src/signals/interaction/satisfaction.rs b/crates/brightstaff/src/signals/interaction/satisfaction.rs new file mode 100644 index 00000000..ad719960 --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/satisfaction.rs @@ -0,0 +1,177 @@ +//! Satisfaction signals: gratitude, confirmation, success. +//! +//! Direct port of `signals/interaction/satisfaction.py`. + +use std::sync::OnceLock; + +use serde_json::json; + +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType}; +use crate::signals::text_processing::{normalize_patterns, NormalizedMessage, NormalizedPattern}; + +const GRATITUDE_PATTERN_TEXTS: &[&str] = &[ + "that's helpful", + "that helps", + "this helps", + "appreciate it", + "appreciate that", + "that's perfect", + "exactly what i needed", + "just what i needed", + "you're the best", + "you rock", + "you're awesome", + "you're amazing", + "you're great", +]; + +const CONFIRMATION_PATTERN_TEXTS: &[&str] = &[ + "that works", + "this works", + "that's great", + "that's amazing", + "this is great", + "that's awesome", + "love it", + "love this", + "love that", +]; + +const SUCCESS_PATTERN_TEXTS: &[&str] = &[ + "it worked", + "that worked", + "this worked", + "it's working", + "that's working", + "this is working", +]; + +fn gratitude_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(GRATITUDE_PATTERN_TEXTS)) +} + +fn confirmation_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(CONFIRMATION_PATTERN_TEXTS)) +} + +fn success_patterns() -> &'static Vec { + static PATS: OnceLock> = OnceLock::new(); + PATS.get_or_init(|| normalize_patterns(SUCCESS_PATTERN_TEXTS)) +} + +pub fn analyze_satisfaction( + normalized_messages: &[(usize, &str, NormalizedMessage)], + char_ngram_threshold: f32, + token_cosine_threshold: f32, +) -> SignalGroup { + let mut group = SignalGroup::new("satisfaction"); + + for (idx, role, norm_msg) in normalized_messages { + if *role != "human" { + continue; + } + + let mut found = false; + + for pattern in gratitude_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::SatisfactionGratitude, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "gratitude"})), + ); + found = true; + break; + } + } + if found { + continue; + } + + for pattern in confirmation_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new( + SignalType::SatisfactionConfirmation, + *idx, + pattern.raw.clone(), + ) + .with_metadata(json!({"pattern_type": "confirmation"})), + ); + found = true; + break; + } + } + if found { + continue; + } + + for pattern in success_patterns() { + if norm_msg.matches_normalized_pattern( + pattern, + char_ngram_threshold, + token_cosine_threshold, + ) { + group.add_signal( + SignalInstance::new(SignalType::SatisfactionSuccess, *idx, pattern.raw.clone()) + .with_metadata(json!({"pattern_type": "success"})), + ); + break; + } + } + } + + group +} + +#[cfg(test)] +mod tests { + use super::*; + + fn nm(s: &str) -> NormalizedMessage { + NormalizedMessage::from_text(s, 2000) + } + + #[test] + fn detects_gratitude() { + let msgs = vec![(0usize, "human", nm("That's perfect, appreciate it!"))]; + let g = analyze_satisfaction(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::SatisfactionGratitude))); + } + + #[test] + fn detects_confirmation() { + let msgs = vec![(0usize, "human", nm("That works for me, thanks"))]; + let g = analyze_satisfaction(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::SatisfactionConfirmation))); + } + + #[test] + fn detects_success() { + let msgs = vec![(0usize, "human", nm("Great, it worked!"))]; + let g = analyze_satisfaction(&msgs, 0.65, 0.6); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::SatisfactionSuccess))); + } +} diff --git a/crates/brightstaff/src/signals/interaction/stagnation.rs b/crates/brightstaff/src/signals/interaction/stagnation.rs new file mode 100644 index 00000000..d7d03c80 --- /dev/null +++ b/crates/brightstaff/src/signals/interaction/stagnation.rs @@ -0,0 +1,241 @@ +//! Stagnation signals: dragging (turn-count efficiency) and repetition. +//! +//! Direct port of `signals/interaction/stagnation.py`. + +use serde_json::json; + +use super::constants::{starts_with_prefix, POSITIVE_PREFIXES}; +use crate::signals::schemas::{SignalGroup, SignalInstance, SignalType, TurnMetrics}; +use crate::signals::text_processing::NormalizedMessage; + +/// Adapter row used by stagnation::dragging detector. Mirrors the ShareGPT +/// `{"from": role, "value": text}` shape used in the Python reference. +pub struct ShareGptMsg<'a> { + pub from: &'a str, +} + +pub fn analyze_dragging( + messages: &[ShareGptMsg<'_>], + baseline_turns: usize, + efficiency_threshold: f32, +) -> (SignalGroup, TurnMetrics) { + let mut group = SignalGroup::new("stagnation"); + + let mut user_turns: usize = 0; + let mut assistant_turns: usize = 0; + for m in messages { + match m.from { + "human" => user_turns += 1, + "gpt" => assistant_turns += 1, + _ => {} + } + } + + let total_turns = user_turns; + let efficiency_score: f32 = if total_turns == 0 || total_turns <= baseline_turns { + 1.0 + } else { + let excess = (total_turns - baseline_turns) as f32; + 1.0 / (1.0 + excess * 0.25) + }; + + let is_dragging = efficiency_score < efficiency_threshold; + let metrics = TurnMetrics { + total_turns, + user_turns, + assistant_turns, + is_dragging, + efficiency_score, + }; + + if is_dragging { + let last_idx = messages.len().saturating_sub(1); + group.add_signal( + SignalInstance::new( + SignalType::StagnationDragging, + last_idx, + format!( + "Conversation dragging: {} turns (efficiency: {:.2})", + total_turns, efficiency_score + ), + ) + .with_confidence(1.0 - efficiency_score) + .with_metadata(json!({ + "total_turns": total_turns, + "efficiency_score": efficiency_score, + "baseline_turns": baseline_turns, + })), + ); + } + + (group, metrics) +} + +pub fn analyze_repetition( + normalized_messages: &[(usize, &str, NormalizedMessage)], + lookback: usize, + exact_threshold: f32, + near_duplicate_threshold: f32, +) -> SignalGroup { + let mut group = SignalGroup::new("stagnation"); + + // We keep references into `normalized_messages`. Since `normalized_messages` + // is borrowed for the whole function, this avoids cloning. + let mut prev_human: Vec<(usize, &NormalizedMessage)> = Vec::new(); + let mut prev_gpt: Vec<(usize, &NormalizedMessage)> = Vec::new(); + + for (idx, role, norm_msg) in normalized_messages { + if *role != "human" && *role != "gpt" { + continue; + } + + // Skip human positive-prefix messages; they're naturally repetitive. + if *role == "human" && starts_with_prefix(&norm_msg.raw, POSITIVE_PREFIXES) { + prev_human.push((*idx, norm_msg)); + continue; + } + + if norm_msg.tokens.len() < 5 { + if *role == "human" { + prev_human.push((*idx, norm_msg)); + } else { + prev_gpt.push((*idx, norm_msg)); + } + continue; + } + + let prev = if *role == "human" { + &prev_human + } else { + &prev_gpt + }; + let start = prev.len().saturating_sub(lookback); + let mut matched = false; + for (prev_idx, prev_msg) in &prev[start..] { + if prev_msg.tokens.len() < 5 { + continue; + } + let similarity = norm_msg.ngram_similarity_with_message(prev_msg); + if similarity >= exact_threshold { + group.add_signal( + SignalInstance::new( + SignalType::StagnationRepetition, + *idx, + format!("Exact repetition with message {}", prev_idx), + ) + .with_confidence(similarity) + .with_metadata(json!({ + "repetition_type": "exact", + "compared_to": prev_idx, + "similarity": similarity, + "role": role, + })), + ); + matched = true; + break; + } else if similarity >= near_duplicate_threshold { + group.add_signal( + SignalInstance::new( + SignalType::StagnationRepetition, + *idx, + format!("Near-duplicate with message {}", prev_idx), + ) + .with_confidence(similarity) + .with_metadata(json!({ + "repetition_type": "near_duplicate", + "compared_to": prev_idx, + "similarity": similarity, + "role": role, + })), + ); + matched = true; + break; + } + } + let _ = matched; + + if *role == "human" { + prev_human.push((*idx, norm_msg)); + } else { + prev_gpt.push((*idx, norm_msg)); + } + } + + group +} + +/// Combined stagnation analyzer: dragging + repetition. +pub fn analyze_stagnation( + messages: &[ShareGptMsg<'_>], + normalized_messages: &[(usize, &str, NormalizedMessage)], + baseline_turns: usize, +) -> (SignalGroup, TurnMetrics) { + let (dragging_group, metrics) = analyze_dragging(messages, baseline_turns, 0.5); + let repetition_group = analyze_repetition(normalized_messages, 2, 0.95, 0.85); + + let mut combined = SignalGroup::new("stagnation"); + for s in dragging_group.signals.iter().cloned() { + combined.add_signal(s); + } + for s in repetition_group.signals.iter().cloned() { + combined.add_signal(s); + } + (combined, metrics) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn nm(s: &str) -> NormalizedMessage { + NormalizedMessage::from_text(s, 2000) + } + + #[test] + fn dragging_after_many_user_turns() { + let msgs: Vec<_> = (0..15) + .flat_map(|_| [ShareGptMsg { from: "human" }, ShareGptMsg { from: "gpt" }]) + .collect(); + let (g, m) = analyze_dragging(&msgs, 5, 0.5); + assert!(m.is_dragging); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::StagnationDragging))); + } + + #[test] + fn no_dragging_below_baseline() { + let msgs = vec![ + ShareGptMsg { from: "human" }, + ShareGptMsg { from: "gpt" }, + ShareGptMsg { from: "human" }, + ShareGptMsg { from: "gpt" }, + ]; + let (g, m) = analyze_dragging(&msgs, 5, 0.5); + assert!(!m.is_dragging); + assert!(g.signals.is_empty()); + } + + #[test] + fn detects_exact_repetition_in_user_messages() { + let n = vec![ + ( + 0usize, + "human", + nm("This widget is broken and needs repair right now"), + ), + (1, "gpt", nm("Sorry to hear that. Let me look into it.")), + ( + 2, + "human", + nm("This widget is broken and needs repair right now"), + ), + ]; + let g = analyze_repetition(&n, 2, 0.95, 0.85); + assert!(g + .signals + .iter() + .any(|s| matches!(s.signal_type, SignalType::StagnationRepetition))); + } +} diff --git a/crates/brightstaff/src/signals/mod.rs b/crates/brightstaff/src/signals/mod.rs index 83db943e..d96d3bf0 100644 --- a/crates/brightstaff/src/signals/mod.rs +++ b/crates/brightstaff/src/signals/mod.rs @@ -1,3 +1,26 @@ -mod analyzer; +//! Plano signals: behavioral quality indicators for agent interactions. +//! +//! This is a Rust port of the paper-aligned Python reference implementation at +//! `https://github.com/katanemo/signals` (or `/Users/shashmi/repos/signals`). +//! +//! Three layers of signals are detected from a conversation transcript: +//! +//! - **Interaction**: misalignment, stagnation, disengagement, satisfaction +//! - **Execution**: failure, loops +//! - **Environment**: exhaustion +//! +//! See `SignalType` for the full hierarchy. -pub use analyzer::*; +pub mod analyzer; +pub mod environment; +pub mod execution; +pub mod interaction; +pub mod otel; +pub mod schemas; +pub mod text_processing; + +pub use analyzer::{SignalAnalyzer, FLAG_MARKER}; +pub use schemas::{ + EnvironmentSignals, ExecutionSignals, InteractionQuality, InteractionSignals, SignalGroup, + SignalInstance, SignalLayer, SignalReport, SignalType, TurnMetrics, +}; diff --git a/crates/brightstaff/src/signals/otel.rs b/crates/brightstaff/src/signals/otel.rs new file mode 100644 index 00000000..deb3c1b5 --- /dev/null +++ b/crates/brightstaff/src/signals/otel.rs @@ -0,0 +1,241 @@ +//! Helpers for emitting `SignalReport` data to OpenTelemetry spans. +//! +//! Two sets of attributes are emitted: +//! +//! - **Legacy** keys under `signals.*` (e.g. `signals.frustration.count`), +//! computed from the new layered counts. Preserved for one release for +//! backward compatibility with existing dashboards. +//! - **New** layered keys (e.g. `signals.interaction.misalignment.count`), +//! one set of `count`/`severity` attributes per category, plus per-instance +//! span events named `signal.`. + +use opentelemetry::trace::SpanRef; +use opentelemetry::KeyValue; + +use crate::signals::schemas::{SignalGroup, SignalReport, SignalType}; + +/// Emit both legacy and layered OTel attributes/events for a `SignalReport`. +/// +/// Returns `true` if any "concerning" signal was found, mirroring the previous +/// behavior used to flag the span operation name. +pub fn emit_signals_to_span(span: &SpanRef<'_>, report: &SignalReport) -> bool { + emit_overall(span, report); + emit_layered_attributes(span, report); + emit_legacy_attributes(span, report); + emit_signal_events(span, report); + + is_concerning(report) +} + +fn emit_overall(span: &SpanRef<'_>, report: &SignalReport) { + span.set_attribute(KeyValue::new( + "signals.quality", + report.overall_quality.as_str().to_string(), + )); + span.set_attribute(KeyValue::new( + "signals.quality_score", + report.quality_score as f64, + )); + span.set_attribute(KeyValue::new( + "signals.turn_count", + report.turn_metrics.total_turns as i64, + )); + span.set_attribute(KeyValue::new( + "signals.efficiency_score", + report.turn_metrics.efficiency_score as f64, + )); +} + +fn emit_group(span: &SpanRef<'_>, prefix: &str, group: &SignalGroup) { + if group.count == 0 { + return; + } + span.set_attribute(KeyValue::new( + format!("{}.count", prefix), + group.count as i64, + )); + span.set_attribute(KeyValue::new( + format!("{}.severity", prefix), + group.severity as i64, + )); +} + +fn emit_layered_attributes(span: &SpanRef<'_>, report: &SignalReport) { + emit_group( + span, + "signals.interaction.misalignment", + &report.interaction.misalignment, + ); + emit_group( + span, + "signals.interaction.stagnation", + &report.interaction.stagnation, + ); + emit_group( + span, + "signals.interaction.disengagement", + &report.interaction.disengagement, + ); + emit_group( + span, + "signals.interaction.satisfaction", + &report.interaction.satisfaction, + ); + emit_group(span, "signals.execution.failure", &report.execution.failure); + emit_group(span, "signals.execution.loops", &report.execution.loops); + emit_group( + span, + "signals.environment.exhaustion", + &report.environment.exhaustion, + ); +} + +fn count_of(report: &SignalReport, t: SignalType) -> usize { + report.iter_signals().filter(|s| s.signal_type == t).count() +} + +/// Emit the legacy attribute keys consumed by existing dashboards. These are +/// derived from the new `SignalReport` so no detector contract is broken. +fn emit_legacy_attributes(span: &SpanRef<'_>, report: &SignalReport) { + use crate::tracing::signals as legacy; + + // signals.follow_up.repair.{count,ratio} - misalignment proxies repairs. + let repair_count = report.interaction.misalignment.count; + let user_turns = report.turn_metrics.user_turns.max(1) as f32; + if repair_count > 0 { + span.set_attribute(KeyValue::new(legacy::REPAIR_COUNT, repair_count as i64)); + let ratio = repair_count as f32 / user_turns; + span.set_attribute(KeyValue::new(legacy::REPAIR_RATIO, format!("{:.3}", ratio))); + } + + // signals.frustration.{count,severity} - disengagement.negative_stance is + // the closest legacy analog of "frustration". + let frustration_count = count_of(report, SignalType::DisengagementNegativeStance); + if frustration_count > 0 { + span.set_attribute(KeyValue::new( + legacy::FRUSTRATION_COUNT, + frustration_count as i64, + )); + let severity = match frustration_count { + 0 => 0, + 1..=2 => 1, + 3..=4 => 2, + _ => 3, + }; + span.set_attribute(KeyValue::new(legacy::FRUSTRATION_SEVERITY, severity as i64)); + } + + // signals.repetition.count - stagnation (repetition + dragging). + if report.interaction.stagnation.count > 0 { + span.set_attribute(KeyValue::new( + legacy::REPETITION_COUNT, + report.interaction.stagnation.count as i64, + )); + } + + // signals.escalation.requested - any escalation/quit signal. + let escalated = report.interaction.disengagement.signals.iter().any(|s| { + matches!( + s.signal_type, + SignalType::DisengagementEscalation | SignalType::DisengagementQuit + ) + }); + if escalated { + span.set_attribute(KeyValue::new(legacy::ESCALATION_REQUESTED, true)); + } + + // signals.positive_feedback.count - satisfaction signals. + if report.interaction.satisfaction.count > 0 { + span.set_attribute(KeyValue::new( + legacy::POSITIVE_FEEDBACK_COUNT, + report.interaction.satisfaction.count as i64, + )); + } +} + +fn emit_signal_events(span: &SpanRef<'_>, report: &SignalReport) { + for sig in report.iter_signals() { + let event_name = format!("signal.{}", sig.signal_type.as_str()); + let mut attrs: Vec = vec![ + KeyValue::new("signal.type", sig.signal_type.as_str().to_string()), + KeyValue::new("signal.message_index", sig.message_index as i64), + KeyValue::new("signal.confidence", sig.confidence as f64), + ]; + if !sig.snippet.is_empty() { + attrs.push(KeyValue::new("signal.snippet", sig.snippet.clone())); + } + if !sig.metadata.is_null() { + attrs.push(KeyValue::new("signal.metadata", sig.metadata.to_string())); + } + span.add_event(event_name, attrs); + } +} + +fn is_concerning(report: &SignalReport) -> bool { + use crate::signals::schemas::InteractionQuality; + if matches!( + report.overall_quality, + InteractionQuality::Poor | InteractionQuality::Severe + ) { + return true; + } + if report.interaction.disengagement.count > 0 { + return true; + } + if report.interaction.stagnation.count > 2 { + return true; + } + if report.execution.failure.count > 0 || report.execution.loops.count > 0 { + return true; + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::signals::schemas::{ + EnvironmentSignals, ExecutionSignals, InteractionQuality, InteractionSignals, SignalGroup, + SignalInstance, SignalReport, SignalType, TurnMetrics, + }; + + fn report_with_escalation() -> SignalReport { + let mut diseng = SignalGroup::new("disengagement"); + diseng.add_signal(SignalInstance::new( + SignalType::DisengagementEscalation, + 3, + "get me a human", + )); + SignalReport { + interaction: InteractionSignals { + disengagement: diseng, + ..InteractionSignals::default() + }, + execution: ExecutionSignals::default(), + environment: EnvironmentSignals::default(), + overall_quality: InteractionQuality::Severe, + quality_score: 0.0, + turn_metrics: TurnMetrics { + total_turns: 3, + user_turns: 2, + assistant_turns: 1, + is_dragging: false, + efficiency_score: 1.0, + }, + summary: String::new(), + } + } + + #[test] + fn is_concerning_flags_disengagement() { + let r = report_with_escalation(); + assert!(is_concerning(&r)); + } + + #[test] + fn count_of_returns_per_type_count() { + let r = report_with_escalation(); + assert_eq!(count_of(&r, SignalType::DisengagementEscalation), 1); + assert_eq!(count_of(&r, SignalType::DisengagementNegativeStance), 0); + } +} diff --git a/crates/brightstaff/src/signals/schemas.rs b/crates/brightstaff/src/signals/schemas.rs new file mode 100644 index 00000000..47ea0836 --- /dev/null +++ b/crates/brightstaff/src/signals/schemas.rs @@ -0,0 +1,431 @@ +//! Data shapes for the signal analyzer. +//! +//! Mirrors `signals/schemas.py` from the reference implementation. Where the +//! Python library exposes a `Dict[str, SignalGroup]` partitioned by category, +//! the Rust port uses strongly-typed sub-structs (`InteractionSignals`, +//! `ExecutionSignals`, `EnvironmentSignals`) for the same partitioning. + +use serde::{Deserialize, Serialize}; + +/// Hierarchical signal type. The 20 leaf variants mirror the paper taxonomy +/// and the Python reference's `SignalType` string enum. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum SignalType { + // Interaction > Misalignment + MisalignmentCorrection, + MisalignmentRephrase, + MisalignmentClarification, + + // Interaction > Stagnation + StagnationDragging, + StagnationRepetition, + + // Interaction > Disengagement + DisengagementEscalation, + DisengagementQuit, + DisengagementNegativeStance, + + // Interaction > Satisfaction + SatisfactionGratitude, + SatisfactionConfirmation, + SatisfactionSuccess, + + // Execution > Failure + ExecutionFailureInvalidArgs, + ExecutionFailureBadQuery, + ExecutionFailureToolNotFound, + ExecutionFailureAuthMisuse, + ExecutionFailureStateError, + + // Execution > Loops + ExecutionLoopsRetry, + ExecutionLoopsParameterDrift, + ExecutionLoopsOscillation, + + // Environment > Exhaustion + EnvironmentExhaustionApiError, + EnvironmentExhaustionTimeout, + EnvironmentExhaustionRateLimit, + EnvironmentExhaustionNetwork, + EnvironmentExhaustionMalformed, + EnvironmentExhaustionContextOverflow, +} + +impl SignalType { + /// Dotted hierarchical string identifier, e.g. + /// `"interaction.misalignment.correction"`. Matches the Python reference's + /// `SignalType` enum *value* strings byte-for-byte. + pub fn as_str(&self) -> &'static str { + match self { + SignalType::MisalignmentCorrection => "interaction.misalignment.correction", + SignalType::MisalignmentRephrase => "interaction.misalignment.rephrase", + SignalType::MisalignmentClarification => "interaction.misalignment.clarification", + SignalType::StagnationDragging => "interaction.stagnation.dragging", + SignalType::StagnationRepetition => "interaction.stagnation.repetition", + SignalType::DisengagementEscalation => "interaction.disengagement.escalation", + SignalType::DisengagementQuit => "interaction.disengagement.quit", + SignalType::DisengagementNegativeStance => "interaction.disengagement.negative_stance", + SignalType::SatisfactionGratitude => "interaction.satisfaction.gratitude", + SignalType::SatisfactionConfirmation => "interaction.satisfaction.confirmation", + SignalType::SatisfactionSuccess => "interaction.satisfaction.success", + SignalType::ExecutionFailureInvalidArgs => "execution.failure.invalid_args", + SignalType::ExecutionFailureBadQuery => "execution.failure.bad_query", + SignalType::ExecutionFailureToolNotFound => "execution.failure.tool_not_found", + SignalType::ExecutionFailureAuthMisuse => "execution.failure.auth_misuse", + SignalType::ExecutionFailureStateError => "execution.failure.state_error", + SignalType::ExecutionLoopsRetry => "execution.loops.retry", + SignalType::ExecutionLoopsParameterDrift => "execution.loops.parameter_drift", + SignalType::ExecutionLoopsOscillation => "execution.loops.oscillation", + SignalType::EnvironmentExhaustionApiError => "environment.exhaustion.api_error", + SignalType::EnvironmentExhaustionTimeout => "environment.exhaustion.timeout", + SignalType::EnvironmentExhaustionRateLimit => "environment.exhaustion.rate_limit", + SignalType::EnvironmentExhaustionNetwork => "environment.exhaustion.network", + SignalType::EnvironmentExhaustionMalformed => { + "environment.exhaustion.malformed_response" + } + SignalType::EnvironmentExhaustionContextOverflow => { + "environment.exhaustion.context_overflow" + } + } + } + + pub fn layer(&self) -> SignalLayer { + match self { + SignalType::MisalignmentCorrection + | SignalType::MisalignmentRephrase + | SignalType::MisalignmentClarification + | SignalType::StagnationDragging + | SignalType::StagnationRepetition + | SignalType::DisengagementEscalation + | SignalType::DisengagementQuit + | SignalType::DisengagementNegativeStance + | SignalType::SatisfactionGratitude + | SignalType::SatisfactionConfirmation + | SignalType::SatisfactionSuccess => SignalLayer::Interaction, + SignalType::ExecutionFailureInvalidArgs + | SignalType::ExecutionFailureBadQuery + | SignalType::ExecutionFailureToolNotFound + | SignalType::ExecutionFailureAuthMisuse + | SignalType::ExecutionFailureStateError + | SignalType::ExecutionLoopsRetry + | SignalType::ExecutionLoopsParameterDrift + | SignalType::ExecutionLoopsOscillation => SignalLayer::Execution, + SignalType::EnvironmentExhaustionApiError + | SignalType::EnvironmentExhaustionTimeout + | SignalType::EnvironmentExhaustionRateLimit + | SignalType::EnvironmentExhaustionNetwork + | SignalType::EnvironmentExhaustionMalformed + | SignalType::EnvironmentExhaustionContextOverflow => SignalLayer::Environment, + } + } + + /// Category name within the layer (e.g. `"misalignment"`, `"failure"`). + pub fn category(&self) -> &'static str { + // Strip the layer prefix and take everything before the next dot. + let s = self.as_str(); + let after_layer = s.split_once('.').map(|(_, rest)| rest).unwrap_or(s); + after_layer + .split_once('.') + .map(|(c, _)| c) + .unwrap_or(after_layer) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum SignalLayer { + Interaction, + Execution, + Environment, +} + +impl SignalLayer { + pub fn as_str(&self) -> &'static str { + match self { + SignalLayer::Interaction => "interaction", + SignalLayer::Execution => "execution", + SignalLayer::Environment => "environment", + } + } +} + +/// Overall quality assessment for an agent interaction session. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum InteractionQuality { + Excellent, + Good, + Neutral, + Poor, + Severe, +} + +impl InteractionQuality { + pub fn as_str(&self) -> &'static str { + match self { + InteractionQuality::Excellent => "excellent", + InteractionQuality::Good => "good", + InteractionQuality::Neutral => "neutral", + InteractionQuality::Poor => "poor", + InteractionQuality::Severe => "severe", + } + } +} + +/// A single detected signal instance. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignalInstance { + pub signal_type: SignalType, + /// Absolute index into the original conversation `Vec`. + pub message_index: usize, + pub snippet: String, + pub confidence: f32, + /// Free-form metadata payload mirroring the Python `Dict[str, Any]`. + /// Stored as a JSON object so we can faithfully reproduce the reference's + /// flexible per-detector metadata. + #[serde(default)] + pub metadata: serde_json::Value, +} + +impl SignalInstance { + pub fn new(signal_type: SignalType, message_index: usize, snippet: impl Into) -> Self { + Self { + signal_type, + message_index, + snippet: snippet.into(), + confidence: 1.0, + metadata: serde_json::Value::Object(serde_json::Map::new()), + } + } + + pub fn with_confidence(mut self, c: f32) -> Self { + self.confidence = c; + self + } + + pub fn with_metadata(mut self, m: serde_json::Value) -> Self { + self.metadata = m; + self + } +} + +/// Aggregated signals for a specific category. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignalGroup { + pub category: String, + pub count: usize, + pub signals: Vec, + /// Severity level (0-3: none, mild, moderate, severe). + pub severity: u8, +} + +impl SignalGroup { + pub fn new(category: impl Into) -> Self { + Self { + category: category.into(), + count: 0, + signals: Vec::new(), + severity: 0, + } + } + + pub fn add_signal(&mut self, signal: SignalInstance) { + self.signals.push(signal); + self.count = self.signals.len(); + self.update_severity(); + } + + fn update_severity(&mut self) { + self.severity = match self.count { + 0 => 0, + 1..=2 => 1, + 3..=4 => 2, + _ => 3, + }; + } +} + +/// Turn count and efficiency metrics, used by stagnation.dragging. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct TurnMetrics { + pub total_turns: usize, + pub user_turns: usize, + pub assistant_turns: usize, + pub is_dragging: bool, + pub efficiency_score: f32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InteractionSignals { + pub misalignment: SignalGroup, + pub stagnation: SignalGroup, + pub disengagement: SignalGroup, + pub satisfaction: SignalGroup, +} + +impl Default for InteractionSignals { + fn default() -> Self { + Self { + misalignment: SignalGroup::new("misalignment"), + stagnation: SignalGroup::new("stagnation"), + disengagement: SignalGroup::new("disengagement"), + satisfaction: SignalGroup::new("satisfaction"), + } + } +} + +impl InteractionSignals { + /// Ratio of misalignment instances to user turns. Used as a quality + /// scoring input and as a threshold for the "high misalignment rate" + /// summary callout. Mirrors `misalignment.count / max(user_turns, 1)` + /// from the Python reference's `_assess_quality` and `_generate_summary`. + pub fn misalignment_ratio(&self, user_turns: usize) -> f32 { + let denom = user_turns.max(1) as f32; + self.misalignment.count as f32 / denom + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExecutionSignals { + pub failure: SignalGroup, + pub loops: SignalGroup, +} + +impl Default for ExecutionSignals { + fn default() -> Self { + Self { + failure: SignalGroup::new("failure"), + loops: SignalGroup::new("loops"), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EnvironmentSignals { + pub exhaustion: SignalGroup, +} + +impl Default for EnvironmentSignals { + fn default() -> Self { + Self { + exhaustion: SignalGroup::new("exhaustion"), + } + } +} + +/// Complete signal analysis report for a conversation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignalReport { + pub interaction: InteractionSignals, + pub execution: ExecutionSignals, + pub environment: EnvironmentSignals, + pub overall_quality: InteractionQuality, + pub quality_score: f32, + pub turn_metrics: TurnMetrics, + pub summary: String, +} + +impl Default for SignalReport { + fn default() -> Self { + Self { + interaction: InteractionSignals::default(), + execution: ExecutionSignals::default(), + environment: EnvironmentSignals::default(), + overall_quality: InteractionQuality::Neutral, + quality_score: 50.0, + turn_metrics: TurnMetrics::default(), + summary: String::new(), + } + } +} + +impl SignalReport { + /// Iterate over every `SignalInstance` across all layers and groups. + pub fn iter_signals(&self) -> impl Iterator { + self.interaction + .misalignment + .signals + .iter() + .chain(self.interaction.stagnation.signals.iter()) + .chain(self.interaction.disengagement.signals.iter()) + .chain(self.interaction.satisfaction.signals.iter()) + .chain(self.execution.failure.signals.iter()) + .chain(self.execution.loops.signals.iter()) + .chain(self.environment.exhaustion.signals.iter()) + } + + pub fn has_signal_type(&self, t: SignalType) -> bool { + self.iter_signals().any(|s| s.signal_type == t) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn signal_type_strings_match_paper_taxonomy() { + assert_eq!( + SignalType::MisalignmentCorrection.as_str(), + "interaction.misalignment.correction" + ); + assert_eq!( + SignalType::ExecutionFailureInvalidArgs.as_str(), + "execution.failure.invalid_args" + ); + assert_eq!( + SignalType::EnvironmentExhaustionMalformed.as_str(), + "environment.exhaustion.malformed_response" + ); + } + + #[test] + fn signal_type_layer_and_category() { + assert_eq!( + SignalType::MisalignmentRephrase.layer(), + SignalLayer::Interaction + ); + assert_eq!(SignalType::MisalignmentRephrase.category(), "misalignment"); + assert_eq!( + SignalType::ExecutionLoopsRetry.layer(), + SignalLayer::Execution + ); + assert_eq!(SignalType::ExecutionLoopsRetry.category(), "loops"); + assert_eq!( + SignalType::EnvironmentExhaustionTimeout.layer(), + SignalLayer::Environment + ); + assert_eq!( + SignalType::EnvironmentExhaustionTimeout.category(), + "exhaustion" + ); + } + + #[test] + fn signal_group_severity_buckets_match_python() { + let mut g = SignalGroup::new("misalignment"); + assert_eq!(g.severity, 0); + for n in 1..=2 { + g.add_signal(SignalInstance::new( + SignalType::MisalignmentCorrection, + n, + "x", + )); + } + assert_eq!(g.severity, 1); + for n in 3..=4 { + g.add_signal(SignalInstance::new( + SignalType::MisalignmentCorrection, + n, + "x", + )); + } + assert_eq!(g.severity, 2); + for n in 5..=6 { + g.add_signal(SignalInstance::new( + SignalType::MisalignmentCorrection, + n, + "x", + )); + } + assert_eq!(g.severity, 3); + } +} diff --git a/crates/brightstaff/src/signals/text_processing.rs b/crates/brightstaff/src/signals/text_processing.rs new file mode 100644 index 00000000..a1d463cc --- /dev/null +++ b/crates/brightstaff/src/signals/text_processing.rs @@ -0,0 +1,401 @@ +//! Text normalization and similarity primitives. +//! +//! Direct Rust port of `signals/text_processing.py` from the reference. The +//! shapes (`NormalizedMessage`, `NormalizedPattern`) and similarity formulas +//! match the Python implementation exactly so that pattern matching produces +//! the same results on the same inputs. + +use std::collections::{HashMap, HashSet}; + +/// Size of character n-grams used for fuzzy similarity (3 = trigrams). +pub const NGRAM_SIZE: usize = 3; + +const PUNCT_TRIM: &[char] = &[ + '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', + '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', +]; + +/// Pre-processed message with normalized text and tokens for efficient matching. +#[derive(Debug, Clone, Default)] +pub struct NormalizedMessage { + pub raw: String, + pub tokens: Vec, + pub token_set: HashSet, + pub bigram_set: HashSet, + pub char_ngram_set: HashSet, + pub token_frequency: HashMap, +} + +impl NormalizedMessage { + /// Create a normalized message from raw text. Mirrors + /// `NormalizedMessage.from_text` in the reference, including the + /// head-20%/tail-80% truncation strategy when text exceeds `max_length`. + pub fn from_text(text: &str, max_length: usize) -> Self { + let char_count = text.chars().count(); + + let raw: String = if char_count <= max_length { + text.to_string() + } else { + let head_len = max_length / 5; + // Reserve one char for the joining space. + let tail_len = max_length.saturating_sub(head_len + 1); + let head: String = text.chars().take(head_len).collect(); + let tail: String = text + .chars() + .skip(char_count.saturating_sub(tail_len)) + .collect(); + format!("{} {}", head, tail) + }; + + // Normalize unicode punctuation to ASCII equivalents. + let normalized_unicode = raw + .replace(['\u{2019}', '\u{2018}'], "'") + .replace(['\u{201c}', '\u{201d}'], "\"") + .replace(['\u{2013}', '\u{2014}'], "-"); + + // Lowercase + collapse whitespace (matches Python's `" ".join(s.split())`). + let normalized: String = normalized_unicode + .to_lowercase() + .split_whitespace() + .collect::>() + .join(" "); + + let mut tokens: Vec = Vec::new(); + for word in normalized.split_whitespace() { + let stripped: String = word.trim_matches(PUNCT_TRIM).to_string(); + if !stripped.is_empty() { + tokens.push(stripped); + } + } + + let token_set: HashSet = tokens.iter().cloned().collect(); + + let mut bigram_set: HashSet = HashSet::new(); + for i in 0..tokens.len().saturating_sub(1) { + bigram_set.insert(format!("{} {}", tokens[i], tokens[i + 1])); + } + + let tokens_text = tokens.join(" "); + let char_ngram_set = char_ngrams(&tokens_text, NGRAM_SIZE); + + let mut token_frequency: HashMap = HashMap::new(); + for t in &tokens { + *token_frequency.entry(t.clone()).or_insert(0) += 1; + } + + Self { + raw, + tokens, + token_set, + bigram_set, + char_ngram_set, + token_frequency, + } + } + + pub fn contains_token(&self, token: &str) -> bool { + self.token_set.contains(token) + } + + pub fn contains_phrase(&self, phrase: &str) -> bool { + let phrase_tokens: Vec<&str> = phrase.split_whitespace().collect(); + if phrase_tokens.is_empty() { + return false; + } + if phrase_tokens.len() == 1 { + return self.contains_token(phrase_tokens[0]); + } + if phrase_tokens.len() > self.tokens.len() { + return false; + } + let n = phrase_tokens.len(); + for i in 0..=self.tokens.len() - n { + if self.tokens[i..i + n] + .iter() + .zip(phrase_tokens.iter()) + .all(|(a, b)| a == b) + { + return true; + } + } + false + } + + /// Character n-gram (Jaccard) similarity vs another normalized message. + pub fn ngram_similarity_with_message(&self, other: &NormalizedMessage) -> f32 { + jaccard(&self.char_ngram_set, &other.char_ngram_set) + } + + /// Character n-gram (Jaccard) similarity vs a raw pattern string. + pub fn ngram_similarity_with_pattern(&self, pattern: &str) -> f32 { + let normalized = strip_non_word_chars(&pattern.to_lowercase()); + let pattern_ngrams = char_ngrams(&normalized, NGRAM_SIZE); + jaccard(&self.char_ngram_set, &pattern_ngrams) + } + + /// Fraction of pattern's ngrams contained in this message's ngram set. + pub fn char_ngram_containment(&self, pattern: &str) -> f32 { + let normalized = strip_non_word_chars(&pattern.to_lowercase()); + let pattern_ngrams = char_ngrams(&normalized, NGRAM_SIZE); + if pattern_ngrams.is_empty() { + return 0.0; + } + let contained = pattern_ngrams + .iter() + .filter(|ng| self.char_ngram_set.contains(*ng)) + .count(); + contained as f32 / pattern_ngrams.len() as f32 + } + + /// Token-frequency cosine similarity vs a raw pattern string. + pub fn token_cosine_similarity(&self, pattern: &str) -> f32 { + let mut pattern_freq: HashMap = HashMap::new(); + for word in pattern.to_lowercase().split_whitespace() { + let stripped = word.trim_matches(PUNCT_TRIM); + if !stripped.is_empty() { + *pattern_freq.entry(stripped.to_string()).or_insert(0) += 1; + } + } + cosine_freq(&self.token_frequency, &pattern_freq) + } + + /// Layered match against a pre-normalized pattern. Mirrors + /// `matches_normalized_pattern` from the reference: exact phrase -> + /// char-ngram Jaccard -> token cosine. + pub fn matches_normalized_pattern( + &self, + pattern: &NormalizedPattern, + char_ngram_threshold: f32, + token_cosine_threshold: f32, + ) -> bool { + // Layer 0: exact phrase match using pre-tokenized message. + let plen = pattern.tokens.len(); + let slen = self.tokens.len(); + if plen > 0 && plen <= slen { + for i in 0..=slen - plen { + if self.tokens[i..i + plen] == pattern.tokens[..] { + return true; + } + } + } + + // Layer 1: character n-gram Jaccard similarity. + if !self.char_ngram_set.is_empty() && !pattern.char_ngram_set.is_empty() { + let inter = self + .char_ngram_set + .intersection(&pattern.char_ngram_set) + .count(); + let union = self.char_ngram_set.union(&pattern.char_ngram_set).count(); + if union > 0 { + let sim = inter as f32 / union as f32; + if sim >= char_ngram_threshold { + return true; + } + } + } + + // Layer 2: token frequency cosine similarity. + if !self.token_frequency.is_empty() && !pattern.token_frequency.is_empty() { + let sim = cosine_freq(&self.token_frequency, &pattern.token_frequency); + if sim >= token_cosine_threshold { + return true; + } + } + + false + } +} + +/// Pre-processed pattern with normalized text and pre-computed n-grams/tokens. +#[derive(Debug, Clone, Default)] +pub struct NormalizedPattern { + pub raw: String, + pub tokens: Vec, + pub char_ngram_set: HashSet, + pub token_frequency: HashMap, +} + +impl NormalizedPattern { + pub fn from_text(pattern: &str) -> Self { + let normalized = pattern + .to_lowercase() + .replace(['\u{2019}', '\u{2018}'], "'") + .replace(['\u{201c}', '\u{201d}'], "\"") + .replace(['\u{2013}', '\u{2014}'], "-"); + let normalized: String = normalized.split_whitespace().collect::>().join(" "); + + // Tokenize the same way as NormalizedMessage (trim boundary punctuation, + // keep internal punctuation). + let mut tokens: Vec = Vec::new(); + for word in normalized.split_whitespace() { + let stripped = word.trim_matches(PUNCT_TRIM); + if !stripped.is_empty() { + tokens.push(stripped.to_string()); + } + } + + // For ngrams + cosine, strip ALL punctuation (matches Python's + // `re.sub(r"[^\w\s]", "", normalized)`). + let normalized_for_ngrams = strip_non_word_chars(&normalized); + let char_ngram_set = char_ngrams(&normalized_for_ngrams, NGRAM_SIZE); + + let tokens_no_punct: Vec<&str> = normalized_for_ngrams.split_whitespace().collect(); + let mut token_frequency: HashMap = HashMap::new(); + for t in &tokens_no_punct { + *token_frequency.entry((*t).to_string()).or_insert(0) += 1; + } + + Self { + raw: pattern.to_string(), + tokens, + char_ngram_set, + token_frequency, + } + } +} + +/// Convenience: normalize a list of raw pattern strings into `NormalizedPattern`s. +pub fn normalize_patterns(patterns: &[&str]) -> Vec { + patterns + .iter() + .map(|p| NormalizedPattern::from_text(p)) + .collect() +} + +// --------------------------------------------------------------------------- +// Similarity primitives +// --------------------------------------------------------------------------- + +fn char_ngrams(s: &str, n: usize) -> HashSet { + // Python iterates by character index, not byte; mirror that with .chars(). + let chars: Vec = s.chars().collect(); + let mut out: HashSet = HashSet::new(); + if chars.len() < n { + return out; + } + for i in 0..=chars.len() - n { + out.insert(chars[i..i + n].iter().collect()); + } + out +} + +fn jaccard(a: &HashSet, b: &HashSet) -> f32 { + if a.is_empty() && b.is_empty() { + return 1.0; + } + if a.is_empty() || b.is_empty() { + return 0.0; + } + let inter = a.intersection(b).count(); + let union = a.union(b).count(); + if union == 0 { + 0.0 + } else { + inter as f32 / union as f32 + } +} + +fn cosine_freq(a: &HashMap, b: &HashMap) -> f32 { + if a.is_empty() && b.is_empty() { + return 1.0; + } + if a.is_empty() || b.is_empty() { + return 0.0; + } + let mut dot: f64 = 0.0; + let mut n1_sq: f64 = 0.0; + let mut n2_sq: f64 = 0.0; + for (token, &freq2) in b { + let freq1 = *a.get(token).unwrap_or(&0); + dot += (freq1 * freq2) as f64; + n2_sq += (freq2 * freq2) as f64; + } + for &freq1 in a.values() { + n1_sq += (freq1 * freq1) as f64; + } + let n1 = n1_sq.sqrt(); + let n2 = n2_sq.sqrt(); + if n1 == 0.0 || n2 == 0.0 { + 0.0 + } else { + (dot / (n1 * n2)) as f32 + } +} + +/// Python equivalent: `re.sub(r"[^\w\s]", "", text)` followed by whitespace +/// collapse. Python's `\w` is `[A-Za-z0-9_]` plus unicode word characters; we +/// use Rust's `char::is_alphanumeric()` plus `_` for an equivalent definition. +fn strip_non_word_chars(text: &str) -> String { + let mut out = String::with_capacity(text.len()); + for c in text.chars() { + if c.is_alphanumeric() || c == '_' || c.is_whitespace() { + out.push(c); + } + } + out.split_whitespace().collect::>().join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize_lowercases_and_strips_punctuation() { + let m = NormalizedMessage::from_text("Hello, World!", 2000); + assert_eq!(m.tokens, vec!["hello".to_string(), "world".to_string()]); + } + + #[test] + fn normalizes_smart_quotes() { + let m = NormalizedMessage::from_text("don\u{2019}t", 2000); + assert!(m.tokens.contains(&"don't".to_string())); + } + + #[test] + fn truncates_long_text_with_head_tail() { + let long = "a".repeat(3000); + let m = NormalizedMessage::from_text(&long, 2000); + // raw should be ~ 2000 chars (head + space + tail) + assert!(m.raw.chars().count() <= 2001); + assert!(m.raw.starts_with("aa")); + assert!(m.raw.ends_with("aa")); + } + + #[test] + fn contains_phrase_matches_consecutive_tokens() { + let m = NormalizedMessage::from_text("I think this is great work", 2000); + assert!(m.contains_phrase("this is great")); + assert!(!m.contains_phrase("great this")); + } + + #[test] + fn matches_pattern_via_exact_phrase() { + let m = NormalizedMessage::from_text("No, I meant the second one", 2000); + let p = NormalizedPattern::from_text("no i meant"); + assert!(m.matches_normalized_pattern(&p, 0.65, 0.6)); + } + + #[test] + fn matches_pattern_via_char_ngram_fuzziness() { + // Typo in "meant" -> "ment" so layer 0 (exact phrase) cannot match, + // forcing the matcher to fall back to layer 1 (char n-gram Jaccard). + let m = NormalizedMessage::from_text("No I ment", 2000); + let p = NormalizedPattern::from_text("no i meant"); + assert!(m.matches_normalized_pattern(&p, 0.4, 0.6)); + } + + #[test] + fn jaccard_identical_sets_is_one() { + let a: HashSet = ["abc", "bcd"].iter().map(|s| s.to_string()).collect(); + assert!((jaccard(&a, &a) - 1.0).abs() < 1e-6); + } + + #[test] + fn cosine_freq_orthogonal_is_zero() { + let mut a: HashMap = HashMap::new(); + a.insert("hello".to_string(), 1); + let mut b: HashMap = HashMap::new(); + b.insert("world".to_string(), 1); + assert_eq!(cosine_freq(&a, &b), 0.0); + } +} diff --git a/crates/brightstaff/src/streaming.rs b/crates/brightstaff/src/streaming.rs index f7af8ae0..26af8672 100644 --- a/crates/brightstaff/src/streaming.rs +++ b/crates/brightstaff/src/streaming.rs @@ -16,10 +16,134 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; use crate::handlers::agents::pipeline::{PipelineError, PipelineProcessor}; const STREAM_BUFFER_SIZE: usize = 16; -use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER}; -use crate::tracing::{llm, set_service_name, signals as signal_constants}; +/// Cap on accumulated response bytes kept for usage extraction. +/// Most chat responses are well under this; pathological ones are dropped without +/// affecting pass-through streaming to the client. +const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; +use crate::signals::otel::emit_signals_to_span; +use crate::signals::{SignalAnalyzer, FLAG_MARKER}; +use crate::tracing::{llm, set_service_name}; use hermesllm::apis::openai::Message; +/// Parsed usage + resolved-model details from a provider response. +#[derive(Debug, Default, Clone)] +struct ExtractedUsage { + prompt_tokens: Option, + completion_tokens: Option, + total_tokens: Option, + cached_input_tokens: Option, + cache_creation_tokens: Option, + reasoning_tokens: Option, + /// The model the upstream actually used. For router aliases (e.g. + /// `router:software-engineering`), this differs from the request model. + resolved_model: Option, +} + +impl ExtractedUsage { + fn is_empty(&self) -> bool { + self.prompt_tokens.is_none() + && self.completion_tokens.is_none() + && self.total_tokens.is_none() + && self.resolved_model.is_none() + } + + fn from_json(value: &serde_json::Value) -> Self { + let mut out = Self::default(); + if let Some(model) = value.get("model").and_then(|v| v.as_str()) { + if !model.is_empty() { + out.resolved_model = Some(model.to_string()); + } + } + if let Some(u) = value.get("usage") { + // OpenAI-shape usage + out.prompt_tokens = u.get("prompt_tokens").and_then(|v| v.as_i64()); + out.completion_tokens = u.get("completion_tokens").and_then(|v| v.as_i64()); + out.total_tokens = u.get("total_tokens").and_then(|v| v.as_i64()); + out.cached_input_tokens = u + .get("prompt_tokens_details") + .and_then(|d| d.get("cached_tokens")) + .and_then(|v| v.as_i64()); + out.reasoning_tokens = u + .get("completion_tokens_details") + .and_then(|d| d.get("reasoning_tokens")) + .and_then(|v| v.as_i64()); + + // Anthropic-shape fallbacks + if out.prompt_tokens.is_none() { + out.prompt_tokens = u.get("input_tokens").and_then(|v| v.as_i64()); + } + if out.completion_tokens.is_none() { + out.completion_tokens = u.get("output_tokens").and_then(|v| v.as_i64()); + } + if out.total_tokens.is_none() { + if let (Some(p), Some(c)) = (out.prompt_tokens, out.completion_tokens) { + out.total_tokens = Some(p + c); + } + } + if out.cached_input_tokens.is_none() { + out.cached_input_tokens = u.get("cache_read_input_tokens").and_then(|v| v.as_i64()); + } + if out.cached_input_tokens.is_none() { + out.cached_input_tokens = + u.get("cached_content_token_count").and_then(|v| v.as_i64()); + } + out.cache_creation_tokens = u + .get("cache_creation_input_tokens") + .and_then(|v| v.as_i64()); + if out.reasoning_tokens.is_none() { + out.reasoning_tokens = u.get("thoughts_token_count").and_then(|v| v.as_i64()); + } + } + out + } +} + +/// Try to pull usage out of an accumulated response body. +/// Handles both a single JSON object (non-streaming) and SSE streams where the +/// final `data: {...}` event carries the `usage` field. +fn extract_usage_from_bytes(buf: &[u8]) -> ExtractedUsage { + if buf.is_empty() { + return ExtractedUsage::default(); + } + + // Fast path: full-body JSON (non-streaming). + if let Ok(value) = serde_json::from_slice::(buf) { + let u = ExtractedUsage::from_json(&value); + if !u.is_empty() { + return u; + } + } + + // SSE path: scan from the end for a `data:` line containing a usage object. + let text = match std::str::from_utf8(buf) { + Ok(t) => t, + Err(_) => return ExtractedUsage::default(), + }; + for line in text.lines().rev() { + let trimmed = line.trim_start(); + let payload = match trimmed.strip_prefix("data:") { + Some(p) => p.trim_start(), + None => continue, + }; + if payload == "[DONE]" || payload.is_empty() { + continue; + } + if !payload.contains("\"usage\"") { + continue; + } + if let Ok(value) = serde_json::from_str::(payload) { + let u = ExtractedUsage::from_json(&value); + if !u.is_empty() { + return u; + } + } + } + + ExtractedUsage::default() +} + /// Trait for processing streaming chunks /// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging) pub trait StreamProcessor: Send + 'static { @@ -51,6 +175,18 @@ impl StreamProcessor for Box { } } +/// Optional Prometheus-metric context for an LLM upstream call. When present, +/// [`ObservableStreamProcessor`] emits `brightstaff_llm_*` metrics at +/// first-byte / complete / error callbacks. +#[derive(Debug, Clone)] +pub struct LlmMetricsCtx { + pub provider: String, + pub model: String, + /// HTTP status of the upstream response. Used to pick `status_class` and + /// `error_class` on `on_complete`. + pub upstream_status: u16, +} + /// A processor that tracks streaming metrics pub struct ObservableStreamProcessor { service_name: String, @@ -60,6 +196,12 @@ pub struct ObservableStreamProcessor { start_time: Instant, time_to_first_token: Option, messages: Option>, + /// Accumulated response bytes used only for best-effort usage extraction + /// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped + /// from the buffer (they still pass through to the client). + response_buffer: Vec, + llm_metrics: Option, + metrics_recorded: bool, } impl ObservableStreamProcessor { @@ -93,21 +235,42 @@ impl ObservableStreamProcessor { start_time, time_to_first_token: None, messages, + response_buffer: Vec::new(), + llm_metrics: None, + metrics_recorded: false, } } + + /// Attach LLM upstream metric context so the processor emits + /// `brightstaff_llm_*` metrics on first-byte / complete / error. + pub fn with_llm_metrics(mut self, ctx: LlmMetricsCtx) -> Self { + self.llm_metrics = Some(ctx); + self + } } impl StreamProcessor for ObservableStreamProcessor { fn process_chunk(&mut self, chunk: Bytes) -> Result, String> { self.total_bytes += chunk.len(); self.chunk_count += 1; + // Accumulate for best-effort usage extraction; drop further chunks once + // the cap is reached so we don't retain huge response bodies in memory. + if self.response_buffer.len() < USAGE_BUFFER_MAX { + let remaining = USAGE_BUFFER_MAX - self.response_buffer.len(); + let take = chunk.len().min(remaining); + self.response_buffer.extend_from_slice(&chunk[..take]); + } Ok(Some(chunk)) } fn on_first_bytes(&mut self) { // Record time to first token (only for streaming) if self.time_to_first_token.is_none() { - self.time_to_first_token = Some(self.start_time.elapsed().as_millis()); + let elapsed = self.start_time.elapsed(); + self.time_to_first_token = Some(elapsed.as_millis()); + if let Some(ref ctx) = self.llm_metrics { + bs_metrics::record_llm_ttft(&ctx.provider, &ctx.model, elapsed); + } } } @@ -124,77 +287,98 @@ impl StreamProcessor for ObservableStreamProcessor { ); } - // Analyze signals if messages are available and record as span attributes - if let Some(ref messages) = self.messages { - let analyzer: Box = Box::new(TextBasedSignalAnalyzer::new()); - let report = analyzer.analyze(messages); + // Record total duration on the span for the observability console. + let duration_ms = self.start_time.elapsed().as_millis() as i64; + { + let span = tracing::Span::current(); + let otel_context = span.context(); + let otel_span = otel_context.span(); + otel_span.set_attribute(KeyValue::new(llm::DURATION_MS, duration_ms)); + otel_span.set_attribute(KeyValue::new(llm::RESPONSE_BYTES, self.total_bytes as i64)); + } + + // Best-effort usage extraction + emission (works for both streaming + // SSE and non-streaming JSON responses that include a `usage` object). + let usage = extract_usage_from_bytes(&self.response_buffer); + if !usage.is_empty() { + let span = tracing::Span::current(); + let otel_context = span.context(); + let otel_span = otel_context.span(); + if let Some(v) = usage.prompt_tokens { + otel_span.set_attribute(KeyValue::new(llm::PROMPT_TOKENS, v)); + } + if let Some(v) = usage.completion_tokens { + otel_span.set_attribute(KeyValue::new(llm::COMPLETION_TOKENS, v)); + } + if let Some(v) = usage.total_tokens { + otel_span.set_attribute(KeyValue::new(llm::TOTAL_TOKENS, v)); + } + if let Some(v) = usage.cached_input_tokens { + otel_span.set_attribute(KeyValue::new(llm::CACHED_INPUT_TOKENS, v)); + } + if let Some(v) = usage.cache_creation_tokens { + otel_span.set_attribute(KeyValue::new(llm::CACHE_CREATION_TOKENS, v)); + } + if let Some(v) = usage.reasoning_tokens { + otel_span.set_attribute(KeyValue::new(llm::REASONING_TOKENS, v)); + } + // Override `llm.model` with the model the upstream actually ran + // (e.g. `openai-gpt-5.4` resolved from `router:software-engineering`). + // Cost lookup keys off the real model, not the alias. + if let Some(resolved) = usage.resolved_model.clone() { + otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved)); + } + } + + // Emit LLM upstream prometheus metrics (duration + tokens) if wired. + // The upstream responded (we have a status), so status_class alone + // carries the non-2xx signal — error_class stays "none". + if let Some(ref ctx) = self.llm_metrics { + bs_metrics::record_llm_upstream( + &ctx.provider, + &ctx.model, + ctx.upstream_status, + metric_labels::LLM_ERR_NONE, + self.start_time.elapsed(), + ); + if let Some(v) = usage.prompt_tokens { + bs_metrics::record_llm_tokens( + &ctx.provider, + &ctx.model, + metric_labels::TOKEN_KIND_PROMPT, + v.max(0) as u64, + ); + } + if let Some(v) = usage.completion_tokens { + bs_metrics::record_llm_tokens( + &ctx.provider, + &ctx.model, + metric_labels::TOKEN_KIND_COMPLETION, + v.max(0) as u64, + ); + } + if usage.prompt_tokens.is_none() && usage.completion_tokens.is_none() { + bs_metrics::record_llm_tokens_usage_missing(&ctx.provider, &ctx.model); + } + self.metrics_recorded = true; + } + // Release the buffered bytes early; nothing downstream needs them. + self.response_buffer.clear(); + self.response_buffer.shrink_to_fit(); + + // Analyze signals if messages are available and record as span + // attributes + per-signal events. We dual-emit legacy aggregate keys + // and the new layered taxonomy so existing dashboards keep working + // while new consumers can opt into the richer hierarchy. + if let Some(ref messages) = self.messages { + let analyzer = SignalAnalyzer::default(); + let report = analyzer.analyze_openai(messages); - // Get the current OTel span to set signal attributes let span = tracing::Span::current(); let otel_context = span.context(); let otel_span = otel_context.span(); - // Add overall quality - otel_span.set_attribute(KeyValue::new( - signal_constants::QUALITY, - format!("{:?}", report.overall_quality), - )); - - // Add repair/follow-up metrics if concerning - if report.follow_up.is_concerning || report.follow_up.repair_count > 0 { - otel_span.set_attribute(KeyValue::new( - signal_constants::REPAIR_COUNT, - report.follow_up.repair_count as i64, - )); - otel_span.set_attribute(KeyValue::new( - signal_constants::REPAIR_RATIO, - format!("{:.3}", report.follow_up.repair_ratio), - )); - } - - // Add frustration metrics - if report.frustration.has_frustration { - otel_span.set_attribute(KeyValue::new( - signal_constants::FRUSTRATION_COUNT, - report.frustration.frustration_count as i64, - )); - otel_span.set_attribute(KeyValue::new( - signal_constants::FRUSTRATION_SEVERITY, - report.frustration.severity as i64, - )); - } - - // Add repetition metrics - if report.repetition.has_looping { - otel_span.set_attribute(KeyValue::new( - signal_constants::REPETITION_COUNT, - report.repetition.repetition_count as i64, - )); - } - - // Add escalation metrics - if report.escalation.escalation_requested { - otel_span - .set_attribute(KeyValue::new(signal_constants::ESCALATION_REQUESTED, true)); - } - - // Add positive feedback metrics - if report.positive_feedback.has_positive_feedback { - otel_span.set_attribute(KeyValue::new( - signal_constants::POSITIVE_FEEDBACK_COUNT, - report.positive_feedback.positive_count as i64, - )); - } - - // Flag the span name if any concerning signal is detected - let should_flag = report.frustration.has_frustration - || report.repetition.has_looping - || report.escalation.escalation_requested - || matches!( - report.overall_quality, - InteractionQuality::Poor | InteractionQuality::Severe - ); - + let should_flag = emit_signals_to_span(&otel_span, &report); if should_flag { otel_span.update_name(format!("{} {}", self.operation_name, FLAG_MARKER)); } @@ -217,6 +401,18 @@ impl StreamProcessor for ObservableStreamProcessor { duration_ms = self.start_time.elapsed().as_millis(), "stream error" ); + if let Some(ref ctx) = self.llm_metrics { + if !self.metrics_recorded { + bs_metrics::record_llm_upstream( + &ctx.provider, + &ctx.model, + ctx.upstream_status, + metric_labels::LLM_ERR_STREAM, + self.start_time.elapsed(), + ); + self.metrics_recorded = true; + } + } } } @@ -404,3 +600,55 @@ pub fn truncate_message(message: &str, max_length: usize) -> String { message.to_string() } } + +#[cfg(test)] +mod usage_extraction_tests { + use super::*; + + #[test] + fn non_streaming_openai_with_cached() { + let body = br#"{"id":"x","model":"gpt-4o","choices":[],"usage":{"prompt_tokens":12,"completion_tokens":34,"total_tokens":46,"prompt_tokens_details":{"cached_tokens":5}}}"#; + let u = extract_usage_from_bytes(body); + assert_eq!(u.prompt_tokens, Some(12)); + assert_eq!(u.completion_tokens, Some(34)); + assert_eq!(u.total_tokens, Some(46)); + assert_eq!(u.cached_input_tokens, Some(5)); + assert_eq!(u.reasoning_tokens, None); + } + + #[test] + fn non_streaming_anthropic_with_cache_creation() { + let body = br#"{"id":"x","model":"claude","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":20,"cache_read_input_tokens":30}}"#; + let u = extract_usage_from_bytes(body); + assert_eq!(u.prompt_tokens, Some(100)); + assert_eq!(u.completion_tokens, Some(50)); + assert_eq!(u.total_tokens, Some(150)); + assert_eq!(u.cached_input_tokens, Some(30)); + assert_eq!(u.cache_creation_tokens, Some(20)); + } + + #[test] + fn streaming_openai_final_chunk_has_usage() { + let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]} + +data: {\"choices\":[{\"delta\":{}, \"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":3,\"total_tokens\":10}} + +data: [DONE] + +"; + let u = extract_usage_from_bytes(sse); + assert_eq!(u.prompt_tokens, Some(7)); + assert_eq!(u.completion_tokens, Some(3)); + assert_eq!(u.total_tokens, Some(10)); + } + + #[test] + fn empty_returns_default() { + assert!(extract_usage_from_bytes(b"").is_empty()); + } + + #[test] + fn no_usage_in_body_returns_default() { + assert!(extract_usage_from_bytes(br#"{"ok":true}"#).is_empty()); + } +} diff --git a/crates/brightstaff/src/tracing/constants.rs b/crates/brightstaff/src/tracing/constants.rs index 15e3cf57..79a40401 100644 --- a/crates/brightstaff/src/tracing/constants.rs +++ b/crates/brightstaff/src/tracing/constants.rs @@ -80,6 +80,18 @@ pub mod llm { /// Total tokens used (prompt + completion) pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens"; + /// Tokens served from a prompt cache read + /// (OpenAI `prompt_tokens_details.cached_tokens`, Anthropic `cache_read_input_tokens`, + /// Google `cached_content_token_count`) + pub const CACHED_INPUT_TOKENS: &str = "llm.usage.cached_input_tokens"; + + /// Tokens used to write a prompt cache entry (Anthropic `cache_creation_input_tokens`) + pub const CACHE_CREATION_TOKENS: &str = "llm.usage.cache_creation_tokens"; + + /// Reasoning tokens for reasoning models + /// (OpenAI `completion_tokens_details.reasoning_tokens`, Google `thoughts_token_count`) + pub const REASONING_TOKENS: &str = "llm.usage.reasoning_tokens"; + /// Temperature parameter used pub const TEMPERATURE: &str = "llm.temperature"; @@ -119,6 +131,22 @@ pub mod routing { pub const SELECTION_REASON: &str = "routing.selection_reason"; } +// ============================================================================= +// Span Attributes - Plano-specific +// ============================================================================= + +/// Attributes specific to Plano (session affinity, routing decisions). +pub mod plano { + /// Session identifier propagated via the `x-model-affinity` header. + /// Absent when the client did not send the header. + pub const SESSION_ID: &str = "plano.session_id"; + + /// Matched route name from routing (e.g. "code", "summarization", + /// "software-engineering"). Absent when the client routed directly + /// to a concrete model. + pub const ROUTE_NAME: &str = "plano.route.name"; +} + // ============================================================================= // Span Attributes - Error Handling // ============================================================================= diff --git a/crates/brightstaff/src/tracing/mod.rs b/crates/brightstaff/src/tracing/mod.rs index 644db31a..8e09a21c 100644 --- a/crates/brightstaff/src/tracing/mod.rs +++ b/crates/brightstaff/src/tracing/mod.rs @@ -4,7 +4,7 @@ mod init; mod service_name_exporter; pub use constants::{ - error, http, llm, operation_component, routing, signals, OperationNameBuilder, + error, http, llm, operation_component, plano, routing, signals, OperationNameBuilder, }; pub use custom_attributes::collect_custom_trace_attributes; pub use init::init_tracer; diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 125a986d..1275d77d 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -234,6 +234,7 @@ pub struct Overrides { pub llm_routing_model: Option, pub agent_orchestration_model: Option, pub orchestrator_model_context_length: Option, + pub disable_signals: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] @@ -391,6 +392,10 @@ pub enum LlmProviderType { AmazonBedrock, #[serde(rename = "plano")] Plano, + #[serde(rename = "chatgpt")] + ChatGPT, + #[serde(rename = "digitalocean")] + DigitalOcean, } impl Display for LlmProviderType { @@ -412,6 +417,8 @@ impl Display for LlmProviderType { LlmProviderType::Qwen => write!(f, "qwen"), LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"), LlmProviderType::Plano => write!(f, "plano"), + LlmProviderType::ChatGPT => write!(f, "chatgpt"), + LlmProviderType::DigitalOcean => write!(f, "digitalocean"), } } } @@ -478,6 +485,7 @@ pub struct LlmProvider { pub base_url_path_prefix: Option, pub internal: Option, pub passthrough_auth: Option, + pub headers: Option>, } pub trait IntoModels { @@ -521,6 +529,7 @@ impl Default for LlmProvider { base_url_path_prefix: None, internal: None, passthrough_auth: None, + headers: None, } } } @@ -647,7 +656,7 @@ mod test { .expect("reference config file not found"); let config: super::Configuration = serde_yaml::from_str(&ref_config).unwrap(); - assert_eq!(config.version, "v0.3.0"); + assert_eq!(config.version, "v0.4.0"); if let Some(prompt_targets) = &config.prompt_targets { assert!( @@ -747,4 +756,29 @@ mod test { assert!(model_ids.contains(&"openai-gpt4".to_string())); assert!(!model_ids.contains(&"plano-orchestrator".to_string())); } + + #[test] + fn test_overrides_disable_signals_default_none() { + let overrides = super::Overrides::default(); + assert_eq!(overrides.disable_signals, None); + } + + #[test] + fn test_overrides_disable_signals_deserialize() { + let yaml = r#" +disable_signals: true +"#; + let overrides: super::Overrides = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(overrides.disable_signals, Some(true)); + + let yaml_false = r#" +disable_signals: false +"#; + let overrides: super::Overrides = serde_yaml::from_str(yaml_false).unwrap(); + assert_eq!(overrides.disable_signals, Some(false)); + + let yaml_missing = "{}"; + let overrides: super::Overrides = serde_yaml::from_str(yaml_missing).unwrap(); + assert_eq!(overrides.disable_signals, None); + } } diff --git a/crates/common/src/llm_providers.rs b/crates/common/src/llm_providers.rs index b5c03b30..b4355a2f 100644 --- a/crates/common/src/llm_providers.rs +++ b/crates/common/src/llm_providers.rs @@ -277,6 +277,7 @@ mod tests { internal: None, stream: None, passthrough_auth: None, + headers: None, } } diff --git a/crates/hermesllm/src/apis/anthropic.rs b/crates/hermesllm/src/apis/anthropic.rs index 4df4bb00..ee572268 100644 --- a/crates/hermesllm/src/apis/anthropic.rs +++ b/crates/hermesllm/src/apis/anthropic.rs @@ -435,6 +435,12 @@ impl TokenUsage for MessagesResponse { fn total_tokens(&self) -> usize { (self.usage.input_tokens + self.usage.output_tokens) as usize } + fn cached_input_tokens(&self) -> Option { + self.usage.cache_read_input_tokens.map(|t| t as usize) + } + fn cache_creation_tokens(&self) -> Option { + self.usage.cache_creation_input_tokens.map(|t| t as usize) + } } impl ProviderResponse for MessagesResponse { diff --git a/crates/hermesllm/src/apis/openai.rs b/crates/hermesllm/src/apis/openai.rs index d22ff756..bb93fd34 100644 --- a/crates/hermesllm/src/apis/openai.rs +++ b/crates/hermesllm/src/apis/openai.rs @@ -596,6 +596,18 @@ impl TokenUsage for Usage { fn total_tokens(&self) -> usize { self.total_tokens as usize } + + fn cached_input_tokens(&self) -> Option { + self.prompt_tokens_details + .as_ref() + .and_then(|d| d.cached_tokens.map(|t| t as usize)) + } + + fn reasoning_tokens(&self) -> Option { + self.completion_tokens_details + .as_ref() + .and_then(|d| d.reasoning_tokens.map(|t| t as usize)) + } } /// Implementation of ProviderRequest for ChatCompletionsRequest diff --git a/crates/hermesllm/src/apis/openai_responses.rs b/crates/hermesllm/src/apis/openai_responses.rs index eac8a452..92d362b2 100644 --- a/crates/hermesllm/src/apis/openai_responses.rs +++ b/crates/hermesllm/src/apis/openai_responses.rs @@ -710,6 +710,18 @@ impl crate::providers::response::TokenUsage for ResponseUsage { fn total_tokens(&self) -> usize { self.total_tokens as usize } + + fn cached_input_tokens(&self) -> Option { + self.input_tokens_details + .as_ref() + .map(|d| d.cached_tokens.max(0) as usize) + } + + fn reasoning_tokens(&self) -> Option { + self.output_tokens_details + .as_ref() + .map(|d| d.reasoning_tokens.max(0) as usize) + } } /// Token details diff --git a/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs b/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs index eb9ec5b1..d3e3bbff 100644 --- a/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs +++ b/crates/hermesllm/src/apis/streaming_shapes/anthropic_streaming_buffer.rs @@ -1,6 +1,9 @@ -use crate::apis::anthropic::MessagesStreamEvent; +use crate::apis::anthropic::{ + MessagesMessageDelta, MessagesStopReason, MessagesStreamEvent, MessagesUsage, +}; use crate::apis::streaming_shapes::sse::{SseEvent, SseStreamBufferTrait}; use crate::providers::streaming_response::ProviderStreamResponseType; +use log::warn; use std::collections::HashSet; /// SSE Stream Buffer for Anthropic Messages API streaming. @@ -11,13 +14,24 @@ use std::collections::HashSet; /// /// When converting from OpenAI to Anthropic format, this buffer injects the required /// ContentBlockStart and ContentBlockStop events to maintain proper Anthropic protocol. +/// +/// Guarantees (Anthropic Messages API contract): +/// 1. `message_stop` is never emitted unless a matching `message_start` was emitted first. +/// 2. `message_stop` is emitted at most once per stream (no double-close). +/// 3. If upstream terminates with no content (empty/filtered/errored response), a +/// minimal but well-formed envelope is synthesized so the client's state machine +/// stays consistent. pub struct AnthropicMessagesStreamBuffer { /// Buffered SSE events ready to be written to wire buffered_events: Vec, - /// Track if we've seen a message_start event + /// Track if we've emitted a message_start event message_started: bool, + /// Track if we've emitted a terminal message_stop event (for idempotency / + /// double-close protection). + message_stopped: bool, + /// Track content block indices that have received ContentBlockStart events content_block_start_indices: HashSet, @@ -42,6 +56,7 @@ impl AnthropicMessagesStreamBuffer { Self { buffered_events: Vec::new(), message_started: false, + message_stopped: false, content_block_start_indices: HashSet::new(), needs_content_block_stop: false, seen_message_delta: false, @@ -49,6 +64,66 @@ impl AnthropicMessagesStreamBuffer { } } + /// Inject a `message_start` event into the buffer if one hasn't been emitted yet. + /// This is the single source of truth for opening a message — every handler + /// that can legitimately be the first event on the wire must call this before + /// pushing its own event. + fn ensure_message_started(&mut self) { + if self.message_started { + return; + } + let model = self.model.as_deref().unwrap_or("unknown"); + let message_start = AnthropicMessagesStreamBuffer::create_message_start_event(model); + self.buffered_events.push(message_start); + self.message_started = true; + } + + /// Inject a synthetic `message_delta` with `end_turn` / zero usage. + /// Used when we must close a message but upstream never produced a terminal + /// event (e.g. `[DONE]` arrives with no prior `finish_reason`). + fn push_synthetic_message_delta(&mut self) { + let event = MessagesStreamEvent::MessageDelta { + delta: MessagesMessageDelta { + stop_reason: MessagesStopReason::EndTurn, + stop_sequence: None, + }, + usage: MessagesUsage { + input_tokens: 0, + output_tokens: 0, + cache_creation_input_tokens: None, + cache_read_input_tokens: None, + }, + }; + let sse_string: String = event.clone().into(); + self.buffered_events.push(SseEvent { + data: None, + event: Some("message_delta".to_string()), + raw_line: sse_string.clone(), + sse_transformed_lines: sse_string, + provider_stream_response: Some(ProviderStreamResponseType::MessagesStreamEvent(event)), + }); + self.seen_message_delta = true; + } + + /// Inject a `message_stop` event into the buffer, marking the stream as closed. + /// Idempotent — subsequent calls are no-ops. + fn push_message_stop(&mut self) { + if self.message_stopped { + return; + } + let message_stop = MessagesStreamEvent::MessageStop; + let sse_string: String = message_stop.into(); + self.buffered_events.push(SseEvent { + data: None, + event: Some("message_stop".to_string()), + raw_line: sse_string.clone(), + sse_transformed_lines: sse_string, + provider_stream_response: None, + }); + self.message_stopped = true; + self.seen_message_delta = false; + } + /// Check if a content_block_start event has been sent for the given index fn has_content_block_start_been_sent(&self, index: i32) -> bool { self.content_block_start_indices.contains(&index) @@ -149,6 +224,27 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { // We match on a reference first to determine the type, then move the event match &event.provider_stream_response { Some(ProviderStreamResponseType::MessagesStreamEvent(evt)) => { + // If the message has already been closed, drop any trailing events + // to avoid emitting data after `message_stop` (protocol violation). + // This typically indicates a duplicate `[DONE]` from upstream or a + // replay of previously-buffered bytes — worth surfacing so we can + // spot misbehaving providers. + if self.message_stopped { + warn!( + "anthropic stream buffer: dropping event after message_stop (variant={})", + match evt { + MessagesStreamEvent::MessageStart { .. } => "message_start", + MessagesStreamEvent::ContentBlockStart { .. } => "content_block_start", + MessagesStreamEvent::ContentBlockDelta { .. } => "content_block_delta", + MessagesStreamEvent::ContentBlockStop { .. } => "content_block_stop", + MessagesStreamEvent::MessageDelta { .. } => "message_delta", + MessagesStreamEvent::MessageStop => "message_stop", + MessagesStreamEvent::Ping => "ping", + } + ); + return; + } + match evt { MessagesStreamEvent::MessageStart { .. } => { // Add the message_start event @@ -157,14 +253,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { } MessagesStreamEvent::ContentBlockStart { index, .. } => { let index = *index as i32; - // Inject message_start if needed - if !self.message_started { - let model = self.model.as_deref().unwrap_or("unknown"); - let message_start = - AnthropicMessagesStreamBuffer::create_message_start_event(model); - self.buffered_events.push(message_start); - self.message_started = true; - } + self.ensure_message_started(); // Add the content_block_start event (from tool calls or other sources) self.buffered_events.push(event); @@ -173,14 +262,7 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { } MessagesStreamEvent::ContentBlockDelta { index, .. } => { let index = *index as i32; - // Inject message_start if needed - if !self.message_started { - let model = self.model.as_deref().unwrap_or("unknown"); - let message_start = - AnthropicMessagesStreamBuffer::create_message_start_event(model); - self.buffered_events.push(message_start); - self.message_started = true; - } + self.ensure_message_started(); // Check if ContentBlockStart was sent for this index if !self.has_content_block_start_been_sent(index) { @@ -196,6 +278,11 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { self.buffered_events.push(event); } MessagesStreamEvent::MessageDelta { usage, .. } => { + // `message_delta` is only meaningful inside an open message. + // Upstream can send it with no prior content (empty completion, + // content filter, etc.), so we must open a message first. + self.ensure_message_started(); + // Inject ContentBlockStop before message_delta if self.needs_content_block_stop { let content_block_stop = @@ -230,15 +317,52 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { } MessagesStreamEvent::ContentBlockStop { .. } => { // ContentBlockStop received from upstream (e.g., Bedrock) + self.ensure_message_started(); // Clear the flag so we don't inject another one self.needs_content_block_stop = false; self.buffered_events.push(event); } MessagesStreamEvent::MessageStop => { - // MessageStop received from upstream (e.g., OpenAI via [DONE]) - // Clear the flag so we don't inject another one - self.seen_message_delta = false; + // MessageStop received from upstream (e.g., OpenAI via [DONE]). + // + // The Anthropic protocol requires the full envelope + // message_start → [content blocks] → message_delta → message_stop + // so we must not emit a bare `message_stop`. Synthesize whatever + // is missing to keep the client's state machine consistent. + self.ensure_message_started(); + + if self.needs_content_block_stop { + let content_block_stop = + AnthropicMessagesStreamBuffer::create_content_block_stop_event(); + self.buffered_events.push(content_block_stop); + self.needs_content_block_stop = false; + } + + // If no message_delta has been emitted yet (empty/filtered upstream + // response), synthesize a minimal one carrying `end_turn`. + if !self.seen_message_delta { + // If we also never opened a content block, open and close one + // so clients that expect at least one block are happy. + if self.content_block_start_indices.is_empty() { + let content_block_start = + AnthropicMessagesStreamBuffer::create_content_block_start_event( + ); + self.buffered_events.push(content_block_start); + self.set_content_block_start_sent(0); + let content_block_stop = + AnthropicMessagesStreamBuffer::create_content_block_stop_event( + ); + self.buffered_events.push(content_block_stop); + } + self.push_synthetic_message_delta(); + } + + // Push the upstream-provided message_stop and mark closed. + // `push_message_stop` is idempotent but we want to reuse the + // original SseEvent so raw passthrough semantics are preserved. self.buffered_events.push(event); + self.message_stopped = true; + self.seen_message_delta = false; } _ => { // Other Anthropic event types (Ping, etc.), just accumulate @@ -254,24 +378,23 @@ impl SseStreamBufferTrait for AnthropicMessagesStreamBuffer { } fn to_bytes(&mut self) -> Vec { - // Convert all accumulated events to bytes and clear buffer + // Convert all accumulated events to bytes and clear buffer. + // // NOTE: We do NOT inject ContentBlockStop here because it's injected when we see MessageDelta // or MessageStop. Injecting it here causes premature ContentBlockStop in the middle of streaming. - - // Inject MessageStop after MessageDelta if we've seen one - // This completes the Anthropic Messages API event sequence - if self.seen_message_delta { - let message_stop = MessagesStreamEvent::MessageStop; - let sse_string: String = message_stop.into(); - let message_stop_event = SseEvent { - data: None, - event: Some("message_stop".to_string()), - raw_line: sse_string.clone(), - sse_transformed_lines: sse_string, - provider_stream_response: None, - }; - self.buffered_events.push(message_stop_event); - self.seen_message_delta = false; + // + // Inject a synthetic `message_stop` only when: + // 1. A `message_delta` has been seen (otherwise we'd violate the Anthropic + // protocol by emitting `message_stop` without a preceding `message_delta`), AND + // 2. We haven't already emitted `message_stop` (either synthetic from a + // previous flush, or real from an upstream `[DONE]`). + // + // Without the `!message_stopped` guard, a stream whose `finish_reason` chunk + // and `[DONE]` marker land in separate HTTP body chunks would receive two + // `message_stop` events, triggering Claude Code's "Received message_stop + // without a current message" error. + if self.seen_message_delta && !self.message_stopped { + self.push_message_stop(); } let mut buffer = Vec::new(); @@ -615,4 +738,133 @@ data: [DONE]"#; println!("✓ Stop reason: tool_use"); println!("✓ Proper Anthropic tool_use protocol\n"); } + + /// Regression test for: + /// Claude Code CLI error: "Received message_stop without a current message" + /// + /// Reproduces the *double-close* scenario: OpenAI's final `finish_reason` + /// chunk and the `[DONE]` marker arrive in **separate** HTTP body chunks, so + /// `to_bytes()` is called between them. Before the fix, this produced two + /// `message_stop` events on the wire (one synthetic, one from `[DONE]`). + #[test] + fn test_openai_to_anthropic_emits_single_message_stop_across_chunk_boundary() { + let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + let mut buffer = AnthropicMessagesStreamBuffer::new(); + + // --- HTTP chunk 1: content + finish_reason (no [DONE] yet) ----------- + let chunk_1 = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]} + +data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}"#; + + for raw in SseStreamIter::try_from(chunk_1.as_bytes()).unwrap() { + let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(e); + } + let out_1 = String::from_utf8(buffer.to_bytes()).unwrap(); + + // --- HTTP chunk 2: just the [DONE] marker ---------------------------- + let chunk_2 = "data: [DONE]"; + for raw in SseStreamIter::try_from(chunk_2.as_bytes()).unwrap() { + let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(e); + } + let out_2 = String::from_utf8(buffer.to_bytes()).unwrap(); + + let combined = format!("{}{}", out_1, out_2); + let start_count = combined.matches("event: message_start").count(); + let stop_count = combined.matches("event: message_stop").count(); + + assert_eq!( + start_count, 1, + "Must emit exactly one message_start across chunks, got {start_count}. Output:\n{combined}" + ); + assert_eq!( + stop_count, 1, + "Must emit exactly one message_stop across chunks (no double-close), got {stop_count}. Output:\n{combined}" + ); + // Every message_stop must be preceded by a message_start earlier in the stream. + let start_pos = combined.find("event: message_start").unwrap(); + let stop_pos = combined.find("event: message_stop").unwrap(); + assert!( + start_pos < stop_pos, + "message_start must come before message_stop. Output:\n{combined}" + ); + } + + /// Regression test for: + /// "Received message_stop without a current message" on empty upstream responses. + /// + /// OpenAI returns only `[DONE]` with no content deltas and no `finish_reason` + /// (this happens with content filters, truncated upstream streams, and some + /// 5xx recoveries). Before the fix, the buffer emitted a bare `message_stop` + /// with no preceding `message_start`. After the fix, it synthesizes a + /// minimal but well-formed envelope. + #[test] + fn test_openai_done_only_stream_synthesizes_valid_envelope() { + let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + let mut buffer = AnthropicMessagesStreamBuffer::new(); + + let raw_input = "data: [DONE]"; + for raw in SseStreamIter::try_from(raw_input.as_bytes()).unwrap() { + let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(e); + } + let out = String::from_utf8(buffer.to_bytes()).unwrap(); + + assert!( + out.contains("event: message_start"), + "Empty upstream must still produce message_start. Output:\n{out}" + ); + assert!( + out.contains("event: message_delta"), + "Empty upstream must produce a synthesized message_delta. Output:\n{out}" + ); + assert_eq!( + out.matches("event: message_stop").count(), + 1, + "Empty upstream must produce exactly one message_stop. Output:\n{out}" + ); + + // Protocol ordering: start < delta < stop. + let p_start = out.find("event: message_start").unwrap(); + let p_delta = out.find("event: message_delta").unwrap(); + let p_stop = out.find("event: message_stop").unwrap(); + assert!( + p_start < p_delta && p_delta < p_stop, + "Bad ordering. Output:\n{out}" + ); + } + + /// Regression test: events arriving after `message_stop` (e.g. a stray `[DONE]` + /// echo, or late-arriving deltas from a racing upstream) must be dropped + /// rather than written after the terminal frame. + #[test] + fn test_events_after_message_stop_are_dropped() { + let client_api = SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream_api = SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + let mut buffer = AnthropicMessagesStreamBuffer::new(); + + let first = r#"data: {"id":"c1","object":"chat.completion.chunk","created":1,"model":"gpt-4o","choices":[{"index":0,"delta":{"content":"ok"},"finish_reason":"stop"}]} + +data: [DONE]"#; + for raw in SseStreamIter::try_from(first.as_bytes()).unwrap() { + let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(e); + } + let _ = buffer.to_bytes(); + + // Simulate a duplicate / late `[DONE]` after the stream was already closed. + let late = "data: [DONE]"; + for raw in SseStreamIter::try_from(late.as_bytes()).unwrap() { + let e = SseEvent::try_from((raw, &client_api, &upstream_api)).unwrap(); + buffer.add_transformed_event(e); + } + let tail = String::from_utf8(buffer.to_bytes()).unwrap(); + assert!( + tail.is_empty(), + "No bytes should be emitted after message_stop, got: {tail:?}" + ); + } } diff --git a/crates/hermesllm/src/bin/provider_models.yaml b/crates/hermesllm/src/bin/provider_models.yaml index 53dac7f4..2e9e0a9b 100644 --- a/crates/hermesllm/src/bin/provider_models.yaml +++ b/crates/hermesllm/src/bin/provider_models.yaml @@ -95,6 +95,7 @@ providers: anthropic: - anthropic/claude-sonnet-4-6 - anthropic/claude-opus-4-6 + - anthropic/claude-opus-4-7 - anthropic/claude-opus-4-5-20251101 - anthropic/claude-opus-4-5 - anthropic/claude-haiku-4-5-20251001 @@ -328,7 +329,57 @@ providers: - xiaomi/mimo-v2-flash - xiaomi/mimo-v2-omni - xiaomi/mimo-v2-pro + chatgpt: + - chatgpt/gpt-5.4 + - chatgpt/gpt-5.3-codex + - chatgpt/gpt-5.2 + digitalocean: + - digitalocean/openai-gpt-4.1 + - digitalocean/openai-gpt-4o + - digitalocean/openai-gpt-4o-mini + - digitalocean/openai-gpt-5 + - digitalocean/openai-gpt-5-mini + - digitalocean/openai-gpt-5-nano + - digitalocean/openai-gpt-5.1-codex-max + - digitalocean/openai-gpt-5.2 + - digitalocean/openai-gpt-5.2-pro + - digitalocean/openai-gpt-5.3-codex + - digitalocean/openai-gpt-5.4 + - digitalocean/openai-gpt-5.4-mini + - digitalocean/openai-gpt-5.4-nano + - digitalocean/openai-gpt-5.4-pro + - digitalocean/openai-gpt-oss-120b + - digitalocean/openai-gpt-oss-20b + - digitalocean/openai-o1 + - digitalocean/openai-o3 + - digitalocean/openai-o3-mini + - digitalocean/anthropic-claude-4.1-opus + - digitalocean/anthropic-claude-4.5-sonnet + - digitalocean/anthropic-claude-4.6-sonnet + - digitalocean/anthropic-claude-haiku-4.5 + - digitalocean/anthropic-claude-opus-4 + - digitalocean/anthropic-claude-opus-4.5 + - digitalocean/anthropic-claude-opus-4.6 + - digitalocean/anthropic-claude-opus-4.7 + - digitalocean/anthropic-claude-sonnet-4 + - digitalocean/alibaba-qwen3-32b + - digitalocean/arcee-trinity-large-thinking + - digitalocean/deepseek-3.2 + - digitalocean/deepseek-r1-distill-llama-70b + - digitalocean/gemma-4-31B-it + - digitalocean/glm-5 + - digitalocean/kimi-k2.5 + - digitalocean/llama3.3-70b-instruct + - digitalocean/minimax-m2.5 + - digitalocean/nvidia-nemotron-3-super-120b + - digitalocean/qwen3-coder-flash + - digitalocean/qwen3.5-397b-a17b + - digitalocean/all-mini-lm-l6-v2 + - digitalocean/gte-large-en-v1.5 + - digitalocean/multi-qa-mpnet-base-dot-v1 + - digitalocean/qwen3-embedding-0.6b + - digitalocean/router:software-engineering metadata: - total_providers: 11 - total_models: 316 - last_updated: 2026-04-03T23:14:46.956158+00:00 + total_providers: 13 + total_models: 364 + last_updated: 2026-04-20T00:00:00.000000+00:00 diff --git a/crates/hermesllm/src/clients/endpoints.rs b/crates/hermesllm/src/clients/endpoints.rs index 39b34358..eeef8856 100644 --- a/crates/hermesllm/src/clients/endpoints.rs +++ b/crates/hermesllm/src/clients/endpoints.rs @@ -175,7 +175,9 @@ impl SupportedAPIsFromClient { match self { SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages) => { match provider_id { - ProviderId::Anthropic => build_endpoint("/v1", "/messages"), + ProviderId::Anthropic | ProviderId::Vercel => { + build_endpoint("/v1", "/messages") + } ProviderId::AmazonBedrock => { if request_path.starts_with("/v1/") && !is_streaming { build_endpoint("", &format!("/model/{}/converse", model_id)) @@ -192,7 +194,10 @@ impl SupportedAPIsFromClient { // For Responses API, check if provider supports it, otherwise translate to chat/completions match provider_id { // Providers that support /v1/responses natively - ProviderId::OpenAI | ProviderId::XAI => route_by_provider("/responses"), + ProviderId::OpenAI + | ProviderId::XAI + | ProviderId::ChatGPT + | ProviderId::Vercel => route_by_provider("/responses"), // All other providers: translate to /chat/completions _ => route_by_provider("/chat/completions"), } @@ -718,4 +723,36 @@ mod tests { "/v1/responses" ); } + + #[test] + fn test_responses_api_targets_chatgpt_native_responses_endpoint() { + let api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + assert_eq!( + api.target_endpoint_for_provider( + &ProviderId::ChatGPT, + "/v1/responses", + "gpt-5.4", + false, + None, + false + ), + "/v1/responses" + ); + } + + #[test] + fn test_responses_api_targets_vercel_native_responses_endpoint() { + let api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + assert_eq!( + api.target_endpoint_for_provider( + &ProviderId::Vercel, + "/v1/responses", + "gpt-5.4", + false, + None, + false + ), + "/v1/responses" + ); + } } diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs index c410bd78..4fa7d19d 100644 --- a/crates/hermesllm/src/providers/id.rs +++ b/crates/hermesllm/src/providers/id.rs @@ -44,6 +44,10 @@ pub enum ProviderId { Zhipu, Qwen, AmazonBedrock, + ChatGPT, + DigitalOcean, + Vercel, + OpenRouter, } impl TryFrom<&str> for ProviderId { @@ -71,6 +75,12 @@ impl TryFrom<&str> for ProviderId { "qwen" => Ok(ProviderId::Qwen), "amazon_bedrock" => Ok(ProviderId::AmazonBedrock), "amazon" => Ok(ProviderId::AmazonBedrock), // alias + "chatgpt" => Ok(ProviderId::ChatGPT), + "digitalocean" => Ok(ProviderId::DigitalOcean), + "do" => Ok(ProviderId::DigitalOcean), // alias + "do_ai" => Ok(ProviderId::DigitalOcean), // alias + "vercel" => Ok(ProviderId::Vercel), + "openrouter" => Ok(ProviderId::OpenRouter), _ => Err(format!("Unknown provider: {}", value)), } } @@ -95,6 +105,8 @@ impl ProviderId { ProviderId::Moonshotai => "moonshotai", ProviderId::Zhipu => "z-ai", ProviderId::Qwen => "qwen", + ProviderId::ChatGPT => "chatgpt", + ProviderId::DigitalOcean => "digitalocean", _ => return Vec::new(), }; @@ -132,6 +144,17 @@ impl ProviderId { SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions) } + // Vercel AI Gateway natively supports all three API types + (ProviderId::Vercel, SupportedAPIsFromClient::AnthropicMessagesAPI(_)) => { + SupportedUpstreamAPIs::AnthropicMessagesAPI(AnthropicApi::Messages) + } + (ProviderId::Vercel, SupportedAPIsFromClient::OpenAIChatCompletions(_)) => { + SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions) + } + (ProviderId::Vercel, SupportedAPIsFromClient::OpenAIResponsesAPI(_)) => { + SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses) + } + // OpenAI-compatible providers only support OpenAI chat completions ( ProviderId::OpenAI @@ -148,7 +171,10 @@ impl ProviderId { | ProviderId::Ollama | ProviderId::Moonshotai | ProviderId::Zhipu - | ProviderId::Qwen, + | ProviderId::Qwen + | ProviderId::DigitalOcean + | ProviderId::OpenRouter + | ProviderId::ChatGPT, SupportedAPIsFromClient::AnthropicMessagesAPI(_), ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), @@ -167,13 +193,16 @@ impl ProviderId { | ProviderId::Ollama | ProviderId::Moonshotai | ProviderId::Zhipu - | ProviderId::Qwen, + | ProviderId::Qwen + | ProviderId::DigitalOcean + | ProviderId::OpenRouter + | ProviderId::ChatGPT, SupportedAPIsFromClient::OpenAIChatCompletions(_), ) => SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), - // OpenAI Responses API - OpenAI and xAI support this natively + // OpenAI Responses API - OpenAI, xAI, and ChatGPT support this natively ( - ProviderId::OpenAI | ProviderId::XAI, + ProviderId::OpenAI | ProviderId::XAI | ProviderId::ChatGPT, SupportedAPIsFromClient::OpenAIResponsesAPI(_), ) => SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses), @@ -234,6 +263,10 @@ impl Display for ProviderId { ProviderId::Zhipu => write!(f, "zhipu"), ProviderId::Qwen => write!(f, "qwen"), ProviderId::AmazonBedrock => write!(f, "amazon_bedrock"), + ProviderId::ChatGPT => write!(f, "chatgpt"), + ProviderId::DigitalOcean => write!(f, "digitalocean"), + ProviderId::Vercel => write!(f, "vercel"), + ProviderId::OpenRouter => write!(f, "openrouter"), } } } @@ -336,6 +369,79 @@ mod tests { ); } + #[test] + fn test_vercel_and_openrouter_parsing() { + assert_eq!(ProviderId::try_from("vercel"), Ok(ProviderId::Vercel)); + assert!(ProviderId::try_from("vercel_ai").is_err()); + assert_eq!( + ProviderId::try_from("openrouter"), + Ok(ProviderId::OpenRouter) + ); + assert!(ProviderId::try_from("open_router").is_err()); + } + + #[test] + fn test_vercel_compatible_api() { + use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs}; + + let openai_client = + SupportedAPIsFromClient::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + let upstream = ProviderId::Vercel.compatible_api_for_client(&openai_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::OpenAIChatCompletions(_)), + "Vercel should map OpenAI client to OpenAIChatCompletions upstream" + ); + + let anthropic_client = + SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream = ProviderId::Vercel.compatible_api_for_client(&anthropic_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::AnthropicMessagesAPI(_)), + "Vercel should map Anthropic client to AnthropicMessagesAPI upstream natively" + ); + + let responses_client = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + let upstream = ProviderId::Vercel.compatible_api_for_client(&responses_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::OpenAIResponsesAPI(_)), + "Vercel should map Responses API client to OpenAIResponsesAPI upstream natively" + ); + } + + #[test] + fn test_openrouter_compatible_api() { + use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs}; + + let openai_client = + SupportedAPIsFromClient::OpenAIChatCompletions(OpenAIApi::ChatCompletions); + let upstream = ProviderId::OpenRouter.compatible_api_for_client(&openai_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::OpenAIChatCompletions(_)), + "OpenRouter should map OpenAI client to OpenAIChatCompletions upstream" + ); + + let anthropic_client = + SupportedAPIsFromClient::AnthropicMessagesAPI(AnthropicApi::Messages); + let upstream = ProviderId::OpenRouter.compatible_api_for_client(&anthropic_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::OpenAIChatCompletions(_)), + "OpenRouter should translate Anthropic client to OpenAIChatCompletions upstream" + ); + + let responses_client = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + let upstream = ProviderId::OpenRouter.compatible_api_for_client(&responses_client, false); + assert!( + matches!(upstream, SupportedUpstreamAPIs::OpenAIChatCompletions(_)), + "OpenRouter should translate Responses API client to OpenAIChatCompletions upstream" + ); + } + + #[test] + fn test_vercel_and_openrouter_empty_models() { + assert!(ProviderId::Vercel.models().is_empty()); + assert!(ProviderId::OpenRouter.models().is_empty()); + } + #[test] fn test_xai_uses_responses_api_for_responses_clients() { use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs}; @@ -347,4 +453,16 @@ mod tests { SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses) )); } + + #[test] + fn test_chatgpt_uses_responses_api_for_responses_clients() { + use crate::clients::endpoints::{SupportedAPIsFromClient, SupportedUpstreamAPIs}; + + let client_api = SupportedAPIsFromClient::OpenAIResponsesAPI(OpenAIApi::Responses); + let upstream = ProviderId::ChatGPT.compatible_api_for_client(&client_api, false); + assert!(matches!( + upstream, + SupportedUpstreamAPIs::OpenAIResponsesAPI(OpenAIApi::Responses) + )); + } } diff --git a/crates/hermesllm/src/providers/request.rs b/crates/hermesllm/src/providers/request.rs index 92688133..aa100a17 100644 --- a/crates/hermesllm/src/providers/request.rs +++ b/crates/hermesllm/src/providers/request.rs @@ -77,7 +77,7 @@ impl ProviderRequestType { &mut self, provider_id: ProviderId, upstream_api: &SupportedUpstreamAPIs, - ) { + ) -> Result<(), ProviderRequestError> { if provider_id == ProviderId::XAI && matches!( upstream_api, @@ -89,6 +89,48 @@ impl ProviderRequestType { req.web_search_options = None; } } + + // ChatGPT requires instructions, store=false, and input as a list + if provider_id == ProviderId::ChatGPT { + if let Self::ResponsesAPIRequest(req) = self { + use crate::apis::openai_responses::{ + InputItem, InputMessage, InputParam, MessageContent, MessageRole, + }; + + const CHATGPT_BASE_INSTRUCTIONS: &str = + "You are Codex, based on GPT-5. You are running as a coding agent in the Codex CLI on a user's computer."; + match &req.instructions { + Some(existing) if existing.contains(CHATGPT_BASE_INSTRUCTIONS) => {} + Some(existing) => { + req.instructions = + Some(format!("{}\n\n{}", CHATGPT_BASE_INSTRUCTIONS, existing)); + } + None => { + req.instructions = Some(CHATGPT_BASE_INSTRUCTIONS.to_string()); + } + } + req.store = Some(false); + if req.stream == Some(false) { + return Err(ProviderRequestError { + message: "Non-streaming requests are not supported for the ChatGPT Codex provider. Set stream=true or omit the stream field.".to_string(), + source: None, + }); + } + req.stream = Some(true); + + // ChatGPT backend requires input to be a list, not a plain string + if let InputParam::Text(text) = &req.input { + req.input = InputParam::Items(vec![InputItem::Message(InputMessage { + role: MessageRole::User, + content: MessageContent::Text(text.clone()), + })]); + } + if let InputParam::SingleItem(item) = &req.input { + req.input = InputParam::Items(vec![item.clone()]); + } + } + } + Ok(()) } } @@ -824,10 +866,12 @@ mod tests { ..Default::default() }); - request.normalize_for_upstream( - ProviderId::XAI, - &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), - ); + request + .normalize_for_upstream( + ProviderId::XAI, + &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), + ) + .unwrap(); let ProviderRequestType::ChatCompletionsRequest(req) = request else { panic!("expected chat request"); @@ -852,10 +896,12 @@ mod tests { ..Default::default() }); - request.normalize_for_upstream( - ProviderId::OpenAI, - &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), - ); + request + .normalize_for_upstream( + ProviderId::OpenAI, + &SupportedUpstreamAPIs::OpenAIChatCompletions(OpenAIApi::ChatCompletions), + ) + .unwrap(); let ProviderRequestType::ChatCompletionsRequest(req) = request else { panic!("expected chat request"); diff --git a/crates/hermesllm/src/providers/response.rs b/crates/hermesllm/src/providers/response.rs index 5f46f97b..b8565ddf 100644 --- a/crates/hermesllm/src/providers/response.rs +++ b/crates/hermesllm/src/providers/response.rs @@ -23,6 +23,31 @@ pub trait TokenUsage { fn completion_tokens(&self) -> usize; fn prompt_tokens(&self) -> usize; fn total_tokens(&self) -> usize; + /// Tokens served from a prompt cache read (OpenAI `prompt_tokens_details.cached_tokens`, + /// Anthropic `cache_read_input_tokens`, Google `cached_content_token_count`). + fn cached_input_tokens(&self) -> Option { + None + } + /// Tokens used to write a cache entry (Anthropic `cache_creation_input_tokens`). + fn cache_creation_tokens(&self) -> Option { + None + } + /// Reasoning tokens for reasoning models (OpenAI `completion_tokens_details.reasoning_tokens`, + /// Google `thoughts_token_count`). + fn reasoning_tokens(&self) -> Option { + None + } +} + +/// Rich usage breakdown extracted from a provider response. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct UsageDetails { + pub prompt_tokens: usize, + pub completion_tokens: usize, + pub total_tokens: usize, + pub cached_input_tokens: Option, + pub cache_creation_tokens: Option, + pub reasoning_tokens: Option, } pub trait ProviderResponse: Send + Sync { @@ -34,6 +59,18 @@ pub trait ProviderResponse: Send + Sync { self.usage() .map(|u| (u.prompt_tokens(), u.completion_tokens(), u.total_tokens())) } + + /// Extract a rich usage breakdown including cached/cache-creation/reasoning tokens. + fn extract_usage_details(&self) -> Option { + self.usage().map(|u| UsageDetails { + prompt_tokens: u.prompt_tokens(), + completion_tokens: u.completion_tokens(), + total_tokens: u.total_tokens(), + cached_input_tokens: u.cached_input_tokens(), + cache_creation_tokens: u.cache_creation_tokens(), + reasoning_tokens: u.reasoning_tokens(), + }) + } } impl ProviderResponse for ProviderResponseType { diff --git a/crates/hermesllm/src/providers/streaming_response.rs b/crates/hermesllm/src/providers/streaming_response.rs index 66ccc735..8d06dfcf 100644 --- a/crates/hermesllm/src/providers/streaming_response.rs +++ b/crates/hermesllm/src/providers/streaming_response.rs @@ -346,12 +346,10 @@ impl TryFrom<(SseEvent, &SupportedAPIsFromClient, &SupportedUpstreamAPIs)> for S ( SupportedAPIsFromClient::OpenAIChatCompletions(_), SupportedUpstreamAPIs::AnthropicMessagesAPI(_), - ) => { + ) if transformed_event.is_event_only() && transformed_event.event.is_some() => { // OpenAI clients don't expect separate event: lines // Suppress upstream Anthropic event-only lines - if transformed_event.is_event_only() && transformed_event.event.is_some() { - transformed_event.sse_transformed_lines = "\n".to_string(); - } + transformed_event.sse_transformed_lines = "\n".to_string(); } _ => { // Other cross-API combinations can be handled here as needed @@ -371,12 +369,10 @@ impl TryFrom<(SseEvent, &SupportedAPIsFromClient, &SupportedUpstreamAPIs)> for S | ( SupportedAPIsFromClient::OpenAIResponsesAPI(_), SupportedUpstreamAPIs::OpenAIResponsesAPI(_), - ) => { - if transformed_event.is_event_only() && transformed_event.event.is_some() { - // Mark as should-skip by clearing sse_transformed_lines - // The event line is already included when the data line is transformed - transformed_event.sse_transformed_lines = String::new(); - } + ) if transformed_event.is_event_only() && transformed_event.event.is_some() => { + // Mark as should-skip by clearing sse_transformed_lines + // The event line is already included when the data line is transformed + transformed_event.sse_transformed_lines = String::new(); } _ => { // Other passthrough combinations (OpenAI ChatCompletions, etc.) don't have this issue diff --git a/crates/hermesllm/src/transforms/lib.rs b/crates/hermesllm/src/transforms/lib.rs index 115f061c..5308cc47 100644 --- a/crates/hermesllm/src/transforms/lib.rs +++ b/crates/hermesllm/src/transforms/lib.rs @@ -188,14 +188,13 @@ pub fn convert_openai_message_to_anthropic_content( // Handle regular content match &message.content { - Some(MessageContent::Text(text)) => { - if !text.is_empty() { - blocks.push(MessagesContentBlock::Text { - text: text.clone(), - cache_control: None, - }); - } + Some(MessageContent::Text(text)) if !text.is_empty() => { + blocks.push(MessagesContentBlock::Text { + text: text.clone(), + cache_control: None, + }); } + Some(MessageContent::Text(_)) => {} Some(MessageContent::Parts(parts)) => { for part in parts { match part { diff --git a/crates/hermesllm/src/transforms/request/from_anthropic.rs b/crates/hermesllm/src/transforms/request/from_anthropic.rs index 82dbe547..dba17dde 100644 --- a/crates/hermesllm/src/transforms/request/from_anthropic.rs +++ b/crates/hermesllm/src/transforms/request/from_anthropic.rs @@ -354,10 +354,10 @@ impl TryFrom for BedrockMessage { MessagesMessageContent::Blocks(blocks) => { for block in blocks { match block { - crate::apis::anthropic::MessagesContentBlock::Text { text, .. } => { - if !text.is_empty() { - content_blocks.push(ContentBlock::Text { text }); - } + crate::apis::anthropic::MessagesContentBlock::Text { text, .. } + if !text.is_empty() => + { + content_blocks.push(ContentBlock::Text { text }); } crate::apis::anthropic::MessagesContentBlock::ToolUse { id, diff --git a/crates/hermesllm/src/transforms/request/from_openai.rs b/crates/hermesllm/src/transforms/request/from_openai.rs index 70e69cb8..b673af38 100644 --- a/crates/hermesllm/src/transforms/request/from_openai.rs +++ b/crates/hermesllm/src/transforms/request/from_openai.rs @@ -317,11 +317,10 @@ impl TryFrom for BedrockMessage { Role::User => { // Convert user message content to content blocks match message.content { - Some(MessageContent::Text(text)) => { - if !text.is_empty() { - content_blocks.push(ContentBlock::Text { text }); - } + Some(MessageContent::Text(text)) if !text.is_empty() => { + content_blocks.push(ContentBlock::Text { text }); } + Some(MessageContent::Text(_)) => {} Some(MessageContent::Parts(parts)) => { // Convert OpenAI content parts to Bedrock ContentBlocks for part in parts { diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index afb0b050..fa9964dd 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -177,24 +177,33 @@ impl StreamContext { } fn modify_auth_headers(&mut self) -> Result<(), ServerError> { - if self.llm_provider().passthrough_auth == Some(true) { - // Check if client provided an Authorization header - if self.get_http_request_header("Authorization").is_none() { - warn!( - "request_id={}: passthrough_auth enabled but no authorization header present in client request", - self.request_identifier() - ); - } else { - debug!( - "request_id={}: preserving client authorization header for provider '{}'", - self.request_identifier(), - self.llm_provider().name - ); + // Determine the credential to forward upstream. Either the client + // supplied one (passthrough_auth) or it's configured on the provider. + let credential: String = if self.llm_provider().passthrough_auth == Some(true) { + // Client auth may arrive in either Anthropic-style (`x-api-key`) + // or OpenAI-style (`Authorization: Bearer ...`). Accept both so + // clients using Anthropic SDKs (which default to `x-api-key`) + // work when the upstream is OpenAI-compatible, and vice versa. + let authorization = self.get_http_request_header("Authorization"); + let x_api_key = self.get_http_request_header("x-api-key"); + match extract_client_credential(authorization.as_deref(), x_api_key.as_deref()) { + Some(key) => { + debug!( + "request_id={}: forwarding client credential to provider '{}'", + self.request_identifier(), + self.llm_provider().name + ); + key + } + None => { + warn!( + "request_id={}: passthrough_auth enabled but no Authorization / x-api-key header present in client request", + self.request_identifier() + ); + return Ok(()); + } } - return Ok(()); - } - - let llm_provider_api_key_value = + } else { self.llm_provider() .access_key .as_ref() @@ -203,15 +212,19 @@ impl StreamContext { "No access key configured for selected LLM Provider \"{}\"", self.llm_provider() ), - })?; + })? + .clone() + }; - // Set API-specific headers based on the resolved upstream API + // Normalize the credential into whichever header the upstream expects. + // This lets an Anthropic-SDK client reach an OpenAI-compatible upstream + // (and vice versa) without the caller needing to know what format the + // upstream uses. match self.resolved_api.as_ref() { Some(SupportedUpstreamAPIs::AnthropicMessagesAPI(_)) => { - // Anthropic API requires x-api-key and anthropic-version headers - // Remove any existing Authorization header since Anthropic doesn't use it + // Anthropic expects `x-api-key` + `anthropic-version`. self.remove_http_request_header("Authorization"); - self.set_http_request_header("x-api-key", Some(llm_provider_api_key_value)); + self.set_http_request_header("x-api-key", Some(&credential)); self.set_http_request_header("anthropic-version", Some("2023-06-01")); } Some( @@ -221,14 +234,21 @@ impl StreamContext { | SupportedUpstreamAPIs::OpenAIResponsesAPI(_), ) | None => { - // OpenAI and default: use Authorization Bearer token - // Remove any existing x-api-key header since OpenAI doesn't use it + // OpenAI (and default): `Authorization: Bearer ...`. self.remove_http_request_header("x-api-key"); - let authorization_header_value = format!("Bearer {}", llm_provider_api_key_value); + let authorization_header_value = format!("Bearer {}", credential); self.set_http_request_header("Authorization", Some(&authorization_header_value)); } } + // Apply any extra headers configured on the provider (e.g., ChatGPT-Account-Id, originator) + let headers = self.llm_provider().headers.clone(); + if let Some(headers) = headers { + for (key, value) in &headers { + self.set_http_request_header(key, Some(value)); + } + } + Ok(()) } @@ -1048,7 +1068,20 @@ impl HttpContext for StreamContext { match ProviderRequestType::try_from((deserialized_client_request, upstream)) { Ok(mut request) => { - request.normalize_for_upstream(self.get_provider_id(), upstream); + if let Err(e) = + request.normalize_for_upstream(self.get_provider_id(), upstream) + { + warn!( + "request_id={}: normalize_for_upstream failed: {}", + self.request_identifier(), + e + ); + self.send_server_error( + ServerError::LogicError(e.message), + Some(StatusCode::BAD_REQUEST), + ); + return Action::Pause; + } debug!( "request_id={}: upstream request payload: {}", self.request_identifier(), @@ -1235,3 +1268,86 @@ fn current_time_ns() -> u128 { } impl Context for StreamContext {} + +/// Extract the credential a client sent in either an OpenAI-style +/// `Authorization` header or an Anthropic-style `x-api-key` header. +/// +/// Returns `None` when neither header is present or both are empty/whitespace. +/// The `Bearer ` prefix on the `Authorization` value is stripped if present; +/// otherwise the value is taken verbatim (some clients send a raw token). +fn extract_client_credential( + authorization: Option<&str>, + x_api_key: Option<&str>, +) -> Option { + // Strip the optional "Bearer " / "Bearer" prefix (case-sensitive, matches + // OpenAI SDK behavior) and trim surrounding whitespace before validating + // non-empty. + let from_authorization = authorization + .map(|v| { + v.strip_prefix("Bearer ") + .or_else(|| v.strip_prefix("Bearer")) + .unwrap_or(v) + .trim() + .to_string() + }) + .filter(|s| !s.is_empty()); + if from_authorization.is_some() { + return from_authorization; + } + x_api_key + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) +} + +#[cfg(test)] +mod tests { + use super::extract_client_credential; + + #[test] + fn authorization_bearer_strips_prefix() { + assert_eq!( + extract_client_credential(Some("Bearer sk-abc"), None), + Some("sk-abc".to_string()) + ); + } + + #[test] + fn authorization_raw_token_preserved() { + // Some clients send the raw token without "Bearer " — accept it. + assert_eq!( + extract_client_credential(Some("sk-abc"), None), + Some("sk-abc".to_string()) + ); + } + + #[test] + fn x_api_key_used_when_authorization_absent() { + assert_eq!( + extract_client_credential(None, Some("sk-ant-api-key")), + Some("sk-ant-api-key".to_string()) + ); + } + + #[test] + fn authorization_wins_when_both_present() { + // If a client is particularly exotic and sends both, prefer the + // OpenAI-style Authorization header. + assert_eq!( + extract_client_credential(Some("Bearer openai-key"), Some("anthropic-key")), + Some("openai-key".to_string()) + ); + } + + #[test] + fn returns_none_when_neither_present() { + assert!(extract_client_credential(None, None).is_none()); + } + + #[test] + fn empty_and_whitespace_headers_are_ignored() { + assert!(extract_client_credential(Some(""), None).is_none()); + assert!(extract_client_credential(Some("Bearer "), None).is_none()); + assert!(extract_client_credential(Some(" "), Some(" ")).is_none()); + } +} diff --git a/demos/llm_routing/chatgpt_subscription/README.md b/demos/llm_routing/chatgpt_subscription/README.md new file mode 100644 index 00000000..d091155a --- /dev/null +++ b/demos/llm_routing/chatgpt_subscription/README.md @@ -0,0 +1,61 @@ +# ChatGPT Subscription Routing + +Route requests through your ChatGPT Plus/Pro subscription using Plano. Uses the OpenAI Responses API under the hood, targeting `chatgpt.com/backend-api/codex/responses`. + +## Setup + +### 1. Authenticate with ChatGPT + +```bash +planoai chatgpt login +``` + +This opens a device code flow — visit the URL shown and enter the code. Tokens are saved to `~/.plano/chatgpt/auth.json`. + +### 2. Start Plano + +```bash +planoai up config.yaml +``` + +### 3. Send a request + +```bash +curl http://localhost:12000/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-5.2", + "input": "Hello, what model are you?" + }' +``` + +Or use the test script: + +```bash +bash test_chatgpt.sh +``` + +## How it works + +- `chatgpt/gpt-5.2` in the config tells Plano to use the ChatGPT subscription provider +- Plano reads OAuth tokens from `~/.plano/chatgpt/auth.json` (auto-refreshes if expired) +- Requests are proxied to `https://chatgpt.com/backend-api/codex/responses` with the required headers: + - `Authorization: Bearer ` + - `ChatGPT-Account-Id: ` + - `originator: codex_cli_rs` + - `session_id: ` + +## Available models + +``` +chatgpt/gpt-5.4 +chatgpt/gpt-5.3-codex +chatgpt/gpt-5.2 +``` + +## Managing credentials + +```bash +planoai chatgpt status # Check auth status +planoai chatgpt logout # Remove stored credentials +``` diff --git a/demos/llm_routing/chatgpt_subscription/chat.py b/demos/llm_routing/chatgpt_subscription/chat.py new file mode 100644 index 00000000..3c6b8ae3 --- /dev/null +++ b/demos/llm_routing/chatgpt_subscription/chat.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +"""Interactive chat with a model through Plano using the OpenAI SDK.""" + +import sys +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:12000/v1", api_key="unused") + + +def run_chat(model): + print(f"Chatting with {model} via Plano (Ctrl+C to quit)\n") + history = [] + while True: + try: + user_input = input("you> ") + except (KeyboardInterrupt, EOFError): + print("\nbye") + break + if not user_input.strip(): + continue + + history.append({"role": "user", "content": user_input}) + + stream = client.responses.create(model=model, input=history, stream=True) + print(f"{model}> ", end="", flush=True) + full = "" + for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) + full += event.delta + print() + + history.append({"role": "assistant", "content": full}) + + +if __name__ == "__main__": + model = sys.argv[1] if len(sys.argv) > 1 else "gpt-5.2" + run_chat(model) diff --git a/demos/llm_routing/chatgpt_subscription/config.yaml b/demos/llm_routing/chatgpt_subscription/config.yaml new file mode 100644 index 00000000..a7137b3d --- /dev/null +++ b/demos/llm_routing/chatgpt_subscription/config.yaml @@ -0,0 +1,9 @@ +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: chatgpt/* diff --git a/demos/llm_routing/chatgpt_subscription/test_chatgpt.sh b/demos/llm_routing/chatgpt_subscription/test_chatgpt.sh new file mode 100755 index 00000000..5544049d --- /dev/null +++ b/demos/llm_routing/chatgpt_subscription/test_chatgpt.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Test ChatGPT subscription routing through Plano +# Prerequisites: planoai chatgpt login && planoai up config.yaml + +set -e + +echo "Testing ChatGPT subscription via Plano Responses API..." +echo "" + +curl -s http://localhost:12000/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-5.2", + "input": "What is 2 + 2? Reply in one word." + }' | python3 -m json.tool + +echo "" +echo "Done." diff --git a/demos/llm_routing/claude_code_router/config.yaml b/demos/llm_routing/claude_code_router/config.yaml index e72aa73a..6235b6c6 100644 --- a/demos/llm_routing/claude_code_router/config.yaml +++ b/demos/llm_routing/claude_code_router/config.yaml @@ -19,7 +19,7 @@ model_providers: - name: code understanding description: understand and explain existing code snippets, functions, or libraries # Anthropic Models - - model: anthropic/claude-sonnet-4-5 + - model: anthropic/claude-sonnet-4-6 default: true access_key: $ANTHROPIC_API_KEY diff --git a/demos/llm_routing/preference_based_routing/README.md b/demos/llm_routing/preference_based_routing/README.md index f04fcf06..3208c17c 100644 --- a/demos/llm_routing/preference_based_routing/README.md +++ b/demos/llm_routing/preference_based_routing/README.md @@ -3,7 +3,7 @@ This demo shows how you can use user preferences to route user prompts to approp ## How to start the demo -Make sure you have Plano CLI installed (`pip install planoai==0.4.19` or `uv tool install planoai==0.4.19`). +Make sure you have Plano CLI installed (`pip install planoai==0.4.21` or `uv tool install planoai==0.4.21`). ```bash cd demos/llm_routing/preference_based_routing diff --git a/docs/routing-api.md b/docs/routing-api.md index c2b9c63f..4d1d6a8e 100644 --- a/docs/routing-api.md +++ b/docs/routing-api.md @@ -34,11 +34,13 @@ POST /v1/chat/completions ### `routing_preferences` fields -| Field | Type | Required | Description | -|---|---|---|---| -| `name` | string | yes | Route identifier. Must match the LLM router's route classification. | -| `description` | string | yes | Natural language description used by the router to match user intent. | -| `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. | + +| Field | Type | Required | Description | +| ------------- | -------- | -------- | ------------------------------------------------------------------------------------------- | +| `name` | string | yes | Route identifier. Must match the LLM router's route classification. | +| `description` | string | yes | Natural language description used by the router to match user intent. | +| `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. | + ### Notes @@ -64,11 +66,13 @@ POST /v1/chat/completions ### Fields -| Field | Type | Description | -|---|---|---| -| `models` | string[] | Ranked model list. Use `models[0]` as primary; retry with `models[1]` on 429/5xx, and so on. | -| `route` | string \| null | Name of the matched route. `null` if no route matched — client should use the original request `model`. | -| `trace_id` | string | Trace ID for distributed tracing and observability. | + +| Field | Type | Description | +| ---------- | ------------- | ------------------------------------------------------------------------------------------------------- | +| `models` | string[] | Ranked model list. Use `models[0]` as primary; retry with `models[1]` on 429/5xx, and so on. | +| `route` | string | null | Name of the matched route. `null` if no route matched — client should use the original request `model`. | +| `trace_id` | string | Trace ID for distributed tracing and observability. | + --- @@ -142,6 +146,7 @@ X-Model-Affinity: a1b2c3d4-5678-... ``` Response when pinned: + ```json { "models": ["anthropic/claude-sonnet-4-20250514"], @@ -155,6 +160,7 @@ Response when pinned: Without the header, routing runs fresh every time (no breaking change). Configure TTL and cache size: + ```yaml routing: session_ttl_seconds: 600 # default: 10 min @@ -165,7 +171,8 @@ routing: ## Version Requirements -| Version | Top-level `routing_preferences` | -|---|---| + +| Version | Top-level `routing_preferences` | +| ---------- | -------------------------------------- | | `< v0.4.0` | Not allowed — startup error if present | -| `v0.4.0+` | Supported (required for model routing) | +| `v0.4.0+` | Supported (required for model routing) | diff --git a/docs/source/concepts/llm_providers/supported_providers.rst b/docs/source/concepts/llm_providers/supported_providers.rst index 87163d3b..60f468e0 100644 --- a/docs/source/concepts/llm_providers/supported_providers.rst +++ b/docs/source/concepts/llm_providers/supported_providers.rst @@ -158,7 +158,9 @@ Anthropic .. code-block:: yaml - llm_providers: + version: v0.4.0 + + model_providers: # Configure all Anthropic models with wildcard - model: anthropic/* access_key: $ANTHROPIC_API_KEY @@ -179,8 +181,12 @@ Anthropic - model: anthropic/claude-sonnet-4-20250514 access_key: $ANTHROPIC_PROD_API_KEY - routing_preferences: - - name: code_generation + + routing_preferences: + - name: code_generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + models: + - anthropic/claude-sonnet-4-20250514 DeepSeek ~~~~~~~~ @@ -798,7 +804,9 @@ You can configure specific models with custom settings even when using wildcards .. code-block:: yaml - llm_providers: + version: v0.4.0 + + model_providers: # Expand to all Anthropic models - model: anthropic/* access_key: $ANTHROPIC_API_KEY @@ -807,14 +815,17 @@ You can configure specific models with custom settings even when using wildcards # This model will NOT be included in the wildcard expansion above - model: anthropic/claude-sonnet-4-20250514 access_key: $ANTHROPIC_PROD_API_KEY - routing_preferences: - - name: code_generation - priority: 1 # Another specific override - model: anthropic/claude-3-haiku-20240307 access_key: $ANTHROPIC_DEV_API_KEY + routing_preferences: + - name: code_generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + models: + - anthropic/claude-sonnet-4-20250514 + **Custom Provider Wildcards:** For providers not in Plano's registry, wildcards enable dynamic model routing: @@ -856,24 +867,36 @@ Mark one model as the default for fallback scenarios: Routing Preferences ~~~~~~~~~~~~~~~~~~~ -Configure routing preferences for dynamic model selection: +Starting in ``v0.4.0``, configure routing preferences at the top level of the config. Each preference declares an ordered ``models`` candidate pool; the first entry is primary and the rest are fallbacks the client tries on ``429``/``5xx`` errors. Multiple providers can serve the same route — just list them all under ``models``. See :doc:`/guides/llm_router` for the full routing model. .. code-block:: yaml - llm_providers: + version: v0.4.0 + + model_providers: - model: openai/gpt-5.2 access_key: $OPENAI_API_KEY - routing_preferences: - - name: complex_reasoning - description: deep analysis, mathematical problem solving, and logical reasoning - - name: code_review - description: reviewing and analyzing existing code for bugs and improvements - model: anthropic/claude-sonnet-4-5 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: creative_writing - description: creative content generation, storytelling, and writing assistance + + routing_preferences: + - name: complex_reasoning + description: deep analysis, mathematical problem solving, and logical reasoning + models: + - openai/gpt-5.2 + - anthropic/claude-sonnet-4-5 + - name: code_review + description: reviewing and analyzing existing code for bugs and improvements + models: + - openai/gpt-5.2 + - name: creative_writing + description: creative content generation, storytelling, and writing assistance + models: + - anthropic/claude-sonnet-4-5 + +.. note:: + ``v0.3.0`` configs that declare ``routing_preferences`` inline under each ``model_provider`` are auto-migrated to this top-level shape by the Plano CLI at compile time, with a deprecation warning. Update to the form above to silence the warning and gain the multi-model fallback behavior. .. _passthrough_auth: diff --git a/docs/source/concepts/signals.rst b/docs/source/concepts/signals.rst index ec1750e1..d5e25e7e 100644 --- a/docs/source/concepts/signals.rst +++ b/docs/source/concepts/signals.rst @@ -4,333 +4,602 @@ Signals™ ======== -Agentic Signals are behavioral and executions quality indicators that act as early warning signs of agent performance—highlighting both brilliant successes and **severe failures**. These signals are computed directly from conversation traces without requiring manual labeling or domain expertise, making them practical for production observability at scale. +Agentic Signals are lightweight, model-free behavioral indicators computed +from live interaction trajectories and attached to your existing +OpenTelemetry traces. They are the instrumentation layer of a closed-loop +improvement flywheel for agents — turning raw production traffic into +prioritized data that can drive prompt, routing, and model updates without +running an LLM-as-judge on every session. -The Problem: Knowing What's "Good" -================================== +The framework implemented here follows the taxonomy and detector design in +*Signals: Trajectory Sampling and Triage for Agentic Interactions* +(`Chen et al., 2026 `_). All detectors +are computed without model calls; the entire pipeline attaches structured +attributes and span events to existing spans so your dashboards and alerts +work unmodified. -One of the hardest parts of building agents is measuring how well they perform in the real world. +Why Signals Matter: The Improvement Flywheel +============================================ -**Offline testing** relies on hand-picked examples and happy-path scenarios, missing the messy diversity of real usage. Developers manually prompt models, evaluate responses, and tune prompts by guesswork—a slow, incomplete feedback loop. +Agentic applications are increasingly deployed at scale, yet improving them +after deployment remains difficult. Production trajectories are long, +numerous, and non-deterministic, making exhaustive human review infeasible +and auxiliary LLM evaluation expensive. As a result, teams face a +bottleneck: they cannot score every response, inspect every trace, or +reliably identify which failures and successes should inform the next model +update. Without a low-cost triage layer, the feedback loop from production +behavior to model improvement remains incomplete. -**Production debugging** floods developers with traces and logs but provides little guidance on which interactions actually matter. Finding failures means painstakingly reconstructing sessions and manually labeling quality issues. +Signals close this loop by cheaply identifying which interactions among +millions are worth inspecting: -You can't score every response with an LLM-as-judge (too expensive, too slow) or manually review every trace (doesn't scale). What you need are **behavioral signals**—fast, economical proxies that don’t label quality outright but dramatically shrink the search space, pointing to sessions most likely to be broken or brilliant. +1. **Instrument.** Live trajectories are scored with model-free signals + attached as structured attributes on existing OpenTelemetry spans, + organized under a fixed taxonomy of interaction, execution, and + environment signals. This requires no additional model calls, + infrastructure, or changes to online agent behavior. +2. **Sample & triage.** Signal attributes act as filters: they surface + severe failures, retrieve representative exemplars, and exclude the + uninformative middle. In our experiments, signal-based sampling + achieves 82% informativeness on :math:`\tau`-bench, compared with 54% + for random sampling, yielding a 1.52× efficiency gain per informative + trajectory. +3. **Data Construction.** The triaged subset becomes targeted input for + constructing preference datasets or supervised fine-tuning datasets + from production trajectories. +4. **Model Optimization.** The resulting preference or supervised + fine-tuning data is used to update the model through methods such as + DPO, RLHF, or supervised fine-tuning, so optimization is driven by + targeted production behavior rather than undifferentiated trace noise. +5. **Deploy.** The improved model is deployed and immediately + re-instrumented with the same signals, enabling teams to measure + whether the change improved production behavior and to feed the next + iteration. + +This loop depends on the first step being nearly free. The framework is +therefore designed around fixed-taxonomy, model-free detectors with +:math:`O(\text{messages})` cost, no online behavior change, and no +dependence on expensive evaluator models. By making production traces +searchable and sampleable at scale, signals turn raw agent telemetry into a +practical model-optimization flywheel. What Are Behavioral Signals? ============================ -Behavioral signals are canaries in the coal mine—early, objective indicators that something may have gone wrong (or gone exceptionally well). They don’t explain *why* an agent failed, but they reliably signal *where* attention is needed. +Behavioral signals are canaries in the coal mine — early, objective +indicators that something may have gone wrong (or gone exceptionally well). +They don't explain *why* an agent failed, but they reliably signal *where* +attention is needed. These signals emerge naturally from the rhythm of interaction: -- A user rephrasing the same request +- A user rephrasing or correcting the same request - Sharp increases in conversation length -- Frustrated follow-up messages (ALL CAPS, "this doesn’t work", excessive !!!/???) -- Agent repetition / looping -- Expressions of gratitude or satisfaction -- Requests to speak to a human / contact support +- Negative stance markers ("this doesn't work", ALL CAPS, excessive !!! or ???) +- Agent repetition or tool-call loops +- Expressions of gratitude, confirmation, or task success +- Requests for a human agent or explicit quit intent +- Tool errors, timeouts, rate limits, and context-window exhaustion -Individually, these clues are shallow; together, they form a fingerprint of agent performance. Embedded directly into traces, they make it easy to spot friction as it happens: where users struggle, where agents loop, and where escalations occur. +Individually, these clues are shallow; together, they form a fingerprint of +agent performance. Embedded directly into traces, they make it easy to spot +friction as it happens: where users struggle, where agents loop, where tool +failures cluster, and where escalations occur. -Signals vs Response Quality -=========================== +Signal Taxonomy +=============== -Behavioral signals and response quality are complementary. +Signals are organized into three top-level **layers**, each with its own +intent. Every detected signal belongs to exactly one leaf type under one of +seven categories. The per-category summaries and leaf-type descriptions +below are borrowed verbatim from the reference implementation at +`katanemo/signals `_ to keep the +documentation and the detector contract in sync. -**Response Quality** - Domain-specific correctness: did the agent do the right thing given business rules, user intent, and operational context? This often requires subject-matter experts or outcome instrumentation and is time-intensive but irreplaceable. +Interaction — user ↔ agent conversational quality +------------------------------------------------- -**Behavioral Signals** - Observable patterns that correlate with quality: high repair frequency, excessive turns, frustration markers, repetition, escalation, and positive feedback. Fast to compute and valuable for prioritizing which traces deserve inspection. +**Misalignment** — Misalignment signals capture semantic or intent mismatch +between the user and the agent, such as rephrasing, corrections, +clarifications, and restated constraints. These signals do not assert that +either party is "wrong"; they only indicate that shared understanding has +not yet been established. -Used together, signals tell you *where to look*, and quality evaluation tells you *what went wrong (or right)*. +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``misalignment.correction`` + - Explicit corrections, negations, mistake acknowledgments. + * - ``misalignment.rephrase`` + - Rephrasing indicators, alternative explanations. + * - ``misalignment.clarification`` + - Confusion expressions, requests for clarification. + +**Stagnation** — Stagnation signals capture cases where the discourse +continues but fails to make visible progress. This includes near-duplicate +assistant responses, circular explanations, repeated scaffolding, and other +forms of linguistic degeneration. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``stagnation.dragging`` + - Excessive turn count, conversation not progressing efficiently. + * - ``stagnation.repetition`` + - Near-duplicate or repetitive assistant responses. + +**Disengagement** — Disengagement signals mark the withdrawal of +cooperative intent from the interaction. These include explicit requests to +exit the agent flow (e.g., "talk to a human"), strong negative stances, and +abandonment markers. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``disengagement.escalation`` + - Requests for human agent or support. + * - ``disengagement.quit`` + - Notification to quit or leave. + * - ``disengagement.negative_stance`` + - Complaints, frustration, negative sentiment. + +**Satisfaction** — Satisfaction signals indicate explicit stabilization and +completion of the interaction. These include expressions of gratitude, +success confirmations, and closing utterances. We use these signals to +sample exemplar traces rather than to assign quality scores. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``satisfaction.gratitude`` + - Expressions of thanks and appreciation. + * - ``satisfaction.confirmation`` + - Explicit satisfaction expressions. + * - ``satisfaction.success`` + - Confirmation of task completion or understanding. + +Execution — agent-caused action quality +--------------------------------------- + +**Failure** — Detects agent-caused failures in tool/function usage. These +are issues the agent is responsible for (as opposed to environment failures +which are external system issues). Requires tool-call traces +(``function_call`` / ``observation``) to fire. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``execution.failure.invalid_args`` + - Wrong type, missing required field. + * - ``execution.failure.bad_query`` + - Empty results due to overly narrow/wrong query. + * - ``execution.failure.tool_not_found`` + - Agent called non-existent tool. + * - ``execution.failure.auth_misuse`` + - Agent didn't pass credentials correctly. + * - ``execution.failure.state_error`` + - Tool called in wrong state/order. + +**Loops** — Detects behavioral patterns where the agent gets stuck +repeating tool calls. These are distinct from +``interaction.stagnation`` (conversation text repetition) and +``execution.failure`` (single tool errors) — these detect tool-level +behavioral loops. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``execution.loops.retry`` + - Same tool with identical args ≥3 times. + * - ``execution.loops.parameter_drift`` + - Same tool with varied args ≥3 times. + * - ``execution.loops.oscillation`` + - Multi-tool A→B→A→B pattern ≥3 cycles. + +Environment — external system / boundary conditions +--------------------------------------------------- + +**Exhaustion** — Detects failures and constraints arising from the +surrounding system rather than the agent's internal policy or reasoning. +These are external issues the agent cannot control. + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Leaf signal type + - Description + * - ``environment.exhaustion.api_error`` + - 5xx errors, service unavailable. + * - ``environment.exhaustion.timeout`` + - Connection/read timeouts. + * - ``environment.exhaustion.rate_limit`` + - 429, quota exceeded. + * - ``environment.exhaustion.network`` + - Connection refused, DNS errors. + * - ``environment.exhaustion.malformed_response`` + - Invalid JSON, unexpected schema. + * - ``environment.exhaustion.context_overflow`` + - Token/context limit exceeded. How It Works ============ -Signals are computed automatically by the gateway and emitted as **OpenTelemetry trace attributes** to your existing observability stack (Jaeger, Honeycomb, Grafana Tempo, etc.). No additional libraries or instrumentation required—just configure your OTEL collector endpoint. +Signals are computed automatically by the gateway after each assistant +response and emitted as **OpenTelemetry trace attributes** and **span events** +on your existing spans. No additional libraries or instrumentation are +required — just configure your OTEL collector endpoint as usual. -Each conversation trace is enriched with signal attributes that you can query, filter, and visualize in your observability platform. The gateway analyzes message content (performing text normalization, Unicode handling, and pattern matching) to compute behavioral signals in real-time. +Each conversation trace is enriched with layered signal attributes +(category-level counts and severities) plus one span event per detected +signal instance (with confidence, snippet, and per-detector metadata). -**OTEL Trace Attributes** +.. note:: + Signal analysis is enabled by default and runs on the request path. It + does **not** affect the response sent to the client. Set + ``overrides.disable_signals: true`` in your Plano config to skip this + CPU-heavy analysis (see the configuration reference). -Signal data is exported as structured span attributes: +OTel Span Attributes +==================== -- ``signals.quality`` - Overall assessment (Excellent/Good/Neutral/Poor/Severe) -- ``signals.turn_count`` - Total number of turns in the conversation -- ``signals.efficiency_score`` - Efficiency metric (0.0-1.0) -- ``signals.repair.count`` - Number of repair attempts detected (when present) -- ``signals.repair.ratio`` - Ratio of repairs to user turns (when present) -- ``signals.frustration.count`` - Number of frustration indicators detected -- ``signals.frustration.severity`` - Frustration level (0-3) -- ``signals.repetition.count`` - Number of repetition instances detected -- ``signals.escalation.requested`` - Boolean escalation flag ("true" when present) -- ``signals.positive_feedback.count`` - Number of positive feedback indicators +Signal data is exported as structured OTel attributes. There are two tiers: +**top-level** attributes (always emitted on spans that carry signal +analysis) and **layered** attributes (emitted only when the corresponding +category has at least one signal instance). -**Visual Flag Marker** +Top-level attributes +-------------------- -When concerning signals are detected (frustration, looping, escalation, or poor/severe quality), the flag marker **🚩** is automatically appended to the span's operation name, making problematic traces easy to spot in your trace visualizations. +Always emitted once signals are computed. -**Querying in Your Observability Platform** +.. list-table:: + :header-rows: 1 + :widths: 40 15 45 -Example queries: + * - Attribute + - Type + - Value + * - ``signals.quality`` + - string + - One of ``excellent``, ``good``, ``neutral``, ``poor``, ``severe``. + * - ``signals.quality_score`` + - float + - Numeric score 0.0 – 100.0 that feeds the quality bucket. + * - ``signals.turn_count`` + - int + - Total number of user + assistant turns in the interaction. + * - ``signals.efficiency_score`` + - float + - Efficiency metric 0.0 – 1.0 (stays at 1.0 up to baseline turns, + then decays: ``1 / (1 + 0.3 * (turns - baseline))``). -- Find all severe interactions: ``signals.quality = "Severe"`` -- Find flagged traces: search for **🚩** in span names -- Find long conversations: ``signals.turn_count > 10`` -- Find inefficient interactions: ``signals.efficiency_score < 0.5`` -- Find high repair rates: ``signals.repair.ratio > 0.3`` -- Find frustrated users: ``signals.frustration.severity >= 2`` -- Find looping agents: ``signals.repetition.count >= 3`` -- Find positive interactions: ``signals.positive_feedback.count >= 2`` -- Find escalations: ``signals.escalation.requested = "true"`` +Layered attributes +------------------ + +Emitted per category, only when ``count > 0``. One ``.count`` and one +``.severity`` attribute per category. Severity is a 0–3 bucket (see +`Severity levels`_ below). + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Attribute (emitted when fired) + - Source + * - ``signals.interaction.misalignment.count`` + - Any ``misalignment.*`` leaf type + * - ``signals.interaction.misalignment.severity`` + - " + * - ``signals.interaction.stagnation.count`` + - Any ``stagnation.*`` leaf type + * - ``signals.interaction.stagnation.severity`` + - " + * - ``signals.interaction.disengagement.count`` + - Any ``disengagement.*`` leaf type + * - ``signals.interaction.disengagement.severity`` + - " + * - ``signals.interaction.satisfaction.count`` + - Any ``satisfaction.*`` leaf type + * - ``signals.interaction.satisfaction.severity`` + - " + * - ``signals.execution.failure.count`` + - Any ``failure.*`` leaf type + * - ``signals.execution.failure.severity`` + - " + * - ``signals.execution.loops.count`` + - Any ``loops.*`` leaf type + * - ``signals.execution.loops.severity`` + - " + * - ``signals.environment.exhaustion.count`` + - Any ``exhaustion.*`` leaf type + * - ``signals.environment.exhaustion.severity`` + - " + +Legacy attributes (deprecated, still emitted) +--------------------------------------------- + +The following aggregate keys pre-date the paper taxonomy and are still +emitted for one release so existing dashboards keep working. They are +derived from the layered counts above and will be removed in a future +release. Migrate to the layered keys when convenient. + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Legacy attribute + - Layered equivalent + * - ``signals.follow_up.repair.count`` + - ``signals.interaction.misalignment.count`` + * - ``signals.follow_up.repair.ratio`` + - (computed: ``misalignment.count / max(user_turns, 1)``) + * - ``signals.frustration.count`` + - Count of ``disengagement.negative_stance`` instances + * - ``signals.frustration.severity`` + - Derived severity bucket of the above + * - ``signals.repetition.count`` + - ``signals.interaction.stagnation.count`` + * - ``signals.escalation.requested`` + - True if any ``disengagement.escalation`` or ``disengagement.quit`` fired + * - ``signals.positive_feedback.count`` + - ``signals.interaction.satisfaction.count`` + +Span Events +=========== + +In addition to span attributes, every detected signal instance is emitted as +a span event named ``signal.`` (e.g. +``signal.interaction.satisfaction.gratitude``). Each event carries: + +.. list-table:: + :header-rows: 1 + :widths: 30 15 55 + + * - Event attribute + - Type + - Description + * - ``signal.type`` + - string + - Full dotted signal type (same as the event name suffix). + * - ``signal.message_index`` + - int + - Zero-based index of the message that triggered the signal. + * - ``signal.confidence`` + - float + - Detector confidence in [0.0, 1.0]. + * - ``signal.snippet`` + - string + - Matched substring from the source message (when available). + * - ``signal.metadata`` + - string (JSON) + - Per-detector metadata (pattern name, ratio values, etc.). + +Span events are the right surface for drill-down: attribute filters narrow +traces, then events tell you *which messages* fired *which signals* with +*what evidence*. + +Visual Flag Marker +------------------ + +When concerning signals are detected (disengagement present, stagnation +count > 2, any execution failure / loop, or overall quality ``poor``/ +``severe``), the marker 🚩 (U+1F6A9) is appended to the span's operation +name. +This makes flagged sessions immediately visible in trace UIs without +requiring attribute filtering. + +Querying in Your Observability Platform +--------------------------------------- + +Example queries against the layered keys:: + + signals.quality = "severe" + signals.turn_count > 10 + signals.efficiency_score < 0.5 + signals.interaction.disengagement.severity >= 2 + signals.interaction.misalignment.count > 3 + signals.interaction.satisfaction.count > 0 AND signals.quality = "good" + signals.execution.failure.count > 0 + signals.environment.exhaustion.count > 0 + +For flagged sessions, search for 🚩 in span names. .. image:: /_static/img/signals_trace.png :width: 100% :align: center +Severity Levels +=============== -Core Signal Types -================= - -The signals system tracks six categories of behavioral indicators. - -Turn Count & Efficiency ------------------------ - -**What it measures** - Number of user–assistant exchanges. - -**Why it matters** - Long conversations often indicate unclear intent resolution, confusion, or inefficiency. Very short conversations can correlate with crisp resolution. - -**Key metrics** - -- Total turn count -- Warning thresholds (concerning: >7 turns, excessive: >12 turns) -- Efficiency score (0.0–1.0) - -**Efficiency scoring** - Baseline expectation is ~5 turns (tunable). Efficiency stays at 1.0 up to the baseline, then declines with an inverse penalty as turns exceed baseline:: - - efficiency = 1 / (1 + 0.3 * (turns - baseline)) - -Follow-Up & Repair Frequency ----------------------------- - -**What it measures** - How often users clarify, correct, or rephrase requests. This is a **user signal** tracking query reformulation behavior—when users must repair or rephrase their requests because the agent didn't understand or respond appropriately. - -**Why it matters** - High repair frequency is a proxy for misunderstanding or intent drift. When users repeatedly rephrase the same request, it indicates the agent is failing to grasp or act on the user's intent. - -**Key metrics** - -- Repair count and ratio (repairs / user turns) -- Concerning threshold: >30% repair ratio -- Detected repair phrases (exact or fuzzy) - -**Common patterns detected** - -- Explicit corrections: "I meant", "correction" -- Negations: "No, I...", "that's not" -- Rephrasing: "let me rephrase", "to clarify" -- Mistake acknowledgment: "my mistake", "I was wrong" -- "Similar rephrase" heuristic based on token overlap (with stopwords downweighted) - -User Frustration ----------------- - -**What it measures** - Observable frustration indicators and emotional escalation. - -**Why it matters** - Catching frustration early enables intervention before users abandon or escalate. - -**Detection patterns** - -- **Complaints**: "this doesn't work", "not helpful", "waste of time" -- **Confusion**: "I don't understand", "makes no sense", "I'm confused" -- **Tone markers**: - - - ALL CAPS (>=10 alphabetic chars and >=80% uppercase) - - Excessive punctuation (>=3 exclamation marks or >=3 question marks) - -- **Profanity**: token-based (avoids substring false positives like "absolute" -> "bs") - -**Severity levels** - -- **None (0)**: no indicators -- **Mild (1)**: 1–2 indicators -- **Moderate (2)**: 3–4 indicators -- **Severe (3)**: 5+ indicators - -Repetition & Looping --------------------- - -**What it measures** - Assistant repetition / degenerative loops. This is an **assistant signal** tracking when the agent repeats itself, fails to follow instructions, or gets stuck in loops—indicating the agent is not making progress or adapting its responses. - -**Why it matters** - Often indicates missing state tracking, broken tool integration, prompt issues, or the agent ignoring user corrections. High repetition means the agent is not learning from the conversation context. - -**Detection method** - -- Compare assistant messages using **bigram Jaccard similarity** -- Classify: - - - **Exact**: similarity >= 0.85 - - **Near-duplicate**: similarity >= 0.50 - -- Looping is flagged when repetition instances exceed 2 in a session. - -**Severity levels** +Every category aggregates its leaf signal counts into a severity bucket used +by both the layered ``.severity`` attribute and the overall quality score. - **None (0)**: 0 instances - **Mild (1)**: 1–2 instances - **Moderate (2)**: 3–4 instances - **Severe (3)**: 5+ instances -Positive Feedback ------------------ - -**What it measures** - User expressions of satisfaction, gratitude, and success. - -**Why it matters** - Strong positive signals identify exemplar traces for prompt engineering and evaluation. - -**Detection patterns** - -- Gratitude: "thank you", "appreciate it" -- Satisfaction: "that's great", "awesome", "love it" -- Success confirmation: "got it", "that worked", "perfect" - -**Confidence scoring** - -- 1 indicator: 0.6 -- 2 indicators: 0.8 -- 3+ indicators: 0.95 - -Escalation Requests -------------------- - -**What it measures** - Requests for human help/support or threats to quit. - -**Why it matters** - Escalation is a strong signal that the agent failed to resolve the interaction. - -**Detection patterns** - -- Human requests: "speak to a human", "real person", "live agent" -- Support: "contact support", "customer service", "help desk" -- Quit threats: "I'm done", "forget it", "I give up" +Severity is always computed per-category. For example, three instances of +``misalignment.rephrase`` plus two of ``misalignment.correction`` yield +``signals.interaction.misalignment.severity = 3`` (5 instances total). Overall Quality Assessment ========================== -Signals are aggregated into an overall interaction quality on a 5-point scale. +Signals are aggregated into an overall interaction quality on a 5-point +scale. The scoring model starts at 50.0 (neutral), adds positive weight for +satisfaction, and subtracts weight for disengagement, misalignment (when +ratio > 30% of user turns), stagnation (when count > 2), execution failures, +execution loops, and environment exhaustion. -**Excellent** +The resulting numeric score maps to the bucket emitted in ``signals.quality``: + +**Excellent (75 – 100)** Strong positive signals, efficient resolution, low friction. -**Good** - Mostly positive with minor clarifications; some back-and-forth but successful. +**Good (60 – 74)** + Mostly positive with minor clarifications; some back-and-forth but + successful. -**Neutral** +**Neutral (40 – 59)** Mixed signals; neither clearly good nor bad. -**Poor** - Concerning negative patterns (high friction, multiple repairs, moderate frustration). High abandonment risk. +**Poor (25 – 39)** + Concerning negative patterns (high friction, multiple misalignments, + moderate disengagement, tool failures). High abandonment risk. -**Severe** - Critical issues—escalation requested, severe frustration, severe looping, or excessive turns (>12). Requires immediate attention. +**Severe (0 – 24)** + Critical issues — escalation requested, severe disengagement, severe + stagnation, or compounding failures. Requires immediate attention. -This assessment uses a scoring model that weighs positive factors (efficiency, positive feedback) against negative ones (frustration, repairs, repetition, escalation). +The raw numeric score is available under ``signals.quality_score``. Sampling and Prioritization =========================== -In production, trace data is overwhelming. Signals provide a lightweight first layer of analysis to prioritize which sessions deserve review. +In production, trace data is overwhelming. Signals provide a lightweight +first layer of triage to select the small fraction of trajectories that are +most likely to be informative. Per the paper, signal-based sampling reaches +82% informativeness on τ-bench versus 54% for random sampling — a 1.52× +efficiency gain per informative trajectory. Workflow: 1. Gateway captures conversation messages and computes signals -2. Signal attributes are emitted to OTEL spans automatically +2. Signal attributes and per-instance events are emitted to OTEL spans 3. Your observability platform ingests and indexes the attributes -4. Query/filter by signal attributes to surface outliers (poor/severe and exemplars) +4. Query / filter by signal attributes to surface outliers and exemplars 5. Review high-information traces to identify improvement opportunities 6. Update prompts, routing, or policies based on findings 7. Redeploy and monitor signal metrics to validate improvements -This creates a reinforcement loop where traces become both diagnostic data and training signal. +This creates a reinforcement loop where traces become both diagnostic data +and training signal for prompt engineering, routing policies, and +preference-data construction. -Trace Filtering and Telemetry -============================= +.. note:: + An in-gateway triage sampler that selects informative trajectories + inline — with configurable per-category weights and budgets — is planned + as a follow-up to this release. Today, sampling is consumer-side: your + observability platform filters on the signal attributes described above. -Signal attributes are automatically added to OpenTelemetry spans, making them immediately queryable in your observability platform. +Example Span +============ -**Visual Filtering** +A concerning session, showing both layered attributes and a per-instance +event:: -When concerning signals are detected, the flag marker **🚩** (U+1F6A9) is automatically appended to the span's operation name. This makes flagged sessions immediately visible in trace visualizations without requiring attribute filtering. + # Span name: "POST /v1/chat/completions gpt-5.2 🚩" -**Example Span Attributes**:: + # Top-level + signals.quality = "severe" + signals.quality_score = 0.0 + signals.turn_count = 4 + signals.efficiency_score = 1.0 - # Span name: "POST /v1/chat/completions gpt-4 🚩" - signals.quality = "Severe" - signals.turn_count = 15 - signals.efficiency_score = 0.234 - signals.repair.count = 4 - signals.repair.ratio = 0.571 - signals.frustration.severity = 3 - signals.frustration.count = 5 - signals.escalation.requested = "true" - signals.repetition.count = 4 + # Layered (only non-zero categories are emitted) + signals.interaction.disengagement.count = 6 + signals.interaction.disengagement.severity = 3 -**Building Dashboards** + # Legacy (deprecated, emitted while dual-emit is on) + signals.frustration.count = 4 + signals.frustration.severity = 2 + signals.escalation.requested = true -Use signal attributes to build monitoring dashboards in Grafana, Honeycomb, Datadog, etc.: + # Per-instance span events + event: signal.interaction.disengagement.escalation + signal.type = "interaction.disengagement.escalation" + signal.message_index = 6 + signal.confidence = 1.0 + signal.snippet = "get me a human" + signal.metadata = {"pattern_type":"escalation"} + +Building Dashboards +=================== + +Use signal attributes to build monitoring dashboards in Grafana, Honeycomb, +Datadog, etc. Prefer the layered keys — they align with the paper taxonomy +and will outlive the legacy keys. - **Quality distribution**: Count of traces by ``signals.quality`` - **P95 turn count**: 95th percentile of ``signals.turn_count`` - **Average efficiency**: Mean of ``signals.efficiency_score`` -- **High repair rate**: Percentage where ``signals.repair.ratio > 0.3`` -- **Frustration rate**: Percentage where ``signals.frustration.severity >= 2`` -- **Escalation rate**: Percentage where ``signals.escalation.requested = "true"`` -- **Looping rate**: Percentage where ``signals.repetition.count >= 3`` -- **Positive feedback rate**: Percentage where ``signals.positive_feedback.count >= 1`` +- **High misalignment rate**: Percentage where + ``signals.interaction.misalignment.count > 3`` +- **Disengagement rate**: Percentage where + ``signals.interaction.disengagement.severity >= 2`` +- **Satisfaction rate**: Percentage where + ``signals.interaction.satisfaction.count >= 1`` +- **Escalation rate**: Percentage where a ``disengagement.escalation`` or + ``disengagement.quit`` event fired (via span-event filter) +- **Tool-failure rate**: Percentage where + ``signals.execution.failure.count > 0`` +- **Environment issue rate**: Percentage where + ``signals.environment.exhaustion.count > 0`` -**Creating Alerts** +Creating Alerts +=============== Set up alerts based on signal thresholds: -- Alert when severe interaction count exceeds threshold in 1-hour window -- Alert on sudden spike in frustration rate (>2x baseline) -- Alert when escalation rate exceeds 5% of total conversations -- Alert on degraded efficiency (P95 turn count increases >50%) +- Alert when ``signals.quality = "severe"`` count exceeds threshold in a + 1-hour window +- Alert on sudden spike in + ``signals.interaction.disengagement.severity >= 2`` (>2× baseline) +- Alert on sustained ``signals.execution.failure.count > 0`` — agent-caused + tool issues +- Alert on spikes in ``signals.environment.exhaustion.count`` — external + system degradation +- Alert on degraded efficiency (P95 ``signals.turn_count`` up > 50%) Best Practices ============== Start simple: -- Alert or page on **Severe** sessions (or on spikes in Severe rate) -- Review **Poor** sessions within 24 hours -- Sample **Excellent** sessions as exemplars +- Alert or page on ``severe`` sessions (or on spikes in ``severe`` rate) +- Review ``poor`` sessions within 24 hours +- Sample ``excellent`` sessions as exemplars Combine multiple signals to infer failure modes: -- Looping: repetition severity >= 2 + excessive turns -- User giving up: frustration severity >= 2 + escalation requested -- Misunderstood intent: repair ratio > 30% + excessive turns -- Working well: positive feedback + high efficiency + no frustration +- **Silent loop**: ``signals.interaction.stagnation.severity >= 2`` + + ``signals.turn_count`` above baseline +- **User giving up**: ``signals.interaction.disengagement.severity >= 2`` + + any escalation event +- **Misunderstood intent**: + ``signals.interaction.misalignment.count / user_turns > 0.3`` +- **Agent-caused friction**: ``signals.execution.failure.count > 0`` + + ``signals.interaction.misalignment.count > 0`` +- **External degradation, not agent fault**: + ``signals.environment.exhaustion.count > 0`` while + ``signals.execution.failure.count = 0`` +- **Working well**: ``signals.interaction.satisfaction.count >= 1`` + + ``signals.efficiency_score > 0.8`` + no disengagement Limitations and Considerations ============================== -Signals don’t capture: +Signals don't capture: - Task completion / real outcomes - Factual or domain correctness @@ -339,21 +608,31 @@ Signals don’t capture: Mitigation strategies: -- Periodically sample flagged sessions and measure false positives/negatives +- Periodically sample flagged sessions and measure false positives / negatives - Tune baselines per use case and user population - Add domain-specific phrase libraries where needed - Combine signals with non-text metrics (tool failures, disconnects, latency) .. note:: - Behavioral signals complement—but do not replace—domain-specific response quality evaluation. Use signals to prioritize which traces to inspect, then apply domain expertise and outcome checks to diagnose root causes. + Behavioral signals complement — but do not replace — domain-specific + response quality evaluation. Use signals to prioritize which traces to + inspect, then apply domain expertise and outcome checks to diagnose root + causes. .. tip:: - The flag marker in the span name provides instant visual feedback in trace UIs, while the structured attributes (``signals.quality``, ``signals.frustration.severity``, etc.) enable powerful querying and aggregation in your observability platform. + The 🚩 marker in the span name provides instant visual feedback in + trace UIs, while the structured attributes (``signals.quality``, + ``signals.interaction.disengagement.severity``, etc.) and per-instance + span events enable powerful querying and drill-down in your observability + platform. See Also ======== -- :doc:`../guides/observability/tracing` - Distributed tracing for agent systems -- :doc:`../guides/observability/monitoring` - Metrics and dashboards -- :doc:`../guides/observability/access_logging` - Request/response logging -- :doc:`../guides/observability/observability` - Complete observability guide +- `Signals: Trajectory Sampling and Triage for Agentic Interactions + `_ — the paper this framework implements +- :doc:`../guides/observability/tracing` — Distributed tracing for agent + systems +- :doc:`../guides/observability/monitoring` — Metrics and dashboards +- :doc:`../guides/observability/access_logging` — Request / response logging +- :doc:`../guides/observability/observability` — Complete observability guide diff --git a/docs/source/conf.py b/docs/source/conf.py index 401f6cff..401b80f1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons project = "Plano Docs" copyright = "2026, Katanemo Labs, a DigitalOcean Company" author = "Katanemo Labs, Inc" -release = " v0.4.19" +release = " v0.4.21" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -33,6 +33,7 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.extlinks", + "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinx_sitemap", "sphinx_design", @@ -41,6 +42,7 @@ extensions = [ "provider_models", ] + # Paths that contain templates, relative to this directory. templates_path = ["_templates"] diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst index 6f1a86ac..509fe3c9 100644 --- a/docs/source/get_started/quickstart.rst +++ b/docs/source/get_started/quickstart.rst @@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins .. code-block:: console - $ uv tool install planoai==0.4.19 + $ uv tool install planoai==0.4.21 **Option 2: Install with pip (Traditional)** @@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins $ python -m venv venv $ source venv/bin/activate # On Windows, use: venv\Scripts\activate - $ pip install planoai==0.4.19 + $ pip install planoai==0.4.21 .. _llm_routing_quickstart: @@ -340,6 +340,67 @@ And to get the list of supported currencies: "Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in." +Observability +------------- + +Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering. + +===================== ============================================ ============================================================= +Command When to use Shows +===================== ============================================ ============================================================= +``planoai obs`` Live view while you drive traffic Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model +``planoai trace`` Deep-dive into a single request after the fact Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors +===================== ============================================ ============================================================= + +Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add: + +.. code-block:: yaml + + tracing: + random_sampling: 100 + opentracing_grpc_endpoint: http://localhost:4317 + +Live console — ``planoai obs`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: console + + $ planoai obs + # In another terminal: + $ planoai up + +Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required. + +With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request: + +.. code-block:: console + + $ curl localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $DO_API_KEY" \ + -d '{"model":"digitalocean/router:software-engineering", + "messages":[{"role":"user","content":"write code to print prime numbers in python"}], + "stream":false}' + +When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``. + +Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk. + +Single-request traces — ``planoai trace`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``: + +.. code-block:: console + + $ planoai trace listen # start the OTLP listener (daemon) + # drive some traffic through localhost:12000 ... + $ planoai trace # show the most recent trace + $ planoai trace # show a specific trace by id + $ planoai trace --list # list the last 50 trace ids + +Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time. + Next Steps ========== diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index 5539dddc..b66c01f2 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -147,38 +147,53 @@ Plano-Orchestrator analyzes each prompt to infer domain and action, then applies Configuration ^^^^^^^^^^^^^ -To configure preference-aligned dynamic routing, define routing preferences that map domains and actions to specific models: +To configure preference-aligned dynamic routing, declare a top-level ``routing_preferences`` list and attach an ordered ``models`` candidate pool to each route. Starting in ``v0.4.0``, ``routing_preferences`` lives at the root of the config (not inline under ``model_providers``), which lets multiple models serve the same route — the first entry in ``models`` is primary, the rest are fallbacks that the client tries on ``429``/``5xx`` errors. .. code-block:: yaml :caption: Preference-Aligned Dynamic Routing Configuration + version: v0.4.0 + listeners: - egress_traffic: + - name: egress_traffic + type: model address: 0.0.0.0 port: 12000 - message_format: openai timeout: 30s - llm_providers: + model_providers: - model: openai/gpt-5.2 access_key: $OPENAI_API_KEY default: true - model: openai/gpt-5 access_key: $OPENAI_API_KEY - routing_preferences: - - name: code understanding - description: understand and explain existing code snippets, functions, or libraries - - name: complex reasoning - description: deep analysis, mathematical problem solving, and logical reasoning - model: anthropic/claude-sonnet-4-5 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: creative writing - description: creative content generation, storytelling, and writing assistance - - name: code generation - description: generating new code snippets, functions, or boilerplate based on user prompts + + routing_preferences: + - name: code understanding + description: understand and explain existing code snippets, functions, or libraries + models: + - openai/gpt-5 + - anthropic/claude-sonnet-4-5 + - name: complex reasoning + description: deep analysis, mathematical problem solving, and logical reasoning + models: + - openai/gpt-5 + - name: creative writing + description: creative content generation, storytelling, and writing assistance + models: + - anthropic/claude-sonnet-4-5 + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts + models: + - anthropic/claude-sonnet-4-5 + - openai/gpt-5 + +.. note:: + Configs still using the ``v0.3.0`` inline style (``routing_preferences`` nested under each ``model_provider``) are auto-migrated to this top-level shape by the Plano CLI at compile time, with a deprecation warning. Update your config to the form above to silence the warning. Client usage ^^^^^^^^^^^^ @@ -253,6 +268,8 @@ Using Ollama (recommended for local development) .. code-block:: yaml + version: v0.4.0 + overrides: llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M @@ -266,9 +283,12 @@ Using Ollama (recommended for local development) - model: anthropic/claude-sonnet-4-5 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: creative writing - description: creative content generation, storytelling, and writing assistance + + routing_preferences: + - name: creative writing + description: creative content generation, storytelling, and writing assistance + models: + - anthropic/claude-sonnet-4-5 4. **Verify the model is running** @@ -322,6 +342,8 @@ vLLM provides higher throughput and GPU optimizations suitable for production de .. code-block:: yaml + version: v0.4.0 + overrides: llm_routing_model: plano/Plano-Orchestrator @@ -335,9 +357,12 @@ vLLM provides higher throughput and GPU optimizations suitable for production de - model: anthropic/claude-sonnet-4-5 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: creative writing - description: creative content generation, storytelling, and writing assistance + + routing_preferences: + - name: creative writing + description: creative content generation, storytelling, and writing assistance + models: + - anthropic/claude-sonnet-4-5 5. **Verify the server is running** @@ -468,22 +493,30 @@ You can combine static model selection with dynamic routing preferences for maxi .. code-block:: yaml :caption: Hybrid Routing Configuration - llm_providers: + version: v0.4.0 + + model_providers: - model: openai/gpt-5.2 access_key: $OPENAI_API_KEY default: true - model: openai/gpt-5 access_key: $OPENAI_API_KEY - routing_preferences: - - name: complex_reasoning - description: deep analysis and complex problem solving - model: anthropic/claude-sonnet-4-5 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: creative_tasks - description: creative writing and content generation + + routing_preferences: + - name: complex_reasoning + description: deep analysis and complex problem solving + models: + - openai/gpt-5 + - anthropic/claude-sonnet-4-5 + - name: creative_tasks + description: creative writing and content generation + models: + - anthropic/claude-sonnet-4-5 + - openai/gpt-5 model_aliases: # Model aliases - friendly names that map to actual provider names diff --git a/docs/source/guides/observability/monitoring.rst b/docs/source/guides/observability/monitoring.rst index 736e0a64..d28d25ca 100644 --- a/docs/source/guides/observability/monitoring.rst +++ b/docs/source/guides/observability/monitoring.rst @@ -75,3 +75,54 @@ are some sample configuration files for both, respectively. isDefault: true access: proxy editable: true + +Brightstaff metrics +~~~~~~~~~~~~~~~~~~~ + +In addition to Envoy's stats on ``:9901``, the brightstaff dataplane +process exposes its own Prometheus endpoint on ``0.0.0.0:9092`` (override +with ``METRICS_BIND_ADDRESS``). It publishes: + +* HTTP RED — ``brightstaff_http_requests_total``, + ``brightstaff_http_request_duration_seconds``, + ``brightstaff_http_in_flight_requests`` (labels: ``handler``, ``method``, + ``status_class``). +* LLM upstream — ``brightstaff_llm_upstream_requests_total``, + ``brightstaff_llm_upstream_duration_seconds``, + ``brightstaff_llm_time_to_first_token_seconds``, + ``brightstaff_llm_tokens_total`` (labels: ``provider``, ``model``, + ``error_class``, ``kind``). +* Routing — ``brightstaff_router_decisions_total``, + ``brightstaff_router_decision_duration_seconds``, + ``brightstaff_routing_service_requests_total``, + ``brightstaff_session_cache_events_total``. +* Process & build — ``process_resident_memory_bytes``, + ``process_cpu_seconds_total``, ``brightstaff_build_info``. + +A self-contained Prometheus + Grafana stack is shipped under +``config/grafana/``. With Plano already running on the host, bring it up +with one command: + +.. code-block:: bash + + cd config/grafana + docker compose up -d + open http://localhost:3000 # admin / admin (anonymous viewer also enabled) + +Grafana auto-loads the Prometheus datasource and the brightstaff +dashboard (look under the *Plano* folder). Prometheus scrapes the host's +``:9092`` and ``:9901`` via ``host.docker.internal``. + +Files: + +* ``config/grafana/docker-compose.yaml`` — one-command Prom + Grafana + stack with provisioning. +* ``config/grafana/prometheus_scrape.yaml`` — complete Prometheus config + with ``envoy`` and ``brightstaff`` scrape jobs (mounted by the + compose). +* ``config/grafana/brightstaff_dashboard.json`` — 19-panel dashboard + across HTTP RED, LLM upstream, Routing service, and Process & Envoy + link rows. Auto-provisioned by the compose; can also be imported by + hand via *Dashboards → New → Import*. +* ``config/grafana/provisioning/`` — Grafana provisioning files for the + datasource and dashboard provider. diff --git a/docs/source/guides/observability/tracing.rst b/docs/source/guides/observability/tracing.rst index 950befd2..b3660168 100644 --- a/docs/source/guides/observability/tracing.rst +++ b/docs/source/guides/observability/tracing.rst @@ -101,20 +101,20 @@ This creates a complete end-to-end trace showing the full request lifecycle thro Behavioral Signals in Traces ---------------------------- -Plano automatically enriches OpenTelemetry traces with :doc:`../../concepts/signals` — behavioral quality indicators computed from conversation patterns. These signals are attached as span attributes, providing immediate visibility into interaction quality. +Plano automatically enriches OpenTelemetry traces with :doc:`../../concepts/signals` — lightweight, model-free behavioral indicators organized into three layers (interaction, execution, environment) per `Chen et al., 2026 `_. Signals are attached as span attributes and per-instance span events, providing immediate visibility into interaction quality. **What Signals Provide** Signals act as early warning indicators embedded in your traces: -- **Quality Assessment**: Overall interaction quality (Excellent/Good/Neutral/Poor/Severe) -- **Efficiency Metrics**: Turn count, efficiency scores, repair frequency -- **User Sentiment**: Frustration indicators, positive feedback, escalation requests -- **Agent Behavior**: Repetition detection, looping patterns +- **Quality Assessment**: Overall interaction quality (``excellent`` / ``good`` / ``neutral`` / ``poor`` / ``severe``) and numeric score +- **Interaction layer**: misalignment, stagnation, disengagement, satisfaction +- **Execution layer**: tool failures and loop patterns (from ``function_call`` / ``observation`` traces) +- **Environment layer**: exhaustion (API errors, timeouts, rate limits, context overflow) **Visual Flag Markers** -When concerning signals are detected (frustration, looping, escalation, or poor/severe quality), Plano automatically appends a flag marker **🚩** to the span's operation name. This makes problematic traces immediately visible in your tracing UI without requiring additional queries. +When concerning signals are detected (disengagement, execution failures / loops, stagnation > 2, or ``poor`` / ``severe`` quality), Plano automatically appends a 🚩 marker to the span's operation name. This makes problematic traces immediately visible in your tracing UI without requiring additional queries. **Example Span with Signals**:: @@ -123,23 +123,37 @@ When concerning signals are detected (frustration, looping, escalation, or poor/ llm.model = "gpt-4" llm.usage.total_tokens = 225 - # Behavioral signal attributes: - signals.quality = "Severe" - signals.turn_count = 15 - signals.efficiency_score = 0.234 - signals.frustration.severity = 3 - signals.escalation.requested = "true" + # Top-level signal attributes: + signals.quality = "severe" + signals.quality_score = 0.0 + signals.turn_count = 15 + signals.efficiency_score = 0.234 + + # Layered attributes (only non-zero categories are emitted): + signals.interaction.misalignment.count = 4 + signals.interaction.misalignment.severity = 2 + signals.interaction.disengagement.count = 5 + signals.interaction.disengagement.severity = 3 + + # Per-instance span event: + event: signal.interaction.disengagement.escalation + signal.type = "interaction.disengagement.escalation" + signal.message_index = 14 + signal.confidence = 1.0 + signal.snippet = "get me a human" **Querying Signal Data** In your observability platform (Jaeger, Grafana Tempo, Datadog, etc.), filter traces by signal attributes: -- Find severe interactions: ``signals.quality = "Severe"`` -- Find frustrated users: ``signals.frustration.severity >= 2`` +- Find severe interactions: ``signals.quality = "severe"`` +- Find disengaged users: ``signals.interaction.disengagement.severity >= 2`` +- Find misaligned interactions: ``signals.interaction.misalignment.count > 3`` +- Find tool failures: ``signals.execution.failure.count > 0`` +- Find external issues: ``signals.environment.exhaustion.count > 0`` - Find inefficient flows: ``signals.efficiency_score < 0.5`` -- Find escalations: ``signals.escalation.requested = "true"`` -For complete details on all available signals, detection methods, and best practices, see the :doc:`../../concepts/signals` guide. +For complete details on all 20 leaf signal types, severity scheme, legacy attribute deprecation, and best practices, see the :doc:`../../concepts/signals` guide. Custom Span Attributes diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst index 1aab49c9..18cb93ac 100644 --- a/docs/source/resources/deployment.rst +++ b/docs/source/resources/deployment.rst @@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration: # docker-compose.yml services: plano: - image: katanemo/plano:0.4.19 + image: katanemo/plano:0.4.21 container_name: plano ports: - "10000:10000" # ingress (client -> plano) @@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``: spec: containers: - name: plano - image: katanemo/plano:0.4.19 + image: katanemo/plano:0.4.21 ports: - containerPort: 12000 # LLM gateway (chat completions, model routing) name: llm-gateway diff --git a/docs/source/resources/includes/plano_config_full_reference.yaml b/docs/source/resources/includes/plano_config_full_reference.yaml index 1d544727..99eb4510 100644 --- a/docs/source/resources/includes/plano_config_full_reference.yaml +++ b/docs/source/resources/includes/plano_config_full_reference.yaml @@ -1,5 +1,5 @@ # Plano Gateway configuration version -version: v0.3.0 +version: v0.4.0 # External HTTP agents - API type is controlled by request path (/v1/responses, /v1/messages, /v1/chat/completions) agents: @@ -32,17 +32,8 @@ model_providers: - model: mistral/ministral-3b-latest access_key: $MISTRAL_API_KEY - # routing_preferences: tags a model with named capabilities so Plano's LLM router - # can select the best model for each request based on intent. Requires the - # Plano-Orchestrator model (or equivalent) to be configured in overrides.llm_routing_model. - # Each preference has a name (short label) and a description (used for intent matching). - model: groq/llama-3.3-70b-versatile access_key: $GROQ_API_KEY - routing_preferences: - - name: code generation - description: generating new code snippets, functions, or boilerplate based on user prompts or requirements - - name: code review - description: reviewing, analyzing, and suggesting improvements to existing code # passthrough_auth: forwards the client's Authorization header upstream instead of # using the configured access_key. Useful for LiteLLM or similar proxy setups. @@ -64,6 +55,29 @@ model_aliases: smart-llm: target: gpt-4o +# routing_preferences: top-level list that tags named task categories with an +# ordered pool of candidate models. Plano's LLM router matches incoming requests +# against these descriptions and returns an ordered list of models; the client +# uses models[0] as primary and retries with models[1], models[2]... on 429/5xx. +# Requires overrides.llm_routing_model to point at Plano-Orchestrator (or equivalent). +# Each model in `models` must be declared in model_providers above. +# selection_policy is optional: {prefer: cheapest|fastest|none} lets the router +# reorder candidates using live cost/latency data from model_metrics_sources. +routing_preferences: + - name: code generation + description: generating new code snippets, functions, or boilerplate based on user prompts or requirements + models: + - anthropic/claude-sonnet-4-0 + - openai/gpt-4o + - groq/llama-3.3-70b-versatile + - name: code review + description: reviewing, analyzing, and suggesting improvements to existing code + models: + - anthropic/claude-sonnet-4-0 + - groq/llama-3.3-70b-versatile + selection_policy: + prefer: cheapest + # HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access listeners: # Agent listener for routing requests to multiple agents @@ -173,6 +187,9 @@ overrides: llm_routing_model: Plano-Orchestrator # Model used for agent orchestration (must be listed in model_providers) agent_orchestration_model: Plano-Orchestrator + # Disable agentic signal analysis (frustration, repetition, escalation, etc.) + # on LLM responses to save CPU. Default: false. + disable_signals: false # Model affinity — pin routing decisions for agentic loops routing: diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml index 4992ce3b..e2ab9110 100644 --- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml +++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml @@ -69,12 +69,6 @@ listeners: model: llama-3.3-70b-versatile name: groq/llama-3.3-70b-versatile provider_interface: groq - routing_preferences: - - description: generating new code snippets, functions, or boilerplate based on - user prompts or requirements - name: code generation - - description: reviewing, analyzing, and suggesting improvements to existing code - name: code review - base_url: https://litellm.example.com cluster_name: openai_litellm.example.com endpoint: litellm.example.com @@ -131,12 +125,6 @@ model_providers: model: llama-3.3-70b-versatile name: groq/llama-3.3-70b-versatile provider_interface: groq - routing_preferences: - - description: generating new code snippets, functions, or boilerplate based on - user prompts or requirements - name: code generation - - description: reviewing, analyzing, and suggesting improvements to existing code - name: code review - base_url: https://litellm.example.com cluster_name: openai_litellm.example.com endpoint: litellm.example.com @@ -170,6 +158,7 @@ model_providers: provider_interface: plano overrides: agent_orchestration_model: Plano-Orchestrator + disable_signals: false llm_routing_model: Plano-Orchestrator optimize_context_window: true prompt_target_intent_matching_threshold: 0.7 @@ -220,6 +209,21 @@ routing: type: memory session_max_entries: 10000 session_ttl_seconds: 600 +routing_preferences: +- description: generating new code snippets, functions, or boilerplate based on user + prompts or requirements + models: + - anthropic/claude-sonnet-4-0 + - openai/gpt-4o + - groq/llama-3.3-70b-versatile + name: code generation +- description: reviewing, analyzing, and suggesting improvements to existing code + models: + - anthropic/claude-sonnet-4-0 + - groq/llama-3.3-70b-versatile + name: code review + selection_policy: + prefer: cheapest state_storage: type: memory system_prompt: 'You are a helpful assistant. Always respond concisely and accurately. @@ -236,4 +240,4 @@ tracing: environment: production service.team: platform trace_arch_internal: false -version: v0.3.0 +version: v0.4.0 diff --git a/skills/AGENTS.md b/skills/AGENTS.md new file mode 100644 index 00000000..dab3144b --- /dev/null +++ b/skills/AGENTS.md @@ -0,0 +1,2176 @@ +# Plano Agent Skills + +> Best practices for building agents and agentic applications with Plano — the AI-native proxy and dataplane. Covers configuration, routing, agent orchestration, filter chains, observability, CLI operations, and deployment patterns. + +**Version:** 1.0.0 | **Organization:** Plano + +--- + +## Table of Contents + +- [Section 1: Configuration Fundamentals](#section-1) + - [1.1 Always Specify a Supported Config Version](#always-specify-a-supported-config-version) + - [1.2 Choose the Right Listener Type for Your Use Case](#choose-the-right-listener-type-for-your-use-case) + - [1.3 Register Model Providers with Correct Format Identifiers](#register-model-providers-with-correct-format-identifiers) + - [1.4 Use Environment Variable Substitution for All Secrets](#use-environment-variable-substitution-for-all-secrets) +- [Section 2: Routing & Model Selection](#section-2) + - [2.1 Always Set Exactly One Default Model Provider](#always-set-exactly-one-default-model-provider) + - [2.2 Use Model Aliases for Semantic, Stable Model References](#use-model-aliases-for-semantic-stable-model-references) + - [2.3 Use Passthrough Auth for Proxy and Multi-Tenant Setups](#use-passthrough-auth-for-proxy-and-multi-tenant-setups) + - [2.4 Write Task-Specific Routing Preference Descriptions](#write-task-specific-routing-preference-descriptions) +- [Section 3: Agent Orchestration](#section-3) + - [3.1 Register All Sub-Agents in Both `agents` and `listeners.agents`](#register-all-sub-agents-in-both-agents-and-listenersagents) + - [3.2 Write Capability-Focused Agent Descriptions for Accurate Routing](#write-capability-focused-agent-descriptions-for-accurate-routing) +- [Section 4: Filter Chains & Guardrails](#section-4) + - [4.1 Configure MCP Filters with Explicit Type and Transport](#configure-mcp-filters-with-explicit-type-and-transport) + - [4.2 Configure Prompt Guards with Actionable Rejection Messages](#configure-prompt-guards-with-actionable-rejection-messages) + - [4.3 Order Filter Chains with Guards First, Enrichment Last](#order-filter-chains-with-guards-first-enrichment-last) +- [Section 5: Observability & Debugging](#section-5) + - [5.1 Add Custom Span Attributes for Correlation and Filtering](#add-custom-span-attributes-for-correlation-and-filtering) + - [5.2 Enable Tracing with Appropriate Sampling for Your Environment](#enable-tracing-with-appropriate-sampling-for-your-environment) + - [5.3 Use `planoai trace` to Inspect Routing Decisions](#use-planoai-trace-to-inspect-routing-decisions) +- [Section 6: CLI Operations](#section-6) + - [6.1 Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues](#follow-the-planoai-up-validation-workflow-before-debugging-runtime-issues) + - [6.2 Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets`](#generate-prompt-targets-from-python-functions-with-planoai-generateprompttargets) + - [6.3 Use `planoai cli_agent` to Connect Claude Code Through Plano](#use-planoai-cliagent-to-connect-claude-code-through-plano) + - [6.4 Use `planoai init` Templates to Bootstrap New Projects Correctly](#use-planoai-init-templates-to-bootstrap-new-projects-correctly) +- [Section 7: Deployment & Security](#section-7) + - [7.1 Understand Plano's Docker Network Topology for Agent URL Configuration](#understand-planos-docker-network-topology-for-agent-url-configuration) + - [7.2 Use PostgreSQL State Storage for Multi-Turn Conversations in Production](#use-postgresql-state-storage-for-multi-turn-conversations-in-production) + - [7.3 Verify Listener Health Before Sending Requests](#verify-listener-health-before-sending-requests) +- [Section 8: Advanced Patterns](#section-8) + - [8.1 Combine Multiple Listener Types for Layered Agent Architectures](#combine-multiple-listener-types-for-layered-agent-architectures) + - [8.2 Design Prompt Targets with Precise Parameter Schemas](#design-prompt-targets-with-precise-parameter-schemas) + +--- + +## Section 1: Configuration Fundamentals + +*Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment.* + +### 1.1 Always Specify a Supported Config Version + +**Impact:** `CRITICAL` — Plano rejects configs with missing or unsupported version fields — the version field gates all other validation +**Tags:** `config`, `versioning`, `validation` + +## Always Specify a Supported Config Version + +Every Plano `config.yaml` must include a `version` field at the top level. Plano validates configs against a versioned JSON schema — an unrecognized or missing version will cause `planoai up` to fail immediately with a schema validation error before the container starts. + +**Incorrect (missing or invalid version):** + +```yaml +# No version field — fails schema validation +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY +``` + +**Correct (explicit supported version):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +Use the latest supported version unless you are targeting a specific deployed Plano image. Current supported versions: `v0.1`, `v0.1.0`, `0.1-beta`, `v0.2.0`, `v0.3.0`. Prefer `v0.3.0` for all new projects. + +Reference: https://github.com/katanemo/archgw/blob/main/config/plano_config_schema.yaml + +--- + +### 1.2 Choose the Right Listener Type for Your Use Case + +**Impact:** `CRITICAL` — The listener type determines the entire request processing pipeline — choosing the wrong type means features like prompt functions or agent routing are unavailable +**Tags:** `config`, `listeners`, `architecture`, `routing` + +## Choose the Right Listener Type for Your Use Case + +Plano supports three listener types, each serving a distinct purpose. `listeners` is the only required top-level array in a Plano config. Every listener needs at minimum a `type`, `name`, and `port`. + +| Type | Use When | Key Feature | +|------|----------|-------------| +| `model` | You want an OpenAI-compatible LLM gateway | Routes to multiple LLM providers, supports model aliases and routing preferences | +| `prompt` | You want LLM-callable custom functions | Define `prompt_targets` that the LLM dispatches as function calls | +| `agent` | You want multi-agent orchestration | Routes user requests to specialized sub-agents by matching agent descriptions | + +**Incorrect (using `model` when agents need orchestration):** + +```yaml +version: v0.3.0 + +# Wrong: a model listener cannot route to backend agent services +listeners: + - type: model + name: main + port: 12000 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 +``` + +**Correct (use `agent` listener for multi-agent systems):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: travel_agent + url: http://host.docker.internal:8002 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides real-time weather, forecasts, and conditions for any city. + - id: travel_agent + description: Books flights, hotels, and travel itineraries. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +A single Plano instance can expose multiple listeners on different ports, each with a different type, to serve different clients simultaneously. + +Reference: https://github.com/katanemo/archgw + +--- + +### 1.3 Register Model Providers with Correct Format Identifiers + +**Impact:** `CRITICAL` — Incorrect provider format causes request translation failures — Plano must know the wire format each provider expects +**Tags:** `config`, `model-providers`, `llm`, `api-format` + +## Register Model Providers with Correct Format Identifiers + +Plano translates requests between its internal format and each provider's API. The `model` field uses `provider/model-name` syntax which determines both the upstream endpoint and the request/response translation layer. Some providers require an explicit `provider_interface` override. + +**Provider format reference:** + +| Model prefix | Wire format | Example | +|---|---|---| +| `openai/*` | OpenAI | `openai/gpt-4o` | +| `anthropic/*` | Anthropic | `anthropic/claude-sonnet-4-20250514` | +| `gemini/*` | Google Gemini | `gemini/gemini-2.0-flash` | +| `mistral/*` | Mistral | `mistral/mistral-large-latest` | +| `groq/*` | Groq | `groq/llama-3.3-70b-versatile` | +| `deepseek/*` | DeepSeek | `deepseek/deepseek-chat` | +| `xai/*` | Grok (OpenAI-compat) | `xai/grok-2` | +| `together_ai/*` | Together.ai | `together_ai/meta-llama/Llama-3` | +| `custom/*` | Requires `provider_interface` | `custom/my-local-model` | + +**Incorrect (missing provider prefix, ambiguous format):** + +```yaml +model_providers: + - model: gpt-4o # Missing openai/ prefix — Plano cannot route this + access_key: $OPENAI_API_KEY + + - model: claude-3-5-sonnet # Missing anthropic/ prefix + access_key: $ANTHROPIC_API_KEY +``` + +**Correct (explicit provider prefixes):** + +```yaml +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + + - model: gemini/gemini-2.0-flash + access_key: $GOOGLE_API_KEY +``` + +**For local or self-hosted models (Ollama, LiteLLM, vLLM):** + +```yaml +model_providers: + - model: custom/llama3 + base_url: http://host.docker.internal:11434/v1 # Ollama endpoint + provider_interface: openai # Ollama speaks OpenAI format + default: true +``` + +Always set `default: true` on exactly one provider per listener so Plano has a fallback when routing preferences do not match. + +Reference: https://github.com/katanemo/archgw + +--- + +### 1.4 Use Environment Variable Substitution for All Secrets + +**Impact:** `CRITICAL` — Hardcoded API keys in config.yaml will be committed to version control and exposed in Docker container inspect output +**Tags:** `config`, `security`, `secrets`, `api-keys`, `environment-variables` + +## Use Environment Variable Substitution for All Secrets + +Plano supports `$VAR_NAME` substitution in config values. This applies to `access_key` fields, `connection_string` for state storage, and `http_headers` in prompt targets and endpoints. Never hardcode credentials — Plano reads them from environment variables or a `.env` file at startup via `planoai up`. + +**Incorrect (hardcoded secrets):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o + access_key: abcdefghijklmnopqrstuvwxyz... # Hardcoded — never do this + +state_storage: + type: postgres + connection_string: "postgresql://admin:mysecretpassword@prod-db:5432/plano" + +prompt_targets: + - name: get_data + endpoint: + name: my_api + http_headers: + Authorization: "Bearer abcdefghijklmnopqrstuvwxyz" # Hardcoded token +``` + +**Correct (environment variable substitution):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +state_storage: + type: postgres + connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}" + +prompt_targets: + - name: get_data + endpoint: + name: my_api + http_headers: + Authorization: "Bearer $MY_API_TOKEN" +``` + +**`.env` file pattern (loaded automatically by `planoai up`):** + +```bash +# .env — add to .gitignore +OPENAI_API_KEY=abcdefghijklmnopqrstuvwxyz... +ANTHROPIC_API_KEY=abcdefghijklmnopqrstuvwxyz... +DB_USER=plano +DB_PASS=secure-password +DB_HOST=localhost +MY_API_TOKEN=abcdefghijklmnopqrstuvwxyz... +``` + +Plano also accepts keys set directly in the shell environment. Variables referenced in config but not found at startup cause `planoai up` to fail with a clear error listing the missing keys. + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 2: Routing & Model Selection + +*Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model.* + +### 2.1 Always Set Exactly One Default Model Provider + +**Impact:** `HIGH` — Without a default provider, Plano has no fallback when routing preferences do not match — requests with unclassified intent will fail +**Tags:** `routing`, `defaults`, `model-providers`, `reliability` + +## Always Set Exactly One Default Model Provider + +When a request does not match any routing preference, Plano forwards it to the `default: true` provider. Without a default, unmatched requests fail. If multiple providers are marked `default: true`, Plano uses the first one — which can produce unexpected behavior. + +**Incorrect (no default provider set):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini # No default: true anywhere + access_key: $OPENAI_API_KEY + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: summarization + description: Summarizing documents and extracting key points + models: + - openai/gpt-4o-mini + - name: code_generation + description: Writing new functions and implementing algorithms + models: + - openai/gpt-4o +``` + +**Incorrect (multiple defaults — ambiguous):** + +```yaml +model_providers: + - model: openai/gpt-4o-mini + default: true # First default + access_key: $OPENAI_API_KEY + + - model: openai/gpt-4o + default: true # Second default — confusing + access_key: $OPENAI_API_KEY +``` + +**Correct (exactly one default, covering unmatched requests):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true # Handles general/unclassified requests + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: summarization + description: Summarizing documents, articles, and meeting notes + models: + - openai/gpt-4o-mini + - openai/gpt-4o + - name: classification + description: Categorizing inputs, labeling, and intent detection + models: + - openai/gpt-4o-mini + - name: code_generation + description: Writing, debugging, and reviewing code + models: + - openai/gpt-4o + - openai/gpt-4o-mini + - name: complex_reasoning + description: Multi-step math, logical analysis, research synthesis + models: + - openai/gpt-4o +``` + +Choose your most cost-effective capable model as the default — it handles all traffic that doesn't match specialized preferences. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +### 2.2 Use Model Aliases for Semantic, Stable Model References + +**Impact:** `MEDIUM` — Hardcoded model names in client code require code changes when you swap providers; aliases let you update routing in config.yaml alone +**Tags:** `routing`, `model-aliases`, `maintainability`, `client-integration` + +## Use Model Aliases for Semantic, Stable Model References + +`model_aliases` map human-readable names to specific model identifiers. Client applications reference the alias, not the underlying model. When you want to upgrade from `gpt-4o` to a new model, you change one line in `config.yaml` — not every client calling the API. + +**Incorrect (clients hardcode specific model names):** + +```yaml +# config.yaml — no aliases defined +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +```python +# Client code — brittle, must be updated when model changes +client.chat.completions.create(model="gpt-4o", ...) +``` + +**Correct (semantic aliases, stable client contracts):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +model_aliases: + plano.fast.v1: + target: gpt-4o-mini # Cheap, fast — for high-volume tasks + + plano.smart.v1: + target: gpt-4o # High capability — for complex reasoning + + plano.creative.v1: + target: claude-sonnet-4-20250514 # Strong creative writing and analysis + + plano.v1: + target: gpt-4o # Default production alias +``` + +```python +# Client code — stable, alias is the contract +client.chat.completions.create(model="plano.smart.v1", ...) +``` + +**Alias naming conventions:** +- `..` — e.g., `plano.fast.v1`, `acme.code.v2` +- Bumping `.v2` → `.v3` lets you run old and new aliases simultaneously during rollouts +- `plano.v1` as a canonical default gives clients a single stable entry point + +Reference: https://github.com/katanemo/archgw + +--- + +### 2.3 Use Passthrough Auth for Proxy and Multi-Tenant Setups + +**Impact:** `MEDIUM` — Without passthrough auth, self-hosted proxy services (LiteLLM, vLLM, etc.) reject Plano's requests because the wrong Authorization header is sent +**Tags:** `routing`, `authentication`, `proxy`, `litellm`, `multi-tenant` + +## Use Passthrough Auth for Proxy and Multi-Tenant Setups + +When routing to a self-hosted LLM proxy (LiteLLM, vLLM, OpenRouter, Azure APIM) or in multi-tenant setups where clients supply their own keys, set `passthrough_auth: true`. This forwards the client's `Authorization` header rather than Plano's configured `access_key`. Combine with a `base_url` pointing to the proxy. + +**Incorrect (Plano sends its own key to a proxy that expects the client's key):** + +```yaml +model_providers: + - model: custom/proxy + base_url: http://host.docker.internal:8000 + access_key: $SOME_KEY # Plano overwrites the client's auth — proxy rejects it +``` + +**Correct (forward client Authorization header to the proxy):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: custom/litellm-proxy + base_url: http://host.docker.internal:4000 # LiteLLM server + provider_interface: openai # LiteLLM uses OpenAI format + passthrough_auth: true # Forward client's Bearer token + default: true +``` + +**Multi-tenant pattern (client supplies their own API key):** + +```yaml +model_providers: + # Plano acts as a passthrough gateway; each client has their own OpenAI key + - model: openai/gpt-4o + passthrough_auth: true # No access_key here — client's key is forwarded + default: true +``` + +**Combined: proxy for some models, Plano-managed for others:** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY # Plano manages this key + default: true + + - model: custom/vllm-llama + base_url: http://gpu-server:8000 + provider_interface: openai + passthrough_auth: true # vLLM cluster handles its own auth + +routing_preferences: + - name: quick tasks + description: Short answers, simple lookups, fast completions + models: + - openai/gpt-4o-mini + - name: long context + description: Processing very long documents, multi-document analysis + models: + - custom/vllm-llama +``` + +Reference: https://github.com/katanemo/archgw + +--- + +### 2.4 Write Task-Specific Routing Preference Descriptions + +**Impact:** `HIGH` — Vague preference descriptions cause Plano's internal router LLM to misclassify requests, routing expensive tasks to cheap models and vice versa +**Tags:** `routing`, `model-selection`, `preferences`, `llm-routing` + +## Write Task-Specific Routing Preference Descriptions + +Plano's `plano_orchestrator_v1` router uses a 1.5B preference-aligned LLM to classify incoming requests against your `routing_preferences` descriptions. It returns an ordered `models` list for the matched route; the client uses `models[0]` as primary and falls back to `models[1]`, `models[2]`... on `429`/`5xx` errors. Description quality directly determines routing accuracy. + +Starting in `v0.4.0`, `routing_preferences` lives at the **top level** of the config and each entry carries its own `models: [...]` candidate pool. Listing multiple models under a single route gives you automatic provider fallback without extra client logic. Configs still using the legacy v0.3.0 inline shape (under each `model_provider`) are auto-migrated with a deprecation warning — prefer the top-level form below. + +**Incorrect (vague, overlapping descriptions):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: simple + description: easy tasks # Too vague — what is "easy"? + models: + - openai/gpt-4o-mini + - name: hard + description: hard tasks # Too vague — overlaps with "easy" + models: + - openai/gpt-4o +``` + +**Correct (specific, distinct task descriptions, multi-model fallbacks):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + + - model: anthropic/claude-sonnet-4-5 + access_key: $ANTHROPIC_API_KEY + +routing_preferences: + - name: summarization + description: > + Summarizing documents, articles, emails, or meeting transcripts. + Extracting key points, generating TL;DR sections, condensing long text. + models: + - openai/gpt-4o-mini + - openai/gpt-4o + - name: classification + description: > + Categorizing inputs, sentiment analysis, spam detection, + intent classification, labeling structured data fields. + models: + - openai/gpt-4o-mini + - name: translation + description: > + Translating text between languages, localization tasks. + models: + - openai/gpt-4o-mini + - anthropic/claude-sonnet-4-5 + - name: code_generation + description: > + Writing new functions, classes, or modules from scratch. + Implementing algorithms, boilerplate generation, API integrations. + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-5 + - name: code_review + description: > + Reviewing code for bugs, security vulnerabilities, performance issues. + Suggesting refactors, explaining complex code, debugging errors. + models: + - anthropic/claude-sonnet-4-5 + - openai/gpt-4o + - name: complex_reasoning + description: > + Multi-step math problems, logical deduction, strategic planning, + research synthesis requiring chain-of-thought reasoning. + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-5 +``` + +**Key principles for good preference descriptions:** +- Use concrete action verbs: "writing", "reviewing", "translating", "summarizing" +- List 3–5 specific sub-tasks or synonyms for each preference +- Ensure preferences across routes are mutually exclusive in scope +- Order `models` from most preferred to least — the client falls back in order on `429`/`5xx` +- List multiple models under one route for automatic provider fallback without extra client logic +- Every model listed in `models` must be declared in `model_providers` +- Test with representative queries using `planoai trace` and `--where` filters to verify routing decisions + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 3: Agent Orchestration + +*Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications.* + +### 3.1 Register All Sub-Agents in Both `agents` and `listeners.agents` + +**Impact:** `CRITICAL` — An agent registered only in `agents` but not referenced in a listener's agent list is unreachable; an agent listed in a listener but missing from `agents` causes a startup error +**Tags:** `agent`, `orchestration`, `config`, `multi-agent` + +## Register All Sub-Agents in Both `agents` and `listeners.agents` + +Plano's agent system has two separate concepts: the global `agents` array (defines the agent's ID and backend URL) and the `listeners[].agents` array (controls which agents are available to an orchestrator and provides their routing descriptions). Both must reference the same agent ID. + +**Incorrect (agent defined globally but not referenced in listener):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: news_agent # Defined but never referenced in any listener + url: http://host.docker.internal:8002 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides weather forecasts and current conditions. + # news_agent is missing here — the orchestrator cannot route to it +``` + +**Incorrect (listener references an agent ID not in the global agents list):** + +```yaml +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides weather forecasts. + - id: flights_agent # ID not in global agents[] — startup error + description: Provides flight status information. +``` + +**Correct (every agent ID appears in both places):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: flights_agent + url: http://host.docker.internal:8002 + - id: hotels_agent + url: http://host.docker.internal:8003 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +listeners: + - type: agent + name: travel_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Real-time weather, forecasts, and climate data for any city. + - id: flights_agent + description: Live flight status, schedules, gates, and delays. + - id: hotels_agent + description: Hotel search, availability, pricing, and booking. + default: true # Fallback if no other agent matches +``` + +Set `default: true` on one agent in each listener's agents list to handle unmatched requests. The agent's URL in the global `agents` array is the HTTP endpoint Plano forwards matching requests to — it must be reachable from within the Docker container (use `host.docker.internal` for services on the host). + +Reference: https://github.com/katanemo/archgw + +--- + +### 3.2 Write Capability-Focused Agent Descriptions for Accurate Routing + +**Impact:** `HIGH` — The orchestrator LLM routes requests purely by reading agent descriptions — poor descriptions cause misroutes to the wrong specialized agent +**Tags:** `agent`, `orchestration`, `descriptions`, `routing`, `multi-agent` + +## Write Capability-Focused Agent Descriptions for Accurate Routing + +In an `agent` listener, Plano's orchestrator reads each agent's `description` and routes user requests to the best-matching agent. This is LLM-based intent matching — the description is the entire specification the router sees. Write it as a capability manifest: what can this agent do, what data does it have access to, and what types of requests should it handle? + +**Incorrect (generic, overlapping descriptions):** + +```yaml +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: agent_1 + description: Helps users with information # Too generic — matches everything + + - id: agent_2 + description: Also helps users # Indistinguishable from agent_1 +``` + +**Correct (specific capabilities, distinct domains, concrete examples):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: flight_agent + url: http://host.docker.internal:8002 + - id: hotel_agent + url: http://host.docker.internal:8003 + +listeners: + - type: agent + name: travel_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: > + Provides real-time weather conditions and multi-day forecasts for any city + worldwide. Handles questions about temperature, precipitation, wind, humidity, + sunrise/sunset times, and severe weather alerts. Examples: "What's the weather + in Tokyo?", "Will it rain in London this weekend?", "Sunrise time in New York." + + - id: flight_agent + description: > + Provides live flight status, schedules, gate information, delays, and + aircraft details for any flight number or route between airports. + Handles questions about departures, arrivals, and airline information. + Examples: "Is AA123 on time?", "Flights from JFK to LAX tomorrow." + + - id: hotel_agent + description: > + Searches and books hotel accommodations, compares room types, pricing, + and availability. Handles check-in/check-out dates, amenities, and + cancellation policies. Examples: "Hotels near Times Square for next Friday." +``` + +**Description writing checklist:** +- State the primary domain in the first sentence +- List 3–5 specific data types or question categories this agent handles +- Include 2–3 concrete example user queries in quotes +- Avoid capability overlap between agents — if they overlap, the router will split traffic unpredictably +- Keep descriptions under 150 words — the orchestrator reads all descriptions per request + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 4: Filter Chains & Guardrails + +*Request/response processing pipelines — ordering, MCP integration, and safety guardrails.* + +### 4.1 Configure MCP Filters with Explicit Type and Transport + +**Impact:** `MEDIUM` — Omitting type and transport fields relies on defaults that may not match your MCP server's protocol implementation +**Tags:** `filter`, `mcp`, `integration`, `configuration` + +## Configure MCP Filters with Explicit Type and Transport + +Plano filters integrate with external services via MCP (Model Context Protocol) or plain HTTP. MCP filters call a specific tool on a remote MCP server. Always specify `type`, `transport`, and optionally `tool` (defaults to the filter `id`) to ensure Plano connects correctly to your filter implementation. + +**Incorrect (minimal filter definition relying on all defaults):** + +```yaml +filters: + - id: my_guard # Plano infers type=mcp, transport=streamable-http, tool=my_guard + url: http://localhost:10500 + # If your MCP server uses a different tool name or transport, this silently misroutes +``` + +**Correct (explicit configuration for each filter):** + +```yaml +version: v0.3.0 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp # Explicitly MCP protocol + transport: streamable-http # Streamable HTTP transport + tool: input_guards # MCP tool name (matches MCP server registration) + + - id: query_rewriter + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + tool: rewrite_query # Tool name differs from filter ID — explicit is safer + + - id: custom_validator + url: http://host.docker.internal:10503 + type: http # Plain HTTP filter (not MCP) + # No tool field for HTTP filters +``` + +**MCP filter implementation contract:** +Your MCP server must expose a tool matching the `tool` name. The tool receives the request payload and must return either: +- A modified request (to pass through with changes) +- A rejection response (to short-circuit the pipeline) + +**HTTP filter alternative** — use `type: http` for simpler request/response interceptors that don't need the MCP protocol: + +```yaml +filters: + - id: auth_validator + url: http://host.docker.internal:9000/validate + type: http # Plano POSTs the request, expects the modified request back +``` + +Reference: https://github.com/katanemo/archgw + +--- + +### 4.2 Configure Prompt Guards with Actionable Rejection Messages + +**Impact:** `MEDIUM` — A generic or empty rejection message leaves users confused about why their request was blocked and unable to rephrase appropriately +**Tags:** `filter`, `guardrails`, `jailbreak`, `security`, `ux` + +## Configure Prompt Guards with Actionable Rejection Messages + +Plano has built-in `prompt_guards` for detecting jailbreak attempts. When triggered, Plano returns the `on_exception.message` instead of forwarding the request. Write messages that explain the restriction and suggest what the user can do instead — both for user experience and to reduce support burden. + +**Incorrect (no message configured — returns a generic error):** + +```yaml +version: v0.3.0 + +prompt_guards: + input_guards: + jailbreak: + on_exception: {} # Empty — returns unhelpful generic error +``` + +**Incorrect (cryptic technical message):** + +```yaml +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: "Error code 403: guard triggered" # Unhelpful to the user +``` + +**Correct (clear, actionable, brand-appropriate message):** + +```yaml +version: v0.3.0 + +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: > + I'm not able to help with that request. This assistant is designed + to help with [your use case, e.g., customer support, coding questions]. + Please rephrase your question or contact support@yourdomain.com + if you believe this is an error. +``` + +**Combining prompt_guards with MCP filter guardrails:** + +```yaml +# Built-in jailbreak detection (fast, no external service needed) +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: "This request cannot be processed. Please ask about our products and services." + +# MCP-based custom guards for additional policy enforcement +filters: + - id: topic_restriction + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + tool: topic_restriction # Custom filter for domain-specific restrictions + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant for product questions and order issues. + filter_chain: + - topic_restriction # Additional custom topic filtering +``` + +`prompt_guards` applies globally to all listeners. Use `filter_chain` on individual agents for per-agent policies. + +Reference: https://github.com/katanemo/archgw + +--- + +### 4.3 Order Filter Chains with Guards First, Enrichment Last + +**Impact:** `HIGH` — Running context builders before input guards means jailbreak attempts get RAG-enriched context before being blocked — wasting compute and risking data exposure +**Tags:** `filter`, `guardrails`, `security`, `pipeline`, `ordering` + +## Order Filter Chains with Guards First, Enrichment Last + +A `filter_chain` is an ordered list of filter IDs applied sequentially to each request. The order is semantically meaningful: each filter receives the output of the previous one. Safety and validation filters must run first to short-circuit bad requests before expensive enrichment filters process them. + +**Recommended filter chain order:** + +1. **Input guards** — jailbreak detection, PII detection, topic restrictions (reject early) +2. **Query rewriting** — normalize or enhance the user query +3. **Context building** — RAG retrieval, tool lookup, knowledge injection (expensive) +4. **Output guards** — validate or sanitize LLM response before returning + +**Incorrect (context built before guards — wasteful and potentially unsafe):** + +```yaml +filters: + - id: context_builder + url: http://host.docker.internal:10502 # Runs expensive RAG retrieval first + - id: query_rewriter + url: http://host.docker.internal:10501 + - id: input_guards + url: http://host.docker.internal:10500 # Guards run last — jailbreak gets context + +listeners: + - type: agent + name: rag_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: rag_agent + filter_chain: + - context_builder # Wrong: expensive enrichment before safety check + - query_rewriter + - input_guards +``` + +**Correct (guards block bad requests before any enrichment):** + +```yaml +version: v0.3.0 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + - id: query_rewriter + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + - id: context_builder + url: http://host.docker.internal:10502 + type: mcp + transport: streamable-http + +listeners: + - type: agent + name: rag_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: rag_agent + description: Answers questions using internal knowledge base documents. + filter_chain: + - input_guards # 1. Block jailbreaks and policy violations + - query_rewriter # 2. Normalize the safe query + - context_builder # 3. Retrieve relevant context for the clean query +``` + +Different agents within the same listener can have different filter chains — a public-facing agent may need all guards while an internal admin agent may skip them. + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 5: Observability & Debugging + +*OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility.* + +### 5.1 Add Custom Span Attributes for Correlation and Filtering + +**Impact:** `MEDIUM` — Without custom span attributes, traces cannot be filtered by user, session, or environment — making production debugging significantly harder +**Tags:** `observability`, `tracing`, `span-attributes`, `correlation` + +## Add Custom Span Attributes for Correlation and Filtering + +Plano can automatically extract HTTP request headers and attach them as span attributes, plus attach static key-value pairs to every span. This enables filtering traces by user, session, tenant, environment, or any other dimension that matters to your application. + +**Incorrect (no span attributes — traces are unfiltered blobs):** + +```yaml +tracing: + random_sampling: 20 + # No span_attributes — cannot filter by user, session, or environment +``` + +**Correct (rich span attributes for production correlation):** + +```yaml +version: v0.3.0 + +tracing: + random_sampling: 20 + trace_arch_internal: true + + span_attributes: + # Match all headers with this prefix, then map to span attributes by: + # 1) stripping the prefix and 2) converting hyphens to dots + header_prefixes: + - x-katanemo- + + # Static attributes added to every span from this Plano instance + static: + environment: production + service.name: plano-gateway + deployment.region: us-east-1 + service.version: "2.1.0" + team: platform-engineering +``` + +**Sending correlation headers from client code:** + +```python +import httpx + +response = httpx.post( + "http://localhost:12000/v1/chat/completions", + headers={ + "x-katanemo-request-id": "req_abc123", + "x-katanemo-user-id": "usr_12", + "x-katanemo-session-id": "sess_xyz456", + "x-katanemo-tenant-id": "acme-corp", + }, + json={"model": "plano.v1", "messages": [...]} +) +``` + +**Querying by custom attribute:** + +```bash +# Find all requests from a specific user +planoai trace --where user.id=usr_12 + +# Find all traces from production environment +planoai trace --where environment=production + +# Find traces from a specific tenant +planoai trace --where tenant.id=acme-corp +``` + +Header prefix matching is a prefix match. With `x-katanemo-`, these mappings apply: + +- `x-katanemo-user-id` -> `user.id` +- `x-katanemo-tenant-id` -> `tenant.id` +- `x-katanemo-request-id` -> `request.id` + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +### 5.2 Enable Tracing with Appropriate Sampling for Your Environment + +**Impact:** `HIGH` — Without tracing enabled, debugging routing decisions, latency issues, and model selection is guesswork — traces are the primary observability primitive in Plano +**Tags:** `observability`, `tracing`, `opentelemetry`, `otel`, `debugging` + +## Enable Tracing with Appropriate Sampling for Your Environment + +Plano emits OpenTelemetry (OTEL) traces for every request, capturing routing decisions, LLM provider selection, filter chain execution, and response latency. Traces are the best tool for understanding why a request was routed to a particular model and debugging unexpected behavior. + +**Incorrect (no tracing configured — flying blind in production):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +# No tracing block — no visibility into routing, latency, or errors +``` + +**Correct (tracing enabled with environment-appropriate sampling):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +tracing: + random_sampling: 100 # 100% for development/debugging + trace_arch_internal: true # Include Plano's internal routing spans +``` + +**Production configuration (sampled to control volume):** + +```yaml +tracing: + random_sampling: 10 # Sample 10% of requests in production + trace_arch_internal: false # Skip internal spans to reduce noise + span_attributes: + header_prefixes: + - x-katanemo- # Match all x-katanemo-* headers + static: + environment: production + service.name: my-plano-service + version: "1.0.0" +``` + +With `x-katanemo-` configured, Plano maps headers to attributes by stripping the prefix and converting hyphens to dots: + +- `x-katanemo-user-id` -> `user.id` +- `x-katanemo-session-id` -> `session.id` +- `x-katanemo-request-id` -> `request.id` + +**Starting the trace collector:** + +```bash +# Start Plano with built-in OTEL collector +planoai up config.yaml --with-tracing +``` + +Sampling rates: 100% for dev/staging, 5–20% for high-traffic production, 100% for low-traffic production. `trace_arch_internal: true` adds spans showing which routing preference matched — essential for debugging preference configuration. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +### 5.3 Use `planoai trace` to Inspect Routing Decisions + +**Impact:** `MEDIUM-HIGH` — The trace CLI lets you verify which model was selected, why, and how long each step took — without setting up a full OTEL backend +**Tags:** `observability`, `tracing`, `cli`, `debugging`, `routing` + +## Use `planoai trace` to Inspect Routing Decisions + +`planoai trace` provides a built-in trace viewer backed by an in-memory OTEL collector. Use it to inspect routing decisions, verify preference matching, measure filter latency, and debug failed requests — all from the CLI without configuring Jaeger, Zipkin, or another backend. + +**Workflow: start collector, run requests, then inspect traces:** + +```bash +# 1. Start Plano with the built-in trace collector (recommended) +planoai up config.yaml --with-tracing + +# 2. Send test requests through Plano +curl http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "plano.v1", "messages": [{"role": "user", "content": "Write a Python function to sort a list"}]}' + +# 3. Show the latest trace +planoai trace +``` + +You can also run the trace listener directly: + +```bash +planoai trace listen # available on a process ID running OTEL collector +``` + +Stop the background trace listener: + +```bash +planoai trace down +``` + +**Useful trace viewer patterns:** + +```bash +# Show latest trace (default target is "last") +planoai trace + +# List available trace IDs +planoai trace --list + +# Show all traces +planoai trace any + +# Show a specific trace (short 8-char or full 32-char ID) +planoai trace 7f4e9a1c +planoai trace 7f4e9a1c0d9d4a0bb9bf5a8a7d13f62a + +# Filter by specific span attributes (AND semantics for repeated --where) +planoai trace any --where llm.model=gpt-4o-mini + +# Filter by user ID (if header prefix is x-katanemo-, x-katanemo-user-id maps to user.id) +planoai trace any --where user.id=user_123 + +# Limit results for a quick sanity check +planoai trace any --limit 5 + +# Time window filter +planoai trace any --since 30m + +# Filter displayed attributes by key pattern +planoai trace any --filter "http.*" + +# Output machine-readable JSON +planoai trace any --json +``` + +**What to look for in traces:** + + +| Span name | What it tells you | +| ------------------- | ------------------------------------------------------------- | +| `plano.routing` | Which routing preference matched and which model was selected | +| `plano.filter.` | How long each filter in the chain took | +| `plano.llm.request` | Time to first token and full response time | +| `plano.agent.route` | Which agent description matched for agent listeners | + + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +## Section 6: CLI Operations + +*Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation.* + +### 6.1 Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues + +**Impact:** `HIGH` — `planoai up` validates config, checks API keys, and health-checks all listeners — skipping this diagnostic information leads to unnecessary debugging of container or network issues +**Tags:** `cli`, `startup`, `validation`, `debugging`, `workflow` + +## Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues + +`planoai up` is the entry point for running Plano. It performs sequential checks before the container starts: schema validation, API key presence check, container startup, and health checks on all configured listener ports. Understanding what each failure stage means prevents chasing the wrong root cause. + +**Validation stages and failure signals:** + +``` +Stage 1: Schema validation → "config.yaml: invalid against schema" +Stage 2: API key check → "Missing required environment variables: OPENAI_API_KEY" +Stage 3: Container start → "Docker daemon not running" or image pull errors +Stage 4: Health check (/healthz) → "Listener not healthy after 120s" (timeout) +``` + +**Development startup workflow:** + +```bash +# Standard startup — config.yaml in current directory +planoai up + +# Explicit config file path +planoai up my-config.yaml + +# Start in foreground to see all logs immediately (great for debugging) +planoai up config.yaml --foreground + +# Start with built-in OTEL trace collector +planoai up config.yaml --with-tracing + +# Enable verbose logging for debugging routing decisions +LOG_LEVEL=debug planoai up config.yaml --foreground +``` + +**Checking what's running:** + +```bash +# Stream recent logs (last N lines, then exit) +planoai logs + +# Follow logs in real-time +planoai logs --follow + +# Include Envoy/gateway debug messages +planoai logs --debug --follow +``` + +**Stopping and restarting after config changes:** + +```bash +# Stop the current container +planoai down + +# Restart with updated config +planoai up config.yaml +``` + +**Common failure patterns:** + +```bash +# API key missing — check your .env file or shell environment +export OPENAI_API_KEY=sk-proj-... +planoai up config.yaml + +# Health check timeout — listener port may conflict +# Check if another process uses port 12000 +lsof -i :12000 + +# Container fails to start — verify Docker daemon is running +docker ps +``` + +`planoai down` fully stops and removes the Plano container. Always run `planoai down` before `planoai up` when changing config to avoid stale container state. + +Reference: https://github.com/katanemo/archgw + +--- + +### 6.2 Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets` + +**Impact:** `MEDIUM` — Manually writing prompt_targets YAML for existing Python APIs is error-prone — the generator introspects function signatures and produces correct YAML automatically +**Tags:** `cli`, `generate`, `prompt-targets`, `python`, `code-generation` + +## Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets` + +`planoai generate_prompt_targets` introspects Python function signatures and docstrings to generate `prompt_targets` YAML for your Plano config. This is the fastest way to expose existing Python APIs as LLM-callable functions without manually writing the YAML schema. + +**Python function requirements for generation:** +- Use simple type annotations: `int`, `float`, `bool`, `str`, `list`, `tuple`, `set`, `dict` +- Include a docstring describing what the function does (becomes the `description`) +- Complex Pydantic models must be flattened into primitive typed parameters first + +**Example Python file:** + +```python +# api.py + +def get_stock_quote(symbol: str, exchange: str = "NYSE") -> dict: + """Get the current stock price and trading data for a given stock symbol. + + Returns price, volume, market cap, and 24h change percentage. + """ + # Implementation calls stock API + pass + +def get_weather_forecast(city: str, days: int = 3, units: str = "celsius") -> dict: + """Get the weather forecast for a city. + + Returns temperature, precipitation, and conditions for the specified number of days. + """ + pass + +def search_flights(origin: str, destination: str, date: str, passengers: int = 1) -> list: + """Search for available flights between two airports on a given date. + + Date format: YYYY-MM-DD. Returns list of flight options with prices. + """ + pass +``` + +**Running the generator:** + +```bash +planoai generate_prompt_targets --file api.py +``` + +**Generated output (add to your config.yaml):** + +```yaml +prompt_targets: + - name: get_stock_quote + description: Get the current stock price and trading data for a given stock symbol. + parameters: + - name: symbol + type: str + required: true + - name: exchange + type: str + required: false + default: NYSE + # Add endpoint manually: + endpoint: + name: stock_api + path: /quote?symbol={symbol}&exchange={exchange} + + - name: get_weather_forecast + description: Get the weather forecast for a city. + parameters: + - name: city + type: str + required: true + - name: days + type: int + required: false + default: 3 + - name: units + type: str + required: false + default: celsius + endpoint: + name: weather_api + path: /forecast?city={city}&days={days}&units={units} +``` + +After generation, manually add the `endpoint` blocks pointing to your actual API. The generator produces the schema; you wire in the connectivity. + +Reference: https://github.com/katanemo/archgw + +--- + +### 6.3 Use `planoai cli_agent` to Connect Claude Code Through Plano + +**Impact:** `MEDIUM-HIGH` — Running Claude Code directly against provider APIs bypasses Plano's routing, observability, and guardrails — cli_agent routes all Claude Code traffic through your configured Plano instance +**Tags:** `cli`, `cli-agent`, `claude`, `coding-agent`, `integration` + +## Use `planoai cli_agent` to Connect Claude Code Through Plano + +`planoai cli_agent` starts a Claude Code session that routes all LLM traffic through your running Plano instance instead of directly to Anthropic. This gives you routing preferences, model aliases, tracing, and guardrails for your coding agent workflows — making Claude Code a first-class citizen of your Plano configuration. + +**Prerequisites:** + +```bash +# 1. Plano must be running with a model listener +planoai up config.yaml + +# 2. ANTHROPIC_API_KEY must be set (Claude Code uses it for auth) +export ANTHROPIC_API_KEY=sk-ant-... +``` + +**Starting the CLI agent:** + +```bash +# Start CLI agent using config.yaml in current directory +planoai cli_agent claude + +# Use a specific config file +planoai cli_agent claude config.yaml + +# Use a config in a different directory +planoai cli_agent claude --path /path/to/project +``` + +**Recommended config for Claude Code routing:** + +```yaml +version: v0.4.0 + +listeners: + - type: model + name: claude_code_router + port: 12000 + +model_providers: + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + default: true + + - model: anthropic/claude-opus-4-6 + access_key: $ANTHROPIC_API_KEY + +routing_preferences: + - name: general coding + description: > + Writing code, debugging, code review, explaining concepts, + answering programming questions, general development tasks. + models: + - anthropic/claude-sonnet-4-20250514 + - anthropic/claude-opus-4-6 + - name: complex architecture + description: > + System design, complex refactoring across many files, + architectural decisions, performance optimization, security audits. + models: + - anthropic/claude-opus-4-6 + - anthropic/claude-sonnet-4-20250514 + +model_aliases: + claude.fast.v1: + target: claude-sonnet-4-20250514 + claude.smart.v1: + target: claude-opus-4-6 + +tracing: + random_sampling: 100 + trace_arch_internal: true + +overrides: + upstream_connect_timeout: "10s" +``` + +**What happens when cli_agent runs:** + +1. Reads your config.yaml to find the model listener port +2. Configures Claude Code to use `http://localhost:` as its API endpoint +3. Starts a Claude Code session in your terminal +4. All Claude Code LLM calls flow through Plano — routing, tracing, and guardrails apply + +After your session, use `planoai trace` to inspect every LLM call Claude Code made, which model was selected, and why. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +### 6.4 Use `planoai init` Templates to Bootstrap New Projects Correctly + +**Impact:** `MEDIUM` — Starting from a blank config.yaml leads to missing required fields and common structural mistakes — templates provide validated, idiomatic starting points +**Tags:** `cli`, `init`, `templates`, `getting-started`, `project-setup` + +## Use `planoai init` Templates to Bootstrap New Projects Correctly + +`planoai init` generates a valid `config.yaml` from built-in templates. Each template demonstrates a specific Plano capability with correct structure, realistic examples, and comments. Use this instead of writing config from scratch — it ensures you start with a valid, working configuration. + +**Available templates:** + +| Template ID | What It Demonstrates | Best For | +|---|---|---| +| `sub_agent_orchestration` | Multi-agent routing with specialized sub-agents | Building agentic applications | +| `coding_agent_routing` | Routing preferences + model aliases for coding workflows | Claude Code and coding assistants | +| `preference_aware_routing` | Automatic LLM routing based on task type | Multi-model cost optimization | +| `filter_chain_guardrails` | Input guards, query rewrite, context builder | RAG + safety pipelines | +| `conversational_state_v1_responses` | Stateful conversations with memory | Chatbots, multi-turn assistants | + +**Usage:** + +```bash +# Initialize with a template +planoai init --template sub_agent_orchestration + +# Initialize coding agent routing setup +planoai init --template coding_agent_routing + +# Initialize a RAG with guardrails project +planoai init --template filter_chain_guardrails +``` + +**Typical project setup workflow:** + +```bash +# 1. Create project directory +mkdir my-plano-agent && cd my-plano-agent + +# 2. Bootstrap with the closest matching template +planoai init --template preference_aware_routing + +# 3. Edit config.yaml to add your specific models, agents, and API keys +# (keys are already using $VAR substitution — just set your env vars) + +# 4. Create .env file for local development +cat > .env << EOF +OPENAI_API_KEY=sk-proj-... +ANTHROPIC_API_KEY=sk-ant-... +EOF + +echo ".env" >> .gitignore + +# 5. Start Plano +planoai up + +# 6. Test your configuration +curl http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]}' +``` + +Start with `preference_aware_routing` for most LLM gateway use cases and `sub_agent_orchestration` for multi-agent applications. Both can be combined after you understand each independently. + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 7: Deployment & Security + +*Docker deployment, environment variable management, health checks, and state storage for production.* + +### 7.1 Understand Plano's Docker Network Topology for Agent URL Configuration + +**Impact:** `HIGH` — Using `localhost` for agent URLs inside Docker always fails — Plano runs in a container and cannot reach host services via localhost +**Tags:** `deployment`, `docker`, `networking`, `agents`, `urls` + +## Understand Plano's Docker Network Topology for Agent URL Configuration + +Plano runs inside a Docker container managed by `planoai up`. Services running on your host machine (agent servers, filter servers, databases) are not accessible as `localhost` from inside the container. Use Docker's special hostname `host.docker.internal` to reach host services. + +**Docker network rules:** +- `localhost` / `127.0.0.1` inside the container → Plano's own container (not your host) +- `host.docker.internal` → Your host machine's loopback interface +- Container name or `docker network` hostname → Other Docker containers +- External domain / IP → Reachable if Docker has network access + +**Incorrect (using localhost — agent unreachable from inside container):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://localhost:8001 # Wrong: this is Plano's own container + + - id: flight_agent + url: http://127.0.0.1:8002 # Wrong: same issue + +filters: + - id: input_guards + url: http://localhost:10500 # Wrong: filter server unreachable +``` + +**Correct (using host.docker.internal for host-side services):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 # Correct: reaches host port 8001 + + - id: flight_agent + url: http://host.docker.internal:8002 # Correct: reaches host port 8002 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 # Correct: reaches filter server on host + +endpoints: + internal_api: + endpoint: host.docker.internal # Correct for internal API on host + protocol: http +``` + +**Production deployment patterns:** + +```yaml +# Kubernetes / Docker Compose — use service names +agents: + - id: weather_agent + url: http://weather-service:8001 # Kubernetes service DNS + +# External cloud services — use full domain +agents: + - id: cloud_agent + url: https://my-agent.us-east-1.amazonaws.com/v1 + +# Custom TLS (self-signed or internal CA) +overrides: + upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem +``` + +**Ports exposed by Plano's container:** +- All `port` values from your `listeners` blocks are automatically mapped +- `9901` — Envoy admin interface (for advanced debugging) +- `12001` — Plano internal management API + +Reference: https://github.com/katanemo/archgw + +--- + +### 7.2 Use PostgreSQL State Storage for Multi-Turn Conversations in Production + +**Impact:** `HIGH` — The default in-memory state storage loses all conversation history when the container restarts — production multi-turn agents require persistent PostgreSQL storage +**Tags:** `deployment`, `state`, `postgres`, `memory`, `multi-turn`, `production` + +## Use PostgreSQL State Storage for Multi-Turn Conversations in Production + +`state_storage` enables Plano to maintain conversation context across requests. Without it, each request is stateless. The `memory` type works for development and testing — all state is lost on container restart. Use `postgres` for any production deployment where conversation continuity matters. + +**Incorrect (memory storage in production):** + +```yaml +version: v0.3.0 + +# Memory storage — all conversations lost on planoai down / container restart +state_storage: + type: memory + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant with conversation history. +``` + +**Correct (PostgreSQL for production persistence):** + +```yaml +version: v0.3.0 + +state_storage: + type: postgres + connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}" + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant with access to full conversation history. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +**Setting up PostgreSQL for local development:** + +```bash +# Start PostgreSQL with Docker +docker run -d \ + --name plano-postgres \ + -e POSTGRES_USER=plano \ + -e POSTGRES_PASSWORD=devpassword \ + -e POSTGRES_DB=plano \ + -p 5432:5432 \ + postgres:16 + +# Set environment variables +export DB_USER=plano +export DB_PASS=devpassword +export DB_HOST=host.docker.internal # Use host.docker.internal from inside Plano container +export DB_NAME=plano +``` + +**Production `.env` pattern:** + +```bash +DB_USER=plano_prod +DB_PASS= +DB_HOST=your-rds-endpoint.amazonaws.com +DB_NAME=plano +``` + +Plano automatically creates its state tables on first startup. The `connection_string` supports all standard PostgreSQL connection parameters including SSL: `postgresql://user:pass@host:5432/db?sslmode=require`. + +Reference: https://github.com/katanemo/archgw + +--- + +### 7.3 Verify Listener Health Before Sending Requests + +**Impact:** `MEDIUM` — Sending requests to Plano before listeners are healthy results in connection refused errors that look like application bugs — always confirm health before testing +**Tags:** `deployment`, `health-checks`, `readiness`, `debugging` + +## Verify Listener Health Before Sending Requests + +Each Plano listener exposes a `/healthz` HTTP endpoint. `planoai up` automatically health-checks all listeners during startup (120s timeout), but in CI/CD pipelines, custom scripts, or when troubleshooting, you may need to check health manually. + +**Health check endpoints:** + +```bash +# Check model listener health (port from your config) +curl -f http://localhost:12000/healthz +# Returns 200 OK when healthy + +# Check prompt listener +curl -f http://localhost:10000/healthz + +# Check agent listener +curl -f http://localhost:8000/healthz +``` + +**Polling health in scripts (CI/CD pattern):** + +```bash +#!/bin/bash +# wait-for-plano.sh + +LISTENER_PORT=${1:-12000} +MAX_WAIT=120 +INTERVAL=2 +elapsed=0 + +echo "Waiting for Plano listener on port $LISTENER_PORT..." + +until curl -sf "http://localhost:$LISTENER_PORT/healthz" > /dev/null; do + if [ $elapsed -ge $MAX_WAIT ]; then + echo "ERROR: Plano listener not healthy after ${MAX_WAIT}s" + planoai logs --debug + exit 1 + fi + sleep $INTERVAL + elapsed=$((elapsed + INTERVAL)) +done + +echo "Plano listener healthy after ${elapsed}s" +``` + +**Docker Compose health check:** + +```yaml +# docker-compose.yml for services that depend on Plano +services: + plano: + image: katanemo/plano:latest + # Plano is managed by planoai, not directly via compose in most setups + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:12000/healthz"] + interval: 5s + timeout: 3s + retries: 24 + start_period: 10s + + my-agent: + image: my-agent:latest + depends_on: + plano: + condition: service_healthy +``` + +**Debug unhealthy listeners:** + +```bash +# See startup logs +planoai logs --debug + +# Check if port is already in use +lsof -i :12000 + +# Check container status +docker ps -a --filter name=plano + +# Restart from scratch +planoai down && planoai up config.yaml --foreground +``` + +Reference: https://github.com/katanemo/archgw + +--- + +## Section 8: Advanced Patterns + +*Prompt targets, external API integration, rate limiting, and multi-listener architectures.* + +### 8.1 Combine Multiple Listener Types for Layered Agent Architectures + +**Impact:** `MEDIUM` — Using a single listener type forces all traffic through one gateway pattern — combining types lets you serve different clients with the right interface without running multiple Plano instances +**Tags:** `advanced`, `multi-listener`, `architecture`, `agent`, `model`, `prompt` + +## Combine Multiple Listener Types for Layered Agent Architectures + +A single Plano `config.yaml` can define multiple listeners of different types, each on a separate port. This lets you serve different client types simultaneously: an OpenAI-compatible model gateway for direct API clients, a prompt gateway for LLM-callable function applications, and an agent orchestrator for multi-agent workflows — all from one Plano instance sharing the same model providers. + +**Single listener (limited — forces all clients through one interface):** + +```yaml +version: v0.3.0 + +listeners: + - type: model # Only model clients can use this + name: model_gateway + port: 12000 + +# Prompt target clients and agent clients cannot connect +``` + +**Multi-listener architecture (serves all client types):** + +```yaml +version: v0.4.0 + +# --- Shared model providers --- +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +# --- Shared routing_preferences (top-level, v0.4.0+) --- +routing_preferences: + - name: quick tasks + description: Short answers, formatting, classification, simple generation + models: + - openai/gpt-4o-mini + - name: complex reasoning + description: Multi-step analysis, code generation, research synthesis + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-20250514 + - name: long documents + description: Summarizing or analyzing very long documents, PDFs, transcripts + models: + - anthropic/claude-sonnet-4-20250514 + - openai/gpt-4o + +# --- Listener 1: OpenAI-compatible API gateway --- +# For: SDK clients, Claude Code, LangChain, etc. +listeners: + - type: model + name: model_gateway + port: 12000 + timeout: "120s" + +# --- Listener 2: Prompt function gateway --- +# For: Applications that expose LLM-callable APIs + - type: prompt + name: function_gateway + port: 10000 + timeout: "60s" + +# --- Listener 3: Agent orchestration gateway --- +# For: Multi-agent application clients + - type: agent + name: agent_orchestrator + port: 8000 + timeout: "90s" + router: plano_orchestrator_v1 + agents: + - id: research_agent + description: Searches, synthesizes, and summarizes information from multiple sources. + filter_chain: + - input_guards + - context_builder + - id: code_agent + description: Writes, reviews, debugs, and explains code across all languages. + default: true + +# --- Agents --- +agents: + - id: research_agent + url: http://host.docker.internal:8001 + - id: code_agent + url: http://host.docker.internal:8002 + +# --- Filters --- +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + - id: context_builder + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + +# --- Prompt targets (for function gateway) --- +endpoints: + internal_api: + endpoint: host.docker.internal + protocol: http + +prompt_targets: + - name: search_knowledge_base + description: Search the internal knowledge base for relevant documents and facts. + parameters: + - name: query + type: str + required: true + description: Search query to find relevant information + endpoint: + name: internal_api + path: /kb/search?q={query} + http_method: GET + +# --- Observability --- +model_aliases: + plano.fast.v1: + target: gpt-4o-mini + plano.smart.v1: + target: gpt-4o + +tracing: + random_sampling: 50 + trace_arch_internal: true + span_attributes: + static: + environment: production + header_prefixes: + - x-katanemo- +``` + +This architecture serves: SDK clients on `:12000`, function-calling apps on `:10000`, and multi-agent orchestration on `:8000` — with shared cost-optimized routing across all three. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) + +--- + +### 8.2 Design Prompt Targets with Precise Parameter Schemas + +**Impact:** `HIGH` — Imprecise parameter definitions cause the LLM to hallucinate values, skip required fields, or produce malformed API calls — the schema is the contract between the LLM and your API +**Tags:** `advanced`, `prompt-targets`, `functions`, `llm`, `api-integration` + +## Design Prompt Targets with Precise Parameter Schemas + +`prompt_targets` define functions that Plano's LLM can call autonomously when it determines a user request matches the function's description. The parameter schema tells the LLM exactly what values to extract from user input — vague schemas lead to hallucinated parameters and failed API calls. + +**Incorrect (too few constraints — LLM must guess):** + +```yaml +prompt_targets: + - name: get_flight_info + description: Get flight information + parameters: + - name: flight # What format? "AA123"? "AA 123"? "American 123"? + type: str + required: true + endpoint: + name: flights_api + path: /flight?id={flight} +``` + +**Correct (fully specified schema with descriptions, formats, and enums):** + +```yaml +version: v0.3.0 + +endpoints: + flights_api: + endpoint: api.flightaware.com + protocol: https + connect_timeout: "5s" + +prompt_targets: + - name: get_flight_status + description: > + Get real-time status, gate information, and delays for a specific flight number. + Use when the user asks about a flight's current status, arrival time, or gate. + parameters: + - name: flight_number + description: > + IATA airline code followed by flight number, e.g., "AA123", "UA456", "DL789". + Extract from user message — do not include spaces. + type: str + required: true + format: "^[A-Z]{2}[0-9]{1,4}$" # Regex hint for validation + + - name: date + description: > + Flight date in YYYY-MM-DD format. Use today's date if not specified. + type: str + required: false + format: date + + endpoint: + name: flights_api + path: /flights/{flight_number}?date={date} + http_method: GET + http_headers: + Authorization: "Bearer $FLIGHTAWARE_API_KEY" + + - name: search_flights + description: > + Search for available flights between two cities or airports. + Use when the user wants to find flights, compare options, or book travel. + parameters: + - name: origin + description: Departure airport IATA code (e.g., "JFK", "LAX", "ORD") + type: str + required: true + - name: destination + description: Arrival airport IATA code (e.g., "LHR", "CDG", "NRT") + type: str + required: true + - name: departure_date + description: Departure date in YYYY-MM-DD format + type: str + required: true + format: date + - name: cabin_class + description: Preferred cabin class + type: str + required: false + default: economy + enum: [economy, premium_economy, business, first] + - name: passengers + description: Number of adult passengers (1-9) + type: int + required: false + default: 1 + + endpoint: + name: flights_api + path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers} + http_method: GET + http_headers: + Authorization: "Bearer $FLIGHTAWARE_API_KEY" + + system_prompt: | + You are a travel assistant. Present flight search results clearly, + highlighting the best value options. Include price, duration, and + number of stops for each option. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +listeners: + - type: prompt + name: travel_functions + port: 10000 + timeout: "30s" +``` + +**Key principles:** +- `description` on the target tells the LLM when to call it — be specific about trigger conditions +- `description` on each parameter tells the LLM what value to extract — include format examples +- Use `enum` to constrain categorical values — prevents the LLM from inventing categories +- Use `format: date` or regex patterns to hint at expected format +- Use `default` for optional parameters so the API never receives null values +- `system_prompt` on the target customizes how the LLM formats the API response to the user + +Reference: https://github.com/katanemo/archgw + +--- + +*Generated from individual rule files in `rules/`.* +*To contribute, see [CONTRIBUTING](https://github.com/katanemo/archgw/blob/main/CONTRIBUTING.md).* diff --git a/skills/README.md b/skills/README.md new file mode 100644 index 00000000..d941fb93 --- /dev/null +++ b/skills/README.md @@ -0,0 +1,243 @@ +# Plano Agent Skills + +A structured repository of best practices for building agents and agentic applications with [Plano](https://github.com/katanemo/archgw) — the AI-native proxy and dataplane. Optimized for coding agents and LLMs. + +## What Are Skills? + +Skills are principle-based guides that help coding agents (Claude Code, Cursor, Copilot, etc.) make better decisions when working with Plano. They cover configuration patterns, routing strategies, agent orchestration, observability, and CLI workflows — acting as operating principles, not documentation replacements. + +## Installing + +```bash +# Install via npx skills +npx skills add katanemo/plano +``` + +This skills collection is published from the `skills/` directory in the `katanemo/plano` monorepo. + +Install a specific skill: + +```bash +npx skills add katanemo/plano --skill plano-routing-model-selection +``` + +List available skills before install: + +```bash +npx skills add katanemo/plano --list +``` + +## Using Skills in Agents + +After installation, these skills are available to your coding agent and can be invoked with normal language. You do not need special syntax unless your tooling requires it. + +### Natural Language Invocation Examples + +- "Use the Plano skills to validate this `config.yaml` and fix issues." +- "Apply Plano routing best practices to improve model/provider selection." +- "Review this agent listener config with the orchestration rules." +- "Refactor this filter chain to follow guardrail ordering best practices." +- "Audit this setup against Plano deployment and security recommendations." + +### Prompting Tips for Better Results + +- Name your goal and file: "Harden `config.yaml` for production." +- Ask for an action: "Generate a patch," "fix directly," or "explain the changes." +- Include runtime context when relevant: trace output, logs, listener errors. +- Ask for verification: "Run a final validation check after edits." + +### Invoke by Skill Area (Optional) + +- **Configuration:** "Use Plano configuration fundamentals on this config." +- **Routing:** "Use routing/model-selection skills to tune defaults and aliases." +- **Agent orchestration:** "Use agent orchestration skills to improve routing accuracy." +- **Filters/guardrails:** "Use filter-chain skills to harden input/output safety." +- **Observability:** "Use observability skills to add traceability and debug routing." +- **CLI/deployment:** "Use CLI and deployment skills to produce a startup checklist." + +## Available Skills + +- `plano-agent-skills` - Umbrella skill covering all Plano areas +- `plano-config-fundamentals` - Config versioning, listeners, providers, secrets +- `plano-routing-model-selection` - Defaults, aliases, passthrough auth, preferences +- `plano-agent-orchestration` - Agent registration and routing descriptions +- `plano-filter-guardrails` - MCP filters, guardrail messaging, filter ordering +- `plano-observability-debugging` - Tracing setup, span attributes, trace analysis +- `plano-cli-operations` - `planoai up`, `cli_agent`, init, prompt target generation +- `plano-deployment-security` - Docker networking, health checks, state storage +- `plano-advanced-patterns` - Multi-listener architecture and prompt target schema design + +## Local Testing + +```bash +# From repo root +npx skills add ./skills --list +npx skills add ./skills --skill plano-agent-skills -y +npx skills list +``` + +## Structure + +``` +skills/ +├── rules/ # Individual rule files (one per rule) +│ ├── _sections.md # Section metadata and prefix definitions +│ ├── _template.md # Template for creating new rules +│ ├── config-*.md # Section 1: Configuration Fundamentals +│ ├── routing-*.md # Section 2: Routing & Model Selection +│ ├── agent-*.md # Section 3: Agent Orchestration +│ ├── filter-*.md # Section 4: Filter Chains & Guardrails +│ ├── observe-*.md # Section 5: Observability & Debugging +│ ├── cli-*.md # Section 6: CLI Operations +│ ├── deploy-*.md # Section 7: Deployment & Security +│ └── advanced-*.md # Section 8: Advanced Patterns +├── src/ +│ ├── build.ts # Compiles rules/ into AGENTS.md +│ ├── validate.ts # Validates rule files +│ └── extract-tests.ts # Extracts test cases for LLM evaluation +├── metadata.json # Document metadata +├── AGENTS.md # Compiled output (generated — do not edit directly) +├── test-cases.json # Test cases for LLM evaluation (generated) +└── package.json +``` + +## Sections + +| # | Prefix | Section | Rules | +|---|--------|---------|-------| +| 1 | `config-` | Configuration Fundamentals | Version, listeners, providers, secrets, timeouts | +| 2 | `routing-` | Routing & Model Selection | Preferences, aliases, defaults, passthrough | +| 3 | `agent-` | Agent Orchestration | Descriptions, agent registration | +| 4 | `filter-` | Filter Chains & Guardrails | Ordering, MCP integration, guardrails | +| 5 | `observe-` | Observability & Debugging | Tracing, trace inspection, span attributes | +| 6 | `cli-` | CLI Operations | Startup, CLI agent, init, code generation | +| 7 | `deploy-` | Deployment & Security | Docker networking, state storage, health checks | +| 8 | `advanced-` | Advanced Patterns | Prompt targets, rate limits, multi-listener | + +## Getting Started + +```bash +# Install dependencies +npm install + +# Validate all rule files +npm run validate + +# Build AGENTS.md from rules +npm run build + +# Extract test cases for LLM evaluation +npm run extract-tests + +# Run all of the above +npm run dev +``` + +## Creating a New Rule + +1. Copy `rules/_template.md` to `rules/-.md` + +2. Choose the correct prefix for your section: + - `config-` — Configuration Fundamentals + - `routing-` — Routing & Model Selection + - `agent-` — Agent Orchestration + - `filter-` — Filter Chains & Guardrails + - `observe-` — Observability & Debugging + - `cli-` — CLI Operations + - `deploy-` — Deployment & Security + - `advanced-` — Advanced Patterns + +3. Fill in the frontmatter: + ```yaml + --- + title: Clear, Actionable Rule Title + impact: HIGH + impactDescription: One-line description of why this matters + tags: config, routing, relevant-tags + --- + ``` + +4. Write the rule body with: + - Brief explanation of the principle and why it matters + - **Incorrect** example (YAML config or CLI command showing the wrong pattern) + - **Correct** example (the right pattern with comments) + - Optional explanatory notes + +5. Run `npm run dev` to validate and regenerate + +## Rule File Structure + +```markdown +--- +title: Rule Title Here +impact: CRITICAL +impactDescription: One sentence on the impact +tags: tag1, tag2, tag3 +--- + +## Rule Title Here + +Brief explanation of the rule and why it matters for Plano developers. + +**Incorrect (describe what's wrong):** + +```yaml +# Bad example +``` + +**Correct (describe what's right):** + +```yaml +# Good example with comments explaining the decisions +``` + +Optional explanatory text, lists, or tables. + +Reference: https://github.com/katanemo/archgw + + + +## Impact Levels + +| Level | Description | +|-------|-------------| +| `CRITICAL` | Causes startup failures or silent misbehavior — always fix | +| `HIGH` | Significantly degrades routing accuracy, security, or reliability | +| `MEDIUM-HIGH` | Important for production deployments | +| `MEDIUM` | Best practice for maintainability and developer experience | +| `LOW-MEDIUM` | Incremental improvements | +| `LOW` | Nice to have | + +## Key Rules at a Glance + +- **Always set `version: v0.3.0`** — config is rejected without it +- **Use `host.docker.internal`** for agent/filter URLs — `localhost` doesn't work inside Docker +- **Set exactly one `default: true` provider** — unmatched requests need a fallback +- **Write specific routing preference descriptions** — vague descriptions cause misroutes +- **Order filter chains: guards → rewriters → context builders** — never build context before blocking bad input +- **Use `$VAR_NAME` for all secrets** — never hardcode API keys in config.yaml +- **Enable tracing with `--with-tracing`** — traces are the primary debugging tool + +## Scripts + +| Command | Description | +|---------|-------------| +| `npm run build` | Compile `rules/` into `AGENTS.md` | +| `npm run validate` | Validate all rule files for required fields and structure | +| `npm run extract-tests` | Generate `test-cases.json` for LLM evaluation | +| `npm run dev` | Validate + build + extract tests | + +## Contributing + +Rules are automatically sorted alphabetically by title within each section — no need to manage numbers. IDs (`1.1`, `1.2`, etc.) are assigned during build. + +When adding rules: +1. Use the correct filename prefix for your section +2. Follow `_template.md` structure +3. Include clear bad/good YAML or CLI examples +4. Add relevant tags +5. Run `npm run dev` to validate and regenerate + +## License + +Apache-2.0 — see [LICENSE](../LICENSE) diff --git a/skills/metadata.json b/skills/metadata.json new file mode 100644 index 00000000..f1f754ab --- /dev/null +++ b/skills/metadata.json @@ -0,0 +1,8 @@ +{ + "version": "1.0.0", + "organization": "Plano", + "name": "plano-agent-skills", + "abstract": "Best practices for building agents and agentic applications with Plano — the AI-native proxy and dataplane. Covers configuration, routing, agent orchestration, filter chains, observability, CLI operations, and deployment patterns.", + "homepage": "https://github.com/katanemo/archgw", + "license": "Apache-2.0" +} diff --git a/skills/package-lock.json b/skills/package-lock.json new file mode 100644 index 00000000..080a8c7f --- /dev/null +++ b/skills/package-lock.json @@ -0,0 +1,594 @@ +{ + "name": "plano-agent-skills", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "plano-agent-skills", + "version": "1.0.0", + "license": "Apache-2.0", + "devDependencies": { + "@types/node": "^24.3.0", + "tsx": "^4.20.5", + "typescript": "^5.9.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz", + "integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz", + "integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz", + "integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz", + "integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz", + "integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz", + "integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz", + "integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz", + "integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz", + "integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz", + "integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz", + "integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz", + "integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz", + "integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz", + "integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz", + "integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz", + "integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz", + "integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz", + "integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz", + "integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz", + "integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz", + "integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz", + "integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz", + "integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz", + "integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz", + "integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz", + "integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@types/node": { + "version": "24.11.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.11.0.tgz", + "integrity": "sha512-fPxQqz4VTgPI/IQ+lj9r0h+fDR66bzoeMGHp8ASee+32OSGIkeASsoZuJixsQoVef1QJbeubcPBxKk22QVoWdw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/esbuild": { + "version": "0.27.3", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz", + "integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.27.3", + "@esbuild/android-arm": "0.27.3", + "@esbuild/android-arm64": "0.27.3", + "@esbuild/android-x64": "0.27.3", + "@esbuild/darwin-arm64": "0.27.3", + "@esbuild/darwin-x64": "0.27.3", + "@esbuild/freebsd-arm64": "0.27.3", + "@esbuild/freebsd-x64": "0.27.3", + "@esbuild/linux-arm": "0.27.3", + "@esbuild/linux-arm64": "0.27.3", + "@esbuild/linux-ia32": "0.27.3", + "@esbuild/linux-loong64": "0.27.3", + "@esbuild/linux-mips64el": "0.27.3", + "@esbuild/linux-ppc64": "0.27.3", + "@esbuild/linux-riscv64": "0.27.3", + "@esbuild/linux-s390x": "0.27.3", + "@esbuild/linux-x64": "0.27.3", + "@esbuild/netbsd-arm64": "0.27.3", + "@esbuild/netbsd-x64": "0.27.3", + "@esbuild/openbsd-arm64": "0.27.3", + "@esbuild/openbsd-x64": "0.27.3", + "@esbuild/openharmony-arm64": "0.27.3", + "@esbuild/sunos-x64": "0.27.3", + "@esbuild/win32-arm64": "0.27.3", + "@esbuild/win32-ia32": "0.27.3", + "@esbuild/win32-x64": "0.27.3" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/get-tsconfig": { + "version": "4.13.6", + "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.6.tgz", + "integrity": "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==", + "dev": true, + "license": "MIT", + "dependencies": { + "resolve-pkg-maps": "^1.0.0" + }, + "funding": { + "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" + } + }, + "node_modules/resolve-pkg-maps": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", + "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" + } + }, + "node_modules/tsx": { + "version": "4.21.0", + "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.21.0.tgz", + "integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "~0.27.0", + "get-tsconfig": "^4.7.5" + }, + "bin": { + "tsx": "dist/cli.mjs" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/skills/package.json b/skills/package.json new file mode 100644 index 00000000..eb33002f --- /dev/null +++ b/skills/package.json @@ -0,0 +1,31 @@ +{ + "name": "plano-agent-skills", + "version": "1.0.0", + "description": "Best practices for building agents and agentic applications with Plano — installable via npx skills add", + "type": "module", + "scripts": { + "typecheck": "tsc --noEmit", + "build": "tsx src/build.ts", + "validate": "tsx src/validate.ts", + "extract-tests": "tsx src/extract-tests.ts", + "dev": "npm run typecheck && npm run validate && npm run build && npm run extract-tests" + }, + "keywords": [ + "plano", + "archgw", + "ai-gateway", + "agent", + "llm", + "skills", + "best-practices" + ], + "license": "Apache-2.0", + "engines": { + "node": ">=18.0.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "tsx": "^4.20.5", + "typescript": "^5.9.2" + } +} diff --git a/skills/plano-advanced-patterns/SKILL.md b/skills/plano-advanced-patterns/SKILL.md new file mode 100644 index 00000000..7e2f1b00 --- /dev/null +++ b/skills/plano-advanced-patterns/SKILL.md @@ -0,0 +1,32 @@ +--- +name: plano-advanced-patterns +description: Design advanced Plano architectures. Use for multi-listener systems, prompt target schema quality, and layered orchestration patterns. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Advanced Patterns + +Use this skill for higher-order architecture decisions once fundamentals are stable. + +## When To Use + +- "Design a multi-listener Plano architecture" +- "Improve prompt target schema precision" +- "Combine model, prompt, and agent listeners" +- "Refine advanced routing/function-calling behavior" + +## Apply These Rules + +- `advanced-multi-listener` +- `advanced-prompt-targets` + +## Execution Checklist + +1. Use multiple listeners only when interfaces are truly distinct. +2. Keep provider/routing definitions shared and consistent. +3. Define prompt target parameters with strict, explicit schemas. +4. Minimize ambiguity that causes malformed tool calls. +5. Provide migration-safe recommendations and test scenarios. diff --git a/skills/plano-agent-orchestration/SKILL.md b/skills/plano-agent-orchestration/SKILL.md new file mode 100644 index 00000000..90f25beb --- /dev/null +++ b/skills/plano-agent-orchestration/SKILL.md @@ -0,0 +1,32 @@ +--- +name: plano-agent-orchestration +description: Improve multi-agent orchestration in Plano. Use for agent registration, agent listener wiring, and capability-focused agent descriptions for accurate routing. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Agent Orchestration + +Use this skill for agent listener quality, sub-agent registration, and route accuracy. + +## When To Use + +- "Fix multi-agent routing" +- "Validate agents vs listeners.agents config" +- "Improve agent descriptions" +- "Set up a reliable orchestrator" + +## Apply These Rules + +- `agent-orchestration` +- `agent-descriptions` + +## Execution Checklist + +1. Verify each agent exists in both `agents` and `listeners[].agents`. +2. Ensure one fallback/default agent where appropriate. +3. Rewrite descriptions to be capability-focused and non-overlapping. +4. Keep descriptions specific, concise, and example-driven. +5. Provide test prompts to validate routing outcomes. diff --git a/skills/plano-agent-skills/SKILL.md b/skills/plano-agent-skills/SKILL.md new file mode 100644 index 00000000..e6ecbb20 --- /dev/null +++ b/skills/plano-agent-skills/SKILL.md @@ -0,0 +1,53 @@ +--- +name: plano-agent-skills +description: Best practices for building agents and agentic applications with Plano, including configuration, routing, orchestration, guardrails, observability, and deployment. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Agent Skills + +Comprehensive Plano guidance for coding agents. Use this umbrella skill when a task spans multiple areas (config, routing, orchestration, filters, observability, CLI, deployment). + +## When To Use + +- Validating or fixing Plano `config.yaml` +- Designing listener architecture (`model`, `prompt`, `agent`) +- Improving model/provider routing quality and fallback behavior +- Hardening filter chains and prompt guardrails +- Debugging routing with traces and CLI workflows +- Preparing deployment and production readiness checks + +## How To Use + +1. Classify the request by scope (single section vs. cross-cutting). +2. For focused work, prefer a section-specific skill (for example `plano-routing-model-selection`). +3. For broad work, apply this umbrella skill and reference section rules from `skills/AGENTS.md`. +4. Produce concrete edits first, then concise reasoning and validation steps. + +## Operating Workflow + +1. Identify the task area first: config, routing, orchestration, filters, observability, CLI, or deployment. +2. Apply the smallest correct change that satisfies the requested behavior. +3. Preserve security and reliability defaults: + - `version: v0.3.0` + - exactly one `default: true` model provider + - secrets via `$ENV_VAR` substitution only + - `host.docker.internal` for host services from inside Docker + - guardrails before enrichment in filter chains +4. For debugging, prioritize traces over guesswork (`planoai up --with-tracing`, `planoai trace`). +5. Return concrete diffs and a short validation checklist. + +## Response Style + +- Prefer actionable edits over generic advice. +- Be explicit about why a config choice is correct. +- Call out risky patterns (hardcoded secrets, missing default provider, bad filter ordering). +- Keep examples minimal and production-viable. + +## References + +- Repo: https://github.com/katanemo/plano +- Full rulebook: `skills/AGENTS.md` diff --git a/skills/plano-cli-operations/SKILL.md b/skills/plano-cli-operations/SKILL.md new file mode 100644 index 00000000..da25db58 --- /dev/null +++ b/skills/plano-cli-operations/SKILL.md @@ -0,0 +1,34 @@ +--- +name: plano-cli-operations +description: Apply Plano CLI best practices. Use for startup troubleshooting, cli_agent workflows, prompt target generation, and template-based project bootstrapping. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano CLI Operations + +Use this skill when the task is primarily operational and CLI-driven. + +## When To Use + +- "Fix `planoai up` failures" +- "Use `planoai cli_agent` with coding agents" +- "Generate prompt targets from Python functions" +- "Bootstrap a project with `planoai init` templates" + +## Apply These Rules + +- `cli-startup` +- `cli-agent` +- `cli-generate` +- `cli-init` + +## Execution Checklist + +1. Follow startup validation order before deep debugging. +2. Use `cli_agent` to route coding-agent traffic through Plano. +3. Generate prompt target schema, then wire endpoint details explicitly. +4. Start from templates for reliable first-time setup. +5. Provide a compact runbook with exact CLI commands. diff --git a/skills/plano-config-fundamentals/SKILL.md b/skills/plano-config-fundamentals/SKILL.md new file mode 100644 index 00000000..87b7fbdd --- /dev/null +++ b/skills/plano-config-fundamentals/SKILL.md @@ -0,0 +1,34 @@ +--- +name: plano-config-fundamentals +description: Validate and fix Plano config fundamentals. Use for config versioning, listener types, provider registration, secrets handling, and startup validation failures. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Configuration Fundamentals + +Use this skill for foundational `config.yaml` correctness. + +## When To Use + +- "Validate this Plano config" +- "Fix startup config errors" +- "Check listeners/providers/secrets" +- "Why does `planoai up` fail schema validation?" + +## Apply These Rules + +- `config-version` +- `config-listeners` +- `config-providers` +- `config-secrets` + +## Execution Checklist + +1. Ensure `version: v0.3.0` is present. +2. Confirm listener type matches intended architecture. +3. Verify provider names/interfaces and exactly one default provider. +4. Replace hardcoded secrets with `$ENV_VAR` substitution. +5. Return minimal patch and a `planoai up` verification plan. diff --git a/skills/plano-deployment-security/SKILL.md b/skills/plano-deployment-security/SKILL.md new file mode 100644 index 00000000..48256777 --- /dev/null +++ b/skills/plano-deployment-security/SKILL.md @@ -0,0 +1,33 @@ +--- +name: plano-deployment-security +description: Apply Plano deployment and production security practices. Use for Docker networking, state storage choices, readiness checks, and environment-based secret handling. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Deployment and Security + +Use this skill to harden production deployments and reduce runtime surprises. + +## When To Use + +- "Fix unreachable agents in Docker" +- "Configure persistent conversation state" +- "Add readiness and health checks" +- "Prepare production deployment checklist" + +## Apply These Rules + +- `deploy-docker` +- `deploy-state` +- `deploy-health` + +## Execution Checklist + +1. Use `host.docker.internal` for host-side services from inside Plano container. +2. Prefer PostgreSQL state storage for production multi-turn workloads. +3. Verify `/healthz` before traffic or CI assertions. +4. Ensure secrets remain environment-based, never hardcoded. +5. Return deployment checks with failure-mode diagnostics. diff --git a/skills/plano-filter-guardrails/SKILL.md b/skills/plano-filter-guardrails/SKILL.md new file mode 100644 index 00000000..2f19e67b --- /dev/null +++ b/skills/plano-filter-guardrails/SKILL.md @@ -0,0 +1,33 @@ +--- +name: plano-filter-guardrails +description: Harden Plano filter chains and guardrails. Use for MCP filter setup, prompt guard responses, and safe filter ordering. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Filter Chains and Guardrails + +Use this skill when safety controls or filter pipelines need correction. + +## When To Use + +- "Fix filter chain ordering" +- "Set up MCP filters correctly" +- "Improve guardrail rejection behavior" +- "Harden request processing for safety" + +## Apply These Rules + +- `filter-mcp` +- `filter-guardrails` +- `filter-ordering` + +## Execution Checklist + +1. Configure filter `type`, `transport`, and `tool` explicitly for MCP. +2. Ensure rejection messages are clear and actionable. +3. Order chain as guards -> rewriters -> enrichment -> output checks. +4. Prevent expensive enrichment on unsafe requests. +5. Verify with representative blocked and allowed test prompts. diff --git a/skills/plano-observability-debugging/SKILL.md b/skills/plano-observability-debugging/SKILL.md new file mode 100644 index 00000000..c4039a7f --- /dev/null +++ b/skills/plano-observability-debugging/SKILL.md @@ -0,0 +1,33 @@ +--- +name: plano-observability-debugging +description: Improve Plano tracing and debugging workflows. Use for sampling strategy, span attributes, and trace query-based root-cause analysis. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Observability and Debugging + +Use this skill to make routing and latency behavior inspectable and debuggable. + +## When To Use + +- "Enable tracing correctly" +- "Add useful span attributes" +- "Debug why a request routed incorrectly" +- "Inspect filter/model latency from traces" + +## Apply These Rules + +- `observe-tracing` +- `observe-span-attributes` +- `observe-trace-query` + +## Execution Checklist + +1. Enable tracing with environment-appropriate sampling. +2. Add useful static and header-derived span attributes. +3. Use `planoai trace` filters to isolate route and latency issues. +4. Prefer trace evidence over assumptions in recommendations. +5. Return exact commands to reproduce and validate findings. diff --git a/skills/plano-routing-model-selection/SKILL.md b/skills/plano-routing-model-selection/SKILL.md new file mode 100644 index 00000000..083f21c8 --- /dev/null +++ b/skills/plano-routing-model-selection/SKILL.md @@ -0,0 +1,34 @@ +--- +name: plano-routing-model-selection +description: Optimize Plano model routing and selection. Use for provider defaults, model aliases, passthrough auth, and routing preference quality. +license: Apache-2.0 +metadata: + author: katanemo + version: "1.0.0" +--- + +# Plano Routing and Model Selection + +Use this skill when requests are routed to the wrong model, costs are high, or fallback behavior is unclear. + +## When To Use + +- "Improve model routing" +- "Add aliases and defaults" +- "Fix passthrough auth with proxy providers" +- "Tune routing preferences for better classification" + +## Apply These Rules + +- `routing-default` +- `routing-aliases` +- `routing-passthrough` +- `routing-preferences` + +## Execution Checklist + +1. Ensure exactly one `default: true` provider. +2. Add semantic aliases for stable client contracts. +3. Configure passthrough auth only where required. +4. Rewrite vague preference descriptions with concrete task scopes. +5. Validate routing behavior using trace-based checks. diff --git a/skills/rules/_sections.md b/skills/rules/_sections.md new file mode 100644 index 00000000..a74c77f8 --- /dev/null +++ b/skills/rules/_sections.md @@ -0,0 +1,16 @@ +# Section Definitions + +This file defines the sections used to organize Plano agent skills rules. +Files are assigned to sections based on their filename prefix. + + +| Prefix | Section # | Title | Impact | Description | +| ----------- | --------- | -------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------- | +| `config-` | 1 | Configuration Fundamentals | CRITICAL | Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment | +| `routing-` | 2 | Routing & Model Selection | HIGH | Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model | +| `agent-` | 3 | Agent Orchestration | HIGH | Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications | +| `filter-` | 4 | Filter Chains & Guardrails | HIGH | Request/response processing pipelines — ordering, MCP integration, and safety guardrails | +| `observe-` | 5 | Observability & Debugging | MEDIUM-HIGH | OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility | +| `cli-` | 6 | CLI Operations | MEDIUM | Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation | +| `deploy-` | 7 | Deployment & Security | HIGH | Docker deployment, environment variable management, health checks, and state storage for production | +| `advanced-` | 8 | Advanced Patterns | MEDIUM | Prompt targets, external API integration, and multi-listener architectures | diff --git a/skills/rules/_template.md b/skills/rules/_template.md new file mode 100644 index 00000000..9566063e --- /dev/null +++ b/skills/rules/_template.md @@ -0,0 +1,26 @@ +--- +title: Rule Title Here +impact: MEDIUM +impactDescription: Optional one-line description of the impact +tags: tag1, tag2, tag3 +--- + +## Rule Title Here + +Brief explanation of what this rule is and why it matters for Plano developers and agents. + +**Incorrect (explain what's wrong):** + +```yaml +# Bad config or CLI example +``` + +**Correct (explain what's right):** + +```yaml +# Good config or CLI example +``` + +Optional explanatory text elaborating on the principle or listing key points. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/advanced-multi-listener.md b/skills/rules/advanced-multi-listener.md new file mode 100644 index 00000000..81c8d4d9 --- /dev/null +++ b/skills/rules/advanced-multi-listener.md @@ -0,0 +1,139 @@ +--- +title: Combine Multiple Listener Types for Layered Agent Architectures +impact: MEDIUM +impactDescription: Using a single listener type forces all traffic through one gateway pattern — combining types lets you serve different clients with the right interface without running multiple Plano instances +tags: advanced, multi-listener, architecture, agent, model, prompt +--- + +## Combine Multiple Listener Types for Layered Agent Architectures + +A single Plano `config.yaml` can define multiple listeners of different types, each on a separate port. This lets you serve different client types simultaneously: an OpenAI-compatible model gateway for direct API clients, a prompt gateway for LLM-callable function applications, and an agent orchestrator for multi-agent workflows — all from one Plano instance sharing the same model providers. + +**Single listener (limited — forces all clients through one interface):** + +```yaml +version: v0.3.0 + +listeners: + - type: model # Only model clients can use this + name: model_gateway + port: 12000 + +# Prompt target clients and agent clients cannot connect +``` + +**Multi-listener architecture (serves all client types):** + +```yaml +version: v0.3.0 + +# --- Shared model providers --- +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + routing_preferences: + - name: quick tasks + description: Short answers, formatting, classification, simple generation + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex reasoning + description: Multi-step analysis, code generation, research synthesis + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: long documents + description: Summarizing or analyzing very long documents, PDFs, transcripts + +# --- Listener 1: OpenAI-compatible API gateway --- +# For: SDK clients, Claude Code, LangChain, etc. +listeners: + - type: model + name: model_gateway + port: 12000 + timeout: "120s" + +# --- Listener 2: Prompt function gateway --- +# For: Applications that expose LLM-callable APIs + - type: prompt + name: function_gateway + port: 10000 + timeout: "60s" + +# --- Listener 3: Agent orchestration gateway --- +# For: Multi-agent application clients + - type: agent + name: agent_orchestrator + port: 8000 + timeout: "90s" + router: plano_orchestrator_v1 + agents: + - id: research_agent + description: Searches, synthesizes, and summarizes information from multiple sources. + filter_chain: + - input_guards + - context_builder + - id: code_agent + description: Writes, reviews, debugs, and explains code across all languages. + default: true + +# --- Agents --- +agents: + - id: research_agent + url: http://host.docker.internal:8001 + - id: code_agent + url: http://host.docker.internal:8002 + +# --- Filters --- +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + - id: context_builder + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + +# --- Prompt targets (for function gateway) --- +endpoints: + internal_api: + endpoint: host.docker.internal + protocol: http + +prompt_targets: + - name: search_knowledge_base + description: Search the internal knowledge base for relevant documents and facts. + parameters: + - name: query + type: str + required: true + description: Search query to find relevant information + endpoint: + name: internal_api + path: /kb/search?q={query} + http_method: GET + +# --- Observability --- +model_aliases: + plano.fast.v1: + target: gpt-4o-mini + plano.smart.v1: + target: gpt-4o + +tracing: + random_sampling: 50 + trace_arch_internal: true + span_attributes: + static: + environment: production + header_prefixes: + - x-katanemo- +``` + +This architecture serves: SDK clients on `:12000`, function-calling apps on `:10000`, and multi-agent orchestration on `:8000` — with shared cost-optimized routing across all three. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/advanced-prompt-targets.md b/skills/rules/advanced-prompt-targets.md new file mode 100644 index 00000000..88f376fd --- /dev/null +++ b/skills/rules/advanced-prompt-targets.md @@ -0,0 +1,128 @@ +--- +title: Design Prompt Targets with Precise Parameter Schemas +impact: HIGH +impactDescription: Imprecise parameter definitions cause the LLM to hallucinate values, skip required fields, or produce malformed API calls — the schema is the contract between the LLM and your API +tags: advanced, prompt-targets, functions, llm, api-integration +--- + +## Design Prompt Targets with Precise Parameter Schemas + +`prompt_targets` define functions that Plano's LLM can call autonomously when it determines a user request matches the function's description. The parameter schema tells the LLM exactly what values to extract from user input — vague schemas lead to hallucinated parameters and failed API calls. + +**Incorrect (too few constraints — LLM must guess):** + +```yaml +prompt_targets: + - name: get_flight_info + description: Get flight information + parameters: + - name: flight # What format? "AA123"? "AA 123"? "American 123"? + type: str + required: true + endpoint: + name: flights_api + path: /flight?id={flight} +``` + +**Correct (fully specified schema with descriptions, formats, and enums):** + +```yaml +version: v0.3.0 + +endpoints: + flights_api: + endpoint: api.flightaware.com + protocol: https + connect_timeout: "5s" + +prompt_targets: + - name: get_flight_status + description: > + Get real-time status, gate information, and delays for a specific flight number. + Use when the user asks about a flight's current status, arrival time, or gate. + parameters: + - name: flight_number + description: > + IATA airline code followed by flight number, e.g., "AA123", "UA456", "DL789". + Extract from user message — do not include spaces. + type: str + required: true + format: "^[A-Z]{2}[0-9]{1,4}$" # Regex hint for validation + + - name: date + description: > + Flight date in YYYY-MM-DD format. Use today's date if not specified. + type: str + required: false + format: date + + endpoint: + name: flights_api + path: /flights/{flight_number}?date={date} + http_method: GET + http_headers: + Authorization: "Bearer $FLIGHTAWARE_API_KEY" + + - name: search_flights + description: > + Search for available flights between two cities or airports. + Use when the user wants to find flights, compare options, or book travel. + parameters: + - name: origin + description: Departure airport IATA code (e.g., "JFK", "LAX", "ORD") + type: str + required: true + - name: destination + description: Arrival airport IATA code (e.g., "LHR", "CDG", "NRT") + type: str + required: true + - name: departure_date + description: Departure date in YYYY-MM-DD format + type: str + required: true + format: date + - name: cabin_class + description: Preferred cabin class + type: str + required: false + default: economy + enum: [economy, premium_economy, business, first] + - name: passengers + description: Number of adult passengers (1-9) + type: int + required: false + default: 1 + + endpoint: + name: flights_api + path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers} + http_method: GET + http_headers: + Authorization: "Bearer $FLIGHTAWARE_API_KEY" + + system_prompt: | + You are a travel assistant. Present flight search results clearly, + highlighting the best value options. Include price, duration, and + number of stops for each option. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +listeners: + - type: prompt + name: travel_functions + port: 10000 + timeout: "30s" +``` + +**Key principles:** +- `description` on the target tells the LLM when to call it — be specific about trigger conditions +- `description` on each parameter tells the LLM what value to extract — include format examples +- Use `enum` to constrain categorical values — prevents the LLM from inventing categories +- Use `format: date` or regex patterns to hint at expected format +- Use `default` for optional parameters so the API never receives null values +- `system_prompt` on the target customizes how the LLM formats the API response to the user + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/agent-descriptions.md b/skills/rules/agent-descriptions.md new file mode 100644 index 00000000..86728bde --- /dev/null +++ b/skills/rules/agent-descriptions.md @@ -0,0 +1,75 @@ +--- +title: Write Capability-Focused Agent Descriptions for Accurate Routing +impact: HIGH +impactDescription: The orchestrator LLM routes requests purely by reading agent descriptions — poor descriptions cause misroutes to the wrong specialized agent +tags: agent, orchestration, descriptions, routing, multi-agent +--- + +## Write Capability-Focused Agent Descriptions for Accurate Routing + +In an `agent` listener, Plano's orchestrator reads each agent's `description` and routes user requests to the best-matching agent. This is LLM-based intent matching — the description is the entire specification the router sees. Write it as a capability manifest: what can this agent do, what data does it have access to, and what types of requests should it handle? + +**Incorrect (generic, overlapping descriptions):** + +```yaml +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: agent_1 + description: Helps users with information # Too generic — matches everything + + - id: agent_2 + description: Also helps users # Indistinguishable from agent_1 +``` + +**Correct (specific capabilities, distinct domains, concrete examples):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: flight_agent + url: http://host.docker.internal:8002 + - id: hotel_agent + url: http://host.docker.internal:8003 + +listeners: + - type: agent + name: travel_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: > + Provides real-time weather conditions and multi-day forecasts for any city + worldwide. Handles questions about temperature, precipitation, wind, humidity, + sunrise/sunset times, and severe weather alerts. Examples: "What's the weather + in Tokyo?", "Will it rain in London this weekend?", "Sunrise time in New York." + + - id: flight_agent + description: > + Provides live flight status, schedules, gate information, delays, and + aircraft details for any flight number or route between airports. + Handles questions about departures, arrivals, and airline information. + Examples: "Is AA123 on time?", "Flights from JFK to LAX tomorrow." + + - id: hotel_agent + description: > + Searches and books hotel accommodations, compares room types, pricing, + and availability. Handles check-in/check-out dates, amenities, and + cancellation policies. Examples: "Hotels near Times Square for next Friday." +``` + +**Description writing checklist:** +- State the primary domain in the first sentence +- List 3–5 specific data types or question categories this agent handles +- Include 2–3 concrete example user queries in quotes +- Avoid capability overlap between agents — if they overlap, the router will split traffic unpredictably +- Keep descriptions under 150 words — the orchestrator reads all descriptions per request + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/agent-orchestration.md b/skills/rules/agent-orchestration.md new file mode 100644 index 00000000..0e6d7bb3 --- /dev/null +++ b/skills/rules/agent-orchestration.md @@ -0,0 +1,88 @@ +--- +title: Register All Sub-Agents in Both `agents` and `listeners.agents` +impact: CRITICAL +impactDescription: An agent registered only in `agents` but not referenced in a listener's agent list is unreachable; an agent listed in a listener but missing from `agents` causes a startup error +tags: agent, orchestration, config, multi-agent +--- + +## Register All Sub-Agents in Both `agents` and `listeners.agents` + +Plano's agent system has two separate concepts: the global `agents` array (defines the agent's ID and backend URL) and the `listeners[].agents` array (controls which agents are available to an orchestrator and provides their routing descriptions). Both must reference the same agent ID. + +**Incorrect (agent defined globally but not referenced in listener):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: news_agent # Defined but never referenced in any listener + url: http://host.docker.internal:8002 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides weather forecasts and current conditions. + # news_agent is missing here — the orchestrator cannot route to it +``` + +**Incorrect (listener references an agent ID not in the global agents list):** + +```yaml +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides weather forecasts. + - id: flights_agent # ID not in global agents[] — startup error + description: Provides flight status information. +``` + +**Correct (every agent ID appears in both places):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: flights_agent + url: http://host.docker.internal:8002 + - id: hotels_agent + url: http://host.docker.internal:8003 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +listeners: + - type: agent + name: travel_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Real-time weather, forecasts, and climate data for any city. + - id: flights_agent + description: Live flight status, schedules, gates, and delays. + - id: hotels_agent + description: Hotel search, availability, pricing, and booking. + default: true # Fallback if no other agent matches +``` + +Set `default: true` on one agent in each listener's agents list to handle unmatched requests. The agent's URL in the global `agents` array is the HTTP endpoint Plano forwards matching requests to — it must be reachable from within the Docker container (use `host.docker.internal` for services on the host). + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/cli-agent.md b/skills/rules/cli-agent.md new file mode 100644 index 00000000..e311e99e --- /dev/null +++ b/skills/rules/cli-agent.md @@ -0,0 +1,86 @@ +--- +title: Use `planoai cli_agent` to Connect Claude Code Through Plano +impact: MEDIUM-HIGH +impactDescription: Running Claude Code directly against provider APIs bypasses Plano's routing, observability, and guardrails — cli_agent routes all Claude Code traffic through your configured Plano instance +tags: cli, cli-agent, claude, coding-agent, integration +--- + +## Use `planoai cli_agent` to Connect Claude Code Through Plano + +`planoai cli_agent` starts a Claude Code session that routes all LLM traffic through your running Plano instance instead of directly to Anthropic. This gives you routing preferences, model aliases, tracing, and guardrails for your coding agent workflows — making Claude Code a first-class citizen of your Plano configuration. + +**Prerequisites:** + +```bash +# 1. Plano must be running with a model listener +planoai up config.yaml + +# 2. ANTHROPIC_API_KEY must be set (Claude Code uses it for auth) +export ANTHROPIC_API_KEY=sk-ant-... +``` + +**Starting the CLI agent:** + +```bash +# Start CLI agent using config.yaml in current directory +planoai cli_agent claude + +# Use a specific config file +planoai cli_agent claude config.yaml + +# Use a config in a different directory +planoai cli_agent claude --path /path/to/project +``` + +**Recommended config for Claude Code routing:** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: claude_code_router + port: 12000 + +model_providers: + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + default: true + routing_preferences: + - name: general coding + description: > + Writing code, debugging, code review, explaining concepts, + answering programming questions, general development tasks. + + - model: anthropic/claude-opus-4-6 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: complex architecture + description: > + System design, complex refactoring across many files, + architectural decisions, performance optimization, security audits. + +model_aliases: + claude.fast.v1: + target: claude-sonnet-4-20250514 + claude.smart.v1: + target: claude-opus-4-6 + +tracing: + random_sampling: 100 + trace_arch_internal: true + +overrides: + upstream_connect_timeout: "10s" +``` + +**What happens when cli_agent runs:** + +1. Reads your config.yaml to find the model listener port +2. Configures Claude Code to use `http://localhost:` as its API endpoint +3. Starts a Claude Code session in your terminal +4. All Claude Code LLM calls flow through Plano — routing, tracing, and guardrails apply + +After your session, use `planoai trace` to inspect every LLM call Claude Code made, which model was selected, and why. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/cli-generate.md b/skills/rules/cli-generate.md new file mode 100644 index 00000000..75ae8e4f --- /dev/null +++ b/skills/rules/cli-generate.md @@ -0,0 +1,91 @@ +--- +title: Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets` +impact: MEDIUM +impactDescription: Manually writing prompt_targets YAML for existing Python APIs is error-prone — the generator introspects function signatures and produces correct YAML automatically +tags: cli, generate, prompt-targets, python, code-generation +--- + +## Generate Prompt Targets from Python Functions with `planoai generate_prompt_targets` + +`planoai generate_prompt_targets` introspects Python function signatures and docstrings to generate `prompt_targets` YAML for your Plano config. This is the fastest way to expose existing Python APIs as LLM-callable functions without manually writing the YAML schema. + +**Python function requirements for generation:** +- Use simple type annotations: `int`, `float`, `bool`, `str`, `list`, `tuple`, `set`, `dict` +- Include a docstring describing what the function does (becomes the `description`) +- Complex Pydantic models must be flattened into primitive typed parameters first + +**Example Python file:** + +```python +# api.py + +def get_stock_quote(symbol: str, exchange: str = "NYSE") -> dict: + """Get the current stock price and trading data for a given stock symbol. + + Returns price, volume, market cap, and 24h change percentage. + """ + # Implementation calls stock API + pass + +def get_weather_forecast(city: str, days: int = 3, units: str = "celsius") -> dict: + """Get the weather forecast for a city. + + Returns temperature, precipitation, and conditions for the specified number of days. + """ + pass + +def search_flights(origin: str, destination: str, date: str, passengers: int = 1) -> list: + """Search for available flights between two airports on a given date. + + Date format: YYYY-MM-DD. Returns list of flight options with prices. + """ + pass +``` + +**Running the generator:** + +```bash +planoai generate_prompt_targets --file api.py +``` + +**Generated output (add to your config.yaml):** + +```yaml +prompt_targets: + - name: get_stock_quote + description: Get the current stock price and trading data for a given stock symbol. + parameters: + - name: symbol + type: str + required: true + - name: exchange + type: str + required: false + default: NYSE + # Add endpoint manually: + endpoint: + name: stock_api + path: /quote?symbol={symbol}&exchange={exchange} + + - name: get_weather_forecast + description: Get the weather forecast for a city. + parameters: + - name: city + type: str + required: true + - name: days + type: int + required: false + default: 3 + - name: units + type: str + required: false + default: celsius + endpoint: + name: weather_api + path: /forecast?city={city}&days={days}&units={units} +``` + +After generation, manually add the `endpoint` blocks pointing to your actual API. The generator produces the schema; you wire in the connectivity. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/cli-init.md b/skills/rules/cli-init.md new file mode 100644 index 00000000..740396ae --- /dev/null +++ b/skills/rules/cli-init.md @@ -0,0 +1,66 @@ +--- +title: Use `planoai init` Templates to Bootstrap New Projects Correctly +impact: MEDIUM +impactDescription: Starting from a blank config.yaml leads to missing required fields and common structural mistakes — templates provide validated, idiomatic starting points +tags: cli, init, templates, getting-started, project-setup +--- + +## Use `planoai init` Templates to Bootstrap New Projects Correctly + +`planoai init` generates a valid `config.yaml` from built-in templates. Each template demonstrates a specific Plano capability with correct structure, realistic examples, and comments. Use this instead of writing config from scratch — it ensures you start with a valid, working configuration. + +**Available templates:** + +| Template ID | What It Demonstrates | Best For | +|---|---|---| +| `sub_agent_orchestration` | Multi-agent routing with specialized sub-agents | Building agentic applications | +| `coding_agent_routing` | Routing preferences + model aliases for coding workflows | Claude Code and coding assistants | +| `preference_aware_routing` | Automatic LLM routing based on task type | Multi-model cost optimization | +| `filter_chain_guardrails` | Input guards, query rewrite, context builder | RAG + safety pipelines | +| `conversational_state_v1_responses` | Stateful conversations with memory | Chatbots, multi-turn assistants | + +**Usage:** + +```bash +# Initialize with a template +planoai init --template sub_agent_orchestration + +# Initialize coding agent routing setup +planoai init --template coding_agent_routing + +# Initialize a RAG with guardrails project +planoai init --template filter_chain_guardrails +``` + +**Typical project setup workflow:** + +```bash +# 1. Create project directory +mkdir my-plano-agent && cd my-plano-agent + +# 2. Bootstrap with the closest matching template +planoai init --template preference_aware_routing + +# 3. Edit config.yaml to add your specific models, agents, and API keys +# (keys are already using $VAR substitution — just set your env vars) + +# 4. Create .env file for local development +cat > .env << EOF +OPENAI_API_KEY=sk-proj-... +ANTHROPIC_API_KEY=sk-ant-... +EOF + +echo ".env" >> .gitignore + +# 5. Start Plano +planoai up + +# 6. Test your configuration +curl http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]}' +``` + +Start with `preference_aware_routing` for most LLM gateway use cases and `sub_agent_orchestration` for multi-agent applications. Both can be combined after you understand each independently. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/cli-startup.md b/skills/rules/cli-startup.md new file mode 100644 index 00000000..2d51927c --- /dev/null +++ b/skills/rules/cli-startup.md @@ -0,0 +1,80 @@ +--- +title: Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues +impact: HIGH +impactDescription: `planoai up` validates config, checks API keys, and health-checks all listeners — skipping this diagnostic information leads to unnecessary debugging of container or network issues +tags: cli, startup, validation, debugging, workflow +--- + +## Follow the `planoai up` Validation Workflow Before Debugging Runtime Issues + +`planoai up` is the entry point for running Plano. It performs sequential checks before the container starts: schema validation, API key presence check, container startup, and health checks on all configured listener ports. Understanding what each failure stage means prevents chasing the wrong root cause. + +**Validation stages and failure signals:** + +``` +Stage 1: Schema validation → "config.yaml: invalid against schema" +Stage 2: API key check → "Missing required environment variables: OPENAI_API_KEY" +Stage 3: Container start → "Docker daemon not running" or image pull errors +Stage 4: Health check (/healthz) → "Listener not healthy after 120s" (timeout) +``` + +**Development startup workflow:** + +```bash +# Standard startup — config.yaml in current directory +planoai up + +# Explicit config file path +planoai up my-config.yaml + +# Start in foreground to see all logs immediately (great for debugging) +planoai up config.yaml --foreground + +# Start with built-in OTEL trace collector +planoai up config.yaml --with-tracing + +# Enable verbose logging for debugging routing decisions +LOG_LEVEL=debug planoai up config.yaml --foreground +``` + +**Checking what's running:** + +```bash +# Stream recent logs (last N lines, then exit) +planoai logs + +# Follow logs in real-time +planoai logs --follow + +# Include Envoy/gateway debug messages +planoai logs --debug --follow +``` + +**Stopping and restarting after config changes:** + +```bash +# Stop the current container +planoai down + +# Restart with updated config +planoai up config.yaml +``` + +**Common failure patterns:** + +```bash +# API key missing — check your .env file or shell environment +export OPENAI_API_KEY=sk-proj-... +planoai up config.yaml + +# Health check timeout — listener port may conflict +# Check if another process uses port 12000 +lsof -i :12000 + +# Container fails to start — verify Docker daemon is running +docker ps +``` + +`planoai down` fully stops and removes the Plano container. Always run `planoai down` before `planoai up` when changing config to avoid stale container state. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/config-listeners.md b/skills/rules/config-listeners.md new file mode 100644 index 00000000..d40a3e30 --- /dev/null +++ b/skills/rules/config-listeners.md @@ -0,0 +1,64 @@ +--- +title: Choose the Right Listener Type for Your Use Case +impact: CRITICAL +impactDescription: The listener type determines the entire request processing pipeline — choosing the wrong type means features like prompt functions or agent routing are unavailable +tags: config, listeners, architecture, routing +--- + +## Choose the Right Listener Type for Your Use Case + +Plano supports three listener types, each serving a distinct purpose. `listeners` is the only required top-level array in a Plano config. Every listener needs at minimum a `type`, `name`, and `port`. + +| Type | Use When | Key Feature | +|------|----------|-------------| +| `model` | You want an OpenAI-compatible LLM gateway | Routes to multiple LLM providers, supports model aliases and routing preferences | +| `prompt` | You want LLM-callable custom functions | Define `prompt_targets` that the LLM dispatches as function calls | +| `agent` | You want multi-agent orchestration | Routes user requests to specialized sub-agents by matching agent descriptions | + +**Incorrect (using `model` when agents need orchestration):** + +```yaml +version: v0.3.0 + +# Wrong: a model listener cannot route to backend agent services +listeners: + - type: model + name: main + port: 12000 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 +``` + +**Correct (use `agent` listener for multi-agent systems):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 + - id: travel_agent + url: http://host.docker.internal:8002 + +listeners: + - type: agent + name: orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: Provides real-time weather, forecasts, and conditions for any city. + - id: travel_agent + description: Books flights, hotels, and travel itineraries. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +A single Plano instance can expose multiple listeners on different ports, each with a different type, to serve different clients simultaneously. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/config-providers.md b/skills/rules/config-providers.md new file mode 100644 index 00000000..30476cd5 --- /dev/null +++ b/skills/rules/config-providers.md @@ -0,0 +1,64 @@ +--- +title: Register Model Providers with Correct Format Identifiers +impact: CRITICAL +impactDescription: Incorrect provider format causes request translation failures — Plano must know the wire format each provider expects +tags: config, model-providers, llm, api-format +--- + +## Register Model Providers with Correct Format Identifiers + +Plano translates requests between its internal format and each provider's API. The `model` field uses `provider/model-name` syntax which determines both the upstream endpoint and the request/response translation layer. Some providers require an explicit `provider_interface` override. + +**Provider format reference:** + +| Model prefix | Wire format | Example | +|---|---|---| +| `openai/*` | OpenAI | `openai/gpt-4o` | +| `anthropic/*` | Anthropic | `anthropic/claude-sonnet-4-20250514` | +| `gemini/*` | Google Gemini | `gemini/gemini-2.0-flash` | +| `mistral/*` | Mistral | `mistral/mistral-large-latest` | +| `groq/*` | Groq | `groq/llama-3.3-70b-versatile` | +| `deepseek/*` | DeepSeek | `deepseek/deepseek-chat` | +| `xai/*` | Grok (OpenAI-compat) | `xai/grok-2` | +| `together_ai/*` | Together.ai | `together_ai/meta-llama/Llama-3` | +| `custom/*` | Requires `provider_interface` | `custom/my-local-model` | + +**Incorrect (missing provider prefix, ambiguous format):** + +```yaml +model_providers: + - model: gpt-4o # Missing openai/ prefix — Plano cannot route this + access_key: $OPENAI_API_KEY + + - model: claude-3-5-sonnet # Missing anthropic/ prefix + access_key: $ANTHROPIC_API_KEY +``` + +**Correct (explicit provider prefixes):** + +```yaml +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + + - model: gemini/gemini-2.0-flash + access_key: $GOOGLE_API_KEY +``` + +**For local or self-hosted models (Ollama, LiteLLM, vLLM):** + +```yaml +model_providers: + - model: custom/llama3 + base_url: http://host.docker.internal:11434/v1 # Ollama endpoint + provider_interface: openai # Ollama speaks OpenAI format + default: true +``` + +Always set `default: true` on exactly one provider per listener so Plano has a fallback when routing preferences do not match. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/config-secrets.md b/skills/rules/config-secrets.md new file mode 100644 index 00000000..5f585c87 --- /dev/null +++ b/skills/rules/config-secrets.md @@ -0,0 +1,72 @@ +--- +title: Use Environment Variable Substitution for All Secrets +impact: CRITICAL +impactDescription: Hardcoded API keys in config.yaml will be committed to version control and exposed in Docker container inspect output +tags: config, security, secrets, api-keys, environment-variables +--- + +## Use Environment Variable Substitution for All Secrets + +Plano supports `$VAR_NAME` substitution in config values. This applies to `access_key` fields, `connection_string` for state storage, and `http_headers` in prompt targets and endpoints. Never hardcode credentials — Plano reads them from environment variables or a `.env` file at startup via `planoai up`. + +**Incorrect (hardcoded secrets):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o + access_key: abcdefghijklmnopqrstuvwxyz... # Hardcoded — never do this + +state_storage: + type: postgres + connection_string: "postgresql://admin:mysecretpassword@prod-db:5432/plano" + +prompt_targets: + - name: get_data + endpoint: + name: my_api + http_headers: + Authorization: "Bearer abcdefghijklmnopqrstuvwxyz" # Hardcoded token +``` + +**Correct (environment variable substitution):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +state_storage: + type: postgres + connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}" + +prompt_targets: + - name: get_data + endpoint: + name: my_api + http_headers: + Authorization: "Bearer $MY_API_TOKEN" +``` + +**`.env` file pattern (loaded automatically by `planoai up`):** + +```bash +# .env — add to .gitignore +OPENAI_API_KEY=sk-proj-... +ANTHROPIC_API_KEY=sk-ant-... +DB_USER=plano +DB_PASS=secure-password +DB_HOST=localhost +MY_API_TOKEN=tok_live_... +``` + +Plano also accepts keys set directly in the shell environment. Variables referenced in config but not found at startup cause `planoai up` to fail with a clear error listing the missing keys. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/config-version.md b/skills/rules/config-version.md new file mode 100644 index 00000000..768d7b04 --- /dev/null +++ b/skills/rules/config-version.md @@ -0,0 +1,44 @@ +--- +title: Always Specify a Supported Config Version +impact: CRITICAL +impactDescription: Plano rejects configs with missing or unsupported version fields — the version field gates all other validation +tags: config, versioning, validation +--- + +## Always Specify a Supported Config Version + +Every Plano `config.yaml` must include a `version` field at the top level. Plano validates configs against a versioned JSON schema — an unrecognized or missing version will cause `planoai up` to fail immediately with a schema validation error before the container starts. + +**Incorrect (missing or invalid version):** + +```yaml +# No version field — fails schema validation +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY +``` + +**Correct (explicit supported version):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +Use the latest supported version unless you are targeting a specific deployed Plano image. Current supported versions: `v0.1`, `v0.1.0`, `0.1-beta`, `v0.2.0`, `v0.3.0`. Prefer `v0.3.0` for all new projects. + +Reference: https://github.com/katanemo/archgw/blob/main/config/plano_config_schema.yaml diff --git a/skills/rules/deploy-docker.md b/skills/rules/deploy-docker.md new file mode 100644 index 00000000..ecc23586 --- /dev/null +++ b/skills/rules/deploy-docker.md @@ -0,0 +1,80 @@ +--- +title: Understand Plano's Docker Network Topology for Agent URL Configuration +impact: HIGH +impactDescription: Using `localhost` for agent URLs inside Docker always fails — Plano runs in a container and cannot reach host services via localhost +tags: deployment, docker, networking, agents, urls +--- + +## Understand Plano's Docker Network Topology for Agent URL Configuration + +Plano runs inside a Docker container managed by `planoai up`. Services running on your host machine (agent servers, filter servers, databases) are not accessible as `localhost` from inside the container. Use Docker's special hostname `host.docker.internal` to reach host services. + +**Docker network rules:** +- `localhost` / `127.0.0.1` inside the container → Plano's own container (not your host) +- `host.docker.internal` → Your host machine's loopback interface +- Container name or `docker network` hostname → Other Docker containers +- External domain / IP → Reachable if Docker has network access + +**Incorrect (using localhost — agent unreachable from inside container):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://localhost:8001 # Wrong: this is Plano's own container + + - id: flight_agent + url: http://127.0.0.1:8002 # Wrong: same issue + +filters: + - id: input_guards + url: http://localhost:10500 # Wrong: filter server unreachable +``` + +**Correct (using host.docker.internal for host-side services):** + +```yaml +version: v0.3.0 + +agents: + - id: weather_agent + url: http://host.docker.internal:8001 # Correct: reaches host port 8001 + + - id: flight_agent + url: http://host.docker.internal:8002 # Correct: reaches host port 8002 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 # Correct: reaches filter server on host + +endpoints: + internal_api: + endpoint: host.docker.internal # Correct for internal API on host + protocol: http +``` + +**Production deployment patterns:** + +```yaml +# Kubernetes / Docker Compose — use service names +agents: + - id: weather_agent + url: http://weather-service:8001 # Kubernetes service DNS + +# External cloud services — use full domain +agents: + - id: cloud_agent + url: https://my-agent.us-east-1.amazonaws.com/v1 + +# Custom TLS (self-signed or internal CA) +overrides: + upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem +``` + +**Ports exposed by Plano's container:** +- All `port` values from your `listeners` blocks are automatically mapped +- `9901` — Envoy admin interface (for advanced debugging) +- `12001` — Plano internal management API + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/deploy-health.md b/skills/rules/deploy-health.md new file mode 100644 index 00000000..8e948ee4 --- /dev/null +++ b/skills/rules/deploy-health.md @@ -0,0 +1,90 @@ +--- +title: Verify Listener Health Before Sending Requests +impact: MEDIUM +impactDescription: Sending requests to Plano before listeners are healthy results in connection refused errors that look like application bugs — always confirm health before testing +tags: deployment, health-checks, readiness, debugging +--- + +## Verify Listener Health Before Sending Requests + +Each Plano listener exposes a `/healthz` HTTP endpoint. `planoai up` automatically health-checks all listeners during startup (120s timeout), but in CI/CD pipelines, custom scripts, or when troubleshooting, you may need to check health manually. + +**Health check endpoints:** + +```bash +# Check model listener health (port from your config) +curl -f http://localhost:12000/healthz +# Returns 200 OK when healthy + +# Check prompt listener +curl -f http://localhost:10000/healthz + +# Check agent listener +curl -f http://localhost:8000/healthz +``` + +**Polling health in scripts (CI/CD pattern):** + +```bash +#!/bin/bash +# wait-for-plano.sh + +LISTENER_PORT=${1:-12000} +MAX_WAIT=120 +INTERVAL=2 +elapsed=0 + +echo "Waiting for Plano listener on port $LISTENER_PORT..." + +until curl -sf "http://localhost:$LISTENER_PORT/healthz" > /dev/null; do + if [ $elapsed -ge $MAX_WAIT ]; then + echo "ERROR: Plano listener not healthy after ${MAX_WAIT}s" + planoai logs --debug + exit 1 + fi + sleep $INTERVAL + elapsed=$((elapsed + INTERVAL)) +done + +echo "Plano listener healthy after ${elapsed}s" +``` + +**Docker Compose health check:** + +```yaml +# docker-compose.yml for services that depend on Plano +services: + plano: + image: katanemo/plano:latest + # Plano is managed by planoai, not directly via compose in most setups + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:12000/healthz"] + interval: 5s + timeout: 3s + retries: 24 + start_period: 10s + + my-agent: + image: my-agent:latest + depends_on: + plano: + condition: service_healthy +``` + +**Debug unhealthy listeners:** + +```bash +# See startup logs +planoai logs --debug + +# Check if port is already in use +lsof -i :12000 + +# Check container status +docker ps -a --filter name=plano + +# Restart from scratch +planoai down && planoai up config.yaml --foreground +``` + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/deploy-state.md b/skills/rules/deploy-state.md new file mode 100644 index 00000000..03ce1f3d --- /dev/null +++ b/skills/rules/deploy-state.md @@ -0,0 +1,85 @@ +--- +title: Use PostgreSQL State Storage for Multi-Turn Conversations in Production +impact: HIGH +impactDescription: The default in-memory state storage loses all conversation history when the container restarts — production multi-turn agents require persistent PostgreSQL storage +tags: deployment, state, postgres, memory, multi-turn, production +--- + +## Use PostgreSQL State Storage for Multi-Turn Conversations in Production + +`state_storage` enables Plano to maintain conversation context across requests. Without it, each request is stateless. The `memory` type works for development and testing — all state is lost on container restart. Use `postgres` for any production deployment where conversation continuity matters. + +**Incorrect (memory storage in production):** + +```yaml +version: v0.3.0 + +# Memory storage — all conversations lost on planoai down / container restart +state_storage: + type: memory + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant with conversation history. +``` + +**Correct (PostgreSQL for production persistence):** + +```yaml +version: v0.3.0 + +state_storage: + type: postgres + connection_string: "postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}" + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant with access to full conversation history. + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +**Setting up PostgreSQL for local development:** + +```bash +# Start PostgreSQL with Docker +docker run -d \ + --name plano-postgres \ + -e POSTGRES_USER=plano \ + -e POSTGRES_PASSWORD=devpassword \ + -e POSTGRES_DB=plano \ + -p 5432:5432 \ + postgres:16 + +# Set environment variables +export DB_USER=plano +export DB_PASS=devpassword +export DB_HOST=host.docker.internal # Use host.docker.internal from inside Plano container +export DB_NAME=plano +``` + +**Production `.env` pattern:** + +```bash +DB_USER=plano_prod +DB_PASS= +DB_HOST=your-rds-endpoint.amazonaws.com +DB_NAME=plano +``` + +Plano automatically creates its state tables on first startup. The `connection_string` supports all standard PostgreSQL connection parameters including SSL: `postgresql://user:pass@host:5432/db?sslmode=require`. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/filter-guardrails.md b/skills/rules/filter-guardrails.md new file mode 100644 index 00000000..d60bea65 --- /dev/null +++ b/skills/rules/filter-guardrails.md @@ -0,0 +1,81 @@ +--- +title: Configure Prompt Guards with Actionable Rejection Messages +impact: MEDIUM +impactDescription: A generic or empty rejection message leaves users confused about why their request was blocked and unable to rephrase appropriately +tags: filter, guardrails, jailbreak, security, ux +--- + +## Configure Prompt Guards with Actionable Rejection Messages + +Plano has built-in `prompt_guards` for detecting jailbreak attempts. When triggered, Plano returns the `on_exception.message` instead of forwarding the request. Write messages that explain the restriction and suggest what the user can do instead — both for user experience and to reduce support burden. + +**Incorrect (no message configured — returns a generic error):** + +```yaml +version: v0.3.0 + +prompt_guards: + input_guards: + jailbreak: + on_exception: {} # Empty — returns unhelpful generic error +``` + +**Incorrect (cryptic technical message):** + +```yaml +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: "Error code 403: guard triggered" # Unhelpful to the user +``` + +**Correct (clear, actionable, brand-appropriate message):** + +```yaml +version: v0.3.0 + +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: > + I'm not able to help with that request. This assistant is designed + to help with [your use case, e.g., customer support, coding questions]. + Please rephrase your question or contact support@yourdomain.com + if you believe this is an error. +``` + +**Combining prompt_guards with MCP filter guardrails:** + +```yaml +# Built-in jailbreak detection (fast, no external service needed) +prompt_guards: + input_guards: + jailbreak: + on_exception: + message: "This request cannot be processed. Please ask about our products and services." + +# MCP-based custom guards for additional policy enforcement +filters: + - id: topic_restriction + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + tool: topic_restriction # Custom filter for domain-specific restrictions + +listeners: + - type: agent + name: customer_support + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: support_agent + description: Customer support assistant for product questions and order issues. + filter_chain: + - topic_restriction # Additional custom topic filtering +``` + +`prompt_guards` applies globally to all listeners. Use `filter_chain` on individual agents for per-agent policies. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/filter-mcp.md b/skills/rules/filter-mcp.md new file mode 100644 index 00000000..c2d02efd --- /dev/null +++ b/skills/rules/filter-mcp.md @@ -0,0 +1,59 @@ +--- +title: Configure MCP Filters with Explicit Type and Transport +impact: MEDIUM +impactDescription: Omitting type and transport fields relies on defaults that may not match your MCP server's protocol implementation +tags: filter, mcp, integration, configuration +--- + +## Configure MCP Filters with Explicit Type and Transport + +Plano filters integrate with external services via MCP (Model Context Protocol) or plain HTTP. MCP filters call a specific tool on a remote MCP server. Always specify `type`, `transport`, and optionally `tool` (defaults to the filter `id`) to ensure Plano connects correctly to your filter implementation. + +**Incorrect (minimal filter definition relying on all defaults):** + +```yaml +filters: + - id: my_guard # Plano infers type=mcp, transport=streamable-http, tool=my_guard + url: http://localhost:10500 + # If your MCP server uses a different tool name or transport, this silently misroutes +``` + +**Correct (explicit configuration for each filter):** + +```yaml +version: v0.3.0 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp # Explicitly MCP protocol + transport: streamable-http # Streamable HTTP transport + tool: input_guards # MCP tool name (matches MCP server registration) + + - id: query_rewriter + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + tool: rewrite_query # Tool name differs from filter ID — explicit is safer + + - id: custom_validator + url: http://host.docker.internal:10503 + type: http # Plain HTTP filter (not MCP) + # No tool field for HTTP filters +``` + +**MCP filter implementation contract:** +Your MCP server must expose a tool matching the `tool` name. The tool receives the request payload and must return either: +- A modified request (to pass through with changes) +- A rejection response (to short-circuit the pipeline) + +**HTTP filter alternative** — use `type: http` for simpler request/response interceptors that don't need the MCP protocol: + +```yaml +filters: + - id: auth_validator + url: http://host.docker.internal:9000/validate + type: http # Plano POSTs the request, expects the modified request back +``` + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/filter-ordering.md b/skills/rules/filter-ordering.md new file mode 100644 index 00000000..ad2d0d7b --- /dev/null +++ b/skills/rules/filter-ordering.md @@ -0,0 +1,78 @@ +--- +title: Order Filter Chains with Guards First, Enrichment Last +impact: HIGH +impactDescription: Running context builders before input guards means jailbreak attempts get RAG-enriched context before being blocked — wasting compute and risking data exposure +tags: filter, guardrails, security, pipeline, ordering +--- + +## Order Filter Chains with Guards First, Enrichment Last + +A `filter_chain` is an ordered list of filter IDs applied sequentially to each request. The order is semantically meaningful: each filter receives the output of the previous one. Safety and validation filters must run first to short-circuit bad requests before expensive enrichment filters process them. + +**Recommended filter chain order:** + +1. **Input guards** — jailbreak detection, PII detection, topic restrictions (reject early) +2. **Query rewriting** — normalize or enhance the user query +3. **Context building** — RAG retrieval, tool lookup, knowledge injection (expensive) +4. **Output guards** — validate or sanitize LLM response before returning + +**Incorrect (context built before guards — wasteful and potentially unsafe):** + +```yaml +filters: + - id: context_builder + url: http://host.docker.internal:10502 # Runs expensive RAG retrieval first + - id: query_rewriter + url: http://host.docker.internal:10501 + - id: input_guards + url: http://host.docker.internal:10500 # Guards run last — jailbreak gets context + +listeners: + - type: agent + name: rag_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: rag_agent + filter_chain: + - context_builder # Wrong: expensive enrichment before safety check + - query_rewriter + - input_guards +``` + +**Correct (guards block bad requests before any enrichment):** + +```yaml +version: v0.3.0 + +filters: + - id: input_guards + url: http://host.docker.internal:10500 + type: mcp + transport: streamable-http + - id: query_rewriter + url: http://host.docker.internal:10501 + type: mcp + transport: streamable-http + - id: context_builder + url: http://host.docker.internal:10502 + type: mcp + transport: streamable-http + +listeners: + - type: agent + name: rag_orchestrator + port: 8000 + router: plano_orchestrator_v1 + agents: + - id: rag_agent + description: Answers questions using internal knowledge base documents. + filter_chain: + - input_guards # 1. Block jailbreaks and policy violations + - query_rewriter # 2. Normalize the safe query + - context_builder # 3. Retrieve relevant context for the clean query +``` + +Different agents within the same listener can have different filter chains — a public-facing agent may need all guards while an internal admin agent may skip them. + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/observe-span-attributes.md b/skills/rules/observe-span-attributes.md new file mode 100644 index 00000000..a90b3006 --- /dev/null +++ b/skills/rules/observe-span-attributes.md @@ -0,0 +1,80 @@ +--- +title: Add Custom Span Attributes for Correlation and Filtering +impact: MEDIUM +impactDescription: Without custom span attributes, traces cannot be filtered by user, session, or environment — making production debugging significantly harder +tags: observability, tracing, span-attributes, correlation +--- + +## Add Custom Span Attributes for Correlation and Filtering + +Plano can automatically extract HTTP request headers and attach them as span attributes, plus attach static key-value pairs to every span. This enables filtering traces by user, session, tenant, environment, or any other dimension that matters to your application. + +**Incorrect (no span attributes — traces are unfiltered blobs):** + +```yaml +tracing: + random_sampling: 20 + # No span_attributes — cannot filter by user, session, or environment +``` + +**Correct (rich span attributes for production correlation):** + +```yaml +version: v0.3.0 + +tracing: + random_sampling: 20 + trace_arch_internal: true + + span_attributes: + # Match all headers with this prefix, then map to span attributes by: + # 1) stripping the prefix and 2) converting hyphens to dots + header_prefixes: + - x-katanemo- + + # Static attributes added to every span from this Plano instance + static: + environment: production + service.name: plano-gateway + deployment.region: us-east-1 + service.version: "2.1.0" + team: platform-engineering +``` + +**Sending correlation headers from client code:** + +```python +import httpx + +response = httpx.post( + "http://localhost:12000/v1/chat/completions", + headers={ + "x-katanemo-request-id": "req_abc123", + "x-katanemo-user-id": "usr_12", + "x-katanemo-session-id": "sess_xyz456", + "x-katanemo-tenant-id": "acme-corp", + }, + json={"model": "plano.v1", "messages": [...]} +) +``` + +**Querying by custom attribute:** + +```bash +# Find all requests from a specific user +planoai trace --where user.id=usr_12 + +# Find all traces from production environment +planoai trace --where environment=production + +# Find traces from a specific tenant +planoai trace --where tenant.id=acme-corp +``` + +Header prefix matching is a prefix match. With `x-katanemo-`, these mappings apply: + +- `x-katanemo-user-id` -> `user.id` +- `x-katanemo-tenant-id` -> `tenant.id` +- `x-katanemo-request-id` -> `request.id` + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/observe-trace-query.md b/skills/rules/observe-trace-query.md new file mode 100644 index 00000000..a7ef7db7 --- /dev/null +++ b/skills/rules/observe-trace-query.md @@ -0,0 +1,85 @@ +--- +title: Use `planoai trace` to Inspect Routing Decisions +impact: MEDIUM-HIGH +impactDescription: The trace CLI lets you verify which model was selected, why, and how long each step took — without setting up a full OTEL backend +tags: observability, tracing, cli, debugging, routing +--- + +## Use `planoai trace` to Inspect Routing Decisions + +`planoai trace` provides a built-in trace viewer backed by an in-memory OTEL collector. Use it to inspect routing decisions, verify preference matching, measure filter latency, and debug failed requests — all from the CLI without configuring Jaeger, Zipkin, or another backend. + +**Workflow: start collector, run requests, then inspect traces:** + +```bash +# 1. Start Plano with the built-in trace collector (recommended) +planoai up config.yaml --with-tracing + +# 2. Send test requests through Plano +curl http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "plano.v1", "messages": [{"role": "user", "content": "Write a Python function to sort a list"}]}' + +# 3. Show the latest trace +planoai trace +``` + +You can also run the trace listener directly: + +```bash +planoai trace listen # available on a process ID running OTEL collector +``` + +Stop the background trace listener: + +```bash +planoai trace down +``` + +**Useful trace viewer patterns:** + +```bash +# Show latest trace (default target is "last") +planoai trace + +# List available trace IDs +planoai trace --list + +# Show all traces +planoai trace any + +# Show a specific trace (short 8-char or full 32-char ID) +planoai trace 7f4e9a1c +planoai trace 7f4e9a1c0d9d4a0bb9bf5a8a7d13f62a + +# Filter by specific span attributes (AND semantics for repeated --where) +planoai trace any --where llm.model=gpt-4o-mini + +# Filter by user ID (if header prefix is x-katanemo-, x-katanemo-user-id maps to user.id) +planoai trace any --where user.id=user_123 + +# Limit results for a quick sanity check +planoai trace any --limit 5 + +# Time window filter +planoai trace any --since 30m + +# Filter displayed attributes by key pattern +planoai trace any --filter "http.*" + +# Output machine-readable JSON +planoai trace any --json +``` + +**What to look for in traces:** + + +| Span name | What it tells you | +| ------------------- | ------------------------------------------------------------- | +| `plano.routing` | Which routing preference matched and which model was selected | +| `plano.filter.` | How long each filter in the chain took | +| `plano.llm.request` | Time to first token and full response time | +| `plano.agent.route` | Which agent description matched for agent listeners | + + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/observe-tracing.md b/skills/rules/observe-tracing.md new file mode 100644 index 00000000..93b9c003 --- /dev/null +++ b/skills/rules/observe-tracing.md @@ -0,0 +1,80 @@ +--- +title: Enable Tracing with Appropriate Sampling for Your Environment +impact: HIGH +impactDescription: Without tracing enabled, debugging routing decisions, latency issues, and model selection is guesswork — traces are the primary observability primitive in Plano +tags: observability, tracing, opentelemetry, otel, debugging +--- + +## Enable Tracing with Appropriate Sampling for Your Environment + +Plano emits OpenTelemetry (OTEL) traces for every request, capturing routing decisions, LLM provider selection, filter chain execution, and response latency. Traces are the best tool for understanding why a request was routed to a particular model and debugging unexpected behavior. + +**Incorrect (no tracing configured — flying blind in production):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +# No tracing block — no visibility into routing, latency, or errors +``` + +**Correct (tracing enabled with environment-appropriate sampling):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true + +tracing: + random_sampling: 100 # 100% for development/debugging + trace_arch_internal: true # Include Plano's internal routing spans +``` + +**Production configuration (sampled to control volume):** + +```yaml +tracing: + random_sampling: 10 # Sample 10% of requests in production + trace_arch_internal: false # Skip internal spans to reduce noise + span_attributes: + header_prefixes: + - x-katanemo- # Match all x-katanemo-* headers + static: + environment: production + service.name: my-plano-service + version: "1.0.0" +``` + +With `x-katanemo-` configured, Plano maps headers to attributes by stripping the prefix and converting hyphens to dots: + +- `x-katanemo-user-id` -> `user.id` +- `x-katanemo-session-id` -> `session.id` +- `x-katanemo-request-id` -> `request.id` + +**Starting the trace collector:** + +```bash +# Start Plano with built-in OTEL collector +planoai up config.yaml --with-tracing +``` + +Sampling rates: 100% for dev/staging, 5–20% for high-traffic production, 100% for low-traffic production. `trace_arch_internal: true` adds spans showing which routing preference matched — essential for debugging preference configuration. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/routing-aliases.md b/skills/rules/routing-aliases.md new file mode 100644 index 00000000..91f0b31a --- /dev/null +++ b/skills/rules/routing-aliases.md @@ -0,0 +1,77 @@ +--- +title: Use Model Aliases for Semantic, Stable Model References +impact: MEDIUM +impactDescription: Hardcoded model names in client code require code changes when you swap providers; aliases let you update routing in config.yaml alone +tags: routing, model-aliases, maintainability, client-integration +--- + +## Use Model Aliases for Semantic, Stable Model References + +`model_aliases` map human-readable names to specific model identifiers. Client applications reference the alias, not the underlying model. When you want to upgrade from `gpt-4o` to a new model, you change one line in `config.yaml` — not every client calling the API. + +**Incorrect (clients hardcode specific model names):** + +```yaml +# config.yaml — no aliases defined +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + default: true +``` + +```python +# Client code — brittle, must be updated when model changes +client.chat.completions.create(model="gpt-4o", ...) +``` + +**Correct (semantic aliases, stable client contracts):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + +model_aliases: + plano.fast.v1: + target: gpt-4o-mini # Cheap, fast — for high-volume tasks + + plano.smart.v1: + target: gpt-4o # High capability — for complex reasoning + + plano.creative.v1: + target: claude-sonnet-4-20250514 # Strong creative writing and analysis + + plano.v1: + target: gpt-4o # Default production alias +``` + +```python +# Client code — stable, alias is the contract +client.chat.completions.create(model="plano.smart.v1", ...) +``` + +**Alias naming conventions:** +- `..` — e.g., `plano.fast.v1`, `acme.code.v2` +- Bumping `.v2` → `.v3` lets you run old and new aliases simultaneously during rollouts +- `plano.v1` as a canonical default gives clients a single stable entry point + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/routing-default.md b/skills/rules/routing-default.md new file mode 100644 index 00000000..f23e7357 --- /dev/null +++ b/skills/rules/routing-default.md @@ -0,0 +1,70 @@ +--- +title: Always Set Exactly One Default Model Provider +impact: HIGH +impactDescription: Without a default provider, Plano has no fallback when routing preferences do not match — requests with unclassified intent will fail +tags: routing, defaults, model-providers, reliability +--- + +## Always Set Exactly One Default Model Provider + +When a request does not match any routing preference, Plano forwards it to the `default: true` provider. Without a default, unmatched requests fail. If multiple providers are marked `default: true`, Plano uses the first one — which can produce unexpected behavior. + +**Incorrect (no default provider set):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o-mini # No default: true anywhere + access_key: $OPENAI_API_KEY + routing_preferences: + - name: summarization + description: Summarizing documents and extracting key points + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code_generation + description: Writing new functions and implementing algorithms +``` + +**Incorrect (multiple defaults — ambiguous):** + +```yaml +model_providers: + - model: openai/gpt-4o-mini + default: true # First default + access_key: $OPENAI_API_KEY + + - model: openai/gpt-4o + default: true # Second default — confusing + access_key: $OPENAI_API_KEY +``` + +**Correct (exactly one default, covering unmatched requests):** + +```yaml +version: v0.3.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true # Handles general/unclassified requests + routing_preferences: + - name: summarization + description: Summarizing documents, articles, and meeting notes + - name: classification + description: Categorizing inputs, labeling, and intent detection + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: code_generation + description: Writing, debugging, and reviewing code + - name: complex_reasoning + description: Multi-step math, logical analysis, research synthesis +``` + +Choose your most cost-effective capable model as the default — it handles all traffic that doesn't match specialized preferences. + +Reference: [https://github.com/katanemo/archgw](https://github.com/katanemo/archgw) diff --git a/skills/rules/routing-passthrough.md b/skills/rules/routing-passthrough.md new file mode 100644 index 00000000..ff9fbaf9 --- /dev/null +++ b/skills/rules/routing-passthrough.md @@ -0,0 +1,69 @@ +--- +title: Use Passthrough Auth for Proxy and Multi-Tenant Setups +impact: MEDIUM +impactDescription: Without passthrough auth, self-hosted proxy services (LiteLLM, vLLM, etc.) reject Plano's requests because the wrong Authorization header is sent +tags: routing, authentication, proxy, litellm, multi-tenant +--- + +## Use Passthrough Auth for Proxy and Multi-Tenant Setups + +When routing to a self-hosted LLM proxy (LiteLLM, vLLM, OpenRouter, Azure APIM) or in multi-tenant setups where clients supply their own keys, set `passthrough_auth: true`. This forwards the client's `Authorization` header rather than Plano's configured `access_key`. Combine with a `base_url` pointing to the proxy. + +**Incorrect (Plano sends its own key to a proxy that expects the client's key):** + +```yaml +model_providers: + - model: custom/proxy + base_url: http://host.docker.internal:8000 + access_key: $SOME_KEY # Plano overwrites the client's auth — proxy rejects it +``` + +**Correct (forward client Authorization header to the proxy):** + +```yaml +version: v0.3.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: custom/litellm-proxy + base_url: http://host.docker.internal:4000 # LiteLLM server + provider_interface: openai # LiteLLM uses OpenAI format + passthrough_auth: true # Forward client's Bearer token + default: true +``` + +**Multi-tenant pattern (client supplies their own API key):** + +```yaml +model_providers: + # Plano acts as a passthrough gateway; each client has their own OpenAI key + - model: openai/gpt-4o + passthrough_auth: true # No access_key here — client's key is forwarded + default: true +``` + +**Combined: proxy for some models, Plano-managed for others:** + +```yaml +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY # Plano manages this key + default: true + routing_preferences: + - name: quick tasks + description: Short answers, simple lookups, fast completions + + - model: custom/vllm-llama + base_url: http://gpu-server:8000 + provider_interface: openai + passthrough_auth: true # vLLM cluster handles its own auth + routing_preferences: + - name: long context + description: Processing very long documents, multi-document analysis +``` + +Reference: https://github.com/katanemo/archgw diff --git a/skills/rules/routing-preferences.md b/skills/rules/routing-preferences.md new file mode 100644 index 00000000..51127c5e --- /dev/null +++ b/skills/rules/routing-preferences.md @@ -0,0 +1,106 @@ +--- +title: Write Task-Specific Routing Preference Descriptions +impact: HIGH +impactDescription: Vague preference descriptions cause Plano's internal router LLM to misclassify requests, routing expensive tasks to cheap models and vice versa +tags: routing, model-selection, preferences, llm-routing +--- + +## Write Task-Specific Routing Preference Descriptions + +Plano's `plano_orchestrator_v1` router uses a 1.5B preference-aligned LLM to classify incoming requests against your `routing_preferences` descriptions. It returns an ordered `models` list for the matched route; the client uses `models[0]` as primary and falls back to `models[1]`, `models[2]`... on `429`/`5xx` errors. Description quality directly determines routing accuracy. + +Starting in `v0.4.0`, `routing_preferences` lives at the **top level** of the config and each entry carries its own `models: [...]` candidate pool. Configs still using the legacy v0.3.0 inline shape (under each `model_provider`) are auto-migrated with a deprecation warning — prefer the top-level form below. + +**Incorrect (vague, overlapping descriptions):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: simple + description: easy tasks # Too vague — what is "easy"? + models: + - openai/gpt-4o-mini + - name: hard + description: hard tasks # Too vague — overlaps with "easy" + models: + - openai/gpt-4o +``` + +**Correct (specific, distinct task descriptions, multi-model fallbacks):** + +```yaml +version: v0.4.0 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + + - model: anthropic/claude-sonnet-4-5 + access_key: $ANTHROPIC_API_KEY + +routing_preferences: + - name: summarization + description: > + Summarizing documents, articles, emails, or meeting transcripts. + Extracting key points, generating TL;DR sections, condensing long text. + models: + - openai/gpt-4o-mini + - openai/gpt-4o + - name: classification + description: > + Categorizing inputs, sentiment analysis, spam detection, + intent classification, labeling structured data fields. + models: + - openai/gpt-4o-mini + - name: translation + description: > + Translating text between languages, localization tasks. + models: + - openai/gpt-4o-mini + - anthropic/claude-sonnet-4-5 + - name: code_generation + description: > + Writing new functions, classes, or modules from scratch. + Implementing algorithms, boilerplate generation, API integrations. + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-5 + - name: code_review + description: > + Reviewing code for bugs, security vulnerabilities, performance issues. + Suggesting refactors, explaining complex code, debugging errors. + models: + - anthropic/claude-sonnet-4-5 + - openai/gpt-4o + - name: complex_reasoning + description: > + Multi-step math problems, logical deduction, strategic planning, + research synthesis requiring chain-of-thought reasoning. + models: + - openai/gpt-4o + - anthropic/claude-sonnet-4-5 +``` + +**Key principles for good preference descriptions:** +- Use concrete action verbs: "writing", "reviewing", "translating", "summarizing" +- List 3–5 specific sub-tasks or synonyms for each preference +- Ensure preferences across routes are mutually exclusive in scope +- Order `models` from most preferred to least — the client will fall back in order on `429`/`5xx` +- List multiple models under one route to get automatic provider fallback without additional client logic +- Every model listed in `models` must be declared in `model_providers` +- Test with representative queries using `planoai trace` and `--where` filters to verify routing decisions + +Reference: [Routing API](../../docs/routing-api.md) · https://github.com/katanemo/archgw diff --git a/skills/src/build.ts b/skills/src/build.ts new file mode 100644 index 00000000..5d4640f1 --- /dev/null +++ b/skills/src/build.ts @@ -0,0 +1,262 @@ +#!/usr/bin/env node + +import { readFileSync, writeFileSync, readdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +type Section = { + prefix: string; + number: number; + title: string; + description: string; +}; + +type Rule = { + file: string; + title: string; + impact: string; + impactDescription: string; + tags: string[]; + body: string; + section: Section; +}; + +type ParsedFrontmatter = { + frontmatter: Record; + body: string; +}; + +type Metadata = { + abstract: string; + version: string; + organization: string; +}; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const RULES_DIR = join(__dirname, "..", "rules"); +const OUTPUT_FILE = join(__dirname, "..", "AGENTS.md"); +const METADATA_FILE = join(__dirname, "..", "metadata.json"); + +const SECTIONS: Section[] = [ + { + prefix: "config-", + number: 1, + title: "Configuration Fundamentals", + description: + "Core config.yaml structure, versioning, listener types, and provider setup — the entry point for every Plano deployment.", + }, + { + prefix: "routing-", + number: 2, + title: "Routing & Model Selection", + description: + "Intelligent LLM routing using preferences, aliases, and defaults to match tasks to the best model.", + }, + { + prefix: "agent-", + number: 3, + title: "Agent Orchestration", + description: + "Multi-agent patterns, agent descriptions, and orchestration strategies for building agentic applications.", + }, + { + prefix: "filter-", + number: 4, + title: "Filter Chains & Guardrails", + description: + "Request/response processing pipelines — ordering, MCP integration, and safety guardrails.", + }, + { + prefix: "observe-", + number: 5, + title: "Observability & Debugging", + description: + "OpenTelemetry tracing, log levels, span attributes, and sampling for production visibility.", + }, + { + prefix: "cli-", + number: 6, + title: "CLI Operations", + description: + "Using the planoai CLI for startup, tracing, CLI agents, project init, and code generation.", + }, + { + prefix: "deploy-", + number: 7, + title: "Deployment & Security", + description: + "Docker deployment, environment variable management, health checks, and state storage for production.", + }, + { + prefix: "advanced-", + number: 8, + title: "Advanced Patterns", + description: + "Prompt targets, external API integration, rate limiting, and multi-listener architectures.", + }, +]; + +function parseFrontmatter(content: string): ParsedFrontmatter | null { + const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/); + if (!match) return null; + + const frontmatter: Record = {}; + const lines = match[1].split("\n"); + for (const line of lines) { + const colonIdx = line.indexOf(":"); + if (colonIdx === -1) continue; + const key = line.slice(0, colonIdx).trim(); + const value = line.slice(colonIdx + 1).trim(); + frontmatter[key] = value; + } + + return { + frontmatter, + body: match[2].trim(), + }; +} + +function inferSection(filename: string): Section | null { + for (const section of SECTIONS) { + if (filename.startsWith(section.prefix)) { + return section; + } + } + return null; +} + +function main(): void { + const metadata = JSON.parse(readFileSync(METADATA_FILE, "utf-8")) as Metadata; + + const files = readdirSync(RULES_DIR) + .filter((f) => f.endsWith(".md") && !f.startsWith("_")) + .sort(); + + const sectionRules = new Map(); + for (const section of SECTIONS) { + sectionRules.set(section.number, []); + } + + let parseErrors = 0; + + for (const file of files) { + const content = readFileSync(join(RULES_DIR, file), "utf-8"); + const parsed = parseFrontmatter(content); + + if (!parsed) { + console.error(`ERROR: Could not parse frontmatter in ${file}`); + parseErrors++; + continue; + } + + const section = inferSection(file); + if (!section) { + console.warn(`WARN: No section found for ${file} — skipping`); + continue; + } + + const rule: Rule = { + file, + title: parsed.frontmatter.title ?? file, + impact: parsed.frontmatter.impact ?? "MEDIUM", + impactDescription: parsed.frontmatter.impactDescription ?? "", + tags: parsed.frontmatter.tags + ? parsed.frontmatter.tags.split(",").map((t) => t.trim()) + : [], + body: parsed.body, + section, + }; + sectionRules.get(section.number)?.push(rule); + } + + if (parseErrors > 0) { + console.error(`\nBuild failed: ${parseErrors} file(s) had parse errors.`); + process.exit(1); + } + + for (const [, rules] of sectionRules) { + rules.sort((a, b) => a.title.localeCompare(b.title)); + } + + const lines: string[] = []; + lines.push(`# Plano Agent Skills`); + lines.push(``); + lines.push(`> ${metadata.abstract}`); + lines.push(``); + lines.push( + `**Version:** ${metadata.version} | **Organization:** ${metadata.organization}` + ); + lines.push(``); + lines.push(`---`); + lines.push(``); + + lines.push(`## Table of Contents`); + lines.push(``); + for (const section of SECTIONS) { + const rules = sectionRules.get(section.number) ?? []; + if (rules.length === 0) continue; + lines.push( + `- [Section ${section.number}: ${section.title}](#section-${section.number})` + ); + for (let i = 0; i < rules.length; i++) { + const rule = rules[i]; + const id = `${section.number}.${i + 1}`; + const anchor = rule.title + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, "") + .replace(/\s+/g, "-"); + lines.push(` - [${id} ${rule.title}](#${anchor})`); + } + } + lines.push(``); + lines.push(`---`); + lines.push(``); + + for (const section of SECTIONS) { + const rules = sectionRules.get(section.number) ?? []; + if (rules.length === 0) continue; + + lines.push(`## Section ${section.number}: ${section.title}`); + lines.push(``); + lines.push(`*${section.description}*`); + lines.push(``); + + for (let i = 0; i < rules.length; i++) { + const rule = rules[i]; + const id = `${section.number}.${i + 1}`; + + lines.push(`### ${id} ${rule.title}`); + lines.push(``); + lines.push( + `**Impact:** \`${rule.impact}\`${rule.impactDescription ? ` — ${rule.impactDescription}` : ""}` + ); + if (rule.tags.length > 0) { + lines.push(`**Tags:** ${rule.tags.map((t) => `\`${t}\``).join(", ")}`); + } + lines.push(``); + lines.push(rule.body); + lines.push(``); + lines.push(`---`); + lines.push(``); + } + } + + lines.push(`*Generated from individual rule files in \`rules/\`.*`); + lines.push( + `*To contribute, see [CONTRIBUTING](https://github.com/katanemo/archgw/blob/main/CONTRIBUTING.md).*` + ); + + writeFileSync(OUTPUT_FILE, lines.join("\n"), "utf-8"); + + let totalRules = 0; + for (const section of SECTIONS) { + const rules = sectionRules.get(section.number) ?? []; + if (rules.length > 0) { + console.log(` Section ${section.number}: ${rules.length} rules`); + totalRules += rules.length; + } + } + console.log(`\nBuilt AGENTS.md with ${totalRules} rules.`); +} + +main(); diff --git a/skills/src/extract-tests.ts b/skills/src/extract-tests.ts new file mode 100644 index 00000000..b7d03b61 --- /dev/null +++ b/skills/src/extract-tests.ts @@ -0,0 +1,147 @@ +#!/usr/bin/env node + +import { readFileSync, writeFileSync, readdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +type ParsedFrontmatter = { + frontmatter: Record; + body: string; +}; + +type SectionPrefix = { + prefix: string; + number: number; + title: string; +}; + +type ExampleExtraction = { + incorrect: string | null; + correct: string | null; +}; + +type TestCaseEntry = { + id: string; + section: number; + sectionTitle: string; + title: string; + impact: string; + tags: string[]; + testCase: { + description: string; + input: string | null; + expected: string | null; + evaluationPrompt: string; + }; +}; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const RULES_DIR = join(__dirname, "..", "rules"); +const OUTPUT_FILE = join(__dirname, "..", "test-cases.json"); + +const SECTION_PREFIXES: SectionPrefix[] = [ + { prefix: "config-", number: 1, title: "Configuration Fundamentals" }, + { prefix: "routing-", number: 2, title: "Routing & Model Selection" }, + { prefix: "agent-", number: 3, title: "Agent Orchestration" }, + { prefix: "filter-", number: 4, title: "Filter Chains & Guardrails" }, + { prefix: "observe-", number: 5, title: "Observability & Debugging" }, + { prefix: "cli-", number: 6, title: "CLI Operations" }, + { prefix: "deploy-", number: 7, title: "Deployment & Security" }, + { prefix: "advanced-", number: 8, title: "Advanced Patterns" }, +]; + +function parseFrontmatter(content: string): ParsedFrontmatter | null { + const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/); + if (!match) return null; + + const frontmatter: Record = {}; + const lines = match[1].split("\n"); + for (const line of lines) { + const colonIdx = line.indexOf(":"); + if (colonIdx === -1) continue; + const key = line.slice(0, colonIdx).trim(); + const value = line.slice(colonIdx + 1).trim(); + frontmatter[key] = value; + } + + return { frontmatter, body: match[2].trim() }; +} + +function extractCodeBlocks(text: string): string[] { + const blocks: string[] = []; + const regex = /```(?:yaml|bash|python|typescript|json|sh)?\n([\s\S]*?)```/g; + let match: RegExpExecArray | null; + do { + match = regex.exec(text); + if (match) { + blocks.push(match[1].trim()); + } + } while (match !== null); + return blocks; +} + +function extractExamples(body: string): ExampleExtraction { + const incorrectMatch = body.match( + /\*\*Incorrect[^*]*\*\*[:\s]*([\s\S]*?)(?=\*\*Correct|\*\*Key|$)/ + ); + const correctMatch = body.match( + /\*\*Correct[^*]*\*\*[:\s]*([\s\S]*?)(?=\*\*Incorrect|\*\*Key|\*\*Note|Reference:|$)/ + ); + + return { + incorrect: incorrectMatch + ? extractCodeBlocks(incorrectMatch[1]).join("\n\n") + : null, + correct: correctMatch ? extractCodeBlocks(correctMatch[1]).join("\n\n") : null, + }; +} + +function inferSection(filename: string): SectionPrefix | null { + for (const s of SECTION_PREFIXES) { + if (filename.startsWith(s.prefix)) return s; + } + return null; +} + +function main(): void { + const files = readdirSync(RULES_DIR) + .filter((f) => f.endsWith(".md") && !f.startsWith("_")) + .sort(); + + const testCases: TestCaseEntry[] = []; + + for (const file of files) { + const content = readFileSync(join(RULES_DIR, file), "utf-8"); + const parsed = parseFrontmatter(content); + if (!parsed) continue; + + const { frontmatter, body } = parsed; + const section = inferSection(file); + if (!section) continue; + + const { incorrect, correct } = extractExamples(body); + if (!incorrect && !correct) continue; + + testCases.push({ + id: file.replace(".md", ""), + section: section.number, + sectionTitle: section.title, + title: frontmatter.title ?? file, + impact: frontmatter.impact ?? "MEDIUM", + tags: frontmatter.tags + ? frontmatter.tags.split(",").map((t) => t.trim()) + : [], + testCase: { + description: `Detect and fix: "${frontmatter.title}"`, + input: incorrect, + expected: correct, + evaluationPrompt: `Given the following Plano config or CLI usage, identify if it violates the rule "${frontmatter.title}" and explain how to fix it.`, + }, + }); + } + + writeFileSync(OUTPUT_FILE, JSON.stringify(testCases, null, 2), "utf-8"); + console.log(`Extracted ${testCases.length} test cases to test-cases.json`); +} + +main(); diff --git a/skills/src/validate.ts b/skills/src/validate.ts new file mode 100644 index 00000000..4fdf46ea --- /dev/null +++ b/skills/src/validate.ts @@ -0,0 +1,156 @@ +#!/usr/bin/env node + +import { readFileSync, readdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +type ParsedFrontmatter = { + frontmatter: Record; + body: string; +}; + +type ValidationResult = { + errors: string[]; + warnings: string[]; +}; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const RULES_DIR = join(__dirname, "..", "rules"); + +const VALID_IMPACTS = [ + "CRITICAL", + "HIGH", + "MEDIUM-HIGH", + "MEDIUM", + "LOW-MEDIUM", + "LOW", +] as const; + +const SECTION_PREFIXES = [ + "config-", + "routing-", + "agent-", + "filter-", + "observe-", + "cli-", + "deploy-", + "advanced-", +]; + +function parseFrontmatter(content: string): ParsedFrontmatter | null { + const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/); + if (!match) return null; + + const frontmatter: Record = {}; + const lines = match[1].split("\n"); + for (const line of lines) { + const colonIdx = line.indexOf(":"); + if (colonIdx === -1) continue; + const key = line.slice(0, colonIdx).trim(); + const value = line.slice(colonIdx + 1).trim(); + frontmatter[key] = value; + } + + return { frontmatter, body: match[2].trim() }; +} + +function validateFile(file: string, content: string): ValidationResult { + const errors: string[] = []; + const warnings: string[] = []; + + const parsed = parseFrontmatter(content); + if (!parsed) { + errors.push("Missing or malformed frontmatter (expected --- ... ---)"); + return { errors, warnings }; + } + + const { frontmatter, body } = parsed; + + if (!frontmatter.title) { + errors.push("Missing required frontmatter field: title"); + } + if (!frontmatter.impact) { + errors.push("Missing required frontmatter field: impact"); + } else if (!VALID_IMPACTS.includes(frontmatter.impact as (typeof VALID_IMPACTS)[number])) { + errors.push( + `Invalid impact value: "${frontmatter.impact}". Valid values: ${VALID_IMPACTS.join(", ")}` + ); + } + if (!frontmatter.tags) { + warnings.push("No tags defined — consider adding relevant tags"); + } + + const hasValidPrefix = SECTION_PREFIXES.some((p) => file.startsWith(p)); + if (!hasValidPrefix) { + errors.push( + `Filename must start with a valid prefix: ${SECTION_PREFIXES.join(", ")}` + ); + } + + if (body.length < 100) { + warnings.push("Rule body seems very short — consider adding more detail"); + } + + if (!body.includes("```")) { + warnings.push( + "No code examples found — rules should include YAML or CLI examples" + ); + } + + if (!body.includes("Incorrect") || !body.includes("Correct")) { + warnings.push( + "Consider adding both Incorrect and Correct examples for clarity" + ); + } + + return { errors, warnings }; +} + +function main(): void { + const files = readdirSync(RULES_DIR) + .filter((f) => f.endsWith(".md") && !f.startsWith("_")) + .sort(); + + let totalErrors = 0; + let totalWarnings = 0; + let filesWithIssues = 0; + + console.log(`Validating ${files.length} rule files...\n`); + + for (const file of files) { + const content = readFileSync(join(RULES_DIR, file), "utf-8"); + const { errors, warnings } = validateFile(file, content); + + if (errors.length > 0 || warnings.length > 0) { + filesWithIssues++; + console.log(`📄 ${file}`); + + for (const error of errors) { + console.log(` ❌ ERROR: ${error}`); + totalErrors++; + } + for (const warning of warnings) { + console.log(` ⚠️ WARN: ${warning}`); + totalWarnings++; + } + console.log(); + } else { + console.log(`✅ ${file}`); + } + } + + console.log(`\n--- Validation Summary ---`); + console.log(`Files checked: ${files.length}`); + console.log(`Files with issues: ${filesWithIssues}`); + console.log(`Errors: ${totalErrors}`); + console.log(`Warnings: ${totalWarnings}`); + + if (totalErrors > 0) { + console.log(`\nValidation FAILED with ${totalErrors} error(s).`); + process.exit(1); + } else { + console.log(`\nValidation passed.`); + } +} + +main(); diff --git a/skills/test-cases.json b/skills/test-cases.json new file mode 100644 index 00000000..c8bcfe33 --- /dev/null +++ b/skills/test-cases.json @@ -0,0 +1,353 @@ +[ + { + "id": "advanced-prompt-targets", + "section": 8, + "sectionTitle": "Advanced Patterns", + "title": "Design Prompt Targets with Precise Parameter Schemas", + "impact": "HIGH", + "tags": [ + "advanced", + "prompt-targets", + "functions", + "llm", + "api-integration" + ], + "testCase": { + "description": "Detect and fix: \"Design Prompt Targets with Precise Parameter Schemas\"", + "input": "prompt_targets:\n - name: get_flight_info\n description: Get flight information\n parameters:\n - name: flight # What format? \"AA123\"? \"AA 123\"? \"American 123\"?\n type: str\n required: true\n endpoint:\n name: flights_api\n path: /flight?id={flight}", + "expected": "version: v0.3.0\n\nendpoints:\n flights_api:\n endpoint: api.flightaware.com\n protocol: https\n connect_timeout: \"5s\"\n\nprompt_targets:\n - name: get_flight_status\n description: >\n Get real-time status, gate information, and delays for a specific flight number.\n Use when the user asks about a flight's current status, arrival time, or gate.\n parameters:\n - name: flight_number\n description: >\n IATA airline code followed by flight number, e.g., \"AA123\", \"UA456\", \"DL789\".\n Extract from user message — do not include spaces.\n type: str\n required: true\n format: \"^[A-Z]{2}[0-9]{1,4}$\" # Regex hint for validation\n\n - name: date\n description: >\n Flight date in YYYY-MM-DD format. Use today's date if not specified.\n type: str\n required: false\n format: date\n\n endpoint:\n name: flights_api\n path: /flights/{flight_number}?date={date}\n http_method: GET\n http_headers:\n Authorization: \"Bearer $FLIGHTAWARE_API_KEY\"\n\n - name: search_flights\n description: >\n Search for available flights between two cities or airports.\n Use when the user wants to find flights, compare options, or book travel.\n parameters:\n - name: origin\n description: Departure airport IATA code (e.g., \"JFK\", \"LAX\", \"ORD\")\n type: str\n required: true\n - name: destination\n description: Arrival airport IATA code (e.g., \"LHR\", \"CDG\", \"NRT\")\n type: str\n required: true\n - name: departure_date\n description: Departure date in YYYY-MM-DD format\n type: str\n required: true\n format: date\n - name: cabin_class\n description: Preferred cabin class\n type: str\n required: false\n default: economy\n enum: [economy, premium_economy, business, first]\n - name: passengers\n description: Number of adult passengers (1-9)\n type: int\n required: false\n default: 1\n\n endpoint:\n name: flights_api\n path: /search?from={origin}&to={destination}&date={departure_date}&class={cabin_class}&pax={passengers}\n http_method: GET\n http_headers:\n Authorization: \"Bearer $FLIGHTAWARE_API_KEY\"\n\n system_prompt: |\n You are a travel assistant. Present flight search results clearly,\n highlighting the best value options. Include price, duration, and\n number of stops for each option.\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\nlisteners:\n - type: prompt\n name: travel_functions\n port: 10000\n timeout: \"30s\"", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Design Prompt Targets with Precise Parameter Schemas\" and explain how to fix it." + } + }, + { + "id": "agent-descriptions", + "section": 3, + "sectionTitle": "Agent Orchestration", + "title": "Write Capability-Focused Agent Descriptions for Accurate Routing", + "impact": "HIGH", + "tags": [ + "agent", + "orchestration", + "descriptions", + "routing", + "multi-agent" + ], + "testCase": { + "description": "Detect and fix: \"Write Capability-Focused Agent Descriptions for Accurate Routing\"", + "input": "listeners:\n - type: agent\n name: orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: agent_1\n description: Helps users with information # Too generic — matches everything\n\n - id: agent_2\n description: Also helps users # Indistinguishable from agent_1", + "expected": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001\n - id: flight_agent\n url: http://host.docker.internal:8002\n - id: hotel_agent\n url: http://host.docker.internal:8003\n\nlisteners:\n - type: agent\n name: travel_orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: weather_agent\n description: >\n Provides real-time weather conditions and multi-day forecasts for any city\n worldwide. Handles questions about temperature, precipitation, wind, humidity,\n sunrise/sunset times, and severe weather alerts. Examples: \"What's the weather\n in Tokyo?\", \"Will it rain in London this weekend?\", \"Sunrise time in New York.\"\n\n - id: flight_agent\n description: >\n Provides live flight status, schedules, gate information, delays, and\n aircraft details for any flight number or route between airports.\n Handles questions about departures, arrivals, and airline information.\n Examples: \"Is AA123 on time?\", \"Flights from JFK to LAX tomorrow.\"\n\n - id: hotel_agent\n description: >\n Searches and books hotel accommodations, compares room types, pricing,\n and availability. Handles check-in/check-out dates, amenities, and\n cancellation policies. Examples: \"Hotels near Times Square for next Friday.\"", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Write Capability-Focused Agent Descriptions for Accurate Routing\" and explain how to fix it." + } + }, + { + "id": "agent-orchestration", + "section": 3, + "sectionTitle": "Agent Orchestration", + "title": "Register All Sub-Agents in Both `agents` and `listeners.agents`", + "impact": "CRITICAL", + "tags": [ + "agent", + "orchestration", + "config", + "multi-agent" + ], + "testCase": { + "description": "Detect and fix: \"Register All Sub-Agents in Both `agents` and `listeners.agents`\"", + "input": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001\n - id: news_agent # Defined but never referenced in any listener\n url: http://host.docker.internal:8002\n\nlisteners:\n - type: agent\n name: orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: weather_agent\n description: Provides weather forecasts and current conditions.\n # news_agent is missing here — the orchestrator cannot route to it\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001\n\nlisteners:\n - type: agent\n name: orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: weather_agent\n description: Provides weather forecasts.\n - id: flights_agent # ID not in global agents[] — startup error\n description: Provides flight status information.", + "expected": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001\n - id: flights_agent\n url: http://host.docker.internal:8002\n - id: hotels_agent\n url: http://host.docker.internal:8003\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\nlisteners:\n - type: agent\n name: travel_orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: weather_agent\n description: Real-time weather, forecasts, and climate data for any city.\n - id: flights_agent\n description: Live flight status, schedules, gates, and delays.\n - id: hotels_agent\n description: Hotel search, availability, pricing, and booking.\n default: true # Fallback if no other agent matches", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Register All Sub-Agents in Both `agents` and `listeners.agents`\" and explain how to fix it." + } + }, + { + "id": "config-listeners", + "section": 1, + "sectionTitle": "Configuration Fundamentals", + "title": "Choose the Right Listener Type for Your Use Case", + "impact": "CRITICAL", + "tags": [ + "config", + "listeners", + "architecture", + "routing" + ], + "testCase": { + "description": "Detect and fix: \"Choose the Right Listener Type for Your Use Case\"", + "input": "version: v0.3.0\n\n# Wrong: a model listener cannot route to backend agent services\nlisteners:\n - type: model\n name: main\n port: 12000\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001", + "expected": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001\n - id: travel_agent\n url: http://host.docker.internal:8002\n\nlisteners:\n - type: agent\n name: orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: weather_agent\n description: Provides real-time weather, forecasts, and conditions for any city.\n - id: travel_agent\n description: Books flights, hotels, and travel itineraries.\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Choose the Right Listener Type for Your Use Case\" and explain how to fix it." + } + }, + { + "id": "config-providers", + "section": 1, + "sectionTitle": "Configuration Fundamentals", + "title": "Register Model Providers with Correct Format Identifiers", + "impact": "CRITICAL", + "tags": [ + "config", + "model-providers", + "llm", + "api-format" + ], + "testCase": { + "description": "Detect and fix: \"Register Model Providers with Correct Format Identifiers\"", + "input": "model_providers:\n - model: gpt-4o # Missing openai/ prefix — Plano cannot route this\n access_key: $OPENAI_API_KEY\n\n - model: claude-3-5-sonnet # Missing anthropic/ prefix\n access_key: $ANTHROPIC_API_KEY", + "expected": "model_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\n - model: anthropic/claude-sonnet-4-20250514\n access_key: $ANTHROPIC_API_KEY\n\n - model: gemini/gemini-2.0-flash\n access_key: $GOOGLE_API_KEY\n\nmodel_providers:\n - model: custom/llama3\n base_url: http://host.docker.internal:11434/v1 # Ollama endpoint\n provider_interface: openai # Ollama speaks OpenAI format\n default: true", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Register Model Providers with Correct Format Identifiers\" and explain how to fix it." + } + }, + { + "id": "config-secrets", + "section": 1, + "sectionTitle": "Configuration Fundamentals", + "title": "Use Environment Variable Substitution for All Secrets", + "impact": "CRITICAL", + "tags": [ + "config", + "security", + "secrets", + "api-keys", + "environment-variables" + ], + "testCase": { + "description": "Detect and fix: \"Use Environment Variable Substitution for All Secrets\"", + "input": "version: v0.3.0\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: abcdefghijklmnopqrstuvwxyz... # Hardcoded — never do this\n\nstate_storage:\n type: postgres\n connection_string: \"postgresql://admin:mysecretpassword@prod-db:5432/plano\"\n\nprompt_targets:\n - name: get_data\n endpoint:\n name: my_api\n http_headers:\n Authorization: \"Bearer abcdefghijklmnopqrstuvwxyz\" # Hardcoded token", + "expected": "version: v0.3.0\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\n - model: anthropic/claude-sonnet-4-20250514\n access_key: $ANTHROPIC_API_KEY\n\nstate_storage:\n type: postgres\n connection_string: \"postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}\"\n\nprompt_targets:\n - name: get_data\n endpoint:\n name: my_api\n http_headers:\n Authorization: \"Bearer $MY_API_TOKEN\"\n\n# .env — add to .gitignore\nOPENAI_API_KEY=abcdefghijklmnopqrstuvwxyz...\nANTHROPIC_API_KEY=abcdefghijklmnopqrstuvwxyz...\nDB_USER=plano\nDB_PASS=secure-password\nDB_HOST=localhost\nMY_API_TOKEN=abcdefghijklmnopqrstuvwxyz...", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Environment Variable Substitution for All Secrets\" and explain how to fix it." + } + }, + { + "id": "config-version", + "section": 1, + "sectionTitle": "Configuration Fundamentals", + "title": "Always Specify a Supported Config Version", + "impact": "CRITICAL", + "tags": [ + "config", + "versioning", + "validation" + ], + "testCase": { + "description": "Detect and fix: \"Always Specify a Supported Config Version\"", + "input": "# No version field — fails schema validation\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY", + "expected": "version: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Always Specify a Supported Config Version\" and explain how to fix it." + } + }, + { + "id": "deploy-docker", + "section": 7, + "sectionTitle": "Deployment & Security", + "title": "Understand Plano's Docker Network Topology for Agent URL Configuration", + "impact": "HIGH", + "tags": [ + "deployment", + "docker", + "networking", + "agents", + "urls" + ], + "testCase": { + "description": "Detect and fix: \"Understand Plano's Docker Network Topology for Agent URL Configuration\"", + "input": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://localhost:8001 # Wrong: this is Plano's own container\n\n - id: flight_agent\n url: http://127.0.0.1:8002 # Wrong: same issue\n\nfilters:\n - id: input_guards\n url: http://localhost:10500 # Wrong: filter server unreachable", + "expected": "version: v0.3.0\n\nagents:\n - id: weather_agent\n url: http://host.docker.internal:8001 # Correct: reaches host port 8001\n\n - id: flight_agent\n url: http://host.docker.internal:8002 # Correct: reaches host port 8002\n\nfilters:\n - id: input_guards\n url: http://host.docker.internal:10500 # Correct: reaches filter server on host\n\nendpoints:\n internal_api:\n endpoint: host.docker.internal # Correct for internal API on host\n protocol: http\n\n# Kubernetes / Docker Compose — use service names\nagents:\n - id: weather_agent\n url: http://weather-service:8001 # Kubernetes service DNS\n\n# External cloud services — use full domain\nagents:\n - id: cloud_agent\n url: https://my-agent.us-east-1.amazonaws.com/v1\n\n# Custom TLS (self-signed or internal CA)\noverrides:\n upstream_tls_ca_path: /etc/ssl/certs/internal-ca.pem", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Understand Plano's Docker Network Topology for Agent URL Configuration\" and explain how to fix it." + } + }, + { + "id": "deploy-state", + "section": 7, + "sectionTitle": "Deployment & Security", + "title": "Use PostgreSQL State Storage for Multi-Turn Conversations in Production", + "impact": "HIGH", + "tags": [ + "deployment", + "state", + "postgres", + "memory", + "multi-turn", + "production" + ], + "testCase": { + "description": "Detect and fix: \"Use PostgreSQL State Storage for Multi-Turn Conversations in Production\"", + "input": "version: v0.3.0\n\n# Memory storage — all conversations lost on planoai down / container restart\nstate_storage:\n type: memory\n\nlisteners:\n - type: agent\n name: customer_support\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: support_agent\n description: Customer support assistant with conversation history.", + "expected": "version: v0.3.0\n\nstate_storage:\n type: postgres\n connection_string: \"postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:5432/${DB_NAME}\"\n\nlisteners:\n - type: agent\n name: customer_support\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: support_agent\n description: Customer support assistant with access to full conversation history.\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\n# Start PostgreSQL with Docker\ndocker run -d \\\n --name plano-postgres \\\n -e POSTGRES_USER=plano \\\n -e POSTGRES_PASSWORD=devpassword \\\n -e POSTGRES_DB=plano \\\n -p 5432:5432 \\\n postgres:16\n\n# Set environment variables\nexport DB_USER=plano\nexport DB_PASS=devpassword\nexport DB_HOST=host.docker.internal # Use host.docker.internal from inside Plano container\nexport DB_NAME=plano\n\nDB_USER=plano_prod\nDB_PASS=\nDB_HOST=your-rds-endpoint.amazonaws.com\nDB_NAME=plano", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use PostgreSQL State Storage for Multi-Turn Conversations in Production\" and explain how to fix it." + } + }, + { + "id": "filter-guardrails", + "section": 4, + "sectionTitle": "Filter Chains & Guardrails", + "title": "Configure Prompt Guards with Actionable Rejection Messages", + "impact": "MEDIUM", + "tags": [ + "filter", + "guardrails", + "jailbreak", + "security", + "ux" + ], + "testCase": { + "description": "Detect and fix: \"Configure Prompt Guards with Actionable Rejection Messages\"", + "input": "version: v0.3.0\n\nprompt_guards:\n input_guards:\n jailbreak:\n on_exception: {} # Empty — returns unhelpful generic error\n\nprompt_guards:\n input_guards:\n jailbreak:\n on_exception:\n message: \"Error code 403: guard triggered\" # Unhelpful to the user", + "expected": "version: v0.3.0\n\nprompt_guards:\n input_guards:\n jailbreak:\n on_exception:\n message: >\n I'm not able to help with that request. This assistant is designed\n to help with [your use case, e.g., customer support, coding questions].\n Please rephrase your question or contact support@yourdomain.com\n if you believe this is an error.\n\n# Built-in jailbreak detection (fast, no external service needed)\nprompt_guards:\n input_guards:\n jailbreak:\n on_exception:\n message: \"This request cannot be processed. Please ask about our products and services.\"\n\n# MCP-based custom guards for additional policy enforcement\nfilters:\n - id: topic_restriction\n url: http://host.docker.internal:10500\n type: mcp\n transport: streamable-http\n tool: topic_restriction # Custom filter for domain-specific restrictions\n\nlisteners:\n - type: agent\n name: customer_support\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: support_agent\n description: Customer support assistant for product questions and order issues.\n filter_chain:\n - topic_restriction # Additional custom topic filtering", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Configure Prompt Guards with Actionable Rejection Messages\" and explain how to fix it." + } + }, + { + "id": "filter-mcp", + "section": 4, + "sectionTitle": "Filter Chains & Guardrails", + "title": "Configure MCP Filters with Explicit Type and Transport", + "impact": "MEDIUM", + "tags": [ + "filter", + "mcp", + "integration", + "configuration" + ], + "testCase": { + "description": "Detect and fix: \"Configure MCP Filters with Explicit Type and Transport\"", + "input": "filters:\n - id: my_guard # Plano infers type=mcp, transport=streamable-http, tool=my_guard\n url: http://localhost:10500\n # If your MCP server uses a different tool name or transport, this silently misroutes", + "expected": "version: v0.3.0\n\nfilters:\n - id: input_guards\n url: http://host.docker.internal:10500\n type: mcp # Explicitly MCP protocol\n transport: streamable-http # Streamable HTTP transport\n tool: input_guards # MCP tool name (matches MCP server registration)\n\n - id: query_rewriter\n url: http://host.docker.internal:10501\n type: mcp\n transport: streamable-http\n tool: rewrite_query # Tool name differs from filter ID — explicit is safer\n\n - id: custom_validator\n url: http://host.docker.internal:10503\n type: http # Plain HTTP filter (not MCP)\n # No tool field for HTTP filters\n\nfilters:\n - id: auth_validator\n url: http://host.docker.internal:9000/validate\n type: http # Plano POSTs the request, expects the modified request back", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Configure MCP Filters with Explicit Type and Transport\" and explain how to fix it." + } + }, + { + "id": "filter-ordering", + "section": 4, + "sectionTitle": "Filter Chains & Guardrails", + "title": "Order Filter Chains with Guards First, Enrichment Last", + "impact": "HIGH", + "tags": [ + "filter", + "guardrails", + "security", + "pipeline", + "ordering" + ], + "testCase": { + "description": "Detect and fix: \"Order Filter Chains with Guards First, Enrichment Last\"", + "input": "filters:\n - id: context_builder\n url: http://host.docker.internal:10502 # Runs expensive RAG retrieval first\n - id: query_rewriter\n url: http://host.docker.internal:10501\n - id: input_guards\n url: http://host.docker.internal:10500 # Guards run last — jailbreak gets context\n\nlisteners:\n - type: agent\n name: rag_orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: rag_agent\n filter_chain:\n - context_builder # Wrong: expensive enrichment before safety check\n - query_rewriter\n - input_guards", + "expected": "version: v0.3.0\n\nfilters:\n - id: input_guards\n url: http://host.docker.internal:10500\n type: mcp\n transport: streamable-http\n - id: query_rewriter\n url: http://host.docker.internal:10501\n type: mcp\n transport: streamable-http\n - id: context_builder\n url: http://host.docker.internal:10502\n type: mcp\n transport: streamable-http\n\nlisteners:\n - type: agent\n name: rag_orchestrator\n port: 8000\n router: plano_orchestrator_v1\n agents:\n - id: rag_agent\n description: Answers questions using internal knowledge base documents.\n filter_chain:\n - input_guards # 1. Block jailbreaks and policy violations\n - query_rewriter # 2. Normalize the safe query\n - context_builder # 3. Retrieve relevant context for the clean query", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Order Filter Chains with Guards First, Enrichment Last\" and explain how to fix it." + } + }, + { + "id": "observe-span-attributes", + "section": 5, + "sectionTitle": "Observability & Debugging", + "title": "Add Custom Span Attributes for Correlation and Filtering", + "impact": "MEDIUM", + "tags": [ + "observability", + "tracing", + "span-attributes", + "correlation" + ], + "testCase": { + "description": "Detect and fix: \"Add Custom Span Attributes for Correlation and Filtering\"", + "input": "tracing:\n random_sampling: 20\n # No span_attributes — cannot filter by user, session, or environment", + "expected": "version: v0.3.0\n\ntracing:\n random_sampling: 20\n trace_arch_internal: true\n\n span_attributes:\n # Match all headers with this prefix, then map to span attributes by:\n # 1) stripping the prefix and 2) converting hyphens to dots\n header_prefixes:\n - x-katanemo-\n\n # Static attributes added to every span from this Plano instance\n static:\n environment: production\n service.name: plano-gateway\n deployment.region: us-east-1\n service.version: \"2.1.0\"\n team: platform-engineering\n\nimport httpx\n\nresponse = httpx.post(\n \"http://localhost:12000/v1/chat/completions\",\n headers={\n \"x-katanemo-request-id\": \"req_abc123\",\n \"x-katanemo-user-id\": \"usr_12\",\n \"x-katanemo-session-id\": \"sess_xyz456\",\n \"x-katanemo-tenant-id\": \"acme-corp\",\n },\n json={\"model\": \"plano.v1\", \"messages\": [...]}\n)\n\n# Find all requests from a specific user\nplanoai trace --where user.id=usr_12\n\n# Find all traces from production environment\nplanoai trace --where environment=production\n\n# Find traces from a specific tenant\nplanoai trace --where tenant.id=acme-corp", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Add Custom Span Attributes for Correlation and Filtering\" and explain how to fix it." + } + }, + { + "id": "observe-tracing", + "section": 5, + "sectionTitle": "Observability & Debugging", + "title": "Enable Tracing with Appropriate Sampling for Your Environment", + "impact": "HIGH", + "tags": [ + "observability", + "tracing", + "opentelemetry", + "otel", + "debugging" + ], + "testCase": { + "description": "Detect and fix: \"Enable Tracing with Appropriate Sampling for Your Environment\"", + "input": "version: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\n# No tracing block — no visibility into routing, latency, or errors", + "expected": "version: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\ntracing:\n random_sampling: 100 # 100% for development/debugging\n trace_arch_internal: true # Include Plano's internal routing spans\n\ntracing:\n random_sampling: 10 # Sample 10% of requests in production\n trace_arch_internal: false # Skip internal spans to reduce noise\n span_attributes:\n header_prefixes:\n - x-katanemo- # Match all x-katanemo-* headers\n static:\n environment: production\n service.name: my-plano-service\n version: \"1.0.0\"\n\n# Start Plano with built-in OTEL collector\nplanoai up config.yaml --with-tracing", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Enable Tracing with Appropriate Sampling for Your Environment\" and explain how to fix it." + } + }, + { + "id": "routing-aliases", + "section": 2, + "sectionTitle": "Routing & Model Selection", + "title": "Use Model Aliases for Semantic, Stable Model References", + "impact": "MEDIUM", + "tags": [ + "routing", + "model-aliases", + "maintainability", + "client-integration" + ], + "testCase": { + "description": "Detect and fix: \"Use Model Aliases for Semantic, Stable Model References\"", + "input": "# config.yaml — no aliases defined\nversion: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n default: true\n\n# Client code — brittle, must be updated when model changes\nclient.chat.completions.create(model=\"gpt-4o\", ...)", + "expected": "version: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: openai/gpt-4o-mini\n access_key: $OPENAI_API_KEY\n default: true\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n - model: anthropic/claude-sonnet-4-20250514\n access_key: $ANTHROPIC_API_KEY\n\nmodel_aliases:\n plano.fast.v1:\n target: gpt-4o-mini # Cheap, fast — for high-volume tasks\n\n plano.smart.v1:\n target: gpt-4o # High capability — for complex reasoning\n\n plano.creative.v1:\n target: claude-sonnet-4-20250514 # Strong creative writing and analysis\n\n plano.v1:\n target: gpt-4o # Default production alias\n\n# Client code — stable, alias is the contract\nclient.chat.completions.create(model=\"plano.smart.v1\", ...)", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Model Aliases for Semantic, Stable Model References\" and explain how to fix it." + } + }, + { + "id": "routing-default", + "section": 2, + "sectionTitle": "Routing & Model Selection", + "title": "Always Set Exactly One Default Model Provider", + "impact": "HIGH", + "tags": [ + "routing", + "defaults", + "model-providers", + "reliability" + ], + "testCase": { + "description": "Detect and fix: \"Always Set Exactly One Default Model Provider\"", + "input": "version: v0.3.0\n\nmodel_providers:\n - model: openai/gpt-4o-mini # No default: true anywhere\n access_key: $OPENAI_API_KEY\n routing_preferences:\n - name: summarization\n description: Summarizing documents and extracting key points\n\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n routing_preferences:\n - name: code_generation\n description: Writing new functions and implementing algorithms\n\nmodel_providers:\n - model: openai/gpt-4o-mini\n default: true # First default\n access_key: $OPENAI_API_KEY\n\n - model: openai/gpt-4o\n default: true # Second default — confusing\n access_key: $OPENAI_API_KEY", + "expected": "version: v0.3.0\n\nmodel_providers:\n - model: openai/gpt-4o-mini\n access_key: $OPENAI_API_KEY\n default: true # Handles general/unclassified requests\n routing_preferences:\n - name: summarization\n description: Summarizing documents, articles, and meeting notes\n - name: classification\n description: Categorizing inputs, labeling, and intent detection\n\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n routing_preferences:\n - name: code_generation\n description: Writing, debugging, and reviewing code\n - name: complex_reasoning\n description: Multi-step math, logical analysis, research synthesis", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Always Set Exactly One Default Model Provider\" and explain how to fix it." + } + }, + { + "id": "routing-passthrough", + "section": 2, + "sectionTitle": "Routing & Model Selection", + "title": "Use Passthrough Auth for Proxy and Multi-Tenant Setups", + "impact": "MEDIUM", + "tags": [ + "routing", + "authentication", + "proxy", + "litellm", + "multi-tenant" + ], + "testCase": { + "description": "Detect and fix: \"Use Passthrough Auth for Proxy and Multi-Tenant Setups\"", + "input": "model_providers:\n - model: custom/proxy\n base_url: http://host.docker.internal:8000\n access_key: $SOME_KEY # Plano overwrites the client's auth — proxy rejects it", + "expected": "version: v0.3.0\n\nlisteners:\n - type: model\n name: model_listener\n port: 12000\n\nmodel_providers:\n - model: custom/litellm-proxy\n base_url: http://host.docker.internal:4000 # LiteLLM server\n provider_interface: openai # LiteLLM uses OpenAI format\n passthrough_auth: true # Forward client's Bearer token\n default: true\n\nmodel_providers:\n # Plano acts as a passthrough gateway; each client has their own OpenAI key\n - model: openai/gpt-4o\n passthrough_auth: true # No access_key here — client's key is forwarded\n default: true\n\nmodel_providers:\n - model: openai/gpt-4o-mini\n access_key: $OPENAI_API_KEY # Plano manages this key\n default: true\n routing_preferences:\n - name: quick tasks\n description: Short answers, simple lookups, fast completions\n\n - model: custom/vllm-llama\n base_url: http://gpu-server:8000\n provider_interface: openai\n passthrough_auth: true # vLLM cluster handles its own auth\n routing_preferences:\n - name: long context\n description: Processing very long documents, multi-document analysis", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Use Passthrough Auth for Proxy and Multi-Tenant Setups\" and explain how to fix it." + } + }, + { + "id": "routing-preferences", + "section": 2, + "sectionTitle": "Routing & Model Selection", + "title": "Write Task-Specific Routing Preference Descriptions", + "impact": "HIGH", + "tags": [ + "routing", + "model-selection", + "preferences", + "llm-routing" + ], + "testCase": { + "description": "Detect and fix: \"Write Task-Specific Routing Preference Descriptions\"", + "input": "model_providers:\n - model: openai/gpt-4o-mini\n access_key: $OPENAI_API_KEY\n default: true\n routing_preferences:\n - name: simple\n description: easy tasks # Too vague — what is \"easy\"?\n\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n routing_preferences:\n - name: hard\n description: hard tasks # Too vague — overlaps with \"easy\"", + "expected": "model_providers:\n - model: openai/gpt-4o-mini\n access_key: $OPENAI_API_KEY\n default: true\n routing_preferences:\n - name: summarization\n description: >\n Summarizing documents, articles, emails, or meeting transcripts.\n Extracting key points, generating TL;DR sections, condensing long text.\n - name: classification\n description: >\n Categorizing inputs, sentiment analysis, spam detection,\n intent classification, labeling structured data fields.\n - name: translation\n description: >\n Translating text between languages, localization tasks.\n\n - model: openai/gpt-4o\n access_key: $OPENAI_API_KEY\n routing_preferences:\n - name: code_generation\n description: >\n Writing new functions, classes, or modules from scratch.\n Implementing algorithms, boilerplate generation, API integrations.\n - name: code_review\n description: >\n Reviewing code for bugs, security vulnerabilities, performance issues.\n Suggesting refactors, explaining complex code, debugging errors.\n - name: complex_reasoning\n description: >\n Multi-step math problems, logical deduction, strategic planning,\n research synthesis requiring chain-of-thought reasoning.", + "evaluationPrompt": "Given the following Plano config or CLI usage, identify if it violates the rule \"Write Task-Specific Routing Preference Descriptions\" and explain how to fix it." + } + } +] diff --git a/skills/tsconfig.json b/skills/tsconfig.json new file mode 100644 index 00000000..83552abb --- /dev/null +++ b/skills/tsconfig.json @@ -0,0 +1,15 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "lib": ["ES2022"], + "strict": true, + "noEmit": true, + "types": ["node"], + "skipLibCheck": true, + "resolveJsonModule": true, + "forceConsistentCasingInFileNames": true + }, + "include": ["src/**/*.ts"] +} diff --git a/tests/parity/signals/.gitignore b/tests/parity/signals/.gitignore new file mode 100644 index 00000000..3a7e0d4f --- /dev/null +++ b/tests/parity/signals/.gitignore @@ -0,0 +1,4 @@ +out/ +.venv/ +__pycache__/ +*.pyc diff --git a/tests/parity/signals/README.md b/tests/parity/signals/README.md new file mode 100644 index 00000000..67193d60 --- /dev/null +++ b/tests/parity/signals/README.md @@ -0,0 +1,98 @@ +# Signals Parity Harness + +Validates that `crates/brightstaff/src/signals/` (Rust port) produces the same +`SignalReport` as the Python reference at +on a fixed sample of `lmsys/lmsys-chat-1m` conversations. + +This harness is **not** part of normal CI. It downloads several GB and is run +on demand to gate releases of the signals subsystem (or to investigate +regressions reported in production). + +## What gets compared + +For each conversation, both analyzers emit a `SignalReport`. The comparator +classifies any divergence into three tiers: + +| Tier | Field | Action on divergence | +|------|------------------------------------------------|----------------------| +| A | set of `SignalType` present, per-type counts, `overall_quality` | Fail the run | +| B | per-instance `message_index`, instance counts per type | Log + collect, do not fail | +| C | metadata, snippet text, summary | Information only | + +Quality buckets are compared by string (`excellent` / `good` / ...). + +## What this harness does *not* cover + +`lmsys-chat-1m` is plain user/assistant chat. It exercises the **interaction** +layer well (misalignment, stagnation, disengagement, satisfaction) but does +**not** exercise: + +- `execution.failure.*` +- `execution.loops.*` +- `environment.exhaustion.*` + +Those signals require `function_call` / `observation` ShareGPT roles. They are +covered by the Rust unit tests and the Python repo's own test fixtures, both +of which run on every PR. A synthetic tool-trace dataset for full coverage is +deferred to a follow-up. + +## One-time setup + +```bash +# 1. Build the Rust replay binary. +cd ../../../crates && cargo build --release -p brightstaff --bin signals_replay + +# 2. Set up the Python environment for the harness driver. +cd ../tests/parity/signals +python3 -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt + +# 3. Install the Python signals reference. +# Either point at a local checkout: +pip install -e /path/to/signals +# or pull from git: +pip install 'signals @ git+https://github.com/katanemo/signals@' +``` + +## Running + +```bash +source .venv/bin/activate + +python run_parity.py \ + --num-samples 2000 \ + --seed 42 \ + --dataset-revision \ + --rust-binary ../../../crates/target/release/signals_replay \ + --output-dir out/ + +python compare.py --output-dir out/ +``` + +`run_parity.py` will: + +1. Download `lmsys/lmsys-chat-1m` (cached in `~/.cache/huggingface`). +2. Pick `--num-samples` rows under `--seed`. +3. Convert each to ShareGPT, write `out/conversations.jsonl`. +4. Run the Rust binary as a subprocess → `out/rust_reports.jsonl`. +5. Run the Python analyzer in-process → `out/python_reports.jsonl`. + +`compare.py` reads both report files and writes: + +- `out/diffs.jsonl` — one record per mismatched conversation, with tier + structural diff +- `out/metrics.json` — agreement %, per-`SignalType` confusion matrix, quality-bucket confusion matrix +- `out/summary.md` — human-readable PR-ready report + +Exit code is non-zero iff any Tier-A divergence is observed. + +## Reproducibility + +Every run pins: + +- `dataset_revision` — the HF dataset commit +- `seed` — RNG seed for sampling +- `signals_python_version` — `pip show signals` version +- `plano_git_sha` — `git rev-parse HEAD` of this repo +- `signals_replay_binary_sha256` — the hash of the Rust bin + +All are stamped into `metrics.json`. diff --git a/tests/parity/signals/_smoke_test.py b/tests/parity/signals/_smoke_test.py new file mode 100644 index 00000000..68c6e879 --- /dev/null +++ b/tests/parity/signals/_smoke_test.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Local smoke test for the parity harness — runs both runners on a tiny +hand-picked set of conversations without touching the lmsys dataset. + +Run from this directory: + python _smoke_test.py --rust-binary +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +from signals.analyzer import SignalAnalyzer + +SAMPLES = [ + { + "id": "smoke-gratitude", + "messages": [ + {"from": "human", "value": "What is the weather in Istanbul?"}, + {"from": "gpt", "value": "Istanbul is 14C and partly cloudy."}, + {"from": "human", "value": "That worked, exactly what I needed. Thanks!"}, + ], + }, + { + "id": "smoke-escalation", + "messages": [ + {"from": "human", "value": "This isn't helpful at all"}, + {"from": "gpt", "value": "I'm sorry, can you tell me more?"}, + {"from": "human", "value": "Get me a human, this is useless"}, + ], + }, + { + "id": "smoke-correction", + "messages": [ + {"from": "human", "value": "Book me a flight to NYC for tomorrow"}, + {"from": "gpt", "value": "Sure, here are flights to NYC for Friday."}, + { + "from": "human", + "value": "No, I meant flights for Saturday, not tomorrow", + }, + ], + }, + { + "id": "smoke-clean", + "messages": [ + {"from": "human", "value": "Hi"}, + {"from": "gpt", "value": "Hello, how can I help?"}, + ], + }, + { + "id": "smoke-rephrase", + "messages": [ + {"from": "human", "value": "Can you summarize the news please"}, + {"from": "gpt", "value": "Sure, here is a summary."}, + {"from": "human", "value": "Could you please summarize the news"}, + ], + }, +] + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--rust-binary", required=True, type=Path) + args = p.parse_args() + + out_dir = Path("out_smoke") + out_dir.mkdir(exist_ok=True) + conv_path = out_dir / "conversations.jsonl" + rust_path = out_dir / "rust_reports.jsonl" + py_path = out_dir / "python_reports.jsonl" + + with conv_path.open("w") as f: + for s in SAMPLES: + f.write(json.dumps(s) + "\n") + + with conv_path.open("rb") as fin, rust_path.open("wb") as fout: + proc = subprocess.run( + [str(args.rust_binary)], stdin=fin, stdout=fout, stderr=subprocess.PIPE + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr.decode("utf-8", errors="replace")) + return 2 + + analyzer = SignalAnalyzer() + with conv_path.open() as fin, py_path.open("w") as fout: + for line in fin: + obj = json.loads(line) + r = analyzer.analyze(obj["messages"]) + fout.write(json.dumps({"id": obj["id"], "report": r.to_dict()}) + "\n") + + rc = subprocess.call( + [sys.executable, "compare.py", "--output-dir", str(out_dir)], + ) + return rc + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/parity/signals/compare.py b/tests/parity/signals/compare.py new file mode 100644 index 00000000..80f56295 --- /dev/null +++ b/tests/parity/signals/compare.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +""" +Diff Rust vs Python signal reports produced by run_parity.py. + +See README.md for the tier definitions. Exits non-zero iff any Tier-A +divergence is found. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any, Dict, List, Tuple + +CATEGORIES_BY_LAYER = { + "interaction_signals": [ + "misalignment", + "stagnation", + "disengagement", + "satisfaction", + ], + "execution_signals": ["failure", "loops"], + "environment_signals": ["exhaustion"], +} + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--output-dir", type=Path, default=Path("out")) + return p.parse_args() + + +def load_jsonl(path: Path) -> Dict[str, Dict[str, Any]]: + """Load a JSONL file keyed by `id`. Lines with errors are still indexed.""" + out: Dict[str, Dict[str, Any]] = {} + with path.open() as f: + for line in f: + line = line.strip() + if not line: + continue + obj = json.loads(line) + out[str(obj.get("id"))] = obj + return out + + +def per_type_counts(report: Dict[str, Any]) -> Dict[str, int]: + """Return {signal_type: count} across all groups in a report dict.""" + counts: Counter[str] = Counter() + for layer in CATEGORIES_BY_LAYER: + groups = report.get(layer, {}) or {} + for category in CATEGORIES_BY_LAYER[layer]: + group = groups.get(category) + if not group: + continue + for sig in group.get("signals", []) or []: + counts[sig["signal_type"]] += 1 + return dict(counts) + + +def per_type_indices(report: Dict[str, Any]) -> Dict[str, List[int]]: + out: Dict[str, List[int]] = defaultdict(list) + for layer in CATEGORIES_BY_LAYER: + groups = report.get(layer, {}) or {} + for category in CATEGORIES_BY_LAYER[layer]: + group = groups.get(category) + if not group: + continue + for sig in group.get("signals", []) or []: + out[sig["signal_type"]].append(sig.get("message_index")) + for k in out: + out[k].sort(key=lambda x: (x is None, x)) + return dict(out) + + +def diff_counts(a: Dict[str, int], b: Dict[str, int]) -> List[Tuple[str, int, int]]: + """Return [(signal_type, a_count, b_count)] for entries that differ.""" + keys = set(a) | set(b) + out = [] + for k in sorted(keys): + ac = a.get(k, 0) + bc = b.get(k, 0) + if ac != bc: + out.append((k, ac, bc)) + return out + + +def diff_indices( + a: Dict[str, List[int]], b: Dict[str, List[int]] +) -> List[Tuple[str, List[int], List[int]]]: + keys = set(a) | set(b) + out = [] + for k in sorted(keys): + ai = a.get(k, []) + bi = b.get(k, []) + if ai != bi: + out.append((k, ai, bi)) + return out + + +def compare_one( + convo_id: str, py: Dict[str, Any], rust: Dict[str, Any] +) -> Dict[str, Any] | None: + """Compare a single conversation. Return diff record, or None if identical.""" + if "error" in py or "error" in rust: + return { + "id": convo_id, + "tier": "A", + "kind": "error_in_runner", + "python_error": py.get("error"), + "rust_error": rust.get("error"), + } + py_report = py["report"] + rust_report = rust["report"] + + py_counts = per_type_counts(py_report) + rust_counts = per_type_counts(rust_report) + count_diff = diff_counts(py_counts, rust_counts) + + py_quality = py_report.get("overall_quality") + rust_quality = rust_report.get("overall_quality") + quality_mismatch = py_quality != rust_quality + + if count_diff or quality_mismatch: + return { + "id": convo_id, + "tier": "A", + "kind": "signal_or_quality_mismatch", + "quality": {"python": py_quality, "rust": rust_quality}, + "count_diff": [ + {"signal_type": st, "python": pc, "rust": rc} + for (st, pc, rc) in count_diff + ], + } + + py_idx = per_type_indices(py_report) + rust_idx = per_type_indices(rust_report) + idx_diff = diff_indices(py_idx, rust_idx) + if idx_diff: + return { + "id": convo_id, + "tier": "B", + "kind": "instance_index_mismatch", + "diff": [ + {"signal_type": st, "python_indices": pi, "rust_indices": ri} + for (st, pi, ri) in idx_diff + ], + } + + return None + + +def confusion_matrix( + pairs: List[Tuple[str, str]], labels: List[str] +) -> Dict[str, Dict[str, int]]: + cm: Dict[str, Dict[str, int]] = {a: {b: 0 for b in labels} for a in labels} + for py, rust in pairs: + if py not in cm: + cm[py] = {b: 0 for b in labels} + if rust not in cm[py]: + cm[py][rust] = 0 + cm[py][rust] += 1 + return cm + + +def main() -> int: + args = parse_args() + out_dir = args.output_dir + + py_reports = load_jsonl(out_dir / "python_reports.jsonl") + rust_reports = load_jsonl(out_dir / "rust_reports.jsonl") + + common_ids = sorted(set(py_reports) & set(rust_reports)) + only_py = sorted(set(py_reports) - set(rust_reports)) + only_rust = sorted(set(rust_reports) - set(py_reports)) + + diffs: List[Dict[str, Any]] = [] + quality_pairs: List[Tuple[str, str]] = [] + per_type_total = Counter() + per_type_disagree = Counter() + + tier_a = 0 + tier_b = 0 + for cid in common_ids: + d = compare_one(cid, py_reports[cid], rust_reports[cid]) + if d is None: + quality_pairs.append( + ( + py_reports[cid]["report"]["overall_quality"], + rust_reports[cid]["report"]["overall_quality"], + ) + ) + for st, _ in per_type_counts(py_reports[cid]["report"]).items(): + per_type_total[st] += 1 + else: + diffs.append(d) + if d["tier"] == "A": + tier_a += 1 + elif d["tier"] == "B": + tier_b += 1 + if "report" in py_reports[cid] and "report" in rust_reports[cid]: + quality_pairs.append( + ( + py_reports[cid]["report"].get("overall_quality", "?"), + rust_reports[cid]["report"].get("overall_quality", "?"), + ) + ) + for cd in d.get("count_diff", []) or []: + per_type_disagree[cd["signal_type"]] += 1 + per_type_total[cd["signal_type"]] += 1 + + n_total = len(common_ids) + n_match = n_total - len(diffs) + agreement = (n_match / n_total) if n_total else 0.0 + + quality_labels = ["excellent", "good", "neutral", "poor", "severe"] + cm = confusion_matrix(quality_pairs, quality_labels) + + metrics = { + "n_python_reports": len(py_reports), + "n_rust_reports": len(rust_reports), + "n_common": n_total, + "n_only_python": len(only_py), + "n_only_rust": len(only_rust), + "n_full_match": n_match, + "agreement_pct": round(100.0 * agreement, 4), + "tier_a_divergences": tier_a, + "tier_b_divergences": tier_b, + "quality_confusion_matrix": cm, + "per_signal_type_total": dict(per_type_total), + "per_signal_type_disagree": dict(per_type_disagree), + } + + # Pull in run metadata if present. + rm_path = out_dir / "run_metadata.json" + if rm_path.exists(): + metrics["run_metadata"] = json.loads(rm_path.read_text()) + + (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2)) + with (out_dir / "diffs.jsonl").open("w") as f: + for d in diffs: + f.write(json.dumps(d, ensure_ascii=False)) + f.write("\n") + + write_summary_md(out_dir / "summary.md", metrics, diffs[:20]) + + print( + json.dumps( + {k: v for k, v in metrics.items() if k != "quality_confusion_matrix"}, + indent=2, + ) + ) + print(f"\ndiffs: {out_dir / 'diffs.jsonl'} metrics: {out_dir / 'metrics.json'}") + print(f"summary: {out_dir / 'summary.md'}") + + if tier_a > 0: + print(f"\nFAIL: {tier_a} Tier-A divergence(s) detected.", file=sys.stderr) + return 1 + return 0 + + +def write_summary_md( + path: Path, metrics: Dict[str, Any], sample_diffs: List[Dict[str, Any]] +) -> None: + lines: List[str] = [] + lines.append("# Signals Parity Report") + lines.append("") + rm = metrics.get("run_metadata", {}) + if rm: + lines.append("## Run metadata") + lines.append("") + for k in ( + "dataset_name", + "dataset_revision", + "seed", + "num_samples_actual", + "plano_git_sha", + "signals_python_version", + "rust_binary_sha256", + ): + if k in rm: + lines.append(f"- **{k}**: `{rm[k]}`") + lines.append("") + + lines.append("## Summary") + lines.append("") + lines.append(f"- Conversations compared: **{metrics['n_common']}**") + lines.append(f"- Full matches: **{metrics['n_full_match']}**") + lines.append(f"- Agreement: **{metrics['agreement_pct']}%**") + lines.append(f"- Tier-A divergences: **{metrics['tier_a_divergences']}**") + lines.append(f"- Tier-B divergences: **{metrics['tier_b_divergences']}**") + lines.append("") + + lines.append("## Per-signal-type disagreement") + lines.append("") + lines.append("| Signal type | Total reports | Disagreements |") + lines.append("|---|---:|---:|") + totals = metrics["per_signal_type_total"] + disagrees = metrics["per_signal_type_disagree"] + for k in sorted(set(totals) | set(disagrees)): + lines.append(f"| `{k}` | {totals.get(k, 0)} | {disagrees.get(k, 0)} |") + lines.append("") + + lines.append("## Quality bucket confusion matrix (rows = python, cols = rust)") + lines.append("") + cm = metrics["quality_confusion_matrix"] + labels = list(cm.keys()) + lines.append("| | " + " | ".join(labels) + " |") + lines.append("|---|" + "|".join(["---:"] * len(labels)) + "|") + for r in labels: + lines.append( + f"| {r} | " + " | ".join(str(cm[r].get(c, 0)) for c in labels) + " |" + ) + lines.append("") + + if sample_diffs: + lines.append("## Sample divergences (first 20)") + lines.append("") + for d in sample_diffs: + lines.append(f"### `{d['id']}` — tier {d['tier']} — {d['kind']}") + lines.append("") + lines.append("```json") + lines.append(json.dumps(d, indent=2)) + lines.append("```") + lines.append("") + + path.write_text("\n".join(lines)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/parity/signals/requirements.txt b/tests/parity/signals/requirements.txt new file mode 100644 index 00000000..7b25f179 --- /dev/null +++ b/tests/parity/signals/requirements.txt @@ -0,0 +1,3 @@ +huggingface_hub>=0.25 +pyarrow>=15 +tqdm>=4.65 diff --git a/tests/parity/signals/run_parity.py b/tests/parity/signals/run_parity.py new file mode 100644 index 00000000..1d14630e --- /dev/null +++ b/tests/parity/signals/run_parity.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +""" +Parity harness driver. + +Samples conversations from `lmsys/lmsys-chat-1m`, runs both the Python +reference analyzer (in-process) and the Rust port (subprocess), writes both +reports to disk for `compare.py` to diff. + +Usage: + python run_parity.py \\ + --num-samples 2000 \\ + --seed 42 \\ + --dataset-revision \\ + --rust-binary ../../../crates/target/release/signals_replay \\ + --output-dir out/ +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import random +import subprocess +import sys +import time +from pathlib import Path +from typing import Any, Dict, Iterator, List + +try: + import pyarrow.parquet as pq + from huggingface_hub import hf_hub_download, list_repo_files +except ImportError: + print( + "error: install dependencies first: pip install -r requirements.txt", + file=sys.stderr, + ) + sys.exit(2) + +try: + from signals.analyzer import SignalAnalyzer +except ImportError: + print( + "error: the python `signals` package is not installed. " + "install it from your local checkout: pip install -e /path/to/signals", + file=sys.stderr, + ) + sys.exit(2) + +try: + from tqdm import tqdm +except ImportError: + + def tqdm(it, **_kwargs): # type: ignore[no-redef] + return it + + +DATASET_NAME = "lmsys/lmsys-chat-1m" + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--num-samples", type=int, default=2000) + p.add_argument("--seed", type=int, default=42) + p.add_argument( + "--dataset-revision", + default=None, + help="HF dataset revision to pin (default: latest, NOT recommended for reproducibility)", + ) + p.add_argument( + "--rust-binary", + type=Path, + required=True, + help="path to the `signals_replay` binary built from crates/brightstaff", + ) + p.add_argument( + "--output-dir", + type=Path, + default=Path("out"), + help="directory to write the conversations + both runners' outputs", + ) + p.add_argument( + "--max-conv-messages", + type=int, + default=200, + help="drop conversations with more than this many messages (the analyzer " + "truncates to last 100 anyway; this is a sanity cap on input parsing)", + ) + return p.parse_args() + + +def lmsys_to_sharegpt(conversation: List[Dict[str, str]]) -> List[Dict[str, str]]: + """Convert lmsys-chat-1m's `[{role, content}]` to ShareGPT's `[{from, value}]`. + + lmsys uses `user` / `assistant` (no tools, no system role in `conversation`). + """ + out = [] + for m in conversation: + role = m.get("role", "") + content = m.get("content", "") + if not isinstance(content, str): + content = str(content) if content is not None else "" + if role == "user": + from_ = "human" + elif role == "assistant": + from_ = "gpt" + else: + # lmsys is human/assistant only; skip anything else defensively. + continue + out.append({"from": from_, "value": content}) + return out + + +def _list_parquet_files(revision: str | None) -> List[str]: + """Return the list of parquet shard paths in the dataset repo.""" + files = list_repo_files(DATASET_NAME, repo_type="dataset", revision=revision) + return sorted(f for f in files if f.endswith(".parquet")) + + +def _download_shards(paths: List[str], revision: str | None) -> List[Path]: + """Download each parquet shard to the HF cache, return local paths.""" + local: List[Path] = [] + for rel in tqdm(paths, desc="downloading shards", unit="shard"): + p = hf_hub_download( + DATASET_NAME, + filename=rel, + repo_type="dataset", + revision=revision, + ) + local.append(Path(p)) + return local + + +def sample_conversations( + *, + num_samples: int, + seed: int, + revision: str | None, + max_conv_messages: int, +) -> Iterator[Dict[str, Any]]: + """Yield `num_samples` conversations sampled uniformly across the dataset. + + We bypass the `datasets` loader (which has a Python 3.14 pickle issue) + and read the parquet shards directly via pyarrow. + """ + print( + f"listing {DATASET_NAME}" + f"{' @ ' + revision if revision else ' (no revision pinned!)'}", + file=sys.stderr, + ) + shard_paths = _list_parquet_files(revision) + if not shard_paths: + raise SystemExit(f"no parquet shards found for {DATASET_NAME}") + local_paths = _download_shards(shard_paths, revision) + + # Collect row counts without reading data. + shard_row_counts: List[int] = [] + for p in local_paths: + pf = pq.ParquetFile(str(p)) + shard_row_counts.append(pf.metadata.num_rows) + total_rows = sum(shard_row_counts) + print( + f"dataset has {total_rows:,} rows across {len(local_paths)} shards", + file=sys.stderr, + ) + + rng = random.Random(seed) + global_indices = sorted(rng.sample(range(total_rows), num_samples)) + + # Bucket indices by shard. + by_shard: Dict[int, List[int]] = {} + cumulative = 0 + shard_offsets = [] + for c in shard_row_counts: + shard_offsets.append(cumulative) + cumulative += c + for gi in global_indices: + # Find which shard this index belongs to. + for si, off in enumerate(shard_offsets): + if gi < off + shard_row_counts[si]: + by_shard.setdefault(si, []).append(gi - off) + break + + yielded = 0 + for si in sorted(by_shard.keys()): + local_rows = by_shard[si] + pf = pq.ParquetFile(str(local_paths[si])) + table = pf.read(columns=["conversation"]) + conv_col = table.column("conversation") + for local_idx in local_rows: + raw = conv_col[local_idx].as_py() + if not raw: + continue + conversation = raw if isinstance(raw, list) else raw.get("conversation", []) + if len(conversation) > max_conv_messages: + continue + messages = lmsys_to_sharegpt(conversation) + if not messages: + continue + global_idx = shard_offsets[si] + local_idx + yield { + "id": f"lmsys-{global_idx}", + "messages": messages, + } + yielded += 1 + print(f"yielded {yielded} conversations after filtering", file=sys.stderr) + + +def write_conversations(out_path: Path, samples: Iterator[Dict[str, Any]]) -> int: + n = 0 + with out_path.open("w") as f: + for s in tqdm(samples, desc="sampling", unit="convo"): + f.write(json.dumps(s, ensure_ascii=False)) + f.write("\n") + n += 1 + return n + + +def run_rust(rust_binary: Path, conv_path: Path, out_path: Path) -> None: + print(f"running rust analyzer: {rust_binary}", file=sys.stderr) + t0 = time.monotonic() + with conv_path.open("rb") as fin, out_path.open("wb") as fout: + proc = subprocess.run( + [str(rust_binary)], + stdin=fin, + stdout=fout, + stderr=subprocess.PIPE, + check=False, + ) + if proc.returncode != 0: + sys.stderr.write(proc.stderr.decode("utf-8", errors="replace")) + raise SystemExit(f"rust runner exited {proc.returncode}") + elapsed = time.monotonic() - t0 + print(f" rust runner: {elapsed:.1f}s", file=sys.stderr) + + +def run_python(conv_path: Path, out_path: Path) -> None: + print("running python analyzer...", file=sys.stderr) + t0 = time.monotonic() + analyzer = SignalAnalyzer() + with conv_path.open() as fin, out_path.open("w") as fout: + for line in tqdm(fin, desc="python", unit="convo"): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + report = analyzer.analyze(obj["messages"]) + fout.write( + json.dumps( + {"id": obj["id"], "report": report.to_dict()}, + ensure_ascii=False, + ) + ) + except Exception as e: + fout.write(json.dumps({"id": obj.get("id"), "error": str(e)})) + fout.write("\n") + elapsed = time.monotonic() - t0 + print(f" python runner: {elapsed:.1f}s", file=sys.stderr) + + +def stamp_metadata(args: argparse.Namespace, output_dir: Path, n_samples: int) -> None: + """Write the input metadata so compare.py can include it in the report.""" + binary_sha = hashlib.sha256(args.rust_binary.read_bytes()).hexdigest() + try: + plano_sha = ( + subprocess.check_output( + ["git", "rev-parse", "HEAD"], cwd=Path(__file__).parent + ) + .decode() + .strip() + ) + except Exception: + plano_sha = "unknown" + try: + signals_version = subprocess.check_output( + [sys.executable, "-m", "pip", "show", "signals"] + ).decode() + signals_version = next( + ( + l.split(":", 1)[1].strip() + for l in signals_version.splitlines() + if l.startswith("Version") + ), + "unknown", + ) + except Exception: + signals_version = "unknown" + + meta = { + "dataset_name": DATASET_NAME, + "dataset_revision": args.dataset_revision, + "seed": args.seed, + "num_samples_requested": args.num_samples, + "num_samples_actual": n_samples, + "rust_binary": str(args.rust_binary.resolve()), + "rust_binary_sha256": binary_sha, + "plano_git_sha": plano_sha, + "signals_python_version": signals_version, + "max_conv_messages": args.max_conv_messages, + } + (output_dir / "run_metadata.json").write_text(json.dumps(meta, indent=2)) + print(f"wrote {output_dir / 'run_metadata.json'}", file=sys.stderr) + + +def main() -> None: + args = parse_args() + args.output_dir.mkdir(parents=True, exist_ok=True) + if not args.rust_binary.exists(): + raise SystemExit(f"rust binary not found at {args.rust_binary}") + + conv_path = args.output_dir / "conversations.jsonl" + rust_path = args.output_dir / "rust_reports.jsonl" + py_path = args.output_dir / "python_reports.jsonl" + + samples = sample_conversations( + num_samples=args.num_samples, + seed=args.seed, + revision=args.dataset_revision, + max_conv_messages=args.max_conv_messages, + ) + n = write_conversations(conv_path, samples) + print(f"wrote {n} conversations to {conv_path}", file=sys.stderr) + + run_rust(args.rust_binary, conv_path, rust_path) + run_python(conv_path, py_path) + stamp_metadata(args, args.output_dir, n) + print("done. now run: python compare.py --output-dir " + str(args.output_dir)) + + +if __name__ == "__main__": + main()