add planoai obs: live LLM observability TUI

2026-06-11 15:05:14 +02:00 · 2026-04-17 00:52:46 -07:00 · 2026-04-17 00:52:46 -07:00 · d30018cf35
commit d30018cf35
parent 1f701258cb
19 changed files with 1736 additions and 5 deletions
--- a/cli/planoai/main.py
+++ b/cli/planoai/main.py
@ -37,6 +37,7 @@ from planoai.core import (
 )
 from planoai.init_cmd import init as init_cmd
 from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_background
+from planoai.obs_cmd import obs as obs_cmd
 from planoai.consts import (
    DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
    DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT,
@ -714,6 +715,7 @@ main.add_command(cli_agent)
 main.add_command(generate_prompt_targets)
 main.add_command(init_cmd, name="init")
 main.add_command(trace_cmd, name="trace")
+main.add_command(obs_cmd, name="obs")

 if __name__ == "__main__":
    main()
--- a/cli/planoai/obs/init.py
+++ b/cli/planoai/obs/init.py
@ -0,0 +1,6 @@
+"""Plano observability console: in-memory live view of LLM traffic."""
+
+from planoai.obs.collector import LLMCall, LLMCallStore, ObsCollector
+from planoai.obs.pricing import PricingCatalog
+
+__all__ = ["LLMCall", "LLMCallStore", "ObsCollector", "PricingCatalog"]
--- a/cli/planoai/obs/collector.py
+++ b/cli/planoai/obs/collector.py
@ -0,0 +1,281 @@
+"""In-memory collector for LLM calls, fed by OTLP/gRPC spans from brightstaff."""
+
+from __future__ import annotations
+
+import threading
+from collections import deque
+from concurrent import futures
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Iterable
+
+import grpc
+from opentelemetry.proto.collector.trace.v1 import (
+    trace_service_pb2,
+    trace_service_pb2_grpc,
+)
+
+
+DEFAULT_GRPC_PORT = 4317
+DEFAULT_CAPACITY = 1000
+
+
+@dataclass
+class LLMCall:
+    """One LLM call as reconstructed from a brightstaff LLM span.
+
+    Fields default to ``None`` when the underlying span attribute was absent.
+    """
+
+    request_id: str
+    timestamp: datetime
+    model: str
+    provider: str | None = None
+    request_model: str | None = None
+    session_id: str | None = None
+    route_name: str | None = None
+    is_streaming: bool | None = None
+    status_code: int | None = None
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    cached_input_tokens: int | None = None
+    cache_creation_tokens: int | None = None
+    reasoning_tokens: int | None = None
+    ttft_ms: float | None = None
+    duration_ms: float | None = None
+    routing_strategy: str | None = None
+    routing_reason: str | None = None
+    cost_usd: float | None = None
+
+    @property
+    def tpt_ms(self) -> float | None:
+        if self.duration_ms is None or self.completion_tokens in (None, 0):
+            return None
+        ttft = self.ttft_ms or 0.0
+        generate_ms = max(0.0, self.duration_ms - ttft)
+        if generate_ms <= 0:
+            return None
+        return generate_ms / self.completion_tokens
+
+    @property
+    def tokens_per_sec(self) -> float | None:
+        tpt = self.tpt_ms
+        if tpt is None or tpt <= 0:
+            return None
+        return 1000.0 / tpt
+
+
+class LLMCallStore:
+    """Thread-safe ring buffer of recent LLM calls."""
+
+    def __init__(self, capacity: int = DEFAULT_CAPACITY) -> None:
+        self._capacity = capacity
+        self._calls: deque[LLMCall] = deque(maxlen=capacity)
+        self._lock = threading.Lock()
+
+    @property
+    def capacity(self) -> int:
+        return self._capacity
+
+    def add(self, call: LLMCall) -> None:
+        with self._lock:
+            self._calls.append(call)
+
+    def clear(self) -> None:
+        with self._lock:
+            self._calls.clear()
+
+    def snapshot(self) -> list[LLMCall]:
+        with self._lock:
+            return list(self._calls)
+
+    def __len__(self) -> int:
+        with self._lock:
+            return len(self._calls)
+
+
+# Attribute keys mirror crates/brightstaff/src/tracing/constants.rs.
+_LLM_MODEL = "llm.model"
+_LLM_PROVIDER = "llm.provider"
+_LLM_IS_STREAMING = "llm.is_streaming"
+_LLM_DURATION_MS = "llm.duration_ms"
+_LLM_TTFT_MS = "llm.time_to_first_token"
+_LLM_PROMPT_TOKENS = "llm.usage.prompt_tokens"
+_LLM_COMPLETION_TOKENS = "llm.usage.completion_tokens"
+_LLM_TOTAL_TOKENS = "llm.usage.total_tokens"
+_LLM_CACHED_INPUT_TOKENS = "llm.usage.cached_input_tokens"
+_LLM_CACHE_CREATION_TOKENS = "llm.usage.cache_creation_tokens"
+_LLM_REASONING_TOKENS = "llm.usage.reasoning_tokens"
+_HTTP_STATUS = "http.status_code"
+_MODEL_REQUESTED = "model.requested"
+_PLANO_SESSION_ID = "plano.session_id"
+_PLANO_ROUTE_NAME = "plano.route.name"
+_ROUTING_STRATEGY = "routing.strategy"
+_ROUTING_SELECTION_REASON = "routing.selection_reason"
+_REQUEST_ID_KEYS = ("request_id", "http.request_id")
+
+
+def _anyvalue_to_python(value: Any) -> Any:  # AnyValue from OTLP
+    kind = value.WhichOneof("value")
+    if kind == "string_value":
+        return value.string_value
+    if kind == "bool_value":
+        return value.bool_value
+    if kind == "int_value":
+        return value.int_value
+    if kind == "double_value":
+        return value.double_value
+    return None
+
+
+def _attrs_to_dict(attrs: Iterable[Any]) -> dict[str, Any]:
+    out: dict[str, Any] = {}
+    for kv in attrs:
+        py = _anyvalue_to_python(kv.value)
+        if py is not None:
+            out[kv.key] = py
+    return out
+
+
+def _maybe_int(value: Any) -> int | None:
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _maybe_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def span_to_llm_call(
+    span: Any, service_name: str, pricing: Any | None = None
+) -> LLMCall | None:
+    """Convert an OTLP span into an LLMCall, or return None if it isn't one.
+
+    A span is considered an LLM call iff it carries the ``llm.model`` attribute.
+    """
+    attrs = _attrs_to_dict(span.attributes)
+    model = attrs.get(_LLM_MODEL)
+    if not model:
+        return None
+
+    # Prefer explicit span attributes; fall back to likely aliases.
+    request_id = next(
+        (
+            str(attrs[key])
+            for key in _REQUEST_ID_KEYS
+            if key in attrs and attrs[key] is not None
+        ),
+        span.span_id.hex() if span.span_id else "",
+    )
+    start_ns = span.start_time_unix_nano or 0
+    ts = (
+        datetime.fromtimestamp(start_ns / 1_000_000_000, tz=timezone.utc).astimezone()
+        if start_ns
+        else datetime.now().astimezone()
+    )
+
+    call = LLMCall(
+        request_id=str(request_id),
+        timestamp=ts,
+        model=str(model),
+        provider=str(attrs[_LLM_PROVIDER]) if _LLM_PROVIDER in attrs else service_name,
+        request_model=(
+            str(attrs[_MODEL_REQUESTED]) if _MODEL_REQUESTED in attrs else None
+        ),
+        session_id=(
+            str(attrs[_PLANO_SESSION_ID]) if _PLANO_SESSION_ID in attrs else None
+        ),
+        route_name=(
+            str(attrs[_PLANO_ROUTE_NAME]) if _PLANO_ROUTE_NAME in attrs else None
+        ),
+        is_streaming=bool(attrs[_LLM_IS_STREAMING])
+        if _LLM_IS_STREAMING in attrs
+        else None,
+        status_code=_maybe_int(attrs.get(_HTTP_STATUS)),
+        prompt_tokens=_maybe_int(attrs.get(_LLM_PROMPT_TOKENS)),
+        completion_tokens=_maybe_int(attrs.get(_LLM_COMPLETION_TOKENS)),
+        total_tokens=_maybe_int(attrs.get(_LLM_TOTAL_TOKENS)),
+        cached_input_tokens=_maybe_int(attrs.get(_LLM_CACHED_INPUT_TOKENS)),
+        cache_creation_tokens=_maybe_int(attrs.get(_LLM_CACHE_CREATION_TOKENS)),
+        reasoning_tokens=_maybe_int(attrs.get(_LLM_REASONING_TOKENS)),
+        ttft_ms=_maybe_float(attrs.get(_LLM_TTFT_MS)),
+        duration_ms=_maybe_float(attrs.get(_LLM_DURATION_MS)),
+        routing_strategy=(
+            str(attrs[_ROUTING_STRATEGY]) if _ROUTING_STRATEGY in attrs else None
+        ),
+        routing_reason=(
+            str(attrs[_ROUTING_SELECTION_REASON])
+            if _ROUTING_SELECTION_REASON in attrs
+            else None
+        ),
+    )
+
+    if pricing is not None:
+        call.cost_usd = pricing.cost_for_call(call)
+
+    return call
+
+
+class _ObsServicer(trace_service_pb2_grpc.TraceServiceServicer):
+    def __init__(self, store: LLMCallStore, pricing: Any | None) -> None:
+        self._store = store
+        self._pricing = pricing
+
+    def Export(self, request, context):  # noqa: N802 — gRPC generated name
+        for resource_spans in request.resource_spans:
+            service_name = "unknown"
+            for attr in resource_spans.resource.attributes:
+                if attr.key == "service.name":
+                    val = _anyvalue_to_python(attr.value)
+                    if val is not None:
+                        service_name = str(val)
+                    break
+            for scope_spans in resource_spans.scope_spans:
+                for span in scope_spans.spans:
+                    call = span_to_llm_call(span, service_name, self._pricing)
+                    if call is not None:
+                        self._store.add(call)
+        return trace_service_pb2.ExportTraceServiceResponse()
+
+
+@dataclass
+class ObsCollector:
+    """Owns the OTLP/gRPC server and the in-memory LLMCall ring buffer."""
+
+    store: LLMCallStore = field(default_factory=LLMCallStore)
+    pricing: Any | None = None
+    host: str = "0.0.0.0"
+    port: int = DEFAULT_GRPC_PORT
+    _server: grpc.Server | None = field(default=None, init=False, repr=False)
+
+    def start(self) -> None:
+        if self._server is not None:
+            return
+        server = grpc.server(futures.ThreadPoolExecutor(max_workers=4))
+        trace_service_pb2_grpc.add_TraceServiceServicer_to_server(
+            _ObsServicer(self.store, self.pricing), server
+        )
+        address = f"{self.host}:{self.port}"
+        bound = server.add_insecure_port(address)
+        if bound == 0:
+            raise OSError(
+                f"Failed to bind OTLP listener on {address}: port already in use. "
+                "Stop `planoai trace listen` or pick another port with --port."
+            )
+        server.start()
+        self._server = server
+
+    def stop(self, grace: float = 2.0) -> None:
+        if self._server is not None:
+            self._server.stop(grace)
+            self._server = None
--- a/cli/planoai/obs/pricing.py
+++ b/cli/planoai/obs/pricing.py
@ -0,0 +1,276 @@
+"""DigitalOcean Gradient pricing catalog for the obs console.
+
+Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``.
+Single-source: one fetch at startup, cached for the life of the process.
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from dataclasses import dataclass
+from typing import Any
+
+import requests
+
+
+DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
+FETCH_TIMEOUT_SECS = 5.0
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ModelPrice:
+    """Input/output $/token rates. Token counts are multiplied by these."""
+
+    input_per_token_usd: float
+    output_per_token_usd: float
+    cached_input_per_token_usd: float | None = None
+
+
+class PricingCatalog:
+    """In-memory pricing lookup keyed by model id.
+
+    DO's catalog uses ids like ``openai-gpt-5.4``; Plano's resolved model names
+    may arrive as ``do/openai-gpt-5.4`` or bare ``openai-gpt-5.4``. We strip the
+    leading provider prefix when looking up.
+    """
+
+    def __init__(self, prices: dict[str, ModelPrice] | None = None) -> None:
+        self._prices: dict[str, ModelPrice] = prices or {}
+        self._lock = threading.Lock()
+
+    def __len__(self) -> int:
+        with self._lock:
+            return len(self._prices)
+
+    def sample_models(self, n: int = 5) -> list[str]:
+        with self._lock:
+            return list(self._prices.keys())[:n]
+
+    @classmethod
+    def fetch(
+        cls,
+        url: str = DEFAULT_PRICING_URL,
+        api_key: str | None = None,
+    ) -> "PricingCatalog":
+        """Fetch pricing from DO's catalog endpoint. On failure, returns an
+        empty catalog (cost column will be blank).
+
+        The catalog endpoint requires a DigitalOcean Personal Access Token —
+        this is *not* the same as the inference ``MODEL_ACCESS_KEY`` used at
+        runtime. We check ``DIGITALOCEAN_TOKEN`` first (standard DO CLI env
+        var), then ``DO_PAT``, then fall back to ``DO_API_KEY``.
+        """
+        import os
+
+        headers = {}
+        token = (
+            api_key
+            or os.environ.get("DIGITALOCEAN_TOKEN")
+            or os.environ.get("DO_PAT")
+            or os.environ.get("DO_API_KEY")
+        )
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+
+        try:
+            resp = requests.get(url, headers=headers, timeout=FETCH_TIMEOUT_SECS)
+            resp.raise_for_status()
+            data = resp.json()
+        except Exception as exc:  # noqa: BLE001 — best-effort; never fatal
+            logger.warning(
+                "DO pricing fetch failed: %s; cost column will be blank. "
+                "Set DIGITALOCEAN_TOKEN with a DO Personal Access Token to "
+                "enable cost.",
+                exc,
+            )
+            return cls()
+
+        prices = _parse_do_pricing(data)
+        if not prices:
+            # Dump the first entry's raw shape so we can see which fields DO
+            # actually returned — helps when the catalog adds new fields or
+            # the response doesn't match our parser.
+            import json as _json
+
+            sample_items = _coerce_items(data)
+            sample = sample_items[0] if sample_items else data
+            logger.warning(
+                "DO pricing response had no parseable entries; cost column "
+                "will be blank. Sample entry: %s",
+                _json.dumps(sample, default=str)[:400],
+            )
+        return cls(prices)
+
+    def price_for(self, model_name: str | None) -> ModelPrice | None:
+        if not model_name:
+            return None
+        with self._lock:
+            # Try the full name first, then stripped prefix, then lowercased variants.
+            for candidate in _model_key_candidates(model_name):
+                hit = self._prices.get(candidate)
+                if hit is not None:
+                    return hit
+        return None
+
+    def cost_for_call(self, call: Any) -> float | None:
+        """Compute USD cost for an LLMCall. Returns None when pricing is unknown."""
+        price = self.price_for(getattr(call, "model", None)) or self.price_for(
+            getattr(call, "request_model", None)
+        )
+        if price is None:
+            return None
+        prompt = int(getattr(call, "prompt_tokens", 0) or 0)
+        completion = int(getattr(call, "completion_tokens", 0) or 0)
+        cached = int(getattr(call, "cached_input_tokens", 0) or 0)
+
+        # Cached input tokens are priced separately at the cached rate when known;
+        # otherwise they're already counted in prompt tokens at the regular rate.
+        fresh_prompt = prompt
+        if price.cached_input_per_token_usd is not None and cached:
+            fresh_prompt = max(0, prompt - cached)
+            cost_cached = cached * price.cached_input_per_token_usd
+        else:
+            cost_cached = 0.0
+
+        cost = (
+            fresh_prompt * price.input_per_token_usd
+            + completion * price.output_per_token_usd
+            + cost_cached
+        )
+        return round(cost, 6)
+
+
+def _model_key_candidates(model_name: str) -> list[str]:
+    base = model_name.strip()
+    out = [base]
+    if "/" in base:
+        out.append(base.split("/", 1)[1])
+    out.extend([v.lower() for v in list(out)])
+    # Dedup while preserving order.
+    seen: set[str] = set()
+    uniq = []
+    for key in out:
+        if key not in seen:
+            seen.add(key)
+            uniq.append(key)
+    return uniq
+
+
+def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
+    """Parse DO catalog response into a ModelPrice map keyed by model id.
+
+    DO's shape (as of 2026-04):
+        {
+          "data": [
+            {"model_id": "openai-gpt-5.4",
+             "pricing": {"input_price_per_million": 5.0,
+                         "output_price_per_million": 15.0}},
+            ...
+          ]
+        }
+
+    Older/alternate shapes are also accepted (flat top-level fields, or the
+    ``id``/``model``/``name`` key).
+    """
+    prices: dict[str, ModelPrice] = {}
+    items = _coerce_items(data)
+    for item in items:
+        model_id = (
+            item.get("model_id")
+            or item.get("id")
+            or item.get("model")
+            or item.get("name")
+        )
+        if not model_id:
+            continue
+
+        # DO nests rates under `pricing`; try that first, then fall back to
+        # top-level fields for alternate response shapes.
+        sources = [item]
+        if isinstance(item.get("pricing"), dict):
+            sources.insert(0, item["pricing"])
+
+        input_rate = _extract_rate_from_sources(
+            sources,
+            ["input_per_token", "input_token_price", "price_input"],
+            ["input_price_per_million", "input_per_million", "input_per_mtok"],
+        )
+        output_rate = _extract_rate_from_sources(
+            sources,
+            ["output_per_token", "output_token_price", "price_output"],
+            ["output_price_per_million", "output_per_million", "output_per_mtok"],
+        )
+        cached_rate = _extract_rate_from_sources(
+            sources,
+            [
+                "cached_input_per_token",
+                "cached_input_token_price",
+                "prompt_cache_read_per_token",
+            ],
+            [
+                "cached_input_price_per_million",
+                "cached_input_per_million",
+                "cached_input_per_mtok",
+            ],
+        )
+
+        if input_rate is None or output_rate is None:
+            continue
+        # Treat 0-rate entries as "unknown" so cost falls back to `—` rather
+        # than showing a misleading $0.0000. DO's catalog sometimes omits
+        # rates for promo/open-weight models.
+        if input_rate == 0 and output_rate == 0:
+            continue
+        prices[str(model_id)] = ModelPrice(
+            input_per_token_usd=input_rate,
+            output_per_token_usd=output_rate,
+            cached_input_per_token_usd=cached_rate,
+        )
+    return prices
+
+
+def _coerce_items(data: Any) -> list[dict]:
+    if isinstance(data, list):
+        return [x for x in data if isinstance(x, dict)]
+    if isinstance(data, dict):
+        for key in ("data", "models", "pricing", "items"):
+            val = data.get(key)
+            if isinstance(val, list):
+                return [x for x in val if isinstance(x, dict)]
+    return []
+
+
+def _extract_rate_from_sources(
+    sources: list[dict],
+    per_token_keys: list[str],
+    per_million_keys: list[str],
+) -> float | None:
+    """Return a per-token rate in USD, or None if unknown.
+
+    Some DO catalog responses put per-token values under a field whose name
+    says ``_per_million`` (e.g. ``input_price_per_million: 5E-8`` — that's
+    $5e-8 per token, not per million). Heuristic: values < 1 are already
+    per-token (real per-million rates are ~0.1 to ~100); values >= 1 are
+    treated as per-million and divided by 1,000,000.
+    """
+    for src in sources:
+        for key in per_token_keys:
+            if key in src and src[key] is not None:
+                try:
+                    return float(src[key])
+                except (TypeError, ValueError):
+                    continue
+        for key in per_million_keys:
+            if key in src and src[key] is not None:
+                try:
+                    v = float(src[key])
+                except (TypeError, ValueError):
+                    continue
+                if v >= 1:
+                    return v / 1_000_000
+                return v
+    return None
--- a/cli/planoai/obs/render.py
+++ b/cli/planoai/obs/render.py
@ -0,0 +1,324 @@
+"""Rich TUI renderer for the observability console."""
+
+from __future__ import annotations
+
+from collections import Counter
+from dataclasses import dataclass
+from datetime import datetime, timezone
+
+from rich.box import SIMPLE
+from rich.columns import Columns
+from rich.console import Group
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+from planoai.obs.collector import LLMCall
+
+
+@dataclass
+class AggregateStats:
+    count: int
+    total_cost_usd: float
+    total_input_tokens: int
+    total_output_tokens: int
+    distinct_sessions: int
+    current_session: str | None
+
+
+@dataclass
+class ModelRollup:
+    model: str
+    requests: int
+    input_tokens: int
+    output_tokens: int
+    cache_write: int
+    cache_read: int
+    cost_usd: float
+
+
+def _now() -> datetime:
+    return datetime.now(tz=timezone.utc).astimezone()
+
+
+def aggregates(calls: list[LLMCall]) -> AggregateStats:
+    total_cost = sum((c.cost_usd or 0.0) for c in calls)
+    total_input = sum(int(c.prompt_tokens or 0) for c in calls)
+    total_output = sum(int(c.completion_tokens or 0) for c in calls)
+    session_ids = {c.session_id for c in calls if c.session_id}
+    current = next(
+        (c.session_id for c in reversed(calls) if c.session_id is not None), None
+    )
+    return AggregateStats(
+        count=len(calls),
+        total_cost_usd=total_cost,
+        total_input_tokens=total_input,
+        total_output_tokens=total_output,
+        distinct_sessions=len(session_ids),
+        current_session=current,
+    )
+
+
+def model_rollups(calls: list[LLMCall]) -> list[ModelRollup]:
+    buckets: dict[str, dict[str, float | int]] = {}
+    for c in calls:
+        key = c.model
+        b = buckets.setdefault(
+            key,
+            {
+                "requests": 0,
+                "input": 0,
+                "output": 0,
+                "cache_write": 0,
+                "cache_read": 0,
+                "cost": 0.0,
+            },
+        )
+        b["requests"] = int(b["requests"]) + 1
+        b["input"] = int(b["input"]) + int(c.prompt_tokens or 0)
+        b["output"] = int(b["output"]) + int(c.completion_tokens or 0)
+        b["cache_write"] = int(b["cache_write"]) + int(c.cache_creation_tokens or 0)
+        b["cache_read"] = int(b["cache_read"]) + int(c.cached_input_tokens or 0)
+        b["cost"] = float(b["cost"]) + (c.cost_usd or 0.0)
+
+    rollups: list[ModelRollup] = []
+    for model, b in buckets.items():
+        rollups.append(
+            ModelRollup(
+                model=model,
+                requests=int(b["requests"]),
+                input_tokens=int(b["input"]),
+                output_tokens=int(b["output"]),
+                cache_write=int(b["cache_write"]),
+                cache_read=int(b["cache_read"]),
+                cost_usd=float(b["cost"]),
+            )
+        )
+    rollups.sort(key=lambda r: r.cost_usd, reverse=True)
+    return rollups
+
+
+def route_hits(calls: list[LLMCall]) -> list[tuple[str, int, float]]:
+    counts: Counter[str] = Counter()
+    for c in calls:
+        if c.route_name:
+            counts[c.route_name] += 1
+    total = sum(counts.values())
+    if total == 0:
+        return []
+    return [(r, n, (n / total) * 100.0) for r, n in counts.most_common()]
+
+
+def _fmt_cost(v: float | None) -> str:
+    if v is None:
+        return "—"
+    if v == 0:
+        return "$0"
+    # Adaptive precision so tiny costs ($3.8e-5) remain readable.
+    if abs(v) < 0.0001:
+        return f"${v:.8f}".rstrip("0").rstrip(".")
+    if abs(v) < 0.01:
+        return f"${v:.6f}".rstrip("0").rstrip(".")
+    return f"${v:.4f}"
+
+
+def _fmt_ms(v: float | None) -> str:
+    if v is None:
+        return "—"
+    if v >= 1000:
+        return f"{v / 1000:.1f}s"
+    return f"{v:.0f}ms"
+
+
+def _fmt_int(v: int | None) -> str:
+    if v is None or v == 0:
+        return "—"
+    return f"{v:,}"
+
+
+def _fmt_tokens(v: int | None) -> str:
+    if v is None:
+        return "—"
+    return f"{v:,}"
+
+
+def _request_panel(last: LLMCall | None) -> Panel:
+    if last is None:
+        body = Text("no requests yet", style="dim")
+    else:
+        t = Table.grid(padding=(0, 1))
+        t.add_column(style="bold cyan")
+        t.add_column()
+        t.add_row("Endpoint", "chat/completions")
+        status = "—" if last.status_code is None else str(last.status_code)
+        t.add_row("Status", status)
+        t.add_row("Model", last.model)
+        if last.request_model and last.request_model != last.model:
+            t.add_row("Req model", last.request_model)
+        if last.route_name:
+            t.add_row("Route", last.route_name)
+        body = t
+    return Panel(body, title="[bold]Request[/]", border_style="cyan", box=SIMPLE)
+
+
+def _cost_panel(last: LLMCall | None) -> Panel:
+    if last is None:
+        body = Text("—", style="dim")
+    else:
+        t = Table.grid(padding=(0, 1))
+        t.add_column(style="bold green")
+        t.add_column()
+        t.add_row("Request", _fmt_cost(last.cost_usd))
+        t.add_row("Input", _fmt_tokens(last.prompt_tokens))
+        t.add_row("Output", _fmt_tokens(last.completion_tokens))
+        if last.cached_input_tokens:
+            t.add_row("Cached", _fmt_tokens(last.cached_input_tokens))
+        body = t
+    return Panel(body, title="[bold]Cost[/]", border_style="green", box=SIMPLE)
+
+
+def _totals_panel(stats: AggregateStats) -> Panel:
+    t = Table.grid(padding=(0, 1))
+    t.add_column(style="bold magenta")
+    t.add_column()
+    t.add_column(style="bold magenta")
+    t.add_column()
+    t.add_row(
+        "Total cost",
+        _fmt_cost(stats.total_cost_usd),
+        "Requests",
+        str(stats.count),
+    )
+    t.add_row(
+        "Input",
+        _fmt_tokens(stats.total_input_tokens),
+        "Output",
+        _fmt_tokens(stats.total_output_tokens),
+    )
+    t.add_row(
+        "Sessions",
+        str(stats.distinct_sessions),
+        "Current session",
+        stats.current_session or "—",
+    )
+    return Panel(t, title="[bold]Totals[/]", border_style="magenta", box=SIMPLE)
+
+
+def _model_rollup_table(rollups: list[ModelRollup]) -> Table:
+    table = Table(
+        title="Totals by model",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("Model", style="cyan")
+    table.add_column("Req", justify="right")
+    table.add_column("Input", justify="right")
+    table.add_column("Output", justify="right", style="green")
+    table.add_column("Cache write", justify="right", style="yellow")
+    table.add_column("Cache read", justify="right", style="yellow")
+    table.add_column("Cost", justify="right", style="green")
+    if not rollups:
+        table.add_row("—", "—", "—", "—", "—", "—", "—")
+    for r in rollups:
+        table.add_row(
+            r.model,
+            str(r.requests),
+            _fmt_tokens(r.input_tokens),
+            _fmt_tokens(r.output_tokens),
+            _fmt_int(r.cache_write),
+            _fmt_int(r.cache_read),
+            _fmt_cost(r.cost_usd),
+        )
+    return table
+
+
+def _route_hit_table(hits: list[tuple[str, int, float]]) -> Table:
+    table = Table(
+        title="Route hit %",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("Route", style="cyan")
+    table.add_column("Hits", justify="right")
+    table.add_column("%", justify="right")
+    for route, n, pct in hits:
+        table.add_row(route, str(n), f"{pct:.1f}")
+    return table
+
+
+def _recent_table(calls: list[LLMCall], limit: int = 15) -> Table:
+    show_route = any(c.route_name for c in calls)
+    table = Table(
+        title="Recent requests",
+        box=SIMPLE,
+        header_style="bold",
+        expand=True,
+    )
+    table.add_column("time")
+    table.add_column("model", style="cyan")
+    if show_route:
+        table.add_column("route", style="yellow")
+    table.add_column("in", justify="right")
+    table.add_column("cache", justify="right", style="yellow")
+    table.add_column("out", justify="right", style="green")
+    table.add_column("rsn", justify="right")
+    table.add_column("cost", justify="right", style="green")
+    table.add_column("TTFT", justify="right")
+    table.add_column("lat", justify="right")
+    table.add_column("st")
+
+    recent = list(reversed(calls))[:limit]
+    for c in recent:
+        status_cell = "ok" if c.status_code and 200 <= c.status_code < 400 else str(c.status_code or "—")
+        row = [
+            c.timestamp.strftime("%H:%M:%S"),
+            c.model,
+        ]
+        if show_route:
+            row.append(c.route_name or "—")
+        row.extend(
+            [
+                _fmt_tokens(c.prompt_tokens),
+                _fmt_int(c.cached_input_tokens),
+                _fmt_tokens(c.completion_tokens),
+                _fmt_int(c.reasoning_tokens),
+                _fmt_cost(c.cost_usd),
+                _fmt_ms(c.ttft_ms),
+                _fmt_ms(c.duration_ms),
+                status_cell,
+            ]
+        )
+        table.add_row(*row)
+    if not recent:
+        table.add_row(*(["no requests yet"] + ["—"] * (10 if show_route else 9)))
+    return table
+
+
+def render(calls: list[LLMCall]) -> Group:
+    last = calls[-1] if calls else None
+    stats = aggregates(calls)
+    rollups = model_rollups(calls)
+    hits = route_hits(calls)
+
+    header = Columns(
+        [_request_panel(last), _cost_panel(last), _totals_panel(stats)],
+        expand=True,
+        equal=True,
+    )
+    parts = [
+        header,
+        _model_rollup_table(rollups),
+    ]
+    if hits:
+        parts.append(_route_hit_table(hits))
+    parts.append(_recent_table(calls))
+    parts.append(
+        Text(
+            "q quit · c clear · waiting for spans on OTLP :4317 — brightstaff needs "
+            "tracing.opentracing_grpc_endpoint=localhost:4317",
+            style="dim",
+        )
+    )
+    return Group(*parts)
--- a/cli/planoai/obs_cmd.py
+++ b/cli/planoai/obs_cmd.py
@ -0,0 +1,99 @@
+"""`planoai obs` — live observability TUI."""
+
+from __future__ import annotations
+
+import time
+
+import rich_click as click
+from rich.console import Console
+from rich.live import Live
+
+from planoai.consts import PLANO_COLOR
+from planoai.obs.collector import (
+    DEFAULT_CAPACITY,
+    DEFAULT_GRPC_PORT,
+    LLMCallStore,
+    ObsCollector,
+)
+from planoai.obs.pricing import PricingCatalog
+from planoai.obs.render import render
+
+
+@click.command(name="obs", help="Live observability console for Plano LLM traffic.")
+@click.option(
+    "--port",
+    type=int,
+    default=DEFAULT_GRPC_PORT,
+    show_default=True,
+    help="OTLP/gRPC port to listen on. Must match the brightstaff tracing endpoint.",
+)
+@click.option(
+    "--host",
+    type=str,
+    default="0.0.0.0",
+    show_default=True,
+    help="Host to bind the OTLP listener.",
+)
+@click.option(
+    "--capacity",
+    type=int,
+    default=DEFAULT_CAPACITY,
+    show_default=True,
+    help="Max LLM calls kept in memory; older calls evicted FIFO.",
+)
+@click.option(
+    "--refresh-ms",
+    type=int,
+    default=500,
+    show_default=True,
+    help="TUI refresh interval.",
+)
+def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
+    console = Console()
+    console.print(
+        f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...",
+        end="",
+    )
+    pricing = PricingCatalog.fetch()
+    if len(pricing):
+        sample = ", ".join(pricing.sample_models(3))
+        console.print(
+            f" [green]{len(pricing)} models loaded[/] [dim]({sample}, ...)[/]"
+        )
+    else:
+        console.print(
+            " [yellow]no pricing loaded[/] — "
+            "[dim]set DIGITALOCEAN_TOKEN (DO Personal Access Token) to enable cost[/]"
+        )
+
+    store = LLMCallStore(capacity=capacity)
+    collector = ObsCollector(store=store, pricing=pricing, host=host, port=port)
+    try:
+        collector.start()
+    except OSError as exc:
+        console.print(f"[red]{exc}[/]")
+        raise SystemExit(1)
+
+    console.print(
+        f"Listening for OTLP spans on [bold]{host}:{port}[/]. "
+        "Ensure plano config has [cyan]tracing.opentracing_grpc_endpoint: http://localhost:4317[/] "
+        "and [cyan]tracing.random_sampling: 100[/] (or run [bold]planoai up[/] "
+        "with no config — it wires this automatically)."
+    )
+    console.print("Press [bold]Ctrl-C[/] to exit.\n")
+
+    refresh = max(0.05, refresh_ms / 1000.0)
+    try:
+        with Live(
+            render(store.snapshot()),
+            console=console,
+            refresh_per_second=1.0 / refresh,
+            screen=False,
+        ) as live:
+            while True:
+                time.sleep(refresh)
+                live.update(render(store.snapshot()))
+    except KeyboardInterrupt:
+        console.print("\n[dim]obs stopped[/]")
+    finally:
+        collector.stop()
--- a/cli/planoai/rich_click_config.py
+++ b/cli/planoai/rich_click_config.py
@ -61,7 +61,7 @@ def configure_rich_click(plano_color: str) -> None:
            },
            {
                "name": "Observability",
-                "commands": ["trace"],
+                "commands": ["trace", "obs"],
            },
            {
                "name": "Utilities",
--- a/cli/test/test_obs_collector.py
+++ b/cli/test/test_obs_collector.py
@ -0,0 +1,141 @@
+import time
+from datetime import datetime, timezone
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call
+
+
+def _mk_attr(key: str, value):
+    v = MagicMock()
+    if isinstance(value, bool):
+        v.WhichOneof.return_value = "bool_value"
+        v.bool_value = value
+    elif isinstance(value, int):
+        v.WhichOneof.return_value = "int_value"
+        v.int_value = value
+    elif isinstance(value, float):
+        v.WhichOneof.return_value = "double_value"
+        v.double_value = value
+    else:
+        v.WhichOneof.return_value = "string_value"
+        v.string_value = str(value)
+    kv = MagicMock()
+    kv.key = key
+    kv.value = v
+    return kv
+
+
+def _mk_span(attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab") -> MagicMock:
+    span = MagicMock()
+    span.attributes = [_mk_attr(k, v) for k, v in attrs.items()]
+    span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000)
+    span.span_id.hex.return_value = span_id_hex
+    return span
+
+
+def test_span_without_llm_model_is_ignored():
+    span = _mk_span({"http.method": "POST"})
+    assert span_to_llm_call(span, "plano(llm)") is None
+
+
+def test_span_with_full_llm_attrs_produces_call():
+    span = _mk_span(
+        {
+            "llm.model": "openai-gpt-5.4",
+            "model.requested": "router:software-engineering",
+            "plano.session_id": "sess-abc",
+            "plano.route.name": "software-engineering",
+            "llm.is_streaming": False,
+            "llm.duration_ms": 1234,
+            "llm.time_to_first_token": 210,
+            "llm.usage.prompt_tokens": 100,
+            "llm.usage.completion_tokens": 50,
+            "llm.usage.total_tokens": 150,
+            "llm.usage.cached_input_tokens": 30,
+            "llm.usage.cache_creation_tokens": 5,
+            "llm.usage.reasoning_tokens": 200,
+            "http.status_code": 200,
+            "request_id": "req-42",
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)")
+    assert call is not None
+    assert call.request_id == "req-42"
+    assert call.model == "openai-gpt-5.4"
+    assert call.request_model == "router:software-engineering"
+    assert call.session_id == "sess-abc"
+    assert call.route_name == "software-engineering"
+    assert call.is_streaming is False
+    assert call.duration_ms == 1234.0
+    assert call.ttft_ms == 210.0
+    assert call.prompt_tokens == 100
+    assert call.completion_tokens == 50
+    assert call.total_tokens == 150
+    assert call.cached_input_tokens == 30
+    assert call.cache_creation_tokens == 5
+    assert call.reasoning_tokens == 200
+    assert call.status_code == 200
+
+
+def test_pricing_lookup_attaches_cost():
+    class StubPricing:
+        def cost_for_call(self, call):
+            # Simple: 2 * prompt + 3 * completion, in cents
+            return 0.02 * (call.prompt_tokens or 0) + 0.03 * (call.completion_tokens or 0)
+
+    span = _mk_span(
+        {
+            "llm.model": "do/openai-gpt-5.4",
+            "llm.usage.prompt_tokens": 10,
+            "llm.usage.completion_tokens": 2,
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing())
+    assert call is not None
+    assert call.cost_usd == pytest.approx(0.26)
+
+
+def test_tpt_and_tokens_per_sec_derived():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=80,
+    )
+    # (1000 - 200) / 80 = 10ms per token => 100 tokens/sec
+    assert call.tpt_ms == 10.0
+    assert call.tokens_per_sec == 100.0
+
+
+def test_tpt_returns_none_when_no_completion_tokens():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=0,
+    )
+    assert call.tpt_ms is None
+    assert call.tokens_per_sec is None
+
+
+def test_store_evicts_fifo_at_capacity():
+    store = LLMCallStore(capacity=3)
+    now = datetime.now(tz=timezone.utc)
+    for i in range(5):
+        store.add(
+            LLMCall(
+                request_id=f"r{i}",
+                timestamp=now,
+                model="m",
+            )
+        )
+    snap = store.snapshot()
+    assert len(snap) == 3
+    assert [c.request_id for c in snap] == ["r2", "r3", "r4"]
--- a/cli/test/test_obs_pricing.py
+++ b/cli/test/test_obs_pricing.py
@ -0,0 +1,103 @@
+from datetime import datetime, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.pricing import ModelPrice, PricingCatalog
+
+
+def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall:
+    return LLMCall(
+        request_id="r",
+        timestamp=datetime.now(tz=timezone.utc),
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cached,
+    )
+
+
+def test_lookup_matches_bare_and_prefixed():
+    prices = {
+        "openai-gpt-5.4": ModelPrice(
+            input_per_token_usd=0.000001, output_per_token_usd=0.000002
+        )
+    }
+    catalog = PricingCatalog(prices)
+    assert catalog.price_for("openai-gpt-5.4") is not None
+    # do/openai-gpt-5.4 should resolve after stripping the provider prefix.
+    assert catalog.price_for("do/openai-gpt-5.4") is not None
+    assert catalog.price_for("unknown-model") is None
+
+
+def test_cost_computation_without_cache():
+    prices = {
+        "m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002)
+    }
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500))
+    assert cost == 0.002  # 1000 * 1e-6 + 500 * 2e-6
+
+
+def test_cost_computation_with_cached_discount():
+    prices = {
+        "m": ModelPrice(
+            input_per_token_usd=0.000001,
+            output_per_token_usd=0.000002,
+            cached_input_per_token_usd=0.0000001,
+        )
+    }
+    # 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200))
+    assert cost == round(0.0008 + 0.00002 + 0.001, 6)
+
+
+def test_empty_catalog_returns_none():
+    assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None
+
+
+def test_parse_do_catalog_treats_small_values_as_per_token():
+    """DO's real catalog uses per-token values under the `_per_million` key
+    (e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "openai-gpt-oss-20b",
+                "pricing": {
+                    "input_price_per_million": 5e-8,
+                    "output_price_per_million": 4.5e-7,
+                },
+            },
+            {
+                "model_id": "openai-gpt-oss-120b",
+                "pricing": {
+                    "input_price_per_million": 1e-7,
+                    "output_price_per_million": 7e-7,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    # Values < 1 are assumed to already be per-token — no extra division.
+    assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8
+    assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7
+    assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
+
+
+def test_parse_do_catalog_divides_large_values_as_per_million():
+    """A provider that genuinely reports $5-per-million in that field gets divided."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "mystery-model",
+                "pricing": {
+                    "input_price_per_million": 5.0,  # > 1 → treated as per-million
+                    "output_price_per_million": 15.0,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
+    assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
--- a/cli/test/test_obs_render.py
+++ b/cli/test/test_obs_render.py
@ -0,0 +1,73 @@
+from datetime import datetime, timedelta, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.render import aggregates, model_rollups, route_hits
+
+
+def _call(model: str, ts: datetime, prompt=0, completion=0, cost=None, route=None, session=None, cache_read=0, cache_write=0):
+    return LLMCall(
+        request_id="r",
+        timestamp=ts,
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cache_read,
+        cache_creation_tokens=cache_write,
+        cost_usd=cost,
+        route_name=route,
+        session_id=session,
+    )
+
+
+def test_aggregates_sum_and_session_counts():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m1", now - timedelta(seconds=50), prompt=10, completion=5, cost=0.001, session="s1"),
+        _call("m2", now - timedelta(seconds=40), prompt=20, completion=10, cost=0.002, session="s1"),
+        _call("m1", now - timedelta(seconds=30), prompt=30, completion=15, cost=0.003, session="s2"),
+    ]
+    stats = aggregates(calls)
+    assert stats.count == 3
+    assert stats.total_cost_usd == 0.006
+    assert stats.total_input_tokens == 60
+    assert stats.total_output_tokens == 30
+    assert stats.distinct_sessions == 2
+    assert stats.current_session == "s2"
+
+
+def test_rollups_split_by_model_and_cache():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7),
+        _call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1),
+        _call("m2", now, prompt=30, completion=15, cost=0.004),
+    ]
+    rollups = model_rollups(calls)
+    by_model = {r.model: r for r in rollups}
+    assert by_model["m1"].requests == 2
+    assert by_model["m1"].input_tokens == 30
+    assert by_model["m1"].cache_write == 3
+    assert by_model["m1"].cache_read == 8
+    assert by_model["m2"].input_tokens == 30
+
+
+def test_route_hits_only_for_routed_calls():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m", now, route="code"),
+        _call("m", now, route="code"),
+        _call("m", now, route="summarization"),
+        _call("m", now),  # no route
+    ]
+    hits = route_hits(calls)
+    # Only calls with route names are counted.
+    assert sum(n for _, n, _ in hits) == 3
+    hits_by_name = {name: (n, pct) for name, n, pct in hits}
+    assert hits_by_name["code"][0] == 2
+    assert hits_by_name["summarization"][0] == 1
+
+
+def test_route_hits_empty_when_no_routes():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [_call("m", now), _call("m", now)]
+    assert route_hits(calls) == []
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@ -33,7 +33,8 @@ use crate::streaming::{
    ObservableStreamProcessor, StreamProcessor,
 };
 use crate::tracing::{
-    collect_custom_trace_attributes, llm as tracing_llm, operation_component, set_service_name,
+    collect_custom_trace_attributes, llm as tracing_llm, operation_component,
+    plano as tracing_plano, set_service_name,
 };
 use model_selection::router_chat_get_upstream_model;

@ -102,15 +103,36 @@ async fn llm_chat_inner(
        .and_then(|hdr| request_headers.get(hdr))
        .and_then(|v| v.to_str().ok())
        .map(|s| s.to_string());
-    let pinned_model: Option<String> = if let Some(ref sid) = session_id {
+    let cached_route = if let Some(ref sid) = session_id {
        state
            .orchestrator_service
            .get_cached_route(sid, tenant_id.as_deref())
            .await
-            .map(|c| c.model_name)
    } else {
        None
    };
+    let (pinned_model, pinned_route_name): (Option<String>, Option<String>) = match cached_route {
+        Some(c) => (Some(c.model_name), c.route_name),
+        None => (None, None),
+    };
+
+    // Record session id on the LLM span for the observability console.
+    if let Some(ref sid) = session_id {
+        get_active_span(|span| {
+            span.set_attribute(opentelemetry::KeyValue::new(
+                tracing_plano::SESSION_ID,
+                sid.clone(),
+            ));
+        });
+    }
+    if let Some(ref route_name) = pinned_route_name {
+        get_active_span(|span| {
+            span.set_attribute(opentelemetry::KeyValue::new(
+                tracing_plano::ROUTE_NAME,
+                route_name.clone(),
+            ));
+        });
+    }

    let full_qualified_llm_provider_url = format!("{}{}", state.llm_provider_url, request_path);

@ -311,6 +333,18 @@ async fn llm_chat_inner(
            alias_resolved_model.clone()
        };

+        // Record route name on the LLM span (only when the orchestrator produced one).
+        if let Some(ref rn) = route_name {
+            if !rn.is_empty() && rn != "none" {
+                get_active_span(|span| {
+                    span.set_attribute(opentelemetry::KeyValue::new(
+                        tracing_plano::ROUTE_NAME,
+                        rn.clone(),
+                    ));
+                });
+            }
+        }
+
        if let Some(ref sid) = session_id {
            state
                .orchestrator_service
@ -671,6 +705,36 @@ async fn send_upstream(
    // Propagate upstream headers and status
    let response_headers = llm_response.headers().clone();
    let upstream_status = llm_response.status();
+
+    // Upstream routers (e.g. DigitalOcean Gradient) may return an
+    // `x-model-router-selected-route` header indicating which task-level
+    // route the request was classified into (e.g. "Code Generation"). Surface
+    // it as `plano.route.name` so the obs console's Route hit % panel can
+    // show the breakdown even when Plano's own orchestrator wasn't in the
+    // routing path. Any value from Plano's orchestrator already set earlier
+    // takes precedence — this only fires when the span doesn't already have
+    // a route name.
+    if let Some(upstream_route) = response_headers
+        .get("x-model-router-selected-route")
+        .and_then(|v| v.to_str().ok())
+    {
+        if !upstream_route.is_empty() {
+            get_active_span(|span| {
+                span.set_attribute(opentelemetry::KeyValue::new(
+                    crate::tracing::plano::ROUTE_NAME,
+                    upstream_route.to_string(),
+                ));
+            });
+        }
+    }
+    // Record the upstream HTTP status on the span for the obs console.
+    get_active_span(|span| {
+        span.set_attribute(opentelemetry::KeyValue::new(
+            crate::tracing::http::STATUS_CODE,
+            upstream_status.as_u16() as i64,
+        ));
+    });
+
    let mut response = Response::builder().status(upstream_status);
    if let Some(headers) = response.headers_mut() {
        for (name, value) in response_headers.iter() {
--- a/crates/brightstaff/src/streaming.rs
+++ b/crates/brightstaff/src/streaming.rs
@ -16,10 +16,131 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
 use crate::handlers::agents::pipeline::{PipelineError, PipelineProcessor};

 const STREAM_BUFFER_SIZE: usize = 16;
+/// Cap on accumulated response bytes kept for usage extraction.
+/// Most chat responses are well under this; pathological ones are dropped without
+/// affecting pass-through streaming to the client.
+const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024;
 use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER};
 use crate::tracing::{llm, set_service_name, signals as signal_constants};
 use hermesllm::apis::openai::Message;

+/// Parsed usage + resolved-model details from a provider response.
+#[derive(Debug, Default, Clone)]
+struct ExtractedUsage {
+    prompt_tokens: Option<i64>,
+    completion_tokens: Option<i64>,
+    total_tokens: Option<i64>,
+    cached_input_tokens: Option<i64>,
+    cache_creation_tokens: Option<i64>,
+    reasoning_tokens: Option<i64>,
+    /// The model the upstream actually used. For router aliases (e.g.
+    /// `router:software-engineering`), this differs from the request model.
+    resolved_model: Option<String>,
+}
+
+impl ExtractedUsage {
+    fn is_empty(&self) -> bool {
+        self.prompt_tokens.is_none()
+            && self.completion_tokens.is_none()
+            && self.total_tokens.is_none()
+            && self.resolved_model.is_none()
+    }
+
+    fn from_json(value: &serde_json::Value) -> Self {
+        let mut out = Self::default();
+        if let Some(model) = value.get("model").and_then(|v| v.as_str()) {
+            if !model.is_empty() {
+                out.resolved_model = Some(model.to_string());
+            }
+        }
+        if let Some(u) = value.get("usage") {
+            // OpenAI-shape usage
+            out.prompt_tokens = u.get("prompt_tokens").and_then(|v| v.as_i64());
+            out.completion_tokens = u.get("completion_tokens").and_then(|v| v.as_i64());
+            out.total_tokens = u.get("total_tokens").and_then(|v| v.as_i64());
+            out.cached_input_tokens = u
+                .get("prompt_tokens_details")
+                .and_then(|d| d.get("cached_tokens"))
+                .and_then(|v| v.as_i64());
+            out.reasoning_tokens = u
+                .get("completion_tokens_details")
+                .and_then(|d| d.get("reasoning_tokens"))
+                .and_then(|v| v.as_i64());
+
+            // Anthropic-shape fallbacks
+            if out.prompt_tokens.is_none() {
+                out.prompt_tokens = u.get("input_tokens").and_then(|v| v.as_i64());
+            }
+            if out.completion_tokens.is_none() {
+                out.completion_tokens = u.get("output_tokens").and_then(|v| v.as_i64());
+            }
+            if out.total_tokens.is_none() {
+                if let (Some(p), Some(c)) = (out.prompt_tokens, out.completion_tokens) {
+                    out.total_tokens = Some(p + c);
+                }
+            }
+            if out.cached_input_tokens.is_none() {
+                out.cached_input_tokens = u.get("cache_read_input_tokens").and_then(|v| v.as_i64());
+            }
+            if out.cached_input_tokens.is_none() {
+                out.cached_input_tokens =
+                    u.get("cached_content_token_count").and_then(|v| v.as_i64());
+            }
+            out.cache_creation_tokens = u
+                .get("cache_creation_input_tokens")
+                .and_then(|v| v.as_i64());
+            if out.reasoning_tokens.is_none() {
+                out.reasoning_tokens = u.get("thoughts_token_count").and_then(|v| v.as_i64());
+            }
+        }
+        out
+    }
+}
+
+/// Try to pull usage out of an accumulated response body.
+/// Handles both a single JSON object (non-streaming) and SSE streams where the
+/// final `data: {...}` event carries the `usage` field.
+fn extract_usage_from_bytes(buf: &[u8]) -> ExtractedUsage {
+    if buf.is_empty() {
+        return ExtractedUsage::default();
+    }
+
+    // Fast path: full-body JSON (non-streaming).
+    if let Ok(value) = serde_json::from_slice::<serde_json::Value>(buf) {
+        let u = ExtractedUsage::from_json(&value);
+        if !u.is_empty() {
+            return u;
+        }
+    }
+
+    // SSE path: scan from the end for a `data:` line containing a usage object.
+    let text = match std::str::from_utf8(buf) {
+        Ok(t) => t,
+        Err(_) => return ExtractedUsage::default(),
+    };
+    for line in text.lines().rev() {
+        let trimmed = line.trim_start();
+        let payload = match trimmed.strip_prefix("data:") {
+            Some(p) => p.trim_start(),
+            None => continue,
+        };
+        if payload == "[DONE]" || payload.is_empty() {
+            continue;
+        }
+        if !payload.contains("\"usage\"") {
+            continue;
+        }
+        if let Ok(value) = serde_json::from_str::<serde_json::Value>(payload) {
+            let u = ExtractedUsage::from_json(&value);
+            if !u.is_empty() {
+                return u;
+            }
+        }
+    }
+
+    ExtractedUsage::default()
+}
+
 /// Trait for processing streaming chunks
 /// Implementors can inject custom logic during streaming (e.g., hallucination detection, logging)
 pub trait StreamProcessor: Send + 'static {
@ -60,6 +181,10 @@ pub struct ObservableStreamProcessor {
    start_time: Instant,
    time_to_first_token: Option<u128>,
    messages: Option<Vec<Message>>,
+    /// Accumulated response bytes used only for best-effort usage extraction
+    /// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped
+    /// from the buffer (they still pass through to the client).
+    response_buffer: Vec<u8>,
 }

 impl ObservableStreamProcessor {
@ -93,6 +218,7 @@ impl ObservableStreamProcessor {
            start_time,
            time_to_first_token: None,
            messages,
+            response_buffer: Vec::new(),
        }
    }
 }
@ -101,6 +227,13 @@ impl StreamProcessor for ObservableStreamProcessor {
    fn process_chunk(&mut self, chunk: Bytes) -> Result<Option<Bytes>, String> {
        self.total_bytes += chunk.len();
        self.chunk_count += 1;
+        // Accumulate for best-effort usage extraction; drop further chunks once
+        // the cap is reached so we don't retain huge response bodies in memory.
+        if self.response_buffer.len() < USAGE_BUFFER_MAX {
+            let remaining = USAGE_BUFFER_MAX - self.response_buffer.len();
+            let take = chunk.len().min(remaining);
+            self.response_buffer.extend_from_slice(&chunk[..take]);
+        }
        Ok(Some(chunk))
    }

@ -124,6 +257,52 @@ impl StreamProcessor for ObservableStreamProcessor {
            );
        }

+        // Record total duration on the span for the observability console.
+        let duration_ms = self.start_time.elapsed().as_millis() as i64;
+        {
+            let span = tracing::Span::current();
+            let otel_context = span.context();
+            let otel_span = otel_context.span();
+            otel_span.set_attribute(KeyValue::new(llm::DURATION_MS, duration_ms));
+            otel_span.set_attribute(KeyValue::new(llm::RESPONSE_BYTES, self.total_bytes as i64));
+        }
+
+        // Best-effort usage extraction + emission (works for both streaming
+        // SSE and non-streaming JSON responses that include a `usage` object).
+        let usage = extract_usage_from_bytes(&self.response_buffer);
+        if !usage.is_empty() {
+            let span = tracing::Span::current();
+            let otel_context = span.context();
+            let otel_span = otel_context.span();
+            if let Some(v) = usage.prompt_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::PROMPT_TOKENS, v));
+            }
+            if let Some(v) = usage.completion_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::COMPLETION_TOKENS, v));
+            }
+            if let Some(v) = usage.total_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::TOTAL_TOKENS, v));
+            }
+            if let Some(v) = usage.cached_input_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::CACHED_INPUT_TOKENS, v));
+            }
+            if let Some(v) = usage.cache_creation_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::CACHE_CREATION_TOKENS, v));
+            }
+            if let Some(v) = usage.reasoning_tokens {
+                otel_span.set_attribute(KeyValue::new(llm::REASONING_TOKENS, v));
+            }
+            // Override `llm.model` with the model the upstream actually ran
+            // (e.g. `openai-gpt-5.4` resolved from `router:software-engineering`).
+            // Cost lookup keys off the real model, not the alias.
+            if let Some(resolved) = usage.resolved_model.clone() {
+                otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved));
+            }
+        }
+        // Release the buffered bytes early; nothing downstream needs them.
+        self.response_buffer.clear();
+        self.response_buffer.shrink_to_fit();
+
        // Analyze signals if messages are available and record as span attributes
        if let Some(ref messages) = self.messages {
            let analyzer: Box<dyn SignalAnalyzer> = Box::new(TextBasedSignalAnalyzer::new());
@ -404,3 +583,55 @@ pub fn truncate_message(message: &str, max_length: usize) -> String {
        message.to_string()
    }
 }
+
+#[cfg(test)]
+mod usage_extraction_tests {
+    use super::*;
+
+    #[test]
+    fn non_streaming_openai_with_cached() {
+        let body = br#"{"id":"x","model":"gpt-4o","choices":[],"usage":{"prompt_tokens":12,"completion_tokens":34,"total_tokens":46,"prompt_tokens_details":{"cached_tokens":5}}}"#;
+        let u = extract_usage_from_bytes(body);
+        assert_eq!(u.prompt_tokens, Some(12));
+        assert_eq!(u.completion_tokens, Some(34));
+        assert_eq!(u.total_tokens, Some(46));
+        assert_eq!(u.cached_input_tokens, Some(5));
+        assert_eq!(u.reasoning_tokens, None);
+    }
+
+    #[test]
+    fn non_streaming_anthropic_with_cache_creation() {
+        let body = br#"{"id":"x","model":"claude","usage":{"input_tokens":100,"output_tokens":50,"cache_creation_input_tokens":20,"cache_read_input_tokens":30}}"#;
+        let u = extract_usage_from_bytes(body);
+        assert_eq!(u.prompt_tokens, Some(100));
+        assert_eq!(u.completion_tokens, Some(50));
+        assert_eq!(u.total_tokens, Some(150));
+        assert_eq!(u.cached_input_tokens, Some(30));
+        assert_eq!(u.cache_creation_tokens, Some(20));
+    }
+
+    #[test]
+    fn streaming_openai_final_chunk_has_usage() {
+        let sse = b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}
+
+data: {\"choices\":[{\"delta\":{}, \"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":3,\"total_tokens\":10}}
+
+data: [DONE]
+
+";
+        let u = extract_usage_from_bytes(sse);
+        assert_eq!(u.prompt_tokens, Some(7));
+        assert_eq!(u.completion_tokens, Some(3));
+        assert_eq!(u.total_tokens, Some(10));
+    }
+
+    #[test]
+    fn empty_returns_default() {
+        assert!(extract_usage_from_bytes(b"").is_empty());
+    }
+
+    #[test]
+    fn no_usage_in_body_returns_default() {
+        assert!(extract_usage_from_bytes(br#"{"ok":true}"#).is_empty());
+    }
+}
--- a/crates/brightstaff/src/tracing/constants.rs
+++ b/crates/brightstaff/src/tracing/constants.rs
@ -80,6 +80,18 @@ pub mod llm {
    /// Total tokens used (prompt + completion)
    pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens";

+    /// Tokens served from a prompt cache read
+    /// (OpenAI `prompt_tokens_details.cached_tokens`, Anthropic `cache_read_input_tokens`,
+    /// Google `cached_content_token_count`)
+    pub const CACHED_INPUT_TOKENS: &str = "llm.usage.cached_input_tokens";
+
+    /// Tokens used to write a prompt cache entry (Anthropic `cache_creation_input_tokens`)
+    pub const CACHE_CREATION_TOKENS: &str = "llm.usage.cache_creation_tokens";
+
+    /// Reasoning tokens for reasoning models
+    /// (OpenAI `completion_tokens_details.reasoning_tokens`, Google `thoughts_token_count`)
+    pub const REASONING_TOKENS: &str = "llm.usage.reasoning_tokens";
+
    /// Temperature parameter used
    pub const TEMPERATURE: &str = "llm.temperature";

@ -119,6 +131,22 @@ pub mod routing {
    pub const SELECTION_REASON: &str = "routing.selection_reason";
 }

+// =============================================================================
+// Span Attributes - Plano-specific
+// =============================================================================
+
+/// Attributes specific to Plano (session affinity, routing decisions).
+pub mod plano {
+    /// Session identifier propagated via the `x-model-affinity` header.
+    /// Absent when the client did not send the header.
+    pub const SESSION_ID: &str = "plano.session_id";
+
+    /// Matched route name from routing (e.g. "code", "summarization",
+    /// "software-engineering"). Absent when the client routed directly
+    /// to a concrete model.
+    pub const ROUTE_NAME: &str = "plano.route.name";
+}
+
 // =============================================================================
 // Span Attributes - Error Handling
 // =============================================================================
--- a/crates/brightstaff/src/tracing/mod.rs
+++ b/crates/brightstaff/src/tracing/mod.rs
@ -4,7 +4,7 @@ mod init;
 mod service_name_exporter;

 pub use constants::{
-    error, http, llm, operation_component, routing, signals, OperationNameBuilder,
+    error, http, llm, operation_component, plano, routing, signals, OperationNameBuilder,
 };
 pub use custom_attributes::collect_custom_trace_attributes;
 pub use init::init_tracer;
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
@ -435,6 +435,12 @@ impl TokenUsage for MessagesResponse {
    fn total_tokens(&self) -> usize {
        (self.usage.input_tokens + self.usage.output_tokens) as usize
    }
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.usage.cache_read_input_tokens.map(|t| t as usize)
+    }
+    fn cache_creation_tokens(&self) -> Option<usize> {
+        self.usage.cache_creation_input_tokens.map(|t| t as usize)
+    }
 }

 impl ProviderResponse for MessagesResponse {
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -596,6 +596,18 @@ impl TokenUsage for Usage {
    fn total_tokens(&self) -> usize {
        self.total_tokens as usize
    }
+
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.prompt_tokens_details
+            .as_ref()
+            .and_then(|d| d.cached_tokens.map(|t| t as usize))
+    }
+
+    fn reasoning_tokens(&self) -> Option<usize> {
+        self.completion_tokens_details
+            .as_ref()
+            .and_then(|d| d.reasoning_tokens.map(|t| t as usize))
+    }
 }

 /// Implementation of ProviderRequest for ChatCompletionsRequest
--- a/crates/hermesllm/src/apis/openai_responses.rs
+++ b/crates/hermesllm/src/apis/openai_responses.rs
@ -710,6 +710,18 @@ impl crate::providers::response::TokenUsage for ResponseUsage {
    fn total_tokens(&self) -> usize {
        self.total_tokens as usize
    }
+
+    fn cached_input_tokens(&self) -> Option<usize> {
+        self.input_tokens_details
+            .as_ref()
+            .map(|d| d.cached_tokens.max(0) as usize)
+    }
+
+    fn reasoning_tokens(&self) -> Option<usize> {
+        self.output_tokens_details
+            .as_ref()
+            .map(|d| d.reasoning_tokens.max(0) as usize)
+    }
 }

 /// Token details
--- a/crates/hermesllm/src/providers/response.rs
+++ b/crates/hermesllm/src/providers/response.rs
@ -23,6 +23,31 @@ pub trait TokenUsage {
    fn completion_tokens(&self) -> usize;
    fn prompt_tokens(&self) -> usize;
    fn total_tokens(&self) -> usize;
+    /// Tokens served from a prompt cache read (OpenAI `prompt_tokens_details.cached_tokens`,
+    /// Anthropic `cache_read_input_tokens`, Google `cached_content_token_count`).
+    fn cached_input_tokens(&self) -> Option<usize> {
+        None
+    }
+    /// Tokens used to write a cache entry (Anthropic `cache_creation_input_tokens`).
+    fn cache_creation_tokens(&self) -> Option<usize> {
+        None
+    }
+    /// Reasoning tokens for reasoning models (OpenAI `completion_tokens_details.reasoning_tokens`,
+    /// Google `thoughts_token_count`).
+    fn reasoning_tokens(&self) -> Option<usize> {
+        None
+    }
+}
+
+/// Rich usage breakdown extracted from a provider response.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct UsageDetails {
+    pub prompt_tokens: usize,
+    pub completion_tokens: usize,
+    pub total_tokens: usize,
+    pub cached_input_tokens: Option<usize>,
+    pub cache_creation_tokens: Option<usize>,
+    pub reasoning_tokens: Option<usize>,
 }

 pub trait ProviderResponse: Send + Sync {
@ -34,6 +59,18 @@ pub trait ProviderResponse: Send + Sync {
        self.usage()
            .map(|u| (u.prompt_tokens(), u.completion_tokens(), u.total_tokens()))
    }
+
+    /// Extract a rich usage breakdown including cached/cache-creation/reasoning tokens.
+    fn extract_usage_details(&self) -> Option<UsageDetails> {
+        self.usage().map(|u| UsageDetails {
+            prompt_tokens: u.prompt_tokens(),
+            completion_tokens: u.completion_tokens(),
+            total_tokens: u.total_tokens(),
+            cached_input_tokens: u.cached_input_tokens(),
+            cache_creation_tokens: u.cache_creation_tokens(),
+            reasoning_tokens: u.reasoning_tokens(),
+        })
+    }
 }

 impl ProviderResponse for ProviderResponseType {
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -340,6 +340,42 @@ And to get the list of supported currencies:
   "Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in."


+Observability Console
+---------------------
+
+Run ``planoai obs`` in a second terminal for a live, in-memory view of LLM traffic: per-request tokens, cached/cache-creation/reasoning tokens, TTFT, latency, cost (when DO Gradient pricing is available), session grouping, and route distribution.
+
+.. code-block:: console
+
+   $ planoai obs
+   # In another terminal, start the proxy — with no config, planoai synthesizes
+   # a pass-through config for all known providers and auto-wires OTel export
+   # to localhost:4317 so the console receives spans automatically.
+   $ planoai up
+
+With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request. For example, using DigitalOcean Gradient:
+
+.. code-block:: console
+
+   $ curl localhost:12000/v1/chat/completions \
+       -H "Content-Type: application/json" \
+       -H "Authorization: Bearer $DO_API_KEY" \
+       -d '{"model":"do/router:software-engineering",
+            "messages":[{"role":"user","content":"write code to print prime numbers in python"}],
+            "stream":false}'
+
+When you do export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up automatically and clients no longer need to send ``Authorization``.
+
+If you already use your own ``plano_config.yaml``, add this block so spans flow to the console:
+
+.. code-block:: yaml
+
+   tracing:
+     random_sampling: 100
+     opentracing_grpc_endpoint: http://localhost:4317
+
+Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk.
+
 Next Steps
 ==========