From 9ee33a921f3fb367f596c41a18865e75727eda0a Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Fri, 17 Apr 2026 13:26:04 -0700 Subject: [PATCH] address #891 review: inline otel attrs, correct port-in-use msg, clarify DO pricing is public, split obs/trace docs --- cli/planoai/obs/collector.py | 64 +++++++++------------- cli/planoai/obs/pricing.py | 30 ++--------- cli/planoai/obs_cmd.py | 2 +- docs/source/get_started/quickstart.rst | 73 +++++++++++++++++--------- 4 files changed, 80 insertions(+), 89 deletions(-) diff --git a/cli/planoai/obs/collector.py b/cli/planoai/obs/collector.py index 92875369..7f4cae36 100644 --- a/cli/planoai/obs/collector.py +++ b/cli/planoai/obs/collector.py @@ -94,25 +94,9 @@ class LLMCallStore: return len(self._calls) -# Attribute keys mirror crates/brightstaff/src/tracing/constants.rs. -_LLM_MODEL = "llm.model" -_LLM_PROVIDER = "llm.provider" -_LLM_IS_STREAMING = "llm.is_streaming" -_LLM_DURATION_MS = "llm.duration_ms" -_LLM_TTFT_MS = "llm.time_to_first_token" -_LLM_PROMPT_TOKENS = "llm.usage.prompt_tokens" -_LLM_COMPLETION_TOKENS = "llm.usage.completion_tokens" -_LLM_TOTAL_TOKENS = "llm.usage.total_tokens" -_LLM_CACHED_INPUT_TOKENS = "llm.usage.cached_input_tokens" -_LLM_CACHE_CREATION_TOKENS = "llm.usage.cache_creation_tokens" -_LLM_REASONING_TOKENS = "llm.usage.reasoning_tokens" -_HTTP_STATUS = "http.status_code" -_MODEL_REQUESTED = "model.requested" -_PLANO_SESSION_ID = "plano.session_id" -_PLANO_ROUTE_NAME = "plano.route.name" -_ROUTING_STRATEGY = "routing.strategy" -_ROUTING_SELECTION_REASON = "routing.selection_reason" -_REQUEST_ID_KEYS = ("request_id", "http.request_id") +# Span attribute keys used below are the canonical OTel / Plano keys emitted by +# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source +# of truth. def _anyvalue_to_python(value: Any) -> Any: # AnyValue from OTLP @@ -163,7 +147,7 @@ def span_to_llm_call( A span is considered an LLM call iff it carries the ``llm.model`` attribute. """ attrs = _attrs_to_dict(span.attributes) - model = attrs.get(_LLM_MODEL) + model = attrs.get("llm.model") if not model: return None @@ -171,7 +155,7 @@ def span_to_llm_call( request_id = next( ( str(attrs[key]) - for key in _REQUEST_ID_KEYS + for key in ("request_id", "http.request_id") if key in attrs and attrs[key] is not None ), span.span_id.hex() if span.span_id else "", @@ -187,34 +171,36 @@ def span_to_llm_call( request_id=str(request_id), timestamp=ts, model=str(model), - provider=str(attrs[_LLM_PROVIDER]) if _LLM_PROVIDER in attrs else service_name, + provider=( + str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name + ), request_model=( - str(attrs[_MODEL_REQUESTED]) if _MODEL_REQUESTED in attrs else None + str(attrs["model.requested"]) if "model.requested" in attrs else None ), session_id=( - str(attrs[_PLANO_SESSION_ID]) if _PLANO_SESSION_ID in attrs else None + str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None ), route_name=( - str(attrs[_PLANO_ROUTE_NAME]) if _PLANO_ROUTE_NAME in attrs else None + str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None ), is_streaming=( - bool(attrs[_LLM_IS_STREAMING]) if _LLM_IS_STREAMING in attrs else None + bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None ), - status_code=_maybe_int(attrs.get(_HTTP_STATUS)), - prompt_tokens=_maybe_int(attrs.get(_LLM_PROMPT_TOKENS)), - completion_tokens=_maybe_int(attrs.get(_LLM_COMPLETION_TOKENS)), - total_tokens=_maybe_int(attrs.get(_LLM_TOTAL_TOKENS)), - cached_input_tokens=_maybe_int(attrs.get(_LLM_CACHED_INPUT_TOKENS)), - cache_creation_tokens=_maybe_int(attrs.get(_LLM_CACHE_CREATION_TOKENS)), - reasoning_tokens=_maybe_int(attrs.get(_LLM_REASONING_TOKENS)), - ttft_ms=_maybe_float(attrs.get(_LLM_TTFT_MS)), - duration_ms=_maybe_float(attrs.get(_LLM_DURATION_MS)), + status_code=_maybe_int(attrs.get("http.status_code")), + prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")), + completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")), + total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")), + cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")), + cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")), + reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")), + ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")), + duration_ms=_maybe_float(attrs.get("llm.duration_ms")), routing_strategy=( - str(attrs[_ROUTING_STRATEGY]) if _ROUTING_STRATEGY in attrs else None + str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None ), routing_reason=( - str(attrs[_ROUTING_SELECTION_REASON]) - if _ROUTING_SELECTION_REASON in attrs + str(attrs["routing.selection_reason"]) + if "routing.selection_reason" in attrs else None ), ) @@ -269,7 +255,7 @@ class ObsCollector: if bound == 0: raise OSError( f"Failed to bind OTLP listener on {address}: port already in use. " - "Stop `planoai trace listen` or pick another port with --port." + "Stop tracing via `planoai trace down` or pick another port with --port." ) server.start() self._server = server diff --git a/cli/planoai/obs/pricing.py b/cli/planoai/obs/pricing.py index 406b2cad..19eb1297 100644 --- a/cli/planoai/obs/pricing.py +++ b/cli/planoai/obs/pricing.py @@ -50,40 +50,20 @@ class PricingCatalog: return list(self._prices.keys())[:n] @classmethod - def fetch( - cls, - url: str = DEFAULT_PRICING_URL, - api_key: str | None = None, - ) -> "PricingCatalog": + def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog": """Fetch pricing from DO's catalog endpoint. On failure, returns an empty catalog (cost column will be blank). - The catalog endpoint requires a DigitalOcean Personal Access Token — - this is *not* the same as the inference ``MODEL_ACCESS_KEY`` used at - runtime. We check ``DIGITALOCEAN_TOKEN`` first (standard DO CLI env - var), then ``DO_PAT``, then fall back to ``DO_API_KEY``. + The catalog endpoint is public — no auth required, no signup — so + ``planoai obs`` gets cost data on first run out of the box. """ - import os - - headers = {} - token = ( - api_key - or os.environ.get("DIGITALOCEAN_TOKEN") - or os.environ.get("DO_PAT") - or os.environ.get("DO_API_KEY") - ) - if token: - headers["Authorization"] = f"Bearer {token}" - try: - resp = requests.get(url, headers=headers, timeout=FETCH_TIMEOUT_SECS) + resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS) resp.raise_for_status() data = resp.json() except Exception as exc: # noqa: BLE001 — best-effort; never fatal logger.warning( - "DO pricing fetch failed: %s; cost column will be blank. " - "Set DIGITALOCEAN_TOKEN with a DO Personal Access Token to " - "enable cost.", + "DO pricing fetch failed: %s; cost column will be blank.", exc, ) return cls() diff --git a/cli/planoai/obs_cmd.py b/cli/planoai/obs_cmd.py index 2bd51fbc..6249df30 100644 --- a/cli/planoai/obs_cmd.py +++ b/cli/planoai/obs_cmd.py @@ -63,7 +63,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None: else: console.print( " [yellow]no pricing loaded[/] — " - "[dim]set DIGITALOCEAN_TOKEN (DO Personal Access Token) to enable cost[/]" + "[dim]cost column will be blank (DO catalog unreachable)[/]" ) store = LLMCallStore(capacity=capacity) diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst index fa9e4e22..50916eae 100644 --- a/docs/source/get_started/quickstart.rst +++ b/docs/source/get_started/quickstart.rst @@ -340,33 +340,19 @@ And to get the list of supported currencies: "Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in." -Observability Console ---------------------- +Observability +------------- -Run ``planoai obs`` in a second terminal for a live, in-memory view of LLM traffic: per-request tokens, cached/cache-creation/reasoning tokens, TTFT, latency, cost (when DO Gradient pricing is available), session grouping, and route distribution. +Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering. -.. code-block:: console +===================== ============================================ ============================================================= +Command When to use Shows +===================== ============================================ ============================================================= +``planoai obs`` Live view while you drive traffic Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model +``planoai trace`` Deep-dive into a single request after the fact Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors +===================== ============================================ ============================================================= - $ planoai obs - # In another terminal, start the proxy — with no config, planoai synthesizes - # a pass-through config for all known providers and auto-wires OTel export - # to localhost:4317 so the console receives spans automatically. - $ planoai up - -With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request. For example, using DigitalOcean Gradient: - -.. code-block:: console - - $ curl localhost:12000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer $DO_API_KEY" \ - -d '{"model":"do/router:software-engineering", - "messages":[{"role":"user","content":"write code to print prime numbers in python"}], - "stream":false}' - -When you do export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up automatically and clients no longer need to send ``Authorization``. - -If you already use your own ``plano_config.yaml``, add this block so spans flow to the console: +Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add: .. code-block:: yaml @@ -374,8 +360,47 @@ If you already use your own ``plano_config.yaml``, add this block so spans flow random_sampling: 100 opentracing_grpc_endpoint: http://localhost:4317 +Live console — ``planoai obs`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: console + + $ planoai obs + # In another terminal: + $ planoai up + +Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required. + +With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request: + +.. code-block:: console + + $ curl localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $DO_API_KEY" \ + -d '{"model":"digitalocean/router:software-engineering", + "messages":[{"role":"user","content":"write code to print prime numbers in python"}], + "stream":false}' + +When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``. + Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk. +Single-request traces — ``planoai trace`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``: + +.. code-block:: console + + $ planoai trace listen # start the OTLP listener (daemon) + # drive some traffic through localhost:12000 ... + $ planoai trace # show the most recent trace + $ planoai trace # show a specific trace by id + $ planoai trace --list # list the last 50 trace ids + +Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time. + Next Steps ==========