address #891 review: inline otel attrs, correct port-in-use msg, clarify DO pricing is public, split obs/trace docs

This commit is contained in:
Adil Hafeez 2026-04-17 13:26:04 -07:00
parent d09fa97568
commit 9ee33a921f
4 changed files with 80 additions and 89 deletions

View file

@ -94,25 +94,9 @@ class LLMCallStore:
return len(self._calls)
# Attribute keys mirror crates/brightstaff/src/tracing/constants.rs.
_LLM_MODEL = "llm.model"
_LLM_PROVIDER = "llm.provider"
_LLM_IS_STREAMING = "llm.is_streaming"
_LLM_DURATION_MS = "llm.duration_ms"
_LLM_TTFT_MS = "llm.time_to_first_token"
_LLM_PROMPT_TOKENS = "llm.usage.prompt_tokens"
_LLM_COMPLETION_TOKENS = "llm.usage.completion_tokens"
_LLM_TOTAL_TOKENS = "llm.usage.total_tokens"
_LLM_CACHED_INPUT_TOKENS = "llm.usage.cached_input_tokens"
_LLM_CACHE_CREATION_TOKENS = "llm.usage.cache_creation_tokens"
_LLM_REASONING_TOKENS = "llm.usage.reasoning_tokens"
_HTTP_STATUS = "http.status_code"
_MODEL_REQUESTED = "model.requested"
_PLANO_SESSION_ID = "plano.session_id"
_PLANO_ROUTE_NAME = "plano.route.name"
_ROUTING_STRATEGY = "routing.strategy"
_ROUTING_SELECTION_REASON = "routing.selection_reason"
_REQUEST_ID_KEYS = ("request_id", "http.request_id")
# Span attribute keys used below are the canonical OTel / Plano keys emitted by
# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source
# of truth.
def _anyvalue_to_python(value: Any) -> Any: # AnyValue from OTLP
@ -163,7 +147,7 @@ def span_to_llm_call(
A span is considered an LLM call iff it carries the ``llm.model`` attribute.
"""
attrs = _attrs_to_dict(span.attributes)
model = attrs.get(_LLM_MODEL)
model = attrs.get("llm.model")
if not model:
return None
@ -171,7 +155,7 @@ def span_to_llm_call(
request_id = next(
(
str(attrs[key])
for key in _REQUEST_ID_KEYS
for key in ("request_id", "http.request_id")
if key in attrs and attrs[key] is not None
),
span.span_id.hex() if span.span_id else "",
@ -187,34 +171,36 @@ def span_to_llm_call(
request_id=str(request_id),
timestamp=ts,
model=str(model),
provider=str(attrs[_LLM_PROVIDER]) if _LLM_PROVIDER in attrs else service_name,
provider=(
str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name
),
request_model=(
str(attrs[_MODEL_REQUESTED]) if _MODEL_REQUESTED in attrs else None
str(attrs["model.requested"]) if "model.requested" in attrs else None
),
session_id=(
str(attrs[_PLANO_SESSION_ID]) if _PLANO_SESSION_ID in attrs else None
str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None
),
route_name=(
str(attrs[_PLANO_ROUTE_NAME]) if _PLANO_ROUTE_NAME in attrs else None
str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None
),
is_streaming=(
bool(attrs[_LLM_IS_STREAMING]) if _LLM_IS_STREAMING in attrs else None
bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None
),
status_code=_maybe_int(attrs.get(_HTTP_STATUS)),
prompt_tokens=_maybe_int(attrs.get(_LLM_PROMPT_TOKENS)),
completion_tokens=_maybe_int(attrs.get(_LLM_COMPLETION_TOKENS)),
total_tokens=_maybe_int(attrs.get(_LLM_TOTAL_TOKENS)),
cached_input_tokens=_maybe_int(attrs.get(_LLM_CACHED_INPUT_TOKENS)),
cache_creation_tokens=_maybe_int(attrs.get(_LLM_CACHE_CREATION_TOKENS)),
reasoning_tokens=_maybe_int(attrs.get(_LLM_REASONING_TOKENS)),
ttft_ms=_maybe_float(attrs.get(_LLM_TTFT_MS)),
duration_ms=_maybe_float(attrs.get(_LLM_DURATION_MS)),
status_code=_maybe_int(attrs.get("http.status_code")),
prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")),
completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")),
total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")),
cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")),
cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")),
reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")),
ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")),
duration_ms=_maybe_float(attrs.get("llm.duration_ms")),
routing_strategy=(
str(attrs[_ROUTING_STRATEGY]) if _ROUTING_STRATEGY in attrs else None
str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None
),
routing_reason=(
str(attrs[_ROUTING_SELECTION_REASON])
if _ROUTING_SELECTION_REASON in attrs
str(attrs["routing.selection_reason"])
if "routing.selection_reason" in attrs
else None
),
)
@ -269,7 +255,7 @@ class ObsCollector:
if bound == 0:
raise OSError(
f"Failed to bind OTLP listener on {address}: port already in use. "
"Stop `planoai trace listen` or pick another port with --port."
"Stop tracing via `planoai trace down` or pick another port with --port."
)
server.start()
self._server = server

View file

@ -50,40 +50,20 @@ class PricingCatalog:
return list(self._prices.keys())[:n]
@classmethod
def fetch(
cls,
url: str = DEFAULT_PRICING_URL,
api_key: str | None = None,
) -> "PricingCatalog":
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
"""Fetch pricing from DO's catalog endpoint. On failure, returns an
empty catalog (cost column will be blank).
The catalog endpoint requires a DigitalOcean Personal Access Token
this is *not* the same as the inference ``MODEL_ACCESS_KEY`` used at
runtime. We check ``DIGITALOCEAN_TOKEN`` first (standard DO CLI env
var), then ``DO_PAT``, then fall back to ``DO_API_KEY``.
The catalog endpoint is public no auth required, no signup so
``planoai obs`` gets cost data on first run out of the box.
"""
import os
headers = {}
token = (
api_key
or os.environ.get("DIGITALOCEAN_TOKEN")
or os.environ.get("DO_PAT")
or os.environ.get("DO_API_KEY")
)
if token:
headers["Authorization"] = f"Bearer {token}"
try:
resp = requests.get(url, headers=headers, timeout=FETCH_TIMEOUT_SECS)
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
resp.raise_for_status()
data = resp.json()
except Exception as exc: # noqa: BLE001 — best-effort; never fatal
logger.warning(
"DO pricing fetch failed: %s; cost column will be blank. "
"Set DIGITALOCEAN_TOKEN with a DO Personal Access Token to "
"enable cost.",
"DO pricing fetch failed: %s; cost column will be blank.",
exc,
)
return cls()

View file

@ -63,7 +63,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
else:
console.print(
" [yellow]no pricing loaded[/] — "
"[dim]set DIGITALOCEAN_TOKEN (DO Personal Access Token) to enable cost[/]"
"[dim]cost column will be blank (DO catalog unreachable)[/]"
)
store = LLMCallStore(capacity=capacity)

View file

@ -340,33 +340,19 @@ And to get the list of supported currencies:
"Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in."
Observability Console
---------------------
Observability
-------------
Run ``planoai obs`` in a second terminal for a live, in-memory view of LLM traffic: per-request tokens, cached/cache-creation/reasoning tokens, TTFT, latency, cost (when DO Gradient pricing is available), session grouping, and route distribution.
Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering.
.. code-block:: console
===================== ============================================ =============================================================
Command When to use Shows
===================== ============================================ =============================================================
``planoai obs`` Live view while you drive traffic Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model
``planoai trace`` Deep-dive into a single request after the fact Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors
===================== ============================================ =============================================================
$ planoai obs
# In another terminal, start the proxy — with no config, planoai synthesizes
# a pass-through config for all known providers and auto-wires OTel export
# to localhost:4317 so the console receives spans automatically.
$ planoai up
With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request. For example, using DigitalOcean Gradient:
.. code-block:: console
$ curl localhost:12000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $DO_API_KEY" \
-d '{"model":"do/router:software-engineering",
"messages":[{"role":"user","content":"write code to print prime numbers in python"}],
"stream":false}'
When you do export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up automatically and clients no longer need to send ``Authorization``.
If you already use your own ``plano_config.yaml``, add this block so spans flow to the console:
Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add:
.. code-block:: yaml
@ -374,8 +360,47 @@ If you already use your own ``plano_config.yaml``, add this block so spans flow
random_sampling: 100
opentracing_grpc_endpoint: http://localhost:4317
Live console — ``planoai obs``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: console
$ planoai obs
# In another terminal:
$ planoai up
Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required.
With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request:
.. code-block:: console
$ curl localhost:12000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $DO_API_KEY" \
-d '{"model":"digitalocean/router:software-engineering",
"messages":[{"role":"user","content":"write code to print prime numbers in python"}],
"stream":false}'
When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``.
Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk.
Single-request traces — ``planoai trace``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``:
.. code-block:: console
$ planoai trace listen # start the OTLP listener (daemon)
# drive some traffic through localhost:12000 ...
$ planoai trace # show the most recent trace
$ planoai trace <trace-id> # show a specific trace by id
$ planoai trace --list # list the last 50 trace ids
Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time.
Next Steps
==========