mirror of
https://github.com/katanemo/plano.git
synced 2026-06-11 15:05:14 +02:00
address #891 review: inline otel attrs, correct port-in-use msg, clarify DO pricing is public, split obs/trace docs
This commit is contained in:
parent
d09fa97568
commit
9ee33a921f
4 changed files with 80 additions and 89 deletions
|
|
@ -94,25 +94,9 @@ class LLMCallStore:
|
|||
return len(self._calls)
|
||||
|
||||
|
||||
# Attribute keys mirror crates/brightstaff/src/tracing/constants.rs.
|
||||
_LLM_MODEL = "llm.model"
|
||||
_LLM_PROVIDER = "llm.provider"
|
||||
_LLM_IS_STREAMING = "llm.is_streaming"
|
||||
_LLM_DURATION_MS = "llm.duration_ms"
|
||||
_LLM_TTFT_MS = "llm.time_to_first_token"
|
||||
_LLM_PROMPT_TOKENS = "llm.usage.prompt_tokens"
|
||||
_LLM_COMPLETION_TOKENS = "llm.usage.completion_tokens"
|
||||
_LLM_TOTAL_TOKENS = "llm.usage.total_tokens"
|
||||
_LLM_CACHED_INPUT_TOKENS = "llm.usage.cached_input_tokens"
|
||||
_LLM_CACHE_CREATION_TOKENS = "llm.usage.cache_creation_tokens"
|
||||
_LLM_REASONING_TOKENS = "llm.usage.reasoning_tokens"
|
||||
_HTTP_STATUS = "http.status_code"
|
||||
_MODEL_REQUESTED = "model.requested"
|
||||
_PLANO_SESSION_ID = "plano.session_id"
|
||||
_PLANO_ROUTE_NAME = "plano.route.name"
|
||||
_ROUTING_STRATEGY = "routing.strategy"
|
||||
_ROUTING_SELECTION_REASON = "routing.selection_reason"
|
||||
_REQUEST_ID_KEYS = ("request_id", "http.request_id")
|
||||
# Span attribute keys used below are the canonical OTel / Plano keys emitted by
|
||||
# brightstaff — see crates/brightstaff/src/tracing/constants.rs for the source
|
||||
# of truth.
|
||||
|
||||
|
||||
def _anyvalue_to_python(value: Any) -> Any: # AnyValue from OTLP
|
||||
|
|
@ -163,7 +147,7 @@ def span_to_llm_call(
|
|||
A span is considered an LLM call iff it carries the ``llm.model`` attribute.
|
||||
"""
|
||||
attrs = _attrs_to_dict(span.attributes)
|
||||
model = attrs.get(_LLM_MODEL)
|
||||
model = attrs.get("llm.model")
|
||||
if not model:
|
||||
return None
|
||||
|
||||
|
|
@ -171,7 +155,7 @@ def span_to_llm_call(
|
|||
request_id = next(
|
||||
(
|
||||
str(attrs[key])
|
||||
for key in _REQUEST_ID_KEYS
|
||||
for key in ("request_id", "http.request_id")
|
||||
if key in attrs and attrs[key] is not None
|
||||
),
|
||||
span.span_id.hex() if span.span_id else "",
|
||||
|
|
@ -187,34 +171,36 @@ def span_to_llm_call(
|
|||
request_id=str(request_id),
|
||||
timestamp=ts,
|
||||
model=str(model),
|
||||
provider=str(attrs[_LLM_PROVIDER]) if _LLM_PROVIDER in attrs else service_name,
|
||||
provider=(
|
||||
str(attrs["llm.provider"]) if "llm.provider" in attrs else service_name
|
||||
),
|
||||
request_model=(
|
||||
str(attrs[_MODEL_REQUESTED]) if _MODEL_REQUESTED in attrs else None
|
||||
str(attrs["model.requested"]) if "model.requested" in attrs else None
|
||||
),
|
||||
session_id=(
|
||||
str(attrs[_PLANO_SESSION_ID]) if _PLANO_SESSION_ID in attrs else None
|
||||
str(attrs["plano.session_id"]) if "plano.session_id" in attrs else None
|
||||
),
|
||||
route_name=(
|
||||
str(attrs[_PLANO_ROUTE_NAME]) if _PLANO_ROUTE_NAME in attrs else None
|
||||
str(attrs["plano.route.name"]) if "plano.route.name" in attrs else None
|
||||
),
|
||||
is_streaming=(
|
||||
bool(attrs[_LLM_IS_STREAMING]) if _LLM_IS_STREAMING in attrs else None
|
||||
bool(attrs["llm.is_streaming"]) if "llm.is_streaming" in attrs else None
|
||||
),
|
||||
status_code=_maybe_int(attrs.get(_HTTP_STATUS)),
|
||||
prompt_tokens=_maybe_int(attrs.get(_LLM_PROMPT_TOKENS)),
|
||||
completion_tokens=_maybe_int(attrs.get(_LLM_COMPLETION_TOKENS)),
|
||||
total_tokens=_maybe_int(attrs.get(_LLM_TOTAL_TOKENS)),
|
||||
cached_input_tokens=_maybe_int(attrs.get(_LLM_CACHED_INPUT_TOKENS)),
|
||||
cache_creation_tokens=_maybe_int(attrs.get(_LLM_CACHE_CREATION_TOKENS)),
|
||||
reasoning_tokens=_maybe_int(attrs.get(_LLM_REASONING_TOKENS)),
|
||||
ttft_ms=_maybe_float(attrs.get(_LLM_TTFT_MS)),
|
||||
duration_ms=_maybe_float(attrs.get(_LLM_DURATION_MS)),
|
||||
status_code=_maybe_int(attrs.get("http.status_code")),
|
||||
prompt_tokens=_maybe_int(attrs.get("llm.usage.prompt_tokens")),
|
||||
completion_tokens=_maybe_int(attrs.get("llm.usage.completion_tokens")),
|
||||
total_tokens=_maybe_int(attrs.get("llm.usage.total_tokens")),
|
||||
cached_input_tokens=_maybe_int(attrs.get("llm.usage.cached_input_tokens")),
|
||||
cache_creation_tokens=_maybe_int(attrs.get("llm.usage.cache_creation_tokens")),
|
||||
reasoning_tokens=_maybe_int(attrs.get("llm.usage.reasoning_tokens")),
|
||||
ttft_ms=_maybe_float(attrs.get("llm.time_to_first_token")),
|
||||
duration_ms=_maybe_float(attrs.get("llm.duration_ms")),
|
||||
routing_strategy=(
|
||||
str(attrs[_ROUTING_STRATEGY]) if _ROUTING_STRATEGY in attrs else None
|
||||
str(attrs["routing.strategy"]) if "routing.strategy" in attrs else None
|
||||
),
|
||||
routing_reason=(
|
||||
str(attrs[_ROUTING_SELECTION_REASON])
|
||||
if _ROUTING_SELECTION_REASON in attrs
|
||||
str(attrs["routing.selection_reason"])
|
||||
if "routing.selection_reason" in attrs
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
|
@ -269,7 +255,7 @@ class ObsCollector:
|
|||
if bound == 0:
|
||||
raise OSError(
|
||||
f"Failed to bind OTLP listener on {address}: port already in use. "
|
||||
"Stop `planoai trace listen` or pick another port with --port."
|
||||
"Stop tracing via `planoai trace down` or pick another port with --port."
|
||||
)
|
||||
server.start()
|
||||
self._server = server
|
||||
|
|
|
|||
|
|
@ -50,40 +50,20 @@ class PricingCatalog:
|
|||
return list(self._prices.keys())[:n]
|
||||
|
||||
@classmethod
|
||||
def fetch(
|
||||
cls,
|
||||
url: str = DEFAULT_PRICING_URL,
|
||||
api_key: str | None = None,
|
||||
) -> "PricingCatalog":
|
||||
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
|
||||
"""Fetch pricing from DO's catalog endpoint. On failure, returns an
|
||||
empty catalog (cost column will be blank).
|
||||
|
||||
The catalog endpoint requires a DigitalOcean Personal Access Token —
|
||||
this is *not* the same as the inference ``MODEL_ACCESS_KEY`` used at
|
||||
runtime. We check ``DIGITALOCEAN_TOKEN`` first (standard DO CLI env
|
||||
var), then ``DO_PAT``, then fall back to ``DO_API_KEY``.
|
||||
The catalog endpoint is public — no auth required, no signup — so
|
||||
``planoai obs`` gets cost data on first run out of the box.
|
||||
"""
|
||||
import os
|
||||
|
||||
headers = {}
|
||||
token = (
|
||||
api_key
|
||||
or os.environ.get("DIGITALOCEAN_TOKEN")
|
||||
or os.environ.get("DO_PAT")
|
||||
or os.environ.get("DO_API_KEY")
|
||||
)
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=FETCH_TIMEOUT_SECS)
|
||||
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as exc: # noqa: BLE001 — best-effort; never fatal
|
||||
logger.warning(
|
||||
"DO pricing fetch failed: %s; cost column will be blank. "
|
||||
"Set DIGITALOCEAN_TOKEN with a DO Personal Access Token to "
|
||||
"enable cost.",
|
||||
"DO pricing fetch failed: %s; cost column will be blank.",
|
||||
exc,
|
||||
)
|
||||
return cls()
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
|
|||
else:
|
||||
console.print(
|
||||
" [yellow]no pricing loaded[/] — "
|
||||
"[dim]set DIGITALOCEAN_TOKEN (DO Personal Access Token) to enable cost[/]"
|
||||
"[dim]cost column will be blank (DO catalog unreachable)[/]"
|
||||
)
|
||||
|
||||
store = LLMCallStore(capacity=capacity)
|
||||
|
|
|
|||
|
|
@ -340,33 +340,19 @@ And to get the list of supported currencies:
|
|||
"Here is a list of the currencies that are supported for conversion from USD, along with their symbols:\n\n1. AUD - Australian Dollar\n2. BGN - Bulgarian Lev\n3. BRL - Brazilian Real\n4. CAD - Canadian Dollar\n5. CHF - Swiss Franc\n6. CNY - Chinese Renminbi Yuan\n7. CZK - Czech Koruna\n8. DKK - Danish Krone\n9. EUR - Euro\n10. GBP - British Pound\n11. HKD - Hong Kong Dollar\n12. HUF - Hungarian Forint\n13. IDR - Indonesian Rupiah\n14. ILS - Israeli New Sheqel\n15. INR - Indian Rupee\n16. ISK - Icelandic Króna\n17. JPY - Japanese Yen\n18. KRW - South Korean Won\n19. MXN - Mexican Peso\n20. MYR - Malaysian Ringgit\n21. NOK - Norwegian Krone\n22. NZD - New Zealand Dollar\n23. PHP - Philippine Peso\n24. PLN - Polish Złoty\n25. RON - Romanian Leu\n26. SEK - Swedish Krona\n27. SGD - Singapore Dollar\n28. THB - Thai Baht\n29. TRY - Turkish Lira\n30. USD - United States Dollar\n31. ZAR - South African Rand\n\nIf you want to convert USD to any of these currencies, you can select the one you are interested in."
|
||||
|
||||
|
||||
Observability Console
|
||||
---------------------
|
||||
Observability
|
||||
-------------
|
||||
|
||||
Run ``planoai obs`` in a second terminal for a live, in-memory view of LLM traffic: per-request tokens, cached/cache-creation/reasoning tokens, TTFT, latency, cost (when DO Gradient pricing is available), session grouping, and route distribution.
|
||||
Plano ships two CLI tools for visibility into LLM traffic. Both consume the same OTLP/gRPC span stream from brightstaff; they just slice it differently — use whichever (or both) fits the question you're answering.
|
||||
|
||||
.. code-block:: console
|
||||
===================== ============================================ =============================================================
|
||||
Command When to use Shows
|
||||
===================== ============================================ =============================================================
|
||||
``planoai obs`` Live view while you drive traffic Per-request rows + aggregates: tokens (prompt / completion / cached / cache-creation / reasoning), TTFT, latency, cost, session id, route name, totals by model
|
||||
``planoai trace`` Deep-dive into a single request after the fact Full span tree for a trace id: brightstaff → routing → upstream LLM, attributes on every span, status codes, errors
|
||||
===================== ============================================ =============================================================
|
||||
|
||||
$ planoai obs
|
||||
# In another terminal, start the proxy — with no config, planoai synthesizes
|
||||
# a pass-through config for all known providers and auto-wires OTel export
|
||||
# to localhost:4317 so the console receives spans automatically.
|
||||
$ planoai up
|
||||
|
||||
With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request. For example, using DigitalOcean Gradient:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ curl localhost:12000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $DO_API_KEY" \
|
||||
-d '{"model":"do/router:software-engineering",
|
||||
"messages":[{"role":"user","content":"write code to print prime numbers in python"}],
|
||||
"stream":false}'
|
||||
|
||||
When you do export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up automatically and clients no longer need to send ``Authorization``.
|
||||
|
||||
If you already use your own ``plano_config.yaml``, add this block so spans flow to the console:
|
||||
Both require brightstaff to be exporting spans. If you're running the zero-config path (``planoai up`` with no config file), tracing is auto-wired to ``http://localhost:4317``. If you have your own ``plano_config.yaml``, add:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
|
|
@ -374,8 +360,47 @@ If you already use your own ``plano_config.yaml``, add this block so spans flow
|
|||
random_sampling: 100
|
||||
opentracing_grpc_endpoint: http://localhost:4317
|
||||
|
||||
Live console — ``planoai obs``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ planoai obs
|
||||
# In another terminal:
|
||||
$ planoai up
|
||||
|
||||
Cost is populated automatically from DigitalOcean's public pricing catalog — no signup or token required.
|
||||
|
||||
With no API keys set, every provider runs in pass-through mode — supply the ``Authorization`` header yourself on each request:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ curl localhost:12000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $DO_API_KEY" \
|
||||
-d '{"model":"digitalocean/router:software-engineering",
|
||||
"messages":[{"role":"user","content":"write code to print prime numbers in python"}],
|
||||
"stream":false}'
|
||||
|
||||
When you export ``OPENAI_API_KEY`` / ``ANTHROPIC_API_KEY`` / ``DO_API_KEY`` / etc. before ``planoai up``, Plano picks them up and clients no longer need to send ``Authorization``.
|
||||
|
||||
Press ``Ctrl-C`` in the obs terminal to exit. Data lives in memory only — nothing is persisted to disk.
|
||||
|
||||
Single-request traces — ``planoai trace``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When you need to understand what happened on one specific request (which model was picked, how long each hop took, what an upstream returned), use ``trace``:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ planoai trace listen # start the OTLP listener (daemon)
|
||||
# drive some traffic through localhost:12000 ...
|
||||
$ planoai trace # show the most recent trace
|
||||
$ planoai trace <trace-id> # show a specific trace by id
|
||||
$ planoai trace --list # list the last 50 trace ids
|
||||
|
||||
Use ``obs`` to spot that p95 latency spiked for ``openai-gpt-5.4``; switch to ``trace`` on one of those slow request ids to see which hop burned the time.
|
||||
|
||||
Next Steps
|
||||
==========
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue