mirror of
https://github.com/katanemo/plano.git
synced 2026-04-30 19:36:34 +02:00
add planoai obs: live LLM observability TUI
This commit is contained in:
parent
1f701258cb
commit
d30018cf35
19 changed files with 1736 additions and 5 deletions
141
cli/test/test_obs_collector.py
Normal file
141
cli/test/test_obs_collector.py
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
import time
|
||||
from datetime import datetime, timezone
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call
|
||||
|
||||
|
||||
def _mk_attr(key: str, value):
|
||||
v = MagicMock()
|
||||
if isinstance(value, bool):
|
||||
v.WhichOneof.return_value = "bool_value"
|
||||
v.bool_value = value
|
||||
elif isinstance(value, int):
|
||||
v.WhichOneof.return_value = "int_value"
|
||||
v.int_value = value
|
||||
elif isinstance(value, float):
|
||||
v.WhichOneof.return_value = "double_value"
|
||||
v.double_value = value
|
||||
else:
|
||||
v.WhichOneof.return_value = "string_value"
|
||||
v.string_value = str(value)
|
||||
kv = MagicMock()
|
||||
kv.key = key
|
||||
kv.value = v
|
||||
return kv
|
||||
|
||||
|
||||
def _mk_span(attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab") -> MagicMock:
|
||||
span = MagicMock()
|
||||
span.attributes = [_mk_attr(k, v) for k, v in attrs.items()]
|
||||
span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000)
|
||||
span.span_id.hex.return_value = span_id_hex
|
||||
return span
|
||||
|
||||
|
||||
def test_span_without_llm_model_is_ignored():
|
||||
span = _mk_span({"http.method": "POST"})
|
||||
assert span_to_llm_call(span, "plano(llm)") is None
|
||||
|
||||
|
||||
def test_span_with_full_llm_attrs_produces_call():
|
||||
span = _mk_span(
|
||||
{
|
||||
"llm.model": "openai-gpt-5.4",
|
||||
"model.requested": "router:software-engineering",
|
||||
"plano.session_id": "sess-abc",
|
||||
"plano.route.name": "software-engineering",
|
||||
"llm.is_streaming": False,
|
||||
"llm.duration_ms": 1234,
|
||||
"llm.time_to_first_token": 210,
|
||||
"llm.usage.prompt_tokens": 100,
|
||||
"llm.usage.completion_tokens": 50,
|
||||
"llm.usage.total_tokens": 150,
|
||||
"llm.usage.cached_input_tokens": 30,
|
||||
"llm.usage.cache_creation_tokens": 5,
|
||||
"llm.usage.reasoning_tokens": 200,
|
||||
"http.status_code": 200,
|
||||
"request_id": "req-42",
|
||||
}
|
||||
)
|
||||
call = span_to_llm_call(span, "plano(llm)")
|
||||
assert call is not None
|
||||
assert call.request_id == "req-42"
|
||||
assert call.model == "openai-gpt-5.4"
|
||||
assert call.request_model == "router:software-engineering"
|
||||
assert call.session_id == "sess-abc"
|
||||
assert call.route_name == "software-engineering"
|
||||
assert call.is_streaming is False
|
||||
assert call.duration_ms == 1234.0
|
||||
assert call.ttft_ms == 210.0
|
||||
assert call.prompt_tokens == 100
|
||||
assert call.completion_tokens == 50
|
||||
assert call.total_tokens == 150
|
||||
assert call.cached_input_tokens == 30
|
||||
assert call.cache_creation_tokens == 5
|
||||
assert call.reasoning_tokens == 200
|
||||
assert call.status_code == 200
|
||||
|
||||
|
||||
def test_pricing_lookup_attaches_cost():
|
||||
class StubPricing:
|
||||
def cost_for_call(self, call):
|
||||
# Simple: 2 * prompt + 3 * completion, in cents
|
||||
return 0.02 * (call.prompt_tokens or 0) + 0.03 * (call.completion_tokens or 0)
|
||||
|
||||
span = _mk_span(
|
||||
{
|
||||
"llm.model": "do/openai-gpt-5.4",
|
||||
"llm.usage.prompt_tokens": 10,
|
||||
"llm.usage.completion_tokens": 2,
|
||||
}
|
||||
)
|
||||
call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing())
|
||||
assert call is not None
|
||||
assert call.cost_usd == pytest.approx(0.26)
|
||||
|
||||
|
||||
def test_tpt_and_tokens_per_sec_derived():
|
||||
call = LLMCall(
|
||||
request_id="x",
|
||||
timestamp=datetime.now(tz=timezone.utc),
|
||||
model="m",
|
||||
duration_ms=1000,
|
||||
ttft_ms=200,
|
||||
completion_tokens=80,
|
||||
)
|
||||
# (1000 - 200) / 80 = 10ms per token => 100 tokens/sec
|
||||
assert call.tpt_ms == 10.0
|
||||
assert call.tokens_per_sec == 100.0
|
||||
|
||||
|
||||
def test_tpt_returns_none_when_no_completion_tokens():
|
||||
call = LLMCall(
|
||||
request_id="x",
|
||||
timestamp=datetime.now(tz=timezone.utc),
|
||||
model="m",
|
||||
duration_ms=1000,
|
||||
ttft_ms=200,
|
||||
completion_tokens=0,
|
||||
)
|
||||
assert call.tpt_ms is None
|
||||
assert call.tokens_per_sec is None
|
||||
|
||||
|
||||
def test_store_evicts_fifo_at_capacity():
|
||||
store = LLMCallStore(capacity=3)
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
for i in range(5):
|
||||
store.add(
|
||||
LLMCall(
|
||||
request_id=f"r{i}",
|
||||
timestamp=now,
|
||||
model="m",
|
||||
)
|
||||
)
|
||||
snap = store.snapshot()
|
||||
assert len(snap) == 3
|
||||
assert [c.request_id for c in snap] == ["r2", "r3", "r4"]
|
||||
103
cli/test/test_obs_pricing.py
Normal file
103
cli/test/test_obs_pricing.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from planoai.obs.collector import LLMCall
|
||||
from planoai.obs.pricing import ModelPrice, PricingCatalog
|
||||
|
||||
|
||||
def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall:
|
||||
return LLMCall(
|
||||
request_id="r",
|
||||
timestamp=datetime.now(tz=timezone.utc),
|
||||
model=model,
|
||||
prompt_tokens=prompt,
|
||||
completion_tokens=completion,
|
||||
cached_input_tokens=cached,
|
||||
)
|
||||
|
||||
|
||||
def test_lookup_matches_bare_and_prefixed():
|
||||
prices = {
|
||||
"openai-gpt-5.4": ModelPrice(
|
||||
input_per_token_usd=0.000001, output_per_token_usd=0.000002
|
||||
)
|
||||
}
|
||||
catalog = PricingCatalog(prices)
|
||||
assert catalog.price_for("openai-gpt-5.4") is not None
|
||||
# do/openai-gpt-5.4 should resolve after stripping the provider prefix.
|
||||
assert catalog.price_for("do/openai-gpt-5.4") is not None
|
||||
assert catalog.price_for("unknown-model") is None
|
||||
|
||||
|
||||
def test_cost_computation_without_cache():
|
||||
prices = {
|
||||
"m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002)
|
||||
}
|
||||
cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500))
|
||||
assert cost == 0.002 # 1000 * 1e-6 + 500 * 2e-6
|
||||
|
||||
|
||||
def test_cost_computation_with_cached_discount():
|
||||
prices = {
|
||||
"m": ModelPrice(
|
||||
input_per_token_usd=0.000001,
|
||||
output_per_token_usd=0.000002,
|
||||
cached_input_per_token_usd=0.0000001,
|
||||
)
|
||||
}
|
||||
# 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3
|
||||
cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200))
|
||||
assert cost == round(0.0008 + 0.00002 + 0.001, 6)
|
||||
|
||||
|
||||
def test_empty_catalog_returns_none():
|
||||
assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None
|
||||
|
||||
|
||||
def test_parse_do_catalog_treats_small_values_as_per_token():
|
||||
"""DO's real catalog uses per-token values under the `_per_million` key
|
||||
(e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token."""
|
||||
from planoai.obs.pricing import _parse_do_pricing
|
||||
|
||||
sample = {
|
||||
"data": [
|
||||
{
|
||||
"model_id": "openai-gpt-oss-20b",
|
||||
"pricing": {
|
||||
"input_price_per_million": 5e-8,
|
||||
"output_price_per_million": 4.5e-7,
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_id": "openai-gpt-oss-120b",
|
||||
"pricing": {
|
||||
"input_price_per_million": 1e-7,
|
||||
"output_price_per_million": 7e-7,
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
prices = _parse_do_pricing(sample)
|
||||
# Values < 1 are assumed to already be per-token — no extra division.
|
||||
assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8
|
||||
assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7
|
||||
assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
|
||||
|
||||
|
||||
def test_parse_do_catalog_divides_large_values_as_per_million():
|
||||
"""A provider that genuinely reports $5-per-million in that field gets divided."""
|
||||
from planoai.obs.pricing import _parse_do_pricing
|
||||
|
||||
sample = {
|
||||
"data": [
|
||||
{
|
||||
"model_id": "mystery-model",
|
||||
"pricing": {
|
||||
"input_price_per_million": 5.0, # > 1 → treated as per-million
|
||||
"output_price_per_million": 15.0,
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
prices = _parse_do_pricing(sample)
|
||||
assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
|
||||
assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
|
||||
73
cli/test/test_obs_render.py
Normal file
73
cli/test/test_obs_render.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from planoai.obs.collector import LLMCall
|
||||
from planoai.obs.render import aggregates, model_rollups, route_hits
|
||||
|
||||
|
||||
def _call(model: str, ts: datetime, prompt=0, completion=0, cost=None, route=None, session=None, cache_read=0, cache_write=0):
|
||||
return LLMCall(
|
||||
request_id="r",
|
||||
timestamp=ts,
|
||||
model=model,
|
||||
prompt_tokens=prompt,
|
||||
completion_tokens=completion,
|
||||
cached_input_tokens=cache_read,
|
||||
cache_creation_tokens=cache_write,
|
||||
cost_usd=cost,
|
||||
route_name=route,
|
||||
session_id=session,
|
||||
)
|
||||
|
||||
|
||||
def test_aggregates_sum_and_session_counts():
|
||||
now = datetime.now(tz=timezone.utc).astimezone()
|
||||
calls = [
|
||||
_call("m1", now - timedelta(seconds=50), prompt=10, completion=5, cost=0.001, session="s1"),
|
||||
_call("m2", now - timedelta(seconds=40), prompt=20, completion=10, cost=0.002, session="s1"),
|
||||
_call("m1", now - timedelta(seconds=30), prompt=30, completion=15, cost=0.003, session="s2"),
|
||||
]
|
||||
stats = aggregates(calls)
|
||||
assert stats.count == 3
|
||||
assert stats.total_cost_usd == 0.006
|
||||
assert stats.total_input_tokens == 60
|
||||
assert stats.total_output_tokens == 30
|
||||
assert stats.distinct_sessions == 2
|
||||
assert stats.current_session == "s2"
|
||||
|
||||
|
||||
def test_rollups_split_by_model_and_cache():
|
||||
now = datetime.now(tz=timezone.utc).astimezone()
|
||||
calls = [
|
||||
_call("m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7),
|
||||
_call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1),
|
||||
_call("m2", now, prompt=30, completion=15, cost=0.004),
|
||||
]
|
||||
rollups = model_rollups(calls)
|
||||
by_model = {r.model: r for r in rollups}
|
||||
assert by_model["m1"].requests == 2
|
||||
assert by_model["m1"].input_tokens == 30
|
||||
assert by_model["m1"].cache_write == 3
|
||||
assert by_model["m1"].cache_read == 8
|
||||
assert by_model["m2"].input_tokens == 30
|
||||
|
||||
|
||||
def test_route_hits_only_for_routed_calls():
|
||||
now = datetime.now(tz=timezone.utc).astimezone()
|
||||
calls = [
|
||||
_call("m", now, route="code"),
|
||||
_call("m", now, route="code"),
|
||||
_call("m", now, route="summarization"),
|
||||
_call("m", now), # no route
|
||||
]
|
||||
hits = route_hits(calls)
|
||||
# Only calls with route names are counted.
|
||||
assert sum(n for _, n, _ in hits) == 3
|
||||
hits_by_name = {name: (n, pct) for name, n, pct in hits}
|
||||
assert hits_by_name["code"][0] == 2
|
||||
assert hits_by_name["summarization"][0] == 1
|
||||
|
||||
|
||||
def test_route_hits_empty_when_no_routes():
|
||||
now = datetime.now(tz=timezone.utc).astimezone()
|
||||
calls = [_call("m", now), _call("m", now)]
|
||||
assert route_hits(calls) == []
|
||||
Loading…
Add table
Add a link
Reference in a new issue