add planoai obs: live LLM observability TUI

2026-04-30 19:36:34 +02:00 · 2026-04-17 00:52:46 -07:00 · 2026-04-17 00:52:46 -07:00 · d30018cf35
commit d30018cf35
parent 1f701258cb
19 changed files with 1736 additions and 5 deletions
--- a/cli/test/test_obs_collector.py
+++ b/cli/test/test_obs_collector.py
@ -0,0 +1,141 @@
+import time
+from datetime import datetime, timezone
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+from planoai.obs.collector import LLMCall, LLMCallStore, span_to_llm_call
+
+
+def _mk_attr(key: str, value):
+    v = MagicMock()
+    if isinstance(value, bool):
+        v.WhichOneof.return_value = "bool_value"
+        v.bool_value = value
+    elif isinstance(value, int):
+        v.WhichOneof.return_value = "int_value"
+        v.int_value = value
+    elif isinstance(value, float):
+        v.WhichOneof.return_value = "double_value"
+        v.double_value = value
+    else:
+        v.WhichOneof.return_value = "string_value"
+        v.string_value = str(value)
+    kv = MagicMock()
+    kv.key = key
+    kv.value = v
+    return kv
+
+
+def _mk_span(attrs: dict, start_ns: int | None = None, span_id_hex: str = "ab") -> MagicMock:
+    span = MagicMock()
+    span.attributes = [_mk_attr(k, v) for k, v in attrs.items()]
+    span.start_time_unix_nano = start_ns or int(time.time() * 1_000_000_000)
+    span.span_id.hex.return_value = span_id_hex
+    return span
+
+
+def test_span_without_llm_model_is_ignored():
+    span = _mk_span({"http.method": "POST"})
+    assert span_to_llm_call(span, "plano(llm)") is None
+
+
+def test_span_with_full_llm_attrs_produces_call():
+    span = _mk_span(
+        {
+            "llm.model": "openai-gpt-5.4",
+            "model.requested": "router:software-engineering",
+            "plano.session_id": "sess-abc",
+            "plano.route.name": "software-engineering",
+            "llm.is_streaming": False,
+            "llm.duration_ms": 1234,
+            "llm.time_to_first_token": 210,
+            "llm.usage.prompt_tokens": 100,
+            "llm.usage.completion_tokens": 50,
+            "llm.usage.total_tokens": 150,
+            "llm.usage.cached_input_tokens": 30,
+            "llm.usage.cache_creation_tokens": 5,
+            "llm.usage.reasoning_tokens": 200,
+            "http.status_code": 200,
+            "request_id": "req-42",
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)")
+    assert call is not None
+    assert call.request_id == "req-42"
+    assert call.model == "openai-gpt-5.4"
+    assert call.request_model == "router:software-engineering"
+    assert call.session_id == "sess-abc"
+    assert call.route_name == "software-engineering"
+    assert call.is_streaming is False
+    assert call.duration_ms == 1234.0
+    assert call.ttft_ms == 210.0
+    assert call.prompt_tokens == 100
+    assert call.completion_tokens == 50
+    assert call.total_tokens == 150
+    assert call.cached_input_tokens == 30
+    assert call.cache_creation_tokens == 5
+    assert call.reasoning_tokens == 200
+    assert call.status_code == 200
+
+
+def test_pricing_lookup_attaches_cost():
+    class StubPricing:
+        def cost_for_call(self, call):
+            # Simple: 2 * prompt + 3 * completion, in cents
+            return 0.02 * (call.prompt_tokens or 0) + 0.03 * (call.completion_tokens or 0)
+
+    span = _mk_span(
+        {
+            "llm.model": "do/openai-gpt-5.4",
+            "llm.usage.prompt_tokens": 10,
+            "llm.usage.completion_tokens": 2,
+        }
+    )
+    call = span_to_llm_call(span, "plano(llm)", pricing=StubPricing())
+    assert call is not None
+    assert call.cost_usd == pytest.approx(0.26)
+
+
+def test_tpt_and_tokens_per_sec_derived():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=80,
+    )
+    # (1000 - 200) / 80 = 10ms per token => 100 tokens/sec
+    assert call.tpt_ms == 10.0
+    assert call.tokens_per_sec == 100.0
+
+
+def test_tpt_returns_none_when_no_completion_tokens():
+    call = LLMCall(
+        request_id="x",
+        timestamp=datetime.now(tz=timezone.utc),
+        model="m",
+        duration_ms=1000,
+        ttft_ms=200,
+        completion_tokens=0,
+    )
+    assert call.tpt_ms is None
+    assert call.tokens_per_sec is None
+
+
+def test_store_evicts_fifo_at_capacity():
+    store = LLMCallStore(capacity=3)
+    now = datetime.now(tz=timezone.utc)
+    for i in range(5):
+        store.add(
+            LLMCall(
+                request_id=f"r{i}",
+                timestamp=now,
+                model="m",
+            )
+        )
+    snap = store.snapshot()
+    assert len(snap) == 3
+    assert [c.request_id for c in snap] == ["r2", "r3", "r4"]
--- a/cli/test/test_obs_pricing.py
+++ b/cli/test/test_obs_pricing.py
@ -0,0 +1,103 @@
+from datetime import datetime, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.pricing import ModelPrice, PricingCatalog
+
+
+def _call(model: str, prompt: int, completion: int, cached: int = 0) -> LLMCall:
+    return LLMCall(
+        request_id="r",
+        timestamp=datetime.now(tz=timezone.utc),
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cached,
+    )
+
+
+def test_lookup_matches_bare_and_prefixed():
+    prices = {
+        "openai-gpt-5.4": ModelPrice(
+            input_per_token_usd=0.000001, output_per_token_usd=0.000002
+        )
+    }
+    catalog = PricingCatalog(prices)
+    assert catalog.price_for("openai-gpt-5.4") is not None
+    # do/openai-gpt-5.4 should resolve after stripping the provider prefix.
+    assert catalog.price_for("do/openai-gpt-5.4") is not None
+    assert catalog.price_for("unknown-model") is None
+
+
+def test_cost_computation_without_cache():
+    prices = {
+        "m": ModelPrice(input_per_token_usd=0.000001, output_per_token_usd=0.000002)
+    }
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500))
+    assert cost == 0.002  # 1000 * 1e-6 + 500 * 2e-6
+
+
+def test_cost_computation_with_cached_discount():
+    prices = {
+        "m": ModelPrice(
+            input_per_token_usd=0.000001,
+            output_per_token_usd=0.000002,
+            cached_input_per_token_usd=0.0000001,
+        )
+    }
+    # 800 fresh @ 1e-6 = 8e-4; 200 cached @ 1e-7 = 2e-5; 500 out @ 2e-6 = 1e-3
+    cost = PricingCatalog(prices).cost_for_call(_call("m", 1000, 500, cached=200))
+    assert cost == round(0.0008 + 0.00002 + 0.001, 6)
+
+
+def test_empty_catalog_returns_none():
+    assert PricingCatalog().cost_for_call(_call("m", 100, 50)) is None
+
+
+def test_parse_do_catalog_treats_small_values_as_per_token():
+    """DO's real catalog uses per-token values under the `_per_million` key
+    (e.g. 5E-8 for GPT-oss-20b). We treat values < 1 as already per-token."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "openai-gpt-oss-20b",
+                "pricing": {
+                    "input_price_per_million": 5e-8,
+                    "output_price_per_million": 4.5e-7,
+                },
+            },
+            {
+                "model_id": "openai-gpt-oss-120b",
+                "pricing": {
+                    "input_price_per_million": 1e-7,
+                    "output_price_per_million": 7e-7,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    # Values < 1 are assumed to already be per-token — no extra division.
+    assert prices["openai-gpt-oss-20b"].input_per_token_usd == 5e-8
+    assert prices["openai-gpt-oss-20b"].output_per_token_usd == 4.5e-7
+    assert prices["openai-gpt-oss-120b"].input_per_token_usd == 1e-7
+
+
+def test_parse_do_catalog_divides_large_values_as_per_million():
+    """A provider that genuinely reports $5-per-million in that field gets divided."""
+    from planoai.obs.pricing import _parse_do_pricing
+
+    sample = {
+        "data": [
+            {
+                "model_id": "mystery-model",
+                "pricing": {
+                    "input_price_per_million": 5.0,  # > 1 → treated as per-million
+                    "output_price_per_million": 15.0,
+                },
+            },
+        ]
+    }
+    prices = _parse_do_pricing(sample)
+    assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
+    assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
--- a/cli/test/test_obs_render.py
+++ b/cli/test/test_obs_render.py
@ -0,0 +1,73 @@
+from datetime import datetime, timedelta, timezone
+
+from planoai.obs.collector import LLMCall
+from planoai.obs.render import aggregates, model_rollups, route_hits
+
+
+def _call(model: str, ts: datetime, prompt=0, completion=0, cost=None, route=None, session=None, cache_read=0, cache_write=0):
+    return LLMCall(
+        request_id="r",
+        timestamp=ts,
+        model=model,
+        prompt_tokens=prompt,
+        completion_tokens=completion,
+        cached_input_tokens=cache_read,
+        cache_creation_tokens=cache_write,
+        cost_usd=cost,
+        route_name=route,
+        session_id=session,
+    )
+
+
+def test_aggregates_sum_and_session_counts():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m1", now - timedelta(seconds=50), prompt=10, completion=5, cost=0.001, session="s1"),
+        _call("m2", now - timedelta(seconds=40), prompt=20, completion=10, cost=0.002, session="s1"),
+        _call("m1", now - timedelta(seconds=30), prompt=30, completion=15, cost=0.003, session="s2"),
+    ]
+    stats = aggregates(calls)
+    assert stats.count == 3
+    assert stats.total_cost_usd == 0.006
+    assert stats.total_input_tokens == 60
+    assert stats.total_output_tokens == 30
+    assert stats.distinct_sessions == 2
+    assert stats.current_session == "s2"
+
+
+def test_rollups_split_by_model_and_cache():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m1", now, prompt=10, completion=5, cost=0.001, cache_write=3, cache_read=7),
+        _call("m1", now, prompt=20, completion=10, cost=0.002, cache_read=1),
+        _call("m2", now, prompt=30, completion=15, cost=0.004),
+    ]
+    rollups = model_rollups(calls)
+    by_model = {r.model: r for r in rollups}
+    assert by_model["m1"].requests == 2
+    assert by_model["m1"].input_tokens == 30
+    assert by_model["m1"].cache_write == 3
+    assert by_model["m1"].cache_read == 8
+    assert by_model["m2"].input_tokens == 30
+
+
+def test_route_hits_only_for_routed_calls():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [
+        _call("m", now, route="code"),
+        _call("m", now, route="code"),
+        _call("m", now, route="summarization"),
+        _call("m", now),  # no route
+    ]
+    hits = route_hits(calls)
+    # Only calls with route names are counted.
+    assert sum(n for _, n, _ in hits) == 3
+    hits_by_name = {name: (n, pct) for name, n, pct in hits}
+    assert hits_by_name["code"][0] == 2
+    assert hits_by_name["summarization"][0] == 1
+
+
+def test_route_hits_empty_when_no_routes():
+    now = datetime.now(tz=timezone.utc).astimezone()
+    calls = [_call("m", now), _call("m", now)]
+    assert route_hits(calls) == []