feat: add trace listener process management and foreground mode

This commit is contained in:
Musa 2026-02-12 15:34:47 -08:00
parent 3c8e899de3
commit 6131589277
5 changed files with 2676 additions and 62 deletions

View file

@ -1,7 +1,10 @@
import json
import os
import re
import signal
import string
import subprocess
import sys
import threading
import time
from collections import OrderedDict
@ -26,6 +29,7 @@ from planoai.consts import PLANO_COLOR
DEFAULT_GRPC_PORT = 4317
MAX_TRACES = 50
MAX_SPANS_PER_TRACE = 500
TRACE_LISTENER_PID_FILE = os.path.expanduser("~/.plano_trace_listener.pid")
@dataclass
@ -46,6 +50,60 @@ class TraceSummary:
return dt.astimezone().strftime("%Y-%m-%d %H:%M:%S")
def _is_port_in_use(port: int) -> bool:
"""Check whether a local TCP listener is accepting connections on a port."""
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.2)
return s.connect_ex(("127.0.0.1", port)) == 0
def _save_listener_pid(pid: int) -> None:
"""Save the listener process PID to a file."""
with open(TRACE_LISTENER_PID_FILE, "w") as f:
f.write(str(pid))
def _get_listener_pid() -> int | None:
"""Get the saved listener process PID, if it exists and is running."""
if not os.path.exists(TRACE_LISTENER_PID_FILE):
return None
try:
with open(TRACE_LISTENER_PID_FILE, "r") as f:
pid = int(f.read().strip())
# Check if process is still running
os.kill(pid, 0) # Doesn't actually kill, just checks if process exists
return pid
except (ValueError, ProcessLookupError, OSError):
# PID file is invalid or process doesn't exist
if os.path.exists(TRACE_LISTENER_PID_FILE):
os.remove(TRACE_LISTENER_PID_FILE)
return None
def _stop_background_listener() -> bool:
"""Stop the background listener process if it's running."""
pid = _get_listener_pid()
if pid is None:
return False
try:
os.kill(pid, signal.SIGTERM)
time.sleep(0.5) # Give it time to shut down gracefully
try:
os.kill(pid, 0) # Check if still running
os.kill(pid, signal.SIGKILL) # Force kill if still alive
except ProcessLookupError:
pass # Already dead
if os.path.exists(TRACE_LISTENER_PID_FILE):
os.remove(TRACE_LISTENER_PID_FILE)
return True
except ProcessLookupError:
if os.path.exists(TRACE_LISTENER_PID_FILE):
os.remove(TRACE_LISTENER_PID_FILE)
return False
def _parse_filter_patterns(filter_patterns: tuple[str, ...]) -> list[str]:
parts: list[str] = []
for raw in filter_patterns:
@ -428,6 +486,10 @@ class _OTLPTraceServicer(trace_service_pb2_grpc.TraceServiceServicer):
_console = Console(stderr=True)
# we have to stream events otherwise the payload is too slow to process
def __init__(self, stream_events: bool = True) -> None:
self._stream_events = stream_events
def Export(self, request, context): # noqa: N802
for resource_spans in request.resource_spans:
service_name = "unknown"
@ -458,14 +520,15 @@ class _OTLPTraceServicer(trace_service_pb2_grpc.TraceServiceServicer):
dur_s = dur_ns / 1_000_000_000
dur_str = f"{dur_s:.3f}".rstrip("0").rstrip(".")
dur_str = f"{dur_str}s"
self._console.print(
f"[dim]{span_start}[/dim], "
f"trace=[yellow]{short_id}[/yellow], "
f"span=[yellow]{short_span}[/yellow], "
f"[bold {_service_color(service_name)}]{service_name}[/bold {_service_color(service_name)}] "
f"[cyan]{span.name}[/cyan] "
f"[dim]({dur_str})[/dim]"
)
if self._stream_events:
self._console.print(
f"[dim]{span_start}[/dim], "
f"trace=[yellow]{short_id}[/yellow], "
f"span=[yellow]{short_span}[/yellow], "
f"[bold {_service_color(service_name)}]{service_name}[/bold {_service_color(service_name)}] "
f"[cyan]{span.name}[/cyan] "
f"[dim]({dur_str})[/dim]"
)
return trace_service_pb2.ExportTraceServiceResponse()
@ -488,7 +551,9 @@ class _TraceQueryHandler(grpc.GenericRpcHandler):
return json.dumps({"traces": traces}, separators=(",", ":")).encode("utf-8")
def _create_trace_server(host: str, grpc_port: int) -> grpc.Server:
def _create_trace_server(
host: str, grpc_port: int, stream_events: bool = False
) -> grpc.Server:
"""Create, bind, and start an OTLP/gRPC trace-collection server.
Returns the running ``grpc.Server``. The caller is responsible
@ -499,31 +564,113 @@ def _create_trace_server(host: str, grpc_port: int) -> grpc.Server:
handlers=[_TraceQueryHandler()],
)
trace_service_pb2_grpc.add_TraceServiceServicer_to_server(
_OTLPTraceServicer(), grpc_server
_OTLPTraceServicer(stream_events=stream_events), grpc_server
)
grpc_server.add_insecure_port(f"{host}:{grpc_port}")
grpc_server.start()
return grpc_server
def _start_trace_listener(host: str, grpc_port: int) -> None:
"""Start the OTLP/gRPC listener and block until interrupted."""
console = Console()
grpc_server = _create_trace_server(host, grpc_port)
def _start_trace_listener_foreground(
host: str, grpc_port: int, debug: bool = False
) -> None:
"""Start the OTLP/gRPC listener in foreground (blocking).
This is the actual server implementation that blocks until interrupted.
"""
console = Console()
grpc_server = _create_trace_server(host, grpc_port, stream_events=debug)
if debug:
console.print()
console.print(
f"[bold {PLANO_COLOR}]Trace listener started[/bold {PLANO_COLOR}]"
)
console.print(
f"[green]●[/green] gRPC (OTLP receiver) on [cyan]{host}:{grpc_port}[/cyan]"
)
console.print(
"[dim]Debug mode: streaming trace events. Press Ctrl+C to stop.[/dim]"
)
console.print()
# Save PID for background mode
_save_listener_pid(os.getpid())
console.print()
console.print(f"[bold {PLANO_COLOR}]Listening for traces...[/bold {PLANO_COLOR}]")
console.print(
f"[green]●[/green] gRPC (OTLP receiver) on [cyan]{host}:{grpc_port}[/cyan]"
)
console.print("[dim]Press Ctrl+C to stop.[/dim]")
console.print()
try:
grpc_server.wait_for_termination()
except KeyboardInterrupt:
pass
if debug:
console.print("\n[dim]Stopping trace listener...[/dim]")
finally:
grpc_server.stop(grace=2)
if os.path.exists(TRACE_LISTENER_PID_FILE):
os.remove(TRACE_LISTENER_PID_FILE)
def _start_trace_listener(host: str, grpc_port: int, debug: bool = False) -> None:
"""Start the OTLP/gRPC listener.
Args:
host: Host to bind to
grpc_port: Port to bind to
debug: If True, runs in foreground with event streaming. If False, daemonizes.
"""
console = Console()
# Check if port is already in use
if _is_port_in_use(grpc_port):
existing_pid = _get_listener_pid()
if existing_pid:
console.print(
f"[yellow]⚠[/yellow] Trace listener already running on port [cyan]{grpc_port}[/cyan] (PID: {existing_pid})"
)
else:
console.print(
f"[red]✗[/red] Port [cyan]{grpc_port}[/cyan] is already in use by another process"
)
console.print(f"\n[dim]Check what's using the port:[/dim]")
console.print(f" [cyan]lsof -i :{grpc_port}[/cyan]")
return
if debug:
# Run in foreground with debug output
_start_trace_listener_foreground(host, grpc_port, debug=True)
else:
# Fork to background
pid = os.fork()
if pid > 0:
# Parent process
time.sleep(0.5) # Give child time to start
if _is_port_in_use(grpc_port):
console.print()
console.print(
f"[bold {PLANO_COLOR}]Trace listener started[/bold {PLANO_COLOR}]"
)
console.print(
f"[green]●[/green] gRPC (OTLP receiver) on [cyan]{host}:{grpc_port}[/cyan]"
)
console.print(f"[dim]Process ID: {pid}[/dim]")
console.print(
"[dim]Use [cyan]planoai trace[/cyan] to view collected traces.[/dim]"
)
console.print(
"[dim]Use [cyan]planoai trace listen --debug[/cyan] to see live events.[/dim]"
)
console.print()
else:
console.print(f"[red]✗[/red] Failed to start trace listener")
return
else:
# Child process - become daemon
os.setsid() # Create new session
devnull = os.open(os.devnull, os.O_RDWR)
os.dup2(devnull, 0)
os.dup2(devnull, 1)
os.dup2(devnull, 2)
if devnull > 2:
os.close(devnull)
_start_trace_listener_foreground(host, grpc_port, debug=False)
def start_trace_listener_background(
@ -534,7 +681,7 @@ def start_trace_listener_background(
Returns the running ``grpc.Server`` so the caller can call
``server.stop()`` later.
"""
return _create_trace_server(host, grpc_port)
return _create_trace_server(host, grpc_port, stream_events=False)
def _span_time_ns(span: dict[str, Any], key: str) -> int:
@ -562,13 +709,13 @@ def _trace_summary(trace: dict[str, Any]) -> TraceSummary:
def _service_color(service: str) -> str:
service = service.lower()
if "inbound" in service:
return "white"
return "#4860fa"
if "outbound" in service:
return "white"
return "#57d9a9"
if "orchestrator" in service:
return PLANO_COLOR
if "routing" in service:
return "magenta"
return "#e3a2fa"
if "agent" in service:
return "cyan"
if "llm" in service:
@ -576,6 +723,79 @@ def _service_color(service: str) -> str:
return "white"
def _error_symbol(status_code: str) -> str:
code = int(status_code) if status_code.isdigit() else 0
if code >= 500:
return "💥" # Server error - something broke
elif code == 429:
return "🚦" # Rate limited
elif code == 404:
return "🔍" # Not found
elif code == 403:
return "🚫" # Forbidden
elif code == 401:
return "🔐" # Unauthorized
elif code >= 400:
return "⚠️" # Client error
else:
return "" # Generic error
def _error_description(status_code: str) -> str:
"""Return a developer-friendly description of the error."""
code = int(status_code) if status_code.isdigit() else 0
if code == 500:
return "Internal Server Error"
elif code == 502:
return "Bad Gateway"
elif code == 503:
return "Service Unavailable"
elif code == 504:
return "Gateway Timeout"
elif code >= 500:
return "Server Error"
elif code == 429:
return "Rate Limit Exceeded"
elif code == 404:
return "Not Found"
elif code == 403:
return "Forbidden - Access Denied"
elif code == 401:
return "Unauthorized - Auth Required"
elif code == 400:
return "Bad Request"
elif code >= 400:
return "Client Error"
else:
return "Error"
def _detect_error(span: dict[str, Any]) -> tuple[bool, str, str]:
"""Detect if span has an error and return (has_error, status_code, error_msg).
Returns:
tuple: (has_error, status_code, error_description)
"""
attrs = _attrs(span)
status_code = attrs.get("http.status_code", "")
# Check for non-2xx status codes
if status_code and status_code.isdigit():
code = int(status_code)
if code >= 400:
return True, status_code, _error_description(status_code)
# Check for explicit error attributes
if "error.message" in attrs:
return True, status_code or "unknown", attrs["error.message"]
if "exception.message" in attrs:
return True, status_code or "unknown", attrs["exception.message"]
return False, "", ""
# Attributes to show for inbound/outbound spans when not verbose (trimmed view).
_INBOUND_OUTBOUND_ATTR_KEYS = (
"http.method",
@ -599,10 +819,20 @@ def _trim_attrs_for_display(
def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:
# Error attributes always come first
error_priority = [
"http.status_code",
"error.type",
"error.message",
"error.stack",
"exception.type",
"exception.message",
]
# Then regular priority attributes
priority = [
"http.method",
"http.target",
"http.status_code",
"guid:x-request-id",
"request_size",
"response_size",
@ -619,7 +849,10 @@ def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:
"llm.duration_ms",
"llm.response_bytes",
]
prioritized = [(k, attrs[k]) for k in priority if k in attrs]
# Combine error priority with regular priority
full_priority = error_priority + priority
prioritized = [(k, attrs[k]) for k in full_priority if k in attrs]
prioritized_keys = {k for k, _ in prioritized}
remaining = [(k, v) for k, v in attrs.items() if k not in prioritized_keys]
remaining.sort(key=lambda item: item[0])
@ -627,8 +860,14 @@ def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:
def _display_attr_value(key: str, value: str) -> str:
if key == "http.status_code" and value != "200":
return f"{value} ⚠️"
if key == "http.status_code":
if value.isdigit():
code = int(value)
if code >= 400:
return f"{value} {_error_symbol(value)}"
elif code >= 200 and code < 300:
return f"{value}"
return value
return value
@ -648,7 +887,7 @@ def _build_tree(trace: dict[str, Any], console: Console, verbose: bool = False)
)
spans.sort(key=lambda s: _span_time_ns(s, "startTimeUnixNano"))
tree = Tree("", guide_style="dim")
tree = Tree("", guide_style="dim #5b5a5c bold")
for span in spans:
service = span.get("service", "plano(unknown)")
@ -656,22 +895,52 @@ def _build_tree(trace: dict[str, Any], console: Console, verbose: bool = False)
offset_ms = max(
0, (_span_time_ns(span, "startTimeUnixNano") - start_ns) / 1_000_000
)
color = _service_color(service)
label = Text(f"{offset_ms:.0f}ms ", style="yellow")
label.append(service, style=f"bold {color}")
if name:
label.append(f" {name}", style="dim white")
# Check for errors in this span
has_error, error_code, error_desc = _detect_error(span)
if has_error:
# Create error banner above the span
error_banner = Text()
error_banner.append(error_desc, style="bright_red")
tree.add(error_banner)
# Style the span label in light red
label = Text(f"{offset_ms:.0f}ms ", style="#ff6b6b")
label.append(service, style="bold #ff6b6b")
if name:
label.append(f" {name}", style="#ff6b6b italic")
else:
# Normal styling
color = _service_color(service)
label = Text(f"{offset_ms:.0f}ms ", style="#949c99")
label.append(service, style=f"bold {color}")
if name:
label.append(f" {name}", style="dim white bold italic")
node = tree.add(label)
attrs = _trim_attrs_for_display(_attrs(span), service, verbose)
sorted_items = list(_sorted_attr_items(attrs))
for idx, (key, value) in enumerate(sorted_items):
attr_line = Text()
attr_line.append(f"{key}: ", style="white")
attr_line.append(
_display_attr_value(key, str(value)),
style=f"{PLANO_COLOR}",
)
# attribute key
attr_line.append(f"{key}: ", style="#a4a9aa")
# attribute value
if key == "http.status_code" and value.isdigit():
val_int = int(value)
val_style = "bold red" if val_int >= 400 else "green"
attr_line.append(_display_attr_value(key, str(value)), style=val_style)
elif key in [
"error.message",
"exception.message",
"error.type",
"exception.type",
]:
attr_line.append(_display_attr_value(key, str(value)), style="red")
else:
attr_line.append(
_display_attr_value(key, str(value)), style=f"{PLANO_COLOR} bold"
)
if idx == len(sorted_items) - 1:
attr_line.append("\n")
node.add(attr_line)
@ -882,7 +1151,7 @@ def _run_trace_show(
_build_tree(trace_obj, console, verbose=verbose)
@click.group(invoke_without_command=True)
@click.command()
@click.argument("target", required=False)
@click.option(
"--filter",
@ -908,6 +1177,19 @@ def _run_trace_show(
@click.option("--limit", type=int, default=None, help="Limit results.")
@click.option("--since", default=None, help="Look back window (e.g. 5m, 2h, 1d).")
@click.option("--json", "json_out", is_flag=True, help="Output raw JSON.")
@click.option("--host", default="0.0.0.0", show_default=True)
@click.option(
"--port",
type=int,
default=DEFAULT_GRPC_PORT,
show_default=True,
help="gRPC port for receiving OTLP traces when target is 'listen'.",
)
@click.option(
"--debug",
is_flag=True,
help="When used with target 'listen', keep terminal active and stream trace events.",
)
@click.option(
"--verbose",
"-v",
@ -925,12 +1207,14 @@ def trace(
limit,
since,
json_out,
host,
port,
debug,
verbose,
):
"""Trace requests from the local OTLP listener."""
if ctx.invoked_subcommand:
return
if target == "listen" and not any(
# Handle operational shortcuts when invoked as target values.
has_show_options = any(
[
filter_patterns,
where_filters,
@ -941,9 +1225,25 @@ def trace(
json_out,
verbose,
]
):
_start_trace_listener("0.0.0.0", DEFAULT_GRPC_PORT)
)
if target == "listen" and not has_show_options:
_start_trace_listener(host, port, debug=debug)
return
if debug and target != "listen":
raise click.ClickException("--debug is only valid with target 'listen'.")
if (host != "0.0.0.0" or port != DEFAULT_GRPC_PORT) and target != "listen":
raise click.ClickException("--host/--port are only valid with target 'listen'.")
if target in ("stop", "down") and not has_show_options:
console = Console()
if _stop_background_listener():
console.print(f"[green]✓[/green] Trace listener stopped")
else:
console.print(f"[dim]No background trace listener running[/dim]")
return
_run_trace_show(
target,
filter_patterns,
@ -955,17 +1255,3 @@ def trace(
json_out,
verbose,
)
@trace.command("listen")
@click.option("--host", default="0.0.0.0", show_default=True)
@click.option(
"--port",
type=int,
default=DEFAULT_GRPC_PORT,
show_default=True,
help="gRPC port for receiving OTLP traces.",
)
def trace_listen(host: str, port: int) -> None:
"""Listen for OTLP/gRPC traces."""
_start_trace_listener(host, port)

1133
cli/test/source/failure.json Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,803 @@
{
"traces": [
{
"trace_id": "86f21585168a31a23578d77096cc143b",
"spans": [
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "1d6159b920daf4e9",
"parentSpanId": "c5d6cd3cfb32b551",
"name": "POST archfc.katanemo.dev/v1/chat/completions",
"startTimeUnixNano": "1770937700292451000",
"endTimeUnixNano": "1770937700552403000",
"service": "plano(outbound)",
"attributes": [
{
"key": "node_id",
"value": {
"stringValue": ""
}
},
{
"key": "zone",
"value": {
"stringValue": ""
}
},
{
"key": "guid:x-request-id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.url",
"value": {
"stringValue": "https://archfc.katanemo.dev/v1/chat/completions"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "downstream_cluster",
"value": {
"stringValue": "-"
}
},
{
"key": "user_agent",
"value": {
"stringValue": "-"
}
},
{
"key": "http.protocol",
"value": {
"stringValue": "HTTP/1.1"
}
},
{
"key": "peer.address",
"value": {
"stringValue": "127.0.0.1"
}
},
{
"key": "request_size",
"value": {
"stringValue": "3293"
}
},
{
"key": "response_size",
"value": {
"stringValue": "341"
}
},
{
"key": "component",
"value": {
"stringValue": "proxy"
}
},
{
"key": "upstream_cluster",
"value": {
"stringValue": "arch"
}
},
{
"key": "upstream_cluster.name",
"value": {
"stringValue": "arch"
}
},
{
"key": "http.status_code",
"value": {
"stringValue": "200"
}
},
{
"key": "response_flags",
"value": {
"stringValue": "-"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "4234f793a77a40c8",
"parentSpanId": "445f868c5c36294e",
"name": "routing",
"startTimeUnixNano": "1770937700576995630",
"endTimeUnixNano": "1770937700577104880",
"service": "plano(routing)",
"attributes": [
{
"key": "component",
"value": {
"stringValue": "routing"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "http.target",
"value": {
"stringValue": "/v1/chat/completions"
}
},
{
"key": "model.requested",
"value": {
"stringValue": "openai/gpt-4o-mini"
}
},
{
"key": "model.alias_resolved",
"value": {
"stringValue": "openai/gpt-4o-mini"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(routing)"
}
},
{
"key": "routing.determination_ms",
"value": {
"intValue": "0"
}
},
{
"key": "route.selected_model",
"value": {
"stringValue": "none"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "445f868c5c36294e",
"parentSpanId": "8311d2245d859e71",
"name": "POST /v1/chat/completions openai/gpt-4o-mini",
"startTimeUnixNano": "1770937700576869630",
"endTimeUnixNano": "1770937701151370214",
"service": "plano(llm)",
"attributes": [
{
"key": "component",
"value": {
"stringValue": "llm"
}
},
{
"key": "request_id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "http.path",
"value": {
"stringValue": "/v1/chat/completions"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(llm)"
}
},
{
"key": "llm.temperature",
"value": {
"stringValue": "0.1"
}
},
{
"key": "llm.user_message_preview",
"value": {
"stringValue": "Whats the weather in Seattle?"
}
},
{
"key": "llm.model",
"value": {
"stringValue": "openai/gpt-4o-mini"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(llm)"
}
},
{
"key": "llm.time_to_first_token",
"value": {
"intValue": "572"
}
},
{
"key": "signals.quality",
"value": {
"stringValue": "Good"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "da348b97890a6c9b",
"parentSpanId": "",
"name": "POST /v1/chat/completions",
"startTimeUnixNano": "1770937700183402000",
"endTimeUnixNano": "1770937704394122000",
"service": "plano(inbound)",
"attributes": [
{
"key": "node_id",
"value": {
"stringValue": ""
}
},
{
"key": "zone",
"value": {
"stringValue": ""
}
},
{
"key": "guid:x-request-id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.url",
"value": {
"stringValue": "https://localhost/v1/chat/completions"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "downstream_cluster",
"value": {
"stringValue": "-"
}
},
{
"key": "user_agent",
"value": {
"stringValue": "Python/3.11 aiohttp/3.13.2"
}
},
{
"key": "http.protocol",
"value": {
"stringValue": "HTTP/1.1"
}
},
{
"key": "peer.address",
"value": {
"stringValue": "172.18.0.1"
}
},
{
"key": "request_size",
"value": {
"stringValue": "125"
}
},
{
"key": "response_size",
"value": {
"stringValue": "34401"
}
},
{
"key": "component",
"value": {
"stringValue": "proxy"
}
},
{
"key": "upstream_cluster",
"value": {
"stringValue": "bright_staff"
}
},
{
"key": "upstream_cluster.name",
"value": {
"stringValue": "bright_staff"
}
},
{
"key": "http.status_code",
"value": {
"stringValue": "200"
}
},
{
"key": "response_flags",
"value": {
"stringValue": "-"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "79a116cf7d63602a",
"parentSpanId": "8b6345129425cf4a",
"name": "POST api.openai.com/v1/chat/completions",
"startTimeUnixNano": "1770937702607128000",
"endTimeUnixNano": "1770937704391625000",
"service": "plano(outbound)",
"attributes": [
{
"key": "node_id",
"value": {
"stringValue": ""
}
},
{
"key": "zone",
"value": {
"stringValue": ""
}
},
{
"key": "guid:x-request-id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.url",
"value": {
"stringValue": "https://api.openai.com/v1/chat/completions"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "downstream_cluster",
"value": {
"stringValue": "-"
}
},
{
"key": "user_agent",
"value": {
"stringValue": "AsyncOpenAI/Python 2.17.0"
}
},
{
"key": "http.protocol",
"value": {
"stringValue": "HTTP/1.1"
}
},
{
"key": "peer.address",
"value": {
"stringValue": "127.0.0.1"
}
},
{
"key": "request_size",
"value": {
"stringValue": "1927"
}
},
{
"key": "response_size",
"value": {
"stringValue": "20646"
}
},
{
"key": "component",
"value": {
"stringValue": "proxy"
}
},
{
"key": "upstream_cluster",
"value": {
"stringValue": "openai"
}
},
{
"key": "upstream_cluster.name",
"value": {
"stringValue": "openai"
}
},
{
"key": "http.status_code",
"value": {
"stringValue": "200"
}
},
{
"key": "response_flags",
"value": {
"stringValue": "-"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "60508ba7960d51bc",
"parentSpanId": "445f868c5c36294e",
"name": "POST api.openai.com/v1/chat/completions",
"startTimeUnixNano": "1770937700589205000",
"endTimeUnixNano": "1770937701149191000",
"service": "plano(outbound)",
"attributes": [
{
"key": "node_id",
"value": {
"stringValue": ""
}
},
{
"key": "zone",
"value": {
"stringValue": ""
}
},
{
"key": "guid:x-request-id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.url",
"value": {
"stringValue": "https://api.openai.com/v1/chat/completions"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "downstream_cluster",
"value": {
"stringValue": "-"
}
},
{
"key": "user_agent",
"value": {
"stringValue": "AsyncOpenAI/Python 2.17.0"
}
},
{
"key": "http.protocol",
"value": {
"stringValue": "HTTP/1.1"
}
},
{
"key": "peer.address",
"value": {
"stringValue": "127.0.0.1"
}
},
{
"key": "request_size",
"value": {
"stringValue": "930"
}
},
{
"key": "response_size",
"value": {
"stringValue": "346"
}
},
{
"key": "component",
"value": {
"stringValue": "proxy"
}
},
{
"key": "upstream_cluster",
"value": {
"stringValue": "openai"
}
},
{
"key": "upstream_cluster.name",
"value": {
"stringValue": "openai"
}
},
{
"key": "http.status_code",
"value": {
"stringValue": "200"
}
},
{
"key": "response_flags",
"value": {
"stringValue": "-"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "8311d2245d859e71",
"parentSpanId": "c5d6cd3cfb32b551",
"name": "weather_agent /v1/chat/completions",
"startTimeUnixNano": "1770937700553490130",
"endTimeUnixNano": "1770937704393946299",
"service": "plano(agent)",
"attributes": [
{
"key": "agent_id",
"value": {
"stringValue": "weather_agent"
}
},
{
"key": "message_count",
"value": {
"stringValue": "1"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(agent)"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "9eb8a70a8c167f85",
"parentSpanId": "8b6345129425cf4a",
"name": "routing",
"startTimeUnixNano": "1770937702591610381",
"endTimeUnixNano": "1770937702592150423",
"service": "plano(routing)",
"attributes": [
{
"key": "component",
"value": {
"stringValue": "routing"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "http.target",
"value": {
"stringValue": "/v1/chat/completions"
}
},
{
"key": "model.requested",
"value": {
"stringValue": "openai/gpt-5.2"
}
},
{
"key": "model.alias_resolved",
"value": {
"stringValue": "openai/gpt-5.2"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(routing)"
}
},
{
"key": "routing.determination_ms",
"value": {
"intValue": "0"
}
},
{
"key": "route.selected_model",
"value": {
"stringValue": "none"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "c5d6cd3cfb32b551",
"parentSpanId": "da348b97890a6c9b",
"name": "travel_booking_service",
"startTimeUnixNano": "1770937700188669630",
"endTimeUnixNano": "1770937704393949091",
"service": "plano(orchestrator)",
"attributes": [
{
"key": "component",
"value": {
"stringValue": "orchestrator"
}
},
{
"key": "request_id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "http.path",
"value": {
"stringValue": "/agents/v1/chat/completions"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(orchestrator)"
}
},
{
"key": "selection.listener",
"value": {
"stringValue": "travel_booking_service"
}
},
{
"key": "selection.agent_count",
"value": {
"intValue": "1"
}
},
{
"key": "selection.agents",
"value": {
"stringValue": "weather_agent"
}
},
{
"key": "selection.determination_ms",
"value": {
"stringValue": "264.48"
}
}
]
},
{
"traceId": "86f21585168a31a23578d77096cc143b",
"spanId": "8b6345129425cf4a",
"parentSpanId": "8311d2245d859e71",
"name": "POST /v1/chat/completions openai/gpt-5.2",
"startTimeUnixNano": "1770937702591499256",
"endTimeUnixNano": "1770937704393043174",
"service": "plano(llm)",
"attributes": [
{
"key": "component",
"value": {
"stringValue": "llm"
}
},
{
"key": "request_id",
"value": {
"stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
}
},
{
"key": "http.method",
"value": {
"stringValue": "POST"
}
},
{
"key": "http.path",
"value": {
"stringValue": "/v1/chat/completions"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(llm)"
}
},
{
"key": "llm.temperature",
"value": {
"stringValue": "0.7"
}
},
{
"key": "llm.user_message_preview",
"value": {
"stringValue": "Whats the weather in Seattle?\n\nWeather data for S..."
}
},
{
"key": "llm.model",
"value": {
"stringValue": "openai/gpt-5.2"
}
},
{
"key": "service.name.override",
"value": {
"stringValue": "plano(llm)"
}
},
{
"key": "llm.time_to_first_token",
"value": {
"intValue": "506"
}
},
{
"key": "signals.quality",
"value": {
"stringValue": "Good"
}
}
]
}
]
}
]
}

392
cli/test/test_trace_cmd.py Normal file
View file

@ -0,0 +1,392 @@
import copy
import json
from pathlib import Path
import pytest
from click.testing import CliRunner
from planoai.trace_cmd import trace
import planoai.trace_cmd as trace_cmd
def _load_success_traces() -> list[dict]:
source_path = Path(__file__).parent / "source" / "success.json"
payload = json.loads(source_path.read_text(encoding="utf-8"))
return payload["traces"]
def _load_failure_traces() -> list[dict]:
source_path = Path(__file__).parent / "source" / "failure.json"
payload = json.loads(source_path.read_text(encoding="utf-8"))
return payload["traces"]
def _build_trace_set() -> list[dict]:
traces = copy.deepcopy(_load_success_traces())
primary = traces[0]
secondary = copy.deepcopy(primary)
secondary["trace_id"] = "1234567890abcdef1234567890abcdef"
for span in secondary.get("spans", []):
span["traceId"] = secondary["trace_id"]
if span.get("startTimeUnixNano", "").isdigit():
span["startTimeUnixNano"] = str(
int(span["startTimeUnixNano"]) - 1_000_000_000
)
if span.get("endTimeUnixNano", "").isdigit():
span["endTimeUnixNano"] = str(int(span["endTimeUnixNano"]) - 1_000_000_000)
return [primary, secondary]
def _json_from_output(output: str) -> dict:
start = output.find("{")
if start == -1:
raise AssertionError(f"No JSON object found in output:\n{output}")
return json.loads(output[start:])
@pytest.fixture
def runner() -> CliRunner:
return CliRunner()
@pytest.fixture
def traces() -> list[dict]:
return _build_trace_set()
@pytest.fixture
def failure_traces() -> list[dict]:
return copy.deepcopy(_load_failure_traces())
def test_trace_listen_starts_listener_with_debug_and_custom_bind_after_target(
runner, monkeypatch
):
seen = {}
def fake_start(host: str, port: int, debug: bool) -> None:
seen["host"] = host
seen["port"] = port
seen["debug"] = debug
monkeypatch.setattr(trace_cmd, "_start_trace_listener", fake_start)
result = runner.invoke(
trace, ["listen", "--host", "127.0.0.1", "--port", "9876", "--debug"]
)
assert result.exit_code == 0, result.output
assert seen == {"host": "127.0.0.1", "port": 9876, "debug": True}
def test_trace_listen_starts_listener_with_debug_before_target(runner, monkeypatch):
seen = {}
def fake_start(host: str, port: int, debug: bool) -> None:
seen["host"] = host
seen["port"] = port
seen["debug"] = debug
monkeypatch.setattr(trace_cmd, "_start_trace_listener", fake_start)
result = runner.invoke(trace, ["--debug", "listen"])
assert result.exit_code == 0, result.output
assert seen == {"host": "0.0.0.0", "port": 4317, "debug": True}
def test_trace_down_prints_success_when_listener_stopped(runner, monkeypatch):
monkeypatch.setattr(trace_cmd, "_stop_background_listener", lambda: True)
result = runner.invoke(trace, ["down"])
assert result.exit_code == 0, result.output
assert "Trace listener stopped" in result.output
def test_trace_down_prints_no_listener_when_not_running(runner, monkeypatch):
monkeypatch.setattr(trace_cmd, "_stop_background_listener", lambda: False)
result = runner.invoke(trace, ["down"])
assert result.exit_code == 0, result.output
assert "No background trace listener running" in result.output
def test_trace_debug_requires_listen_target(runner):
result = runner.invoke(trace, ["--debug", "any"])
assert result.exit_code != 0
assert "--debug is only valid with target 'listen'." in result.output
def test_trace_host_port_requires_listen_target(runner):
result = runner.invoke(trace, ["--host", "127.0.0.1", "any"])
assert result.exit_code != 0
assert "--host/--port are only valid with target 'listen'." in result.output
def test_trace_default_target_uses_last_and_builds_first_trace(
runner, monkeypatch, traces
):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
seen = {}
def fake_build_tree(trace_obj, _console, verbose=False):
seen["trace_id"] = trace_obj["trace_id"]
seen["verbose"] = verbose
monkeypatch.setattr(trace_cmd, "_build_tree", fake_build_tree)
result = runner.invoke(trace, [])
assert result.exit_code == 0, result.output
assert seen["trace_id"] == traces[0]["trace_id"]
assert seen["verbose"] is False
def test_trace_list_any_prints_short_trace_ids(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["--list", "any"])
assert result.exit_code == 0, result.output
assert "Trace IDs:" in result.output
assert traces[0]["trace_id"][:8] in result.output
assert traces[1]["trace_id"][:8] in result.output
def test_trace_list_target_conflict_errors(runner, traces, monkeypatch):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["--list", traces[0]["trace_id"]])
assert result.exit_code != 0
assert "Target and --list cannot be used together." in result.output
def test_trace_json_list_with_limit_outputs_trace_ids(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["--list", "any", "--json", "--limit", "1"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert payload == {"trace_ids": [traces[0]["trace_id"]]}
def test_trace_json_for_short_target_returns_one_trace(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
short_target = traces[0]["trace_id"][:8]
result = runner.invoke(trace, [short_target, "--json"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert len(payload["traces"]) == 1
assert payload["traces"][0]["trace_id"] == traces[0]["trace_id"]
@pytest.mark.parametrize(
("target", "message"),
[
("abc", "Trace ID must be 8 or 32 hex characters."),
("00000000", "Short trace ID must be 8 hex characters."),
("0" * 32, "Trace ID must be 32 hex characters."),
],
)
def test_trace_target_validation_errors(runner, target, message):
result = runner.invoke(trace, [target])
assert result.exit_code != 0
assert message in result.output
def test_trace_where_invalid_format_errors(runner):
result = runner.invoke(trace, ["any", "--where", "bad-format"])
assert result.exit_code != 0
assert "Invalid --where filter(s): bad-format. Use key=value." in result.output
def test_trace_where_unknown_key_errors(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["any", "--where", "not.a.real.key=value"])
assert result.exit_code != 0
assert "Unknown --where key(s): not.a.real.key" in result.output
def test_trace_where_filters_to_matching_trace(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(
trace, ["any", "--where", "agent_id=weather_agent", "--json"]
)
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert [trace_item["trace_id"] for trace_item in payload["traces"]] == [
traces[0]["trace_id"],
traces[1]["trace_id"],
]
def test_trace_where_and_filters_can_exclude_all(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(
trace,
[
"any",
"--where",
"agent_id=weather_agent",
"--where",
"http.status_code=500",
"--json",
],
)
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert payload == {"traces": []}
def test_trace_filter_restricts_attributes_by_pattern(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["any", "--filter", "http.*", "--json"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
for trace_item in payload["traces"]:
for span in trace_item["spans"]:
for attr in span.get("attributes", []):
assert attr["key"].startswith("http.")
def test_trace_filter_unmatched_warns_and_returns_unfiltered(
runner, monkeypatch, traces
):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
result = runner.invoke(trace, ["any", "--filter", "not-found-*", "--json"])
assert result.exit_code == 0, result.output
assert (
"Filter key(s) not found: not-found-*. Returning unfiltered traces."
in result.output
)
payload = _json_from_output(result.output)
assert len(payload["traces"]) == len(traces)
def test_trace_since_can_filter_out_old_traces(runner, monkeypatch, traces):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
monkeypatch.setattr(trace_cmd.time, "time", lambda: 1_999_999_999.0)
result = runner.invoke(trace, ["any", "--since", "1m", "--json"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert payload == {"traces": []}
def test_trace_negative_limit_errors(runner):
result = runner.invoke(trace, ["any", "--limit", "-1"])
assert result.exit_code != 0
assert "Limit must be greater than or equal to 0." in result.output
def test_trace_empty_data_prints_no_traces_found(runner, monkeypatch):
monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: [])
result = runner.invoke(trace, [])
assert result.exit_code == 0, result.output
assert "No traces found." in result.output
def test_trace_invalid_filter_token_errors(runner):
result = runner.invoke(trace, ["any", "--filter", "http.method,"])
assert result.exit_code != 0
assert "Filter contains empty tokens." in result.output
def test_trace_failure_json_any_contains_all_fixture_trace_ids(
runner, monkeypatch, failure_traces
):
monkeypatch.setattr(
trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
)
result = runner.invoke(trace, ["any", "--json"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert [item["trace_id"] for item in payload["traces"]] == [
"f7a31829c4b5d6e8a9f0b1c2d3e4f5a6",
"a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6",
"b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7",
]
@pytest.mark.parametrize(
("status_code", "expected_trace_ids"),
[
("503", ["f7a31829c4b5d6e8a9f0b1c2d3e4f5a6"]),
("429", ["a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6"]),
("500", ["b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7"]),
],
)
def test_trace_failure_where_status_filters_expected_traces(
runner, monkeypatch, failure_traces, status_code, expected_trace_ids
):
monkeypatch.setattr(
trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
)
result = runner.invoke(
trace, ["any", "--where", f"http.status_code={status_code}", "--json"]
)
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert [item["trace_id"] for item in payload["traces"]] == expected_trace_ids
def test_trace_failure_default_render_shows_service_unavailable_banner(
runner, monkeypatch, failure_traces
):
monkeypatch.setattr(
trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
)
result = runner.invoke(trace, [])
assert result.exit_code == 0, result.output
assert "Service Unavailable" in result.output
assert "503" in result.output
def test_trace_failure_filter_keeps_http_status_code_attributes(
runner, monkeypatch, failure_traces
):
monkeypatch.setattr(
trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
)
result = runner.invoke(trace, ["any", "--filter", "http.status_code", "--json"])
assert result.exit_code == 0, result.output
payload = _json_from_output(result.output)
assert payload["traces"], "Expected traces in failure fixture"
for trace_item in payload["traces"]:
for span in trace_item["spans"]:
keys = [attr["key"] for attr in span.get("attributes", [])]
assert set(keys).issubset({"http.status_code"})

2
cli/uv.lock generated
View file

@ -337,7 +337,7 @@ wheels = [
[[package]]
name = "planoai"
version = "0.4.4"
version = "0.4.6"
source = { editable = "." }
dependencies = [
{ name = "click" },