Improve end to end tracing (#628)

* adding canonical tracing support via bright-staff * improved formatting for tools in the traces * removing anthropic from the currency exchange demo * using Envoy to transport traces, not calling OTEL directly * moving otel collcetor cluster outside tracing if/else * minor fixes to not write to the OTEL collector if tracing is disabled * fixed PR comments and added more trace attributes * more fixes based on PR comments * more clean up based on PR comments --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
2026-05-08 15:22:43 +02:00 · 2025-12-11 15:21:57 -08:00 · 2025-12-11 15:21:57 -08:00 · a79f55f313
commit a79f55f313
parent 8adb9795d8
34 changed files with 2556 additions and 403 deletions
--- a/crates/brightstaff/src/tracing/constants.rs
+++ b/crates/brightstaff/src/tracing/constants.rs
@ -0,0 +1,319 @@
+/// OpenTelemetry Semantic Conventions
+///
+/// This module defines standard attribute keys following OTEL semantic conventions.
+/// See: https://opentelemetry.io/docs/specs/semconv/
+
+// =============================================================================
+// Span Attributes - HTTP
+// =============================================================================
+
+/// Semantic conventions for HTTP-related span attributes
+pub mod http {
+    /// HTTP request method
+    /// Example: "GET", "POST", "PUT"
+    pub const METHOD: &str = "http.method";
+
+    /// HTTP response status code
+    /// Example: "200", "404", "500"
+    pub const STATUS_CODE: &str = "http.status_code";
+
+    /// Full HTTP request URL
+    pub const URL: &str = "http.url";
+
+    /// HTTP request target (path + query)
+    /// Example: "/v1/chat/completions?stream=true"
+    pub const TARGET: &str = "http.target";
+
+    /// Upstream target path after routing transformation
+    /// Example: "/api/paas/v4/chat/completions" (for Zhipu provider)
+    pub const UPSTREAM_TARGET: &str = "http.upstream_target";
+
+    /// HTTP request scheme
+    /// Example: "http", "https"
+    pub const SCHEME: &str = "http.scheme";
+
+    /// Value of the HTTP User-Agent header
+    pub const USER_AGENT: &str = "http.user_agent";
+
+    /// Size of the request payload body in bytes
+    pub const REQUEST_CONTENT_LENGTH: &str = "http.request_content_length";
+
+    /// Size of the response payload body in bytes
+    pub const RESPONSE_CONTENT_LENGTH: &str = "http.response_content_length";
+}
+
+// =============================================================================
+// Span Attributes - LLM Specific
+// =============================================================================
+
+/// Custom attributes for LLM operations
+/// These follow the emerging OTEL GenAI semantic conventions
+pub mod llm {
+    /// Name of the LLM model being called
+    /// Example: "gpt-4", "claude-3-sonnet", "llama-2-70b"
+    pub const MODEL_NAME: &str = "llm.model";
+
+    /// Provider of the LLM
+    /// Example: "openai", "anthropic", "azure-openai"
+    pub const PROVIDER: &str = "llm.provider";
+
+    /// Type of LLM operation
+    /// Example: "chat", "completion", "embedding"
+    pub const OPERATION_TYPE: &str = "llm.operation_type";
+
+    /// Whether the request is streaming
+    pub const IS_STREAMING: &str = "llm.is_streaming";
+
+    /// Total bytes received in the response
+    pub const RESPONSE_BYTES: &str = "llm.response_bytes";
+
+    /// Duration of the LLM call in milliseconds
+    pub const DURATION_MS: &str = "llm.duration_ms";
+
+    /// Time to first token in milliseconds (streaming only)
+    pub const TIME_TO_FIRST_TOKEN_MS: &str = "llm.time_to_first_token";
+
+    /// Number of prompt tokens used
+    pub const PROMPT_TOKENS: &str = "llm.usage.prompt_tokens";
+
+    /// Number of completion tokens generated
+    pub const COMPLETION_TOKENS: &str = "llm.usage.completion_tokens";
+
+    /// Total tokens used (prompt + completion)
+    pub const TOTAL_TOKENS: &str = "llm.usage.total_tokens";
+
+    /// Temperature parameter used
+    pub const TEMPERATURE: &str = "llm.temperature";
+
+    /// Max tokens parameter used
+    pub const MAX_TOKENS: &str = "llm.max_tokens";
+
+    /// Top-p parameter used
+    pub const TOP_P: &str = "llm.top_p";
+
+    /// List of tool names provided in the request
+    pub const TOOLS: &str = "llm.tools";
+
+    /// Preview of the user message (truncated)
+    pub const USER_MESSAGE_PREVIEW: &str = "llm.user_message_preview";
+}
+
+// =============================================================================
+// Span Attributes - Routing & Gateway
+// =============================================================================
+
+/// Attributes specific to LLM routing and gateway operations
+pub mod routing {
+    /// Strategy used to select the LLM endpoint
+    /// Example: "round-robin", "least-latency", "cost-optimized"
+    pub const STRATEGY: &str = "routing.strategy";
+
+    /// Selected upstream endpoint
+    pub const UPSTREAM_ENDPOINT: &str = "routing.upstream_endpoint";
+
+    /// Time taken to determine the route in milliseconds
+    pub const ROUTE_DETERMINATION_MS: &str = "routing.determination_ms";
+
+    /// Whether a fallback endpoint was used
+    pub const IS_FALLBACK: &str = "routing.is_fallback";
+
+    /// Reason for route selection
+    pub const SELECTION_REASON: &str = "routing.selection_reason";
+}
+
+// =============================================================================
+// Span Attributes - Error Handling
+// =============================================================================
+
+/// Attributes for error and exception tracking
+pub mod error {
+    /// Whether an error occurred
+    pub const ERROR: &str = "error";
+
+    /// Type/class of the error
+    /// Example: "TimeoutError", "AuthenticationError"
+    pub const TYPE: &str = "error.type";
+
+    /// Error message
+    pub const MESSAGE: &str = "error.message";
+
+    /// Stack trace of the error
+    pub const STACK_TRACE: &str = "error.stack_trace";
+}
+
+// =============================================================================
+// Operation Names
+// =============================================================================
+
+/// Canonical operation name components for Arch Gateway
+pub mod operation_component {
+    /// Inbound request handling
+    pub const INBOUND: &str = "plano(inbound)";
+
+    /// Routing decision phase
+    pub const ROUTING: &str = "plano(routing)";
+
+    /// Handoff to upstream service
+    pub const HANDOFF: &str = "plano(handoff)";
+
+    /// Agent filter execution
+    pub const AGENT_FILTER: &str = "plano(agent filter)";
+
+    /// Agent execution
+    pub const AGENT: &str = "plano(agent)";
+
+    /// LLM call
+    pub const LLM: &str = "plano(llm)";
+}
+
+/// Builder for constructing standardized operation names
+///
+/// Format: `{method} {path} {target}`
+///
+/// The operation component (e.g., "archgw(llm)") is now part of the service name,
+/// so the operation name focuses on the HTTP request details and target.
+///
+/// # Examples
+/// ```
+/// use brightstaff::tracing::OperationNameBuilder;
+///
+/// // LLM call operation: "POST /v1/chat/completions gpt-4"
+/// // (service name will be "archgw(llm)")
+/// let op = OperationNameBuilder::new()
+///     .with_method("POST")
+///     .with_path("/v1/chat/completions")
+///     .with_target("gpt-4")
+///     .build();
+///
+/// // Agent filter operation: "POST /agents/v1/chat/completions hallucination-detector"
+/// // (service name will be "archgw(agent filter)")
+/// let op = OperationNameBuilder::new()
+///     .with_method("POST")
+///     .with_path("/agents/v1/chat/completions")
+///     .with_target("hallucination-detector")
+///     .build();
+///
+/// // Routing operation: "POST /v1/chat/completions"
+/// // (service name will be "archgw(routing)")
+/// let op = OperationNameBuilder::new()
+///     .with_method("POST")
+///     .with_path("/v1/chat/completions")
+///     .build();
+/// ```
+pub struct OperationNameBuilder {
+    method: Option<String>,
+    path: Option<String>,
+    target: Option<String>,
+}
+
+impl OperationNameBuilder {
+    /// Create a new operation name builder
+    pub fn new() -> Self {
+        Self {
+            method: None,
+            path: None,
+            target: None,
+        }
+    }
+
+    /// Set the HTTP method
+    ///
+    /// # Arguments
+    /// * `method` - HTTP method (e.g., "GET", "POST", "PUT")
+    pub fn with_method(mut self, method: impl Into<String>) -> Self {
+        self.method = Some(method.into());
+        self
+    }
+
+    /// Set the request path
+    ///
+    /// # Arguments
+    /// * `path` - Request path (e.g., "/v1/chat/completions", "/agents/v1/chat/completions")
+    pub fn with_path(mut self, path: impl Into<String>) -> Self {
+        self.path = Some(path.into());
+        self
+    }
+
+    /// Set the target (model name, agent name, or filter name)
+    ///
+    /// # Arguments
+    /// * `target` - Target identifier (e.g., "gpt-4", "my-agent", "hallucination-detector")
+    pub fn with_target(mut self, target: impl Into<String>) -> Self {
+        self.target = Some(target.into());
+        self
+    }
+
+    /// Build the operation name string
+    ///
+    /// # Format
+    /// - With all components: `{method} {path} {target}`
+    /// - Without target: `{method} {path}`
+    /// - Without path: `{method}`
+    /// - Empty: returns empty string
+    pub fn build(self) -> String {
+        let mut parts = Vec::new();
+
+        if let Some(method) = self.method {
+            parts.push(method);
+        }
+
+        if let Some(path) = self.path {
+            parts.push(path);
+        }
+
+        if let Some(target) = self.target {
+            parts.push(target);
+        }
+
+        parts.join(" ")
+    }
+}
+
+impl Default for OperationNameBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_operation_name_full() {
+        let op = OperationNameBuilder::new()
+            .with_method("POST")
+            .with_path("/v1/chat/completions")
+            .with_target("gpt-4")
+            .build();
+
+        assert_eq!(op, "POST /v1/chat/completions gpt-4");
+    }
+
+    #[test]
+    fn test_operation_name_no_target() {
+        let op = OperationNameBuilder::new()
+            .with_method("POST")
+            .with_path("/v1/chat/completions")
+            .build();
+
+        assert_eq!(op, "POST /v1/chat/completions");
+    }
+
+    #[test]
+    fn test_operation_name_agent_filter() {
+        let op = OperationNameBuilder::new()
+            .with_method("POST")
+            .with_path("/agents/v1/chat/completions")
+            .with_target("content-filter")
+            .build();
+
+        assert_eq!(op, "POST /agents/v1/chat/completions content-filter");
+    }
+
+    #[test]
+    fn test_operation_name_minimal() {
+        let op = OperationNameBuilder::new().build();
+        assert_eq!(op, "");
+    }
+}
--- a/crates/brightstaff/src/tracing/mod.rs
+++ b/crates/brightstaff/src/tracing/mod.rs
@ -0,0 +1,3 @@
+mod constants;
+
+pub use constants::{OperationNameBuilder, operation_component, http, llm, error, routing};