diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 52671f99..39735bc4 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -321,6 +321,23 @@ static_resources: service_name: llm_gateway random_sampling: value: {{ arch_tracing.random_sampling }} + custom_tags: + - tag: user_prompt + metadata: + kind: + request: {} + metadata_key: + key: llm_filter + path: + - key: user_prompt + - tag: time_to_first_token + metadata: + kind: + request: {} + metadata_key: + key: llm_filter + path: + - key: time_to_first_token {% endif %} stat_prefix: arch_listener_http codec_type: AUTO @@ -372,6 +389,28 @@ static_resources: "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip memory_level: 3 window_bits: 10 + - name: envoy.filters.http.lua + typed_config: + '@type': type.googleapis.com/envoy.extensions.filters.http.lua.v3.Lua + default_source_code: + inline_string: | + function envoy_on_response(response_handle) + + local user_message = response_handle:headers():get("x-user-message") + if user_message then + response_handle:logInfo("setting x-user-message") + response_handle:streamInfo():dynamicMetadata():set("llm_filter", "user_prompt", user_message) + response_handle:headers():remove("x-user-message") + end + + local time_to_first_token = response_handle:headers():get("x-time-to-first-token") + if time_to_first_token then + response_handle:logInfo("setting x-time-to-first-token") + response_handle:streamInfo():dynamicMetadata():set("llm_filter", "time_to_first_token", time_to_first_token) + response_handle:headers():remove("x-time-to-first-token") + end + + end - name: envoy.filters.http.wasm typed_config: "@type": type.googleapis.com/udpa.type.v1.TypedStruct diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 7e35e7f2..cdfdbeb2 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -288,6 +288,31 @@ impl HttpContext for StreamContext { Action::Continue } + fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { + debug!( + "on_http_response_headers [S={}] end_stream={}", + self.context_id, _end_of_stream + ); + + if let Some(user_message) = self.user_message.as_ref() { + if let Some(prompt) = user_message.content.as_ref() { + debug!("setting user-message header: {}", prompt); + self.set_http_response_header("x-user-message", Some(&prompt)); + } + } + + let tftt_time_ms = get_current_time() + .unwrap() + .duration_since(self.start_time.unwrap()) + .unwrap() + .as_millis(); + + let tftt_time = tftt_time_ms.to_string(); + self.set_http_response_header("x-time-to-first-token", Some(&tftt_time)); + + Action::Continue + } + fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action { debug!( "on_http_response_body [S={}] bytes={} end_stream={}", @@ -364,6 +389,7 @@ impl HttpContext for StreamContext { } } llm_span.add_attribute("model".to_string(), self.llm_provider().name.to_string()); + llm_span.add_event(Event::new( "time_to_first_token".to_string(), self.ttft_time