2024-12-09 10:46:46 -08:00
use crate ::metrics ::Metrics ;
2025-03-19 15:21:34 -07:00
use common ::configuration ::{ LlmProvider , LlmProviderType , Overrides } ;
2024-10-17 10:16:40 -07:00
use common ::consts ::{
2025-03-03 13:11:57 -08:00
ARCH_PROVIDER_HINT_HEADER , ARCH_ROUTING_HEADER , CHAT_COMPLETIONS_PATH , HEALTHZ_PATH ,
2024-11-15 10:44:01 -08:00
RATELIMIT_SELECTOR_HEADER_KEY , REQUEST_ID_HEADER , TRACE_PARENT_HEADER ,
2024-10-17 10:16:40 -07:00
} ;
2024-10-18 12:53:44 -07:00
use common ::errors ::ServerError ;
2024-10-17 10:16:40 -07:00
use common ::llm_providers ::LlmProviders ;
use common ::ratelimit ::Header ;
2024-12-09 10:46:46 -08:00
use common ::stats ::{ IncrementingMetric , RecordingMetric } ;
2024-11-18 17:55:39 -08:00
use common ::tracing ::{ Event , Span , TraceData , Traceparent } ;
2024-10-17 10:16:40 -07:00
use common ::{ ratelimit , routing , tokenizer } ;
2025-06-10 12:53:27 -07:00
use hermesllm ::providers ::openai ::types ::{ ChatCompletionsRequest , SseChatCompletionIter } ;
use hermesllm ::providers ::openai ::types ::{
ChatCompletionsResponse , ContentType , Message , StreamOptions ,
} ;
use hermesllm ::Provider ;
2024-10-17 10:16:40 -07:00
use http ::StatusCode ;
2025-03-27 10:40:20 -07:00
use log ::{ debug , info , warn } ;
2024-12-09 10:46:46 -08:00
use proxy_wasm ::hostcalls ::get_current_time ;
2024-10-17 10:16:40 -07:00
use proxy_wasm ::traits ::* ;
use proxy_wasm ::types ::* ;
2024-11-18 17:55:39 -08:00
use std ::collections ::VecDeque ;
2024-10-17 10:16:40 -07:00
use std ::num ::NonZero ;
use std ::rc ::Rc ;
2024-11-18 17:55:39 -08:00
use std ::sync ::{ Arc , Mutex } ;
2024-11-15 10:44:01 -08:00
use std ::time ::{ Duration , SystemTime , UNIX_EPOCH } ;
2024-10-17 10:16:40 -07:00
2024-10-18 12:53:44 -07:00
pub struct StreamContext {
2024-10-17 10:16:40 -07:00
context_id : u32 ,
2024-12-09 10:46:46 -08:00
metrics : Rc < Metrics > ,
2024-10-17 10:16:40 -07:00
ratelimit_selector : Option < Header > ,
streaming_response : bool ,
response_tokens : usize ,
is_chat_completions_request : bool ,
llm_providers : Rc < LlmProviders > ,
llm_provider : Option < Rc < LlmProvider > > ,
request_id : Option < String > ,
2024-11-18 17:55:39 -08:00
start_time : SystemTime ,
2024-11-15 10:44:01 -08:00
ttft_duration : Option < Duration > ,
2024-11-18 17:55:39 -08:00
ttft_time : Option < u128 > ,
traceparent : Option < String > ,
request_body_sent_time : Option < u128 > ,
2024-11-15 10:44:01 -08:00
user_message : Option < Message > ,
2024-11-18 17:55:39 -08:00
traces_queue : Arc < Mutex < VecDeque < TraceData > > > ,
2025-03-19 15:21:34 -07:00
overrides : Rc < Option < Overrides > > ,
2024-10-17 10:16:40 -07:00
}
2024-10-18 12:53:44 -07:00
impl StreamContext {
2024-11-18 17:55:39 -08:00
pub fn new (
context_id : u32 ,
2024-12-09 10:46:46 -08:00
metrics : Rc < Metrics > ,
2024-11-18 17:55:39 -08:00
llm_providers : Rc < LlmProviders > ,
traces_queue : Arc < Mutex < VecDeque < TraceData > > > ,
2025-03-19 15:21:34 -07:00
overrides : Rc < Option < Overrides > > ,
2024-11-18 17:55:39 -08:00
) -> Self {
2024-10-18 12:53:44 -07:00
StreamContext {
2024-10-17 10:16:40 -07:00
context_id ,
metrics ,
2025-03-19 15:21:34 -07:00
overrides ,
2024-10-17 10:16:40 -07:00
ratelimit_selector : None ,
streaming_response : false ,
response_tokens : 0 ,
is_chat_completions_request : false ,
llm_providers ,
llm_provider : None ,
request_id : None ,
2024-11-18 17:55:39 -08:00
start_time : SystemTime ::now ( ) ,
2024-11-12 15:03:26 -08:00
ttft_duration : None ,
2024-11-15 10:44:01 -08:00
traceparent : None ,
ttft_time : None ,
user_message : None ,
2024-11-18 17:55:39 -08:00
traces_queue ,
2024-11-17 17:01:19 -08:00
request_body_sent_time : None ,
2024-10-17 10:16:40 -07:00
}
}
fn llm_provider ( & self ) -> & LlmProvider {
self . llm_provider
. as_ref ( )
. expect ( " the provider should be set when asked for it " )
}
fn select_llm_provider ( & mut self ) {
let provider_hint = self
. get_http_request_header ( ARCH_PROVIDER_HINT_HEADER )
2025-01-17 18:25:55 -08:00
. map ( | llm_name | llm_name . into ( ) ) ;
2024-10-17 10:16:40 -07:00
self . llm_provider = Some ( routing ::get_llm_provider (
& self . llm_providers ,
provider_hint ,
) ) ;
2025-01-31 10:37:53 -08:00
2025-06-11 15:15:00 -07:00
match self . llm_provider . as_ref ( ) . unwrap ( ) . provider_interface {
LlmProviderType ::Groq = > {
if let Some ( path ) = self . get_http_request_header ( " :path " ) {
if path . starts_with ( " /v1/ " ) {
let new_path = format! ( " /openai {} " , path ) ;
self . set_http_request_header ( " :path " , Some ( new_path . as_str ( ) ) ) ;
}
}
}
LlmProviderType ::Gemini = > {
if let Some ( path ) = self . get_http_request_header ( " :path " ) {
if path = = " /v1/chat/completions " {
self . set_http_request_header (
" :path " ,
Some ( " /v1beta/openai/chat/completions " ) ,
) ;
}
2025-04-13 14:00:16 -07:00
}
}
2025-06-11 15:15:00 -07:00
_ = > { }
2025-04-13 14:00:16 -07:00
}
2025-03-27 10:40:20 -07:00
debug! (
2025-03-05 14:08:06 -08:00
" request received: llm provider hint: {}, selected llm: {}, model: {} " ,
self . get_http_request_header ( ARCH_PROVIDER_HINT_HEADER )
. unwrap_or_default ( ) ,
self . llm_provider . as_ref ( ) . unwrap ( ) . name ,
2025-03-19 15:21:34 -07:00
self . llm_provider
. as_ref ( )
. unwrap ( )
. model
. as_ref ( )
. unwrap_or ( & String ::new ( ) )
2025-01-31 10:37:53 -08:00
) ;
2024-10-17 10:16:40 -07:00
}
fn modify_auth_headers ( & mut self ) -> Result < ( ) , ServerError > {
let llm_provider_api_key_value =
self . llm_provider ( )
. access_key
. as_ref ( )
. ok_or ( ServerError ::BadRequest {
why : format ! (
" No access key configured for selected LLM Provider \" {} \" " ,
self . llm_provider ( )
) ,
} ) ? ;
let authorization_header_value = format! ( " Bearer {} " , llm_provider_api_key_value ) ;
self . set_http_request_header ( " Authorization " , Some ( & authorization_header_value ) ) ;
Ok ( ( ) )
}
fn delete_content_length_header ( & mut self ) {
// Remove the Content-Length header because further body manipulations in the gateway logic will invalidate it.
// Server's generally throw away requests whose body length do not match the Content-Length header.
// However, a missing Content-Length header is not grounds for bad requests given that intermediary hops could
// manipulate the body in benign ways e.g., compression.
self . set_http_request_header ( " content-length " , None ) ;
}
fn save_ratelimit_header ( & mut self ) {
self . ratelimit_selector = self
. get_http_request_header ( RATELIMIT_SELECTOR_HEADER_KEY )
. and_then ( | key | {
self . get_http_request_header ( & key )
. map ( | value | Header { key , value } )
} ) ;
}
fn send_server_error ( & self , error : ServerError , override_status_code : Option < StatusCode > ) {
2025-03-27 10:40:20 -07:00
warn! ( " server error occurred: {} " , error ) ;
2024-10-17 10:16:40 -07:00
self . send_http_response (
override_status_code
. unwrap_or ( StatusCode ::INTERNAL_SERVER_ERROR )
. as_u16 ( )
. into ( ) ,
vec! [ ] ,
Some ( format! ( " {error} " ) . as_bytes ( ) ) ,
) ;
}
fn enforce_ratelimits (
& mut self ,
model : & str ,
json_string : & str ,
) -> Result < ( ) , ratelimit ::Error > {
2024-11-12 15:03:26 -08:00
// Tokenize and record token count.
let token_count = tokenizer ::token_count ( model , json_string ) . unwrap_or ( 0 ) ;
2025-03-27 10:40:20 -07:00
debug! ( " Recorded input token count: {} " , token_count ) ;
2024-11-12 15:03:26 -08:00
// Record the token count to metrics.
self . metrics
. input_sequence_length
. record ( token_count as u64 ) ;
// Check if rate limiting needs to be applied.
2024-10-17 10:16:40 -07:00
if let Some ( selector ) = self . ratelimit_selector . take ( ) {
2025-03-27 10:40:20 -07:00
log ::debug! ( " Applying ratelimit for model: {} " , model ) ;
2024-11-12 15:03:26 -08:00
ratelimit ::ratelimits ( None ) . read ( ) . unwrap ( ) . check_limit (
model . to_owned ( ) ,
selector ,
NonZero ::new ( token_count as u32 ) . unwrap ( ) ,
) ? ;
} else {
2025-03-27 10:40:20 -07:00
debug! ( " No rate limit applied for model: {} " , model ) ;
2024-10-17 10:16:40 -07:00
}
2024-11-12 15:03:26 -08:00
2024-10-17 10:16:40 -07:00
Ok ( ( ) )
}
}
// HttpContext is the trait that allows the Rust code to interact with HTTP objects.
2024-10-18 12:53:44 -07:00
impl HttpContext for StreamContext {
2024-10-17 10:16:40 -07:00
// Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
// the lifecycle of the http request and response.
fn on_http_request_headers ( & mut self , _num_headers : usize , _end_of_stream : bool ) -> Action {
2025-03-03 13:11:57 -08:00
let request_path = self . get_http_request_header ( " :path " ) . unwrap_or_default ( ) ;
if request_path = = HEALTHZ_PATH {
self . send_http_response ( 200 , vec! [ ] , None ) ;
return Action ::Continue ;
}
2025-06-11 15:15:00 -07:00
self . is_chat_completions_request = CHAT_COMPLETIONS_PATH = = request_path ;
2025-03-19 15:21:34 -07:00
let use_agent_orchestrator = match self . overrides . as_ref ( ) {
Some ( overrides ) = > overrides . use_agent_orchestrator . unwrap_or_default ( ) ,
None = > false ,
} ;
2025-01-17 18:25:55 -08:00
2025-06-10 12:53:27 -07:00
let routing_header_value = self . get_http_request_header ( ARCH_ROUTING_HEADER ) ;
if routing_header_value . is_some ( ) & & ! routing_header_value . as_ref ( ) . unwrap ( ) . is_empty ( ) {
let routing_header_value = routing_header_value . as_ref ( ) . unwrap ( ) ;
2025-03-27 10:40:20 -07:00
info! ( " routing header already set: {} " , routing_header_value ) ;
2025-03-19 15:21:34 -07:00
self . llm_provider = Some ( Rc ::new ( LlmProvider {
name : routing_header_value . to_string ( ) ,
provider_interface : LlmProviderType ::OpenAI ,
2025-05-22 22:55:46 -07:00
.. Default ::default ( )
2025-03-19 15:21:34 -07:00
} ) ) ;
} else {
self . select_llm_provider ( ) ;
2025-03-26 11:01:32 -07:00
if self . llm_provider ( ) . endpoint . is_some ( ) {
self . add_http_request_header (
ARCH_ROUTING_HEADER ,
& self . llm_provider ( ) . name . to_string ( ) ,
) ;
} else {
self . add_http_request_header (
ARCH_ROUTING_HEADER ,
& self . llm_provider ( ) . provider_interface . to_string ( ) ,
) ;
}
2025-03-19 15:21:34 -07:00
if let Err ( error ) = self . modify_auth_headers ( ) {
// ensure that the provider has an endpoint if the access key is missing else return a bad request
2025-07-08 00:33:40 -07:00
if self . llm_provider . as_ref ( ) . unwrap ( ) . endpoint . is_none ( )
& & ! use_agent_orchestrator
& & self . llm_provider . as_ref ( ) . unwrap ( ) . provider_interface
! = LlmProviderType ::Arch
2025-03-19 15:21:34 -07:00
{
self . send_server_error ( error , Some ( StatusCode ::BAD_REQUEST ) ) ;
}
2025-01-17 18:25:55 -08:00
}
2024-10-17 10:16:40 -07:00
}
2025-03-19 15:21:34 -07:00
2024-10-17 10:16:40 -07:00
self . delete_content_length_header ( ) ;
self . save_ratelimit_header ( ) ;
self . request_id = self . get_http_request_header ( REQUEST_ID_HEADER ) ;
2024-11-15 10:44:01 -08:00
self . traceparent = self . get_http_request_header ( TRACE_PARENT_HEADER ) ;
2024-11-12 15:03:26 -08:00
2024-10-17 10:16:40 -07:00
Action ::Continue
}
fn on_http_request_body ( & mut self , body_size : usize , end_of_stream : bool ) -> Action {
2025-03-27 10:40:20 -07:00
debug! (
" on_http_request_body [S={}] bytes={} end_stream={} " ,
self . context_id , body_size , end_of_stream
) ;
2024-10-17 10:16:40 -07:00
// Let the client send the gateway all the data before sending to the LLM_provider.
// TODO: consider a streaming API.
2024-11-17 17:01:19 -08:00
if self . request_body_sent_time . is_none ( ) {
2024-11-18 17:55:39 -08:00
self . request_body_sent_time = Some ( current_time_ns ( ) ) ;
2024-11-17 17:01:19 -08:00
}
2024-10-17 10:16:40 -07:00
if ! end_of_stream {
return Action ::Pause ;
}
if body_size = = 0 {
return Action ::Continue ;
}
2025-03-19 15:21:34 -07:00
let body_bytes = match self . get_http_request_body ( 0 , body_size ) {
Some ( body_bytes ) = > body_bytes ,
None = > {
self . send_server_error (
ServerError ::LogicError ( format! (
" Failed to obtain body bytes even though body_size is {} " ,
body_size
) ) ,
None ,
) ;
return Action ::Pause ;
}
} ;
2025-06-10 12:53:27 -07:00
let mut deserialized_body = match ChatCompletionsRequest ::try_from ( body_bytes . as_slice ( ) ) {
Ok ( deserialized ) = > deserialized ,
Err ( e ) = > {
debug! (
" on_http_request_body: request body: {} " ,
String ::from_utf8_lossy ( & body_bytes )
) ;
self . send_server_error ( ServerError ::OpenAIPError ( e ) , Some ( StatusCode ::BAD_REQUEST ) ) ;
return Action ::Pause ;
}
} ;
2024-10-17 10:16:40 -07:00
2024-11-15 10:44:01 -08:00
self . user_message = deserialized_body
. messages
. iter ( )
. filter ( | m | m . role = = " user " )
. last ( )
. cloned ( ) ;
2025-03-19 15:21:34 -07:00
let model_name = match self . llm_provider . as_ref ( ) {
2025-03-21 15:56:17 -07:00
Some ( llm_provider ) = > llm_provider . model . as_ref ( ) ,
None = > None ,
2025-03-19 15:21:34 -07:00
} ;
2025-03-21 15:56:17 -07:00
let use_agent_orchestrator = match self . overrides . as_ref ( ) {
Some ( overrides ) = > overrides . use_agent_orchestrator . unwrap_or_default ( ) ,
None = > false ,
} ;
2025-03-19 15:21:34 -07:00
2025-03-21 15:56:17 -07:00
let model_requested = deserialized_body . model . clone ( ) ;
2025-05-19 09:59:22 -07:00
deserialized_body . model = match model_name {
Some ( model_name ) = > model_name . clone ( ) ,
None = > {
if use_agent_orchestrator {
" agent_orchestrator " . to_string ( )
} else {
self . send_server_error (
ServerError ::BadRequest {
why : format ! ( " No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?} " , deserialized_body . model , self . llm_provider ( ) . name , self . llm_provider ( ) . model ) . to_string ( ) ,
} ,
Some ( StatusCode ::BAD_REQUEST ) ,
) ;
return Action ::Continue ;
2025-03-21 15:56:17 -07:00
}
}
2025-05-19 09:59:22 -07:00
} ;
2024-10-17 10:16:40 -07:00
2025-03-27 10:40:20 -07:00
info! (
" on_http_request_body: provider: {}, model requested: {}, model selected: {} " ,
2025-03-21 15:56:17 -07:00
self . llm_provider ( ) . name ,
model_requested ,
2025-03-27 10:40:20 -07:00
model_name . unwrap_or ( & " None " . to_string ( ) ) ,
2024-10-28 20:05:06 -04:00
) ;
2025-06-10 12:53:27 -07:00
if deserialized_body . stream . unwrap_or_default ( ) {
2024-10-28 20:05:06 -04:00
self . streaming_response = true ;
}
2025-06-10 12:53:27 -07:00
if deserialized_body . stream . unwrap_or_default ( )
& & deserialized_body . stream_options . is_none ( )
{
2024-10-28 20:05:06 -04:00
deserialized_body . stream_options = Some ( StreamOptions {
include_usage : true ,
} ) ;
}
2024-11-12 15:03:26 -08:00
// only use the tokens from the messages, excluding the metadata and json tags
let input_tokens_str = deserialized_body
. messages
. iter ( )
. fold ( String ::new ( ) , | acc , m | {
2025-05-23 00:51:53 -07:00
acc + " "
+ m . content
. as_ref ( )
. unwrap_or ( & ContentType ::Text ( String ::new ( ) ) )
. to_string ( )
. as_str ( )
2024-11-12 15:03:26 -08:00
} ) ;
2024-10-17 10:16:40 -07:00
// enforce ratelimits on ingress
2024-11-12 15:03:26 -08:00
if let Err ( e ) = self . enforce_ratelimits ( & deserialized_body . model , input_tokens_str . as_str ( ) )
2024-10-17 10:16:40 -07:00
{
self . send_server_error (
ServerError ::ExceededRatelimit ( e ) ,
Some ( StatusCode ::TOO_MANY_REQUESTS ) ,
) ;
self . metrics . ratelimited_rq . increment ( 1 ) ;
return Action ::Continue ;
}
2025-06-10 12:53:27 -07:00
let llm_provider_str = self . llm_provider ( ) . provider_interface . to_string ( ) ;
let hermes_llm_provider = Provider ::from ( llm_provider_str . as_str ( ) ) ;
// convert chat completion request to llm provider specific request
let deserialized_body_bytes = match deserialized_body . to_bytes ( hermes_llm_provider ) {
Ok ( bytes ) = > bytes ,
Err ( e ) = > {
warn! ( " Failed to serialize request body: {} " , e ) ;
self . send_server_error ( ServerError ::OpenAIPError ( e ) , Some ( StatusCode ::BAD_REQUEST ) ) ;
return Action ::Pause ;
}
} ;
self . set_http_request_body ( 0 , body_size , & deserialized_body_bytes ) ;
2024-10-17 10:16:40 -07:00
Action ::Continue
}
2025-06-11 15:15:00 -07:00
fn on_http_response_headers ( & mut self , _num_headers : usize , end_of_stream : bool ) -> Action {
2025-03-27 10:40:20 -07:00
debug! (
2024-11-18 17:55:39 -08:00
" on_http_response_headers [S={}] end_stream={} " ,
2025-06-11 15:15:00 -07:00
self . context_id , end_of_stream
2024-11-18 17:55:39 -08:00
) ;
self . set_property (
vec! [ " metadata " , " filter_metadata " , " llm_filter " , " user_prompt " ] ,
Some ( " hello world from filter " . as_bytes ( ) ) ,
) ;
Action ::Continue
}
2024-10-17 10:16:40 -07:00
fn on_http_response_body ( & mut self , body_size : usize , end_of_stream : bool ) -> Action {
2025-03-27 10:40:20 -07:00
debug! (
2024-10-28 20:05:06 -04:00
" on_http_response_body [S={}] bytes={} end_stream={} " ,
2025-03-27 10:40:20 -07:00
self . context_id , body_size , end_of_stream
2024-10-17 10:16:40 -07:00
) ;
2025-03-27 10:40:20 -07:00
if self . request_body_sent_time . is_none ( ) {
2025-04-15 14:39:12 -07:00
debug! ( " on_http_response_body: request body not sent, not doing any processing in llm filter " ) ;
2025-03-27 10:40:20 -07:00
return Action ::Continue ;
}
2024-10-17 10:16:40 -07:00
if ! self . is_chat_completions_request {
2025-03-27 10:40:20 -07:00
info! ( " on_http_response_body: non-chatcompletion request " ) ;
2024-10-17 10:16:40 -07:00
return Action ::Continue ;
}
2024-11-12 15:03:26 -08:00
let current_time = get_current_time ( ) . unwrap ( ) ;
if end_of_stream & & body_size = = 0 {
// All streaming responses end with bytes=0 and end_stream=true
// Record the latency for the request
2024-11-18 17:55:39 -08:00
match current_time . duration_since ( self . start_time ) {
Ok ( duration ) = > {
// Convert the duration to milliseconds
let duration_ms = duration . as_millis ( ) ;
2025-03-27 10:40:20 -07:00
info! ( " on_http_response_body: request latency: {}ms " , duration_ms ) ;
2024-11-18 17:55:39 -08:00
// Record the latency to the latency histogram
self . metrics . request_latency . record ( duration_ms as u64 ) ;
2025-01-17 18:25:55 -08:00
if self . response_tokens > 0 {
// Compute the time per output token
let tpot = duration_ms as u64 / self . response_tokens as u64 ;
2024-11-18 17:55:39 -08:00
2025-01-17 18:25:55 -08:00
// Record the time per output token
self . metrics . time_per_output_token . record ( tpot ) ;
2024-11-18 17:55:39 -08:00
2025-03-27 10:40:20 -07:00
debug! (
2025-01-31 10:37:53 -08:00
" time per token: {}ms, tokens per second: {} " ,
tpot ,
1000 / tpot
) ;
2025-01-17 18:25:55 -08:00
// Record the tokens per second
self . metrics . tokens_per_second . record ( 1000 / tpot ) ;
}
2024-11-18 17:55:39 -08:00
}
Err ( e ) = > {
warn! ( " SystemTime error: {:?} " , e ) ;
2024-11-12 15:03:26 -08:00
}
2024-10-28 20:05:06 -04:00
}
2024-11-12 15:03:26 -08:00
// Record the output sequence length
self . metrics
. output_sequence_length
. record ( self . response_tokens as u64 ) ;
2024-11-15 10:44:01 -08:00
if let Some ( traceparent ) = self . traceparent . as_ref ( ) {
2024-11-18 17:55:39 -08:00
let current_time_ns = current_time_ns ( ) ;
2024-11-15 10:44:01 -08:00
2024-11-18 17:55:39 -08:00
match Traceparent ::try_from ( traceparent . to_string ( ) ) {
Err ( e ) = > {
warn! ( " traceparent header is invalid: {} " , e ) ;
2024-11-15 10:44:01 -08:00
}
2024-11-18 17:55:39 -08:00
Ok ( traceparent ) = > {
let mut trace_data = common ::tracing ::TraceData ::new ( ) ;
let mut llm_span = Span ::new (
2025-02-14 19:28:10 -08:00
" egress_traffic " . to_string ( ) ,
2024-11-18 17:55:39 -08:00
Some ( traceparent . trace_id ) ,
Some ( traceparent . parent_id ) ,
self . request_body_sent_time . unwrap ( ) ,
current_time_ns ,
) ;
if let Some ( user_message ) = self . user_message . as_ref ( ) {
if let Some ( prompt ) = user_message . content . as_ref ( ) {
llm_span
. add_attribute ( " user_prompt " . to_string ( ) , prompt . to_string ( ) ) ;
}
}
llm_span . add_attribute (
" model " . to_string ( ) ,
self . llm_provider ( ) . name . to_string ( ) ,
) ;
2025-01-17 18:25:55 -08:00
if self . ttft_time . is_some ( ) {
llm_span . add_event ( Event ::new (
" time_to_first_token " . to_string ( ) ,
self . ttft_time . unwrap ( ) ,
) ) ;
trace_data . add_span ( llm_span ) ;
}
2024-11-18 17:55:39 -08:00
self . traces_queue . lock ( ) . unwrap ( ) . push_back ( trace_data ) ;
}
} ;
2024-11-15 10:44:01 -08:00
}
2024-11-12 15:03:26 -08:00
return Action ::Continue ;
}
let body = if self . streaming_response {
2024-10-28 20:05:06 -04:00
let chunk_start = 0 ;
let chunk_size = body_size ;
2025-03-27 10:40:20 -07:00
debug! (
" on_http_response_body: streaming response reading, {}..{} " ,
chunk_start , chunk_size
2024-10-28 20:05:06 -04:00
) ;
let streaming_chunk = match self . get_http_response_body ( 0 , chunk_size ) {
Some ( chunk ) = > chunk ,
2024-10-17 10:16:40 -07:00
None = > {
2024-10-28 20:05:06 -04:00
warn! (
" response body empty, chunk_start: {}, chunk_size: {} " ,
chunk_start , chunk_size
2024-10-17 10:16:40 -07:00
) ;
2024-10-28 20:05:06 -04:00
return Action ::Continue ;
2024-10-17 10:16:40 -07:00
}
} ;
2024-10-28 20:05:06 -04:00
if streaming_chunk . len ( ) ! = chunk_size {
warn! (
" chunk size mismatch: read: {} != requested: {} " ,
streaming_chunk . len ( ) ,
chunk_size
) ;
}
streaming_chunk
} else {
2025-05-22 22:55:46 -07:00
if body_size = = 0 {
return Action ::Continue ;
}
2025-03-27 10:40:20 -07:00
debug! ( " non streaming response bytes read: 0:{} " , body_size ) ;
2024-10-28 20:05:06 -04:00
match self . get_http_response_body ( 0 , body_size ) {
Some ( body ) = > body ,
None = > {
warn! ( " non streaming response body empty " ) ;
return Action ::Continue ;
}
}
} ;
2025-06-11 15:15:00 -07:00
if log ::log_enabled! ( log ::Level ::Debug ) {
debug! (
" response data (converted to utf8): {} " ,
String ::from_utf8_lossy ( & body )
) ;
}
2025-06-10 12:53:27 -07:00
let llm_provider_str = self . llm_provider ( ) . provider_interface . to_string ( ) ;
let hermes_llm_provider = Provider ::from ( llm_provider_str . as_str ( ) ) ;
2024-10-28 20:05:06 -04:00
if self . streaming_response {
let chat_completions_chunk_response_events =
2025-06-10 12:53:27 -07:00
match SseChatCompletionIter ::try_from ( ( body . as_slice ( ) , & hermes_llm_provider ) ) {
Ok ( events ) = > events ,
2024-10-28 20:05:06 -04:00
Err ( e ) = > {
2025-07-02 14:08:19 -07:00
warn! (
" could not parse response: {}, body str: {} " ,
e ,
String ::from_utf8_lossy ( & body )
) ;
2024-10-17 10:16:40 -07:00
return Action ::Continue ;
}
} ;
2025-06-10 12:53:27 -07:00
for event in chat_completions_chunk_response_events {
match event {
Ok ( event ) = > {
if let Some ( usage ) = event . usage . as_ref ( ) {
self . response_tokens + = usage . completion_tokens ;
}
}
2024-10-28 20:05:06 -04:00
Err ( e ) = > {
2025-06-10 12:53:27 -07:00
warn! ( " error in response event: {} " , e ) ;
continue ;
2024-10-28 20:05:06 -04:00
}
2025-06-10 12:53:27 -07:00
}
}
2024-11-12 15:03:26 -08:00
// Compute TTFT if not already recorded
if self . ttft_duration . is_none ( ) {
2024-11-18 17:55:39 -08:00
// if let Some(start_time) = self.start_time {
let current_time = get_current_time ( ) . unwrap ( ) ;
self . ttft_time = Some ( current_time_ns ( ) ) ;
match current_time . duration_since ( self . start_time ) {
Ok ( duration ) = > {
let duration_ms = duration . as_millis ( ) ;
2025-03-27 10:40:20 -07:00
info! (
" on_http_response_body: time to first token: {}ms " ,
duration_ms
) ;
2024-11-18 17:55:39 -08:00
self . ttft_duration = Some ( duration ) ;
self . metrics . time_to_first_token . record ( duration_ms as u64 ) ;
}
Err ( e ) = > {
warn! ( " SystemTime error: {:?} " , e ) ;
2024-11-12 15:03:26 -08:00
}
}
}
2024-10-17 10:16:40 -07:00
} else {
2025-03-27 10:40:20 -07:00
debug! ( " non streaming response " ) ;
2025-06-10 12:53:27 -07:00
let chat_completions_response =
match ChatCompletionsResponse ::try_from ( ( body . as_slice ( ) , & hermes_llm_provider ) ) {
2024-10-17 10:16:40 -07:00
Ok ( de ) = > de ,
2025-06-10 12:53:27 -07:00
Err ( e ) = > {
2025-07-02 14:08:19 -07:00
warn! (
" could not parse response: {}, body str: {} " ,
e ,
String ::from_utf8_lossy ( & body )
) ;
2025-06-10 12:53:27 -07:00
debug! (
" on_http_response_body: S[{}], response body: {} " ,
self . context_id ,
String ::from_utf8_lossy ( & body )
) ;
self . send_server_error (
ServerError ::OpenAIPError ( e ) ,
Some ( StatusCode ::BAD_REQUEST ) ,
2025-01-31 10:37:53 -08:00
) ;
2024-10-18 13:14:18 -07:00
return Action ::Continue ;
2024-10-17 10:16:40 -07:00
}
} ;
2025-06-10 12:53:27 -07:00
if let Some ( usage ) = chat_completions_response . usage {
self . response_tokens + = usage . completion_tokens ;
2024-10-17 10:16:40 -07:00
}
}
2025-03-27 10:40:20 -07:00
debug! (
2024-10-17 10:16:40 -07:00
" recv [S={}] total_tokens={} end_stream={} " ,
2025-03-27 10:40:20 -07:00
self . context_id , self . response_tokens , end_of_stream
2024-10-17 10:16:40 -07:00
) ;
Action ::Continue
}
}
2024-11-18 17:55:39 -08:00
fn current_time_ns ( ) -> u128 {
SystemTime ::now ( )
. duration_since ( UNIX_EPOCH )
. unwrap ( )
. as_nanos ( )
}
2024-10-18 12:53:44 -07:00
impl Context for StreamContext { }