diff --git a/README.md b/README.md index e4d637e9..b733bbd5 100644 --- a/README.md +++ b/README.md @@ -309,6 +309,33 @@ Arch is designed to support best-in class observability by supporting open stand ![alt text](docs/source/_static/img/tracing.png) +## Debugging + +When debugging issues / errors application logs and access logs provide key information to give you more context on whats going on with the system. Arch gateway runs in info log level and following is a typical output you could see in a typical interaction between developer and arch gateway, + +``` +$ archgw up --service archgw --foreground +... +[2025-03-26 18:32:01.350][26][info] prompt_gateway: on_http_request_body: sending request to model server +[2025-03-26 18:32:01.851][26][info] prompt_gateway: on_http_call_response: model server response received +[2025-03-26 18:32:01.852][26][info] prompt_gateway: on_http_call_response: dispatching api call to developer endpoint: weather_forecast_service, path: /weather, method: POST +[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: developer api call response received: status code: 200 +[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: sending request to upstream llm +[2025-03-26 18:32:01.883][26][info] llm_gateway: on_http_request_body: provider: gpt-4o-mini, model requested: None, model selected: gpt-4o-mini +[2025-03-26 18:32:02.818][26][info] llm_gateway: on_http_response_body: time to first token: 1468ms +[2025-03-26 18:32:04.532][26][info] llm_gateway: on_http_response_body: request latency: 3183ms +... +``` + +Log level can be changed to debug to get more details. To enable debug logs edit (Dockerfile)[arch/Dockerfile], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands, + +``` +# make sure you are at the root of the repo +$ archgw build +# go to your service that has arch_config.yaml file and issue following command, +$ archgw up --service archgw --foreground +``` + ## Contribution We would love feedback on our [Roadmap](https://github.com/orgs/katanemo/projects/1) and we welcome contributions to **Arch**! Whether you're fixing bugs, adding new features, improving documentation, or creating tutorials, your help is much appreciated. diff --git a/arch/Dockerfile b/arch/Dockerfile index 98a113ad..7f933da5 100644 --- a/arch/Dockerfile +++ b/arch/Dockerfile @@ -29,4 +29,4 @@ RUN pip install requests RUN touch /var/log/envoy.log # ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --log-level trace 2>&1 | tee /var/log/envoy.log"] -ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"] +ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"] diff --git a/crates/common/src/http.rs b/crates/common/src/http.rs index 85626f9b..8ef176b2 100644 --- a/crates/common/src/http.rs +++ b/crates/common/src/http.rs @@ -3,7 +3,7 @@ use crate::{ stats::{Gauge, IncrementingMetric}, }; use derivative::Derivative; -use log::trace; +use log::debug; use proxy_wasm::traits::Context; use serde::Serialize; use std::{cell::RefCell, collections::HashMap, fmt::Debug, time::Duration}; @@ -48,7 +48,7 @@ pub trait Client: Context { call_args: CallArgs, call_context: Self::CallContext, ) -> Result { - trace!( + debug!( "dispatching http call with args={:?} context={:?}", call_args, call_context diff --git a/crates/common/src/ratelimit.rs b/crates/common/src/ratelimit.rs index 8825fd01..ef4e5d08 100644 --- a/crates/common/src/ratelimit.rs +++ b/crates/common/src/ratelimit.rs @@ -1,7 +1,7 @@ use crate::configuration; use configuration::{Limit, Ratelimit, TimeUnit}; use governor::{DefaultKeyedRateLimiter, InsufficientCapacity, Quota}; -use log::trace; +use log::debug; use std::fmt::Display; use std::num::{NonZero, NonZeroU32}; use std::sync::RwLock; @@ -99,7 +99,7 @@ impl RatelimitMap { selector: Header, tokens_used: NonZeroU32, ) -> Result<(), Error> { - trace!( + debug!( "Checking limit for provider={}, with selector={:?}, consuming tokens={:?}", provider, selector, diff --git a/crates/common/src/tokenizer.rs b/crates/common/src/tokenizer.rs index d2611375..e9431521 100644 --- a/crates/common/src/tokenizer.rs +++ b/crates/common/src/tokenizer.rs @@ -1,14 +1,14 @@ -use log::trace; +use log::{debug}; #[allow(dead_code)] pub fn token_count(model_name: &str, text: &str) -> Result { - trace!("getting token count model={}", model_name); + debug!("getting token count model={}", model_name); //HACK: add support for tokenizing mistral and other models //filed issue https://github.com/katanemo/arch/issues/222 let updated_model = match model_name.starts_with("gpt") { false => { - trace!( + debug!( "tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count", model_name ); diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index c621393f..78d7e21e 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -15,7 +15,7 @@ use common::stats::{IncrementingMetric, RecordingMetric}; use common::tracing::{Event, Span, TraceData, Traceparent}; use common::{ratelimit, routing, tokenizer}; use http::StatusCode; -use log::{debug, trace, warn}; +use log::{debug, info, warn}; use proxy_wasm::hostcalls::get_current_time; use proxy_wasm::traits::*; use proxy_wasm::types::*; @@ -89,7 +89,7 @@ impl StreamContext { provider_hint, )); - trace!( + debug!( "request received: llm provider hint: {}, selected llm: {}, model: {}", self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER) .unwrap_or_default(), @@ -140,7 +140,7 @@ impl StreamContext { } fn send_server_error(&self, error: ServerError, override_status_code: Option) { - debug!("server error occurred: {}", error); + warn!("server error occurred: {}", error); self.send_http_response( override_status_code .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) @@ -159,7 +159,7 @@ impl StreamContext { // Tokenize and record token count. let token_count = tokenizer::token_count(model, json_string).unwrap_or(0); - trace!("Recorded input token count: {}", token_count); + debug!("Recorded input token count: {}", token_count); // Record the token count to metrics. self.metrics .input_sequence_length @@ -167,14 +167,14 @@ impl StreamContext { // Check if rate limiting needs to be applied. if let Some(selector) = self.ratelimit_selector.take() { - log::trace!("Applying ratelimit for model: {}", model); + log::debug!("Applying ratelimit for model: {}", model); ratelimit::ratelimits(None).read().unwrap().check_limit( model.to_owned(), selector, NonZero::new(token_count as u32).unwrap(), )?; } else { - trace!("No rate limit applied for model: {}", model); + debug!("No rate limit applied for model: {}", model); } Ok(()) @@ -200,7 +200,7 @@ impl HttpContext for StreamContext { }; if let Some(routing_header_value) = routing_header_value.as_ref() { - debug!("routing header already set: {}", routing_header_value); + info!("routing header already set: {}", routing_header_value); self.llm_provider = Some(Rc::new(LlmProvider { name: routing_header_value.to_string(), provider_interface: LlmProviderType::OpenAI, @@ -247,6 +247,11 @@ impl HttpContext for StreamContext { } fn on_http_request_body(&mut self, body_size: usize, end_of_stream: bool) -> Action { + debug!( + "on_http_request_body [S={}] bytes={} end_stream={}", + self.context_id, body_size, end_of_stream + ); + // Let the client send the gateway all the data before sending to the LLM_provider. // TODO: consider a streaming API. @@ -282,7 +287,10 @@ impl HttpContext for StreamContext { match serde_json::from_slice(&body_bytes) { Ok(deserialized) => deserialized, Err(e) => { - debug!("body str: {}", String::from_utf8_lossy(&body_bytes)); + debug!( + "on_http_request_body: request body: {}", + String::from_utf8_lossy(&body_bytes) + ); self.send_server_error( ServerError::Deserialization(e), Some(StatusCode::BAD_REQUEST), @@ -336,16 +344,19 @@ impl HttpContext for StreamContext { } } - debug!( - "provider: {:?}, model requested: {}, model selected: {:?}", + info!( + "on_http_request_body: provider: {}, model requested: {}, model selected: {}", self.llm_provider().name, model_requested, - model_name, + model_name.unwrap_or(&"None".to_string()), ); let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap(); - trace!("request body: {}", chat_completion_request_str); + debug!( + "on_http_request_body: request body: {}", + chat_completion_request_str + ); if deserialized_body.stream { self.streaming_response = true; @@ -380,10 +391,9 @@ impl HttpContext for StreamContext { } fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { - trace!( + debug!( "on_http_response_headers [S={}] end_stream={}", - self.context_id, - _end_of_stream + self.context_id, _end_of_stream ); self.set_property( @@ -395,15 +405,18 @@ impl HttpContext for StreamContext { } fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action { - trace!( + debug!( "on_http_response_body [S={}] bytes={} end_stream={}", - self.context_id, - body_size, - end_of_stream + self.context_id, body_size, end_of_stream ); + if self.request_body_sent_time.is_none() { + debug!("on_http_response_body: request body not sent, no doing any processing in llm filter"); + return Action::Continue; + } + if !self.is_chat_completions_request { - debug!("non-chatcompletion request"); + info!("on_http_response_body: non-chatcompletion request"); return Action::Continue; } @@ -415,7 +428,7 @@ impl HttpContext for StreamContext { Ok(duration) => { // Convert the duration to milliseconds let duration_ms = duration.as_millis(); - debug!("request latency: {}ms", duration_ms); + info!("on_http_response_body: request latency: {}ms", duration_ms); // Record the latency to the latency histogram self.metrics.request_latency.record(duration_ms as u64); @@ -426,7 +439,7 @@ impl HttpContext for StreamContext { // Record the time per output token self.metrics.time_per_output_token.record(tpot); - trace!( + debug!( "time per token: {}ms, tokens per second: {}", tpot, 1000 / tpot @@ -490,10 +503,9 @@ impl HttpContext for StreamContext { let body = if self.streaming_response { let chunk_start = 0; let chunk_size = body_size; - trace!( - "streaming response reading, {}..{}", - chunk_start, - chunk_size + debug!( + "on_http_response_body: streaming response reading, {}..{}", + chunk_start, chunk_size ); let streaming_chunk = match self.get_http_response_body(0, chunk_size) { Some(chunk) => chunk, @@ -515,7 +527,7 @@ impl HttpContext for StreamContext { } streaming_chunk } else { - trace!("non streaming response bytes read: 0:{}", body_size); + debug!("non streaming response bytes read: 0:{}", body_size); match self.get_http_response_body(0, body_size) { Some(body) => body, None => { @@ -528,7 +540,7 @@ impl HttpContext for StreamContext { let body_utf8 = match String::from_utf8(body) { Ok(body_utf8) => body_utf8, Err(e) => { - debug!("could not convert to utf8: {}", e); + warn!("could not convert to utf8: {}", e); return Action::Continue; } }; @@ -542,7 +554,7 @@ impl HttpContext for StreamContext { match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) { Ok(response) => response, Err(e) => { - debug!( + warn!( "invalid streaming response: body str: {}, {:?}", body_utf8, e ); @@ -551,8 +563,8 @@ impl HttpContext for StreamContext { }; if chat_completions_chunk_response_events.events.is_empty() { - debug!( - "cound't parse any streaming events: body str: {}", + warn!( + "couldn't parse any streaming events: body str: {}", body_utf8 ); return Action::Continue; @@ -571,7 +583,7 @@ impl HttpContext for StreamContext { { Ok(token_count) => token_count, Err(e) => { - debug!("could not get token count: {:?}", e); + warn!("could not get token count: {:?}", e); return Action::Continue; } }; @@ -585,7 +597,10 @@ impl HttpContext for StreamContext { match current_time.duration_since(self.start_time) { Ok(duration) => { let duration_ms = duration.as_millis(); - debug!("time to first token: {}ms", duration_ms); + info!( + "on_http_response_body: time to first token: {}ms", + duration_ms + ); self.ttft_duration = Some(duration); self.metrics.time_to_first_token.record(duration_ms as u64); } @@ -595,12 +610,12 @@ impl HttpContext for StreamContext { } } } else { - trace!("non streaming response"); + debug!("non streaming response"); let chat_completions_response: ChatCompletionsResponse = match serde_json::from_str(body_utf8.as_str()) { Ok(de) => de, Err(err) => { - debug!( + info!( "non chat-completion compliant response received err: {}, body: {}", err, body_utf8 ); @@ -617,11 +632,9 @@ impl HttpContext for StreamContext { } } - trace!( + debug!( "recv [S={}] total_tokens={} end_stream={}", - self.context_id, - self.response_tokens, - end_of_stream + self.context_id, self.response_tokens, end_of_stream ); Action::Continue diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs index 9d87c00c..0f94d2d8 100644 --- a/crates/llm_gateway/tests/integration.rs +++ b/crates/llm_gateway/tests/integration.rs @@ -30,7 +30,7 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) { Some("x-arch-llm-provider-hint"), ) .returning(None) - .expect_log(Some(LogLevel::Trace), Some("request received: llm provider hint: default, selected llm: open-ai-gpt-4, model: gpt-4")) + .expect_log(Some(LogLevel::Debug), Some("request received: llm provider hint: default, selected llm: open-ai-gpt-4, model: gpt-4")) .expect_add_header_map_value( Some(MapType::HttpRequestHeaders), Some("x-arch-llm-provider"), @@ -223,15 +223,16 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_metric_record("input_sequence_length", 21) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -289,18 +290,19 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() { .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(incomplete_chat_completions_request_body)) .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4")) .expect_send_local_response( Some(StatusCode::BAD_REQUEST.as_u16().into()), None, None, None, ) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 14) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); } @@ -350,17 +352,18 @@ fn llm_gateway_request_ratelimited() { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_metric_record("input_sequence_length", 107) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Debug), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107")) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107")) .expect_send_local_response( Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()), None, @@ -417,16 +420,17 @@ fn llm_gateway_request_not_ratelimited() { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_metric_record("input_sequence_length", 29) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -480,14 +484,15 @@ fn llm_gateway_override_model_name() { .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id - .expect_log(Some(LogLevel::Debug), Some("provider: \"open-ai-gpt-4\", model requested: o1-mini, model selected: Some(\"gpt-4\")")) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4")) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -537,19 +542,20 @@ fn llm_gateway_override_use_default_model() { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id .expect_log( - Some(LogLevel::Debug), - Some("provider: \"open-ai-gpt-4\", model requested: , model selected: Some(\"gpt-4\")"), + Some(LogLevel::Info), + Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"), ) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); @@ -603,13 +609,14 @@ fn llm_gateway_override_use_model_name_none() { .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id - .expect_log(Some(LogLevel::Debug), Some("provider: \"open-ai-gpt-4\", model requested: none, model selected: Some(\"gpt-4\")")) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: none, model selected: gpt-4")) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_metric_record("input_sequence_length", 29) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs index 9580e934..3a7dc7d9 100644 --- a/crates/prompt_gateway/src/http_context.rs +++ b/crates/prompt_gateway/src/http_context.rs @@ -14,7 +14,7 @@ use common::{ pii::obfuscate_auth_header, }; use http::StatusCode; -use log::{debug, trace, warn}; +use log::{debug, info, warn}; use proxy_wasm::{traits::HttpContext, types::Action}; use serde_json::Value; use std::{ @@ -39,7 +39,7 @@ impl HttpContext for StreamContext { if let Some(endpoints) = self.endpoints.as_ref() { if endpoints.len() == 1 { let (name, _) = endpoints.iter().next().unwrap(); - debug!("Setting ARCH_PROVIDER_HINT_HEADER to {}", name); + info!("Setting ARCH_PROVIDER_HINT_HEADER to {}", name); self.set_http_request_header(ARCH_ROUTING_HEADER, Some(name)); } else { warn!("Need single endpoint when use_agent_orchestrator is set"); @@ -63,7 +63,7 @@ impl HttpContext for StreamContext { self.is_chat_completions_request = request_path == CHAT_COMPLETIONS_PATH; - trace!( + debug!( "on_http_request_headers S[{}] req_headers={:?}", self.context_id, obfuscate_auth_header(&mut self.get_http_request_headers()) @@ -89,10 +89,9 @@ impl HttpContext for StreamContext { self.request_body_size = body_size; - trace!( + debug!( "on_http_request_body S[{}] body_size={}", - self.context_id, - body_size + self.context_id, body_size ); let body_bytes = match self.get_http_request_body(0, body_size) { @@ -109,7 +108,7 @@ impl HttpContext for StreamContext { } }; - trace!("request body: {}", String::from_utf8_lossy(&body_bytes)); + debug!("request body: {}", String::from_utf8_lossy(&body_bytes)); // Deserialize body into spec. // Currently OpenAI API. @@ -206,8 +205,8 @@ impl HttpContext for StreamContext { } }; - debug!("sending request to model server"); - trace!("request body: {}", json_data); + info!("on_http_request_body: sending request to model server"); + debug!("request body: {}", json_data); let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string(); @@ -248,7 +247,7 @@ impl HttpContext for StreamContext { }; if let Err(e) = self.http_call(call_args, call_context) { - debug!("http_call failed: {:?}", e); + warn!("http_call failed: {:?}", e); self.send_server_error(ServerError::HttpDispatch(e), None); } @@ -256,7 +255,7 @@ impl HttpContext for StreamContext { } fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action { - trace!( + debug!( "on_http_response_headers recv [S={}] headers={:?}", self.context_id, self.get_http_response_headers() @@ -268,15 +267,13 @@ impl HttpContext for StreamContext { } fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action { - trace!( + debug!( "on_http_response_body: recv [S={}] bytes={} end_stream={}", - self.context_id, - body_size, - end_of_stream + self.context_id, body_size, end_of_stream ); if !self.is_chat_completions_request { - debug!("non-gpt request"); + info!("non-gpt request"); return Action::Continue; } @@ -315,7 +312,7 @@ impl HttpContext for StreamContext { streaming_chunk } else { - debug!("non streaming response bytes read: 0:{}", body_size); + info!("non streaming response bytes read: 0:{}", body_size); match self.get_http_response_body(0, body_size) { Some(body) => body, None => { @@ -328,13 +325,13 @@ impl HttpContext for StreamContext { let body_utf8 = match String::from_utf8(body) { Ok(body_utf8) => body_utf8, Err(e) => { - debug!("could not convert to utf8: {}", e); + info!("could not convert to utf8: {}", e); return Action::Continue; } }; if self.streaming_response { - trace!("streaming response"); + debug!("streaming response"); if self.tool_calls.is_some() && !self.tool_calls.as_ref().unwrap().is_empty() { let chunks = vec![ @@ -396,13 +393,13 @@ impl HttpContext for StreamContext { serde_json::Value::String(arch_state_str), ); let data_serialized = serde_json::to_string(&data).unwrap(); - debug!("archgw <= developer: {}", data_serialized); + info!("archgw <= developer: {}", data_serialized); self.set_http_response_body(0, body_size, data_serialized.as_bytes()); }; } } - trace!("recv [S={}] end_stream={}", self.context_id, end_of_stream); + debug!("recv [S={}] end_stream={}", self.context_id, end_of_stream); Action::Continue } diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index 4a968d43..ec68c104 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -15,7 +15,7 @@ use common::http::{CallArgs, Client}; use common::stats::Gauge; use derivative::Derivative; use http::StatusCode; -use log::{debug, trace, warn}; +use log::{debug, info, warn}; use proxy_wasm::traits::*; use std::cell::RefCell; use std::collections::HashMap; @@ -128,8 +128,8 @@ impl StreamContext { mut callout_context: StreamCallContext, ) { let body_str = String::from_utf8(body).unwrap(); - debug!("model server response received"); - trace!("response body: {}", body_str); + info!("on_http_call_response: model server response received"); + debug!("response body: {}", body_str); let model_server_response: ChatCompletionsResponse = match serde_json::from_str(&body_str) { Ok(arch_fc_response) => arch_fc_response, @@ -150,14 +150,14 @@ impl StreamContext { .is_some(); if !intent_matched { - debug!("intent not matched"); + info!("intent not matched"); // check if we have a default prompt target if let Some(default_prompt_target) = self .prompt_targets .values() .find(|pt| pt.default.unwrap_or(false)) { - debug!("default prompt target found, forwarding request to default prompt target"); + info!("default prompt target found, forwarding request to default prompt target"); let endpoint = default_prompt_target.endpoint.clone().unwrap(); let upstream_path: String = endpoint.path.unwrap_or(String::from("/")); @@ -204,7 +204,7 @@ impl StreamContext { } return; } else { - debug!("no default prompt target found, forwarding request to upstream llm"); + info!("no default prompt target found, forwarding request to upstream llm"); let mut messages = Vec::new(); // add system prompt match self.system_prompt.as_ref() { @@ -242,7 +242,7 @@ impl StreamContext { let chat_completion_request_json = serde_json::to_string(&chat_completion_request).unwrap(); - debug!( + info!( "archgw => upstream llm request: {}", chat_completion_request_json ); @@ -353,7 +353,7 @@ impl StreamContext { }; let body_str = serde_json::to_string(&chat_completion_request).unwrap(); - debug!("sending request to llm agent: {}", body_str); + info!("sending request to llm agent: {}", body_str); self.set_http_request_body(0, self.request_body_size, body_str.as_bytes()); self.resume_http_request(); return; @@ -396,7 +396,7 @@ impl StreamContext { } }; - debug!("api call body {:?}", api_call_body); + debug!("on_http_call_response: api call body {:?}", api_call_body); let timeout_str = API_REQUEST_TIMEOUT_MS.to_string(); @@ -436,8 +436,8 @@ impl StreamContext { Duration::from_secs(5), ); - debug!( - "dispatching api call to developer endpoint: {}, path: {}, method: {}", + info!( + "on_http_call_response: dispatching api call to developer endpoint: {}, path: {}, method: {}", endpoint_details.name, path, http_method_str ); @@ -454,8 +454,8 @@ impl StreamContext { let http_status = self .get_http_call_response_header(":status") .unwrap_or(StatusCode::OK.as_str().to_string()); - debug!( - "developer api call response received: status code: {}", + info!( + "on_http_call_response: developer api call response received: status code: {}", http_status ); let prompt_target = self @@ -479,7 +479,7 @@ impl StreamContext { ); } self.tool_call_response = Some(String::from_utf8(body).unwrap()); - trace!( + debug!( "response body: {}", self.tool_call_response.as_ref().unwrap() ); @@ -561,8 +561,8 @@ impl StreamContext { return self.send_server_error(ServerError::Serialization(e), None); } }; - debug!("sending request to upstream llm"); - trace!("request body: {}", llm_request_str); + info!("on_http_call_response: sending request to upstream llm"); + debug!("request body: {}", llm_request_str); self.start_upstream_llm_request_time = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -755,7 +755,7 @@ impl StreamContext { }; let json_resp = serde_json::to_string(&chat_completion_request).unwrap(); - debug!("archgw => (default target) llm request: {}", json_resp); + info!("archgw => (default target) llm request: {}", json_resp); self.set_http_request_body(0, self.request_body_size, json_resp.as_bytes()); self.resume_http_request(); } diff --git a/crates/prompt_gateway/tests/integration.rs b/crates/prompt_gateway/tests/integration.rs index 1673ac16..bbde10b0 100644 --- a/crates/prompt_gateway/tests/integration.rs +++ b/crates/prompt_gateway/tests/integration.rs @@ -24,12 +24,12 @@ fn wasm_module() -> String { fn request_headers_expectations(module: &mut Tester, http_context: i32) { module .call_proxy_on_request_headers(http_context, 0, false) + .expect_log(Some(LogLevel::Debug), None) .expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length")) .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path")) .returning(Some("/v1/chat/completions")) .expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders)) .returning(None) - .expect_log(Some(LogLevel::Trace), None) .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id")) .returning(None) .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent")) @@ -69,10 +69,14 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) // The actual call is not important in this test, we just need to grab the token_id - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_http_call( Some("arch_internal"), Some(vec![ @@ -88,10 +92,6 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) { Some(5000), ) .returning(Some(1)) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_metric_increment("active_http_calls", 1) .execute_and_expect(ReturnType::Action(Action::Pause)) .unwrap(); @@ -233,13 +233,13 @@ fn prompt_gateway_successful_request_to_open_ai_chat_completions() { chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(chat_completions_request_body)) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Debug), None) .expect_http_call(Some("arch_internal"), None, None, None, None) .returning(Some(4)) .expect_metric_increment("active_http_calls", 1) @@ -296,16 +296,16 @@ fn prompt_gateway_bad_request_to_open_ai_chat_completions() { incomplete_chat_completions_request_body.len() as i32, true, ) + .expect_log(Some(LogLevel::Debug), None) .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody)) .returning(Some(incomplete_chat_completions_request_body)) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Debug), None) .expect_send_local_response( Some(StatusCode::BAD_REQUEST.as_u16().into()), None, None, None, ) - .expect_log(Some(LogLevel::Trace), None) .execute_and_expect(ReturnType::Action(Action::Pause)) .unwrap(); } @@ -378,11 +378,11 @@ fn prompt_gateway_request_to_llm_gateway() { .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody)) .returning(Some(&arch_fc_resp_str)) .expect_log(Some(LogLevel::Warn), None) - .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Debug), None) .expect_http_call( Some("arch_internal"), Some(vec![ @@ -410,14 +410,14 @@ fn prompt_gateway_request_to_llm_gateway() { .expect_metric_increment("active_http_calls", -1) .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody)) .returning(Some(&body_text)) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Trace), None) .expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status")) .returning(Some("200")) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) + .expect_log(Some(LogLevel::Debug), None) .execute_and_expect(ReturnType::None) .unwrap(); @@ -449,11 +449,11 @@ fn prompt_gateway_request_to_llm_gateway() { ) .expect_get_buffer_bytes(Some(BufferType::HttpResponseBody)) .returning(Some(chat_completion_response_str.as_str())) - .expect_log(Some(LogLevel::Trace), None) .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), None) .expect_set_buffer_bytes(Some(BufferType::HttpResponseBody), None) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) .execute_and_expect(ReturnType::Action(Action::Continue)) .unwrap(); } @@ -521,15 +521,15 @@ fn prompt_gateway_request_no_intent_match() { .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody)) .returning(Some(&arch_fc_resp_str)) .expect_log(Some(LogLevel::Warn), None) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Debug), Some("intent not matched")) + .expect_log(Some(LogLevel::Info), Some("intent not matched")) .expect_log( - Some(LogLevel::Debug), + Some(LogLevel::Info), Some("no default prompt target found, forwarding request to upstream llm"), ) - .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), None) + .expect_log(Some(LogLevel::Info), None) .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None) .execute_and_expect(ReturnType::None) .unwrap(); @@ -677,15 +677,15 @@ fn prompt_gateway_request_no_intent_match_default_target() { .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody)) .returning(Some(&arch_fc_resp_str)) .expect_log(Some(LogLevel::Warn), None) + .expect_log(Some(LogLevel::Info), None) .expect_log(Some(LogLevel::Debug), None) - .expect_log(Some(LogLevel::Trace), None) - .expect_log(Some(LogLevel::Debug), Some("intent not matched")) + .expect_log(Some(LogLevel::Info), Some("intent not matched")) .expect_log( - Some(LogLevel::Debug), + Some(LogLevel::Info), Some("default prompt target found, forwarding request to default prompt target"), ) - .expect_log(Some(LogLevel::Trace), None) .expect_log(Some(LogLevel::Debug), None) + .expect_log(Some(LogLevel::Info), None) .expect_http_call( Some("arch_internal"), Some(vec![