add preliminary support for llm agents (#432)

2026-06-20 15:28:07 +02:00 · 2025-03-19 15:21:34 -07:00 · 2025-03-19 15:21:34 -07:00 · 84cd1df7bf
commit 84cd1df7bf
parent 8d66fefded
29 changed files with 1388 additions and 121 deletions
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@ -189,7 +189,7 @@ pub struct ToolCall {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FunctionCallDetail {
    pub name: String,
-    pub arguments: HashMap<String, Value>,
+    pub arguments: Option<HashMap<String, Value>>,
 }

 #[derive(Debug, Deserialize, Serialize)]
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -25,6 +25,7 @@ pub struct Configuration {
 pub struct Overrides {
    pub prompt_target_intent_matching_threshold: Option<f64>,
    pub optimize_context_window: Option<bool>,
+    pub use_agent_orchestrator: Option<bool>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -159,7 +160,7 @@ pub struct LlmProvider {
    pub name: String,
    pub provider_interface: LlmProviderType,
    pub access_key: Option<String>,
-    pub model: String,
+    pub model: Option<String>,
    pub default: Option<bool>,
    pub stream: Option<bool>,
    pub endpoint: Option<String>,
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -1,6 +1,7 @@
 use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
 use common::configuration::Configuration;
+use common::configuration::Overrides;
 use common::consts::OTEL_COLLECTOR_HTTP;
 use common::consts::OTEL_POST_PATH;
 use common::http::CallArgs;
@ -31,6 +32,7 @@ pub struct FilterContext {
    callouts: RefCell<HashMap<u32, CallContext>>,
    llm_providers: Option<Rc<LlmProviders>>,
    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+    overrides: Rc<Option<Overrides>>,
 }

 impl FilterContext {
@ -40,6 +42,7 @@ impl FilterContext {
            metrics: Rc::new(Metrics::new()),
            llm_providers: None,
            traces_queue: Arc::new(Mutex::new(VecDeque::new())),
+            overrides: Rc::new(None),
        }
    }
 }
@ -69,6 +72,7 @@ impl RootContext for FilterContext {
        };

        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
+        self.overrides = Rc::new(config.overrides);

        match config.llm_providers.try_into() {
            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
@ -93,6 +97,7 @@ impl RootContext for FilterContext {
                    .expect("LLM Providers must exist when Streams are being created"),
            ),
            Arc::clone(&self.traces_queue),
+            Rc::clone(&self.overrides),
        )))
    }

--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -3,7 +3,7 @@ use common::api::open_ai::{
    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
    Message, StreamOptions,
 };
-use common::configuration::LlmProvider;
+use common::configuration::{LlmProvider, LlmProviderType, Overrides};
 use common::consts::{
    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
@ -42,6 +42,7 @@ pub struct StreamContext {
    request_body_sent_time: Option<u128>,
    user_message: Option<Message>,
    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+    overrides: Rc<Option<Overrides>>,
 }

 impl StreamContext {
@ -50,10 +51,12 @@ impl StreamContext {
        metrics: Rc<Metrics>,
        llm_providers: Rc<LlmProviders>,
        traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+        overrides: Rc<Option<Overrides>>,
    ) -> Self {
        StreamContext {
            context_id,
            metrics,
+            overrides,
            ratelimit_selector: None,
            streaming_response: false,
            response_tokens: 0,
@ -91,7 +94,12 @@ impl StreamContext {
            self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
                .unwrap_or_default(),
            self.llm_provider.as_ref().unwrap().name,
-            self.llm_provider.as_ref().unwrap().model
+            self.llm_provider
+                .as_ref()
+                .unwrap()
+                .model
+                .as_ref()
+                .unwrap_or(&String::new())
        );
    }

@ -151,11 +159,11 @@ impl StreamContext {
        // Tokenize and record token count.
        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);

+        trace!("Recorded input token count: {}", token_count);
        // Record the token count to metrics.
        self.metrics
            .input_sequence_length
            .record(token_count as u64);
-        trace!("Recorded input token count: {}", token_count);

        // Check if rate limiting needs to be applied.
        if let Some(selector) = self.ratelimit_selector.take() {
@ -184,24 +192,41 @@ impl HttpContext for StreamContext {
            return Action::Continue;
        }

-        self.select_llm_provider();
+        let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);

-        // if endpoint is not set then use provider name as routing header so envoy can resolve the cluster name
-        if self.llm_provider().endpoint.is_none() {
+        let use_agent_orchestrator = match self.overrides.as_ref() {
+            Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
+            None => false,
+        };
+
+        if let Some(routing_header_value) = routing_header_value.as_ref() {
+            debug!("routing header already set: {}", routing_header_value);
+            self.llm_provider = Some(Rc::new(LlmProvider {
+                name: routing_header_value.to_string(),
+                provider_interface: LlmProviderType::OpenAI,
+                access_key: None,
+                endpoint: None,
+                model: None,
+                default: None,
+                stream: None,
+                port: None,
+                rate_limits: None,
+            }));
+        } else {
+            self.select_llm_provider();
            self.add_http_request_header(
                ARCH_ROUTING_HEADER,
                &self.llm_provider().provider_interface.to_string(),
            );
-        } else {
-            self.add_http_request_header(ARCH_ROUTING_HEADER, &self.llm_provider().name);
-        }
-
-        if let Err(error) = self.modify_auth_headers() {
-            // ensure that the provider has an endpoint if the access key is missing else return a bad request
-            if self.llm_provider.as_ref().unwrap().endpoint.is_none() {
-                self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
+            if let Err(error) = self.modify_auth_headers() {
+                // ensure that the provider has an endpoint if the access key is missing else return a bad request
+                if self.llm_provider.as_ref().unwrap().endpoint.is_none() && !use_agent_orchestrator
+                {
+                    self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
+                }
            }
        }
+
        self.delete_content_length_header();
        self.save_ratelimit_header();

@ -230,34 +255,38 @@ impl HttpContext for StreamContext {
            return Action::Continue;
        }

+        let body_bytes = match self.get_http_request_body(0, body_size) {
+            Some(body_bytes) => body_bytes,
+            None => {
+                self.send_server_error(
+                    ServerError::LogicError(format!(
+                        "Failed to obtain body bytes even though body_size is {}",
+                        body_size
+                    )),
+                    None,
+                );
+                return Action::Pause;
+            }
+        };
+
        // Deserialize body into spec.
        // Currently OpenAI API.
        let mut deserialized_body: ChatCompletionsRequest =
-            match self.get_http_request_body(0, body_size) {
-                Some(body_bytes) => match serde_json::from_slice(&body_bytes) {
-                    Ok(deserialized) => deserialized,
-                    Err(e) => {
-                        self.send_server_error(
-                            ServerError::Deserialization(e),
-                            Some(StatusCode::BAD_REQUEST),
-                        );
-                        return Action::Pause;
-                    }
-                },
-                None => {
+            match serde_json::from_slice(&body_bytes) {
+                Ok(deserialized) => deserialized,
+                Err(e) => {
+                    debug!("body str: {}", String::from_utf8_lossy(&body_bytes));
                    self.send_server_error(
-                        ServerError::LogicError(format!(
-                            "Failed to obtain body bytes even though body_size is {}",
-                            body_size
-                        )),
-                        None,
+                        ServerError::Deserialization(e),
+                        Some(StatusCode::BAD_REQUEST),
                    );
                    return Action::Pause;
                }
            };

        // remove metadata from the request body
-        deserialized_body.metadata = None;
+        //TODO: move this to prompt gateway
+        // deserialized_body.metadata = None;
        // delete model key from message array
        for message in deserialized_body.messages.iter_mut() {
            message.model = None;
@ -270,10 +299,16 @@ impl HttpContext for StreamContext {
            .last()
            .cloned();

-        // override model name from the llm provider
-        deserialized_body
-            .model
-            .clone_from(&self.llm_provider.as_ref().unwrap().model);
+        let model_name = match self.llm_provider.as_ref() {
+            Some(llm_provider) => match llm_provider.model.as_ref() {
+                Some(model) => model,
+                None => "--",
+            },
+            None => "--",
+        };
+
+        deserialized_body.model = model_name.to_string();
+
        let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();

        trace!(
@ -469,6 +504,10 @@ impl HttpContext for StreamContext {
        };

        if self.streaming_response {
+            if body_utf8 == "data: [DONE]\n" {
+                return Action::Continue;
+            }
+
            let chat_completions_chunk_response_events =
                match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
                    Ok(response) => response,
@ -482,7 +521,10 @@ impl HttpContext for StreamContext {
                };

            if chat_completions_chunk_response_events.events.is_empty() {
-                debug!("empty streaming response");
+                debug!(
+                    "cound't parse any streaming events: body str: {}",
+                    body_utf8
+                );
                return Action::Continue;
            }

--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@ -20,6 +20,11 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
        .call_proxy_on_request_headers(http_context, 0, false)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
        .returning(Some("/v1/chat/completions"))
+        .expect_get_header_map_value(
+            Some(MapType::HttpRequestHeaders),
+            Some("x-arch-llm-provider"),
+        )
+        .returning(None)
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider-hint"),
@ -36,6 +41,7 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
            Some("Authorization"),
            Some("Bearer secret_key"),
        )
+        .expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider-hint"),
@ -48,8 +54,6 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
        .returning(Some("selector-key"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("selector-key"))
        .returning(Some("selector-value"))
-        .expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
-        .returning(None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
        .returning(Some("/v1/chat/completions"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
@ -223,8 +227,8 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
        .returning(Some(chat_completions_request_body))
        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Trace), None)
-        .expect_metric_record("input_sequence_length", 21)
        .expect_log(Some(LogLevel::Trace), None)
+        .expect_metric_record("input_sequence_length", 21)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
@ -266,7 +270,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
    {\
        \"messages\": [\
        {\
-            \"role\": \"system\",\
+            \"role\": \"system\"\
        },\
        {\
            \"role\": \"user\",\
@ -283,14 +287,19 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
        )
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(incomplete_chat_completions_request_body))
-        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Trace), None)
        .expect_send_local_response(
            Some(StatusCode::BAD_REQUEST.as_u16().into()),
            None,
            None,
            None,
        )
-        .execute_and_expect(ReturnType::Action(Action::Pause))
+        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Trace), None)
+        .expect_metric_record("input_sequence_length", 14)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .execute_and_expect(ReturnType::Action(Action::Continue))
        .unwrap();
 }

--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@ -1,6 +1,8 @@
 use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
-use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget, Tracing};
+use common::configuration::{
+    Configuration, Endpoint, Overrides, PromptGuards, PromptTarget, Tracing,
+};
 use common::http::Client;
 use common::stats::Gauge;
 use log::trace;
@ -21,6 +23,7 @@ pub struct FilterContext {
    overrides: Rc<Option<Overrides>>,
    system_prompt: Rc<Option<String>>,
    prompt_targets: Rc<HashMap<String, PromptTarget>>,
+    endpoints: Rc<Option<HashMap<String, Endpoint>>>,
    prompt_guards: Rc<PromptGuards>,
    tracing: Rc<Option<Tracing>>,
 }
@ -34,6 +37,7 @@ impl FilterContext {
            prompt_targets: Rc::new(HashMap::new()),
            overrides: Rc::new(None),
            prompt_guards: Rc::new(PromptGuards::default()),
+            endpoints: Rc::new(None),
            tracing: Rc::new(None),
        }
    }
@ -73,6 +77,7 @@ impl RootContext for FilterContext {
        }
        self.system_prompt = Rc::new(config.system_prompt);
        self.prompt_targets = Rc::new(prompt_targets);
+        self.endpoints = Rc::new(config.endpoints);

        if let Some(prompt_guards) = config.prompt_guards {
            self.prompt_guards = Rc::new(prompt_guards)
@ -94,6 +99,7 @@ impl RootContext for FilterContext {
            Rc::clone(&self.metrics),
            Rc::clone(&self.system_prompt),
            Rc::clone(&self.prompt_targets),
+            Rc::clone(&self.endpoints),
            Rc::clone(&self.overrides),
            Rc::clone(&self.tracing),
        )))
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@ -4,7 +4,7 @@ use common::{
        self, ArchState, ChatCompletionStreamResponse, ChatCompletionTool, ChatCompletionsRequest,
    },
    consts::{
-        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
+        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_ROUTING_HEADER, ARCH_STATE_HEADER,
        ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
        MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE,
        TRACE_PARENT_HEADER, USER_ROLE,
@ -33,6 +33,27 @@ impl HttpContext for StreamContext {
        // manipulate the body in benign ways e.g., compression.
        self.set_http_request_header("content-length", None);

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                // get endpoint that has agent_orchestrator set to true
+                if let Some(endpoints) = self.endpoints.as_ref() {
+                    if endpoints.len() == 1 {
+                        let (name, _) = endpoints.iter().next().unwrap();
+                        debug!("Setting ARCH_PROVIDER_HINT_HEADER to {}", name);
+                        self.set_http_request_header(ARCH_ROUTING_HEADER, Some(name));
+                    } else {
+                        warn!("Need single endpoint when use_agent_orchestrator is set");
+                        self.send_server_error(
+                            ServerError::LogicError(
+                                "Need single endpoint when use_agent_orchestrator is set".to_string(),
+                            ),
+                            None,
+                        );
+                    }
+                }
+            }
+        }
+
        let request_path = self.get_http_request_header(":path").unwrap_or_default();
        if request_path == HEALTHZ_PATH {
            self.send_http_response(200, vec![], None);
@ -49,6 +70,7 @@ impl HttpContext for StreamContext {

        self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
        self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
+
        Action::Continue
    }

@ -152,6 +174,18 @@ impl HttpContext for StreamContext {
            }
        }

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                if metadata.is_none() {
+                    metadata = Some(HashMap::new());
+                }
+                metadata
+                    .as_mut()
+                    .unwrap()
+                    .insert("use_agent_orchestrator".to_string(), "true".to_string());
+            }
+        }
+
        let arch_fc_chat_completion_request = ChatCompletionsRequest {
            messages: deserialized_body.messages.clone(),
            metadata,
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -4,7 +4,7 @@ use common::api::open_ai::{
    to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionsRequest,
    ChatCompletionsResponse, Message, ToolCall,
 };
-use common::configuration::{Overrides, PromptTarget, Tracing};
+use common::configuration::{Endpoint, Overrides, PromptTarget, Tracing};
 use common::consts::{
    API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME,
    ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY,
@ -46,6 +46,7 @@ pub struct StreamCallContext {
 pub struct StreamContext {
    system_prompt: Rc<Option<String>>,
    pub prompt_targets: Rc<HashMap<String, PromptTarget>>,
+    pub endpoints: Rc<Option<HashMap<String, Endpoint>>>,
    pub overrides: Rc<Option<Overrides>>,
    pub metrics: Rc<Metrics>,
    pub callouts: RefCell<HashMap<u32, StreamCallContext>>,
@ -72,6 +73,7 @@ impl StreamContext {
        metrics: Rc<Metrics>,
        system_prompt: Rc<Option<String>>,
        prompt_targets: Rc<HashMap<String, PromptTarget>>,
+        endpoints: Rc<Option<HashMap<String, Endpoint>>>,
        overrides: Rc<Option<Overrides>>,
        tracing: Rc<Option<Tracing>>,
    ) -> Self {
@ -80,6 +82,7 @@ impl StreamContext {
            metrics,
            system_prompt,
            prompt_targets,
+            endpoints,
            callouts: RefCell::new(HashMap::new()),
            chat_completions_request: None,
            tool_calls: None,
@ -312,12 +315,59 @@ impl StreamContext {
        callout_context.prompt_target_name =
            Some(self.tool_calls.as_ref().unwrap()[0].function.name.clone());

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                let mut metadata = HashMap::new();
+                metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
+
+                metadata.insert(
+                    "agent-name".to_string(),
+                    callout_context
+                        .prompt_target_name
+                        .as_ref()
+                        .unwrap()
+                        .to_string(),
+                );
+
+                if let Some(overrides) = self.overrides.as_ref() {
+                    if overrides.optimize_context_window.unwrap_or_default() {
+                        metadata.insert("optimize_context_window".to_string(), "true".to_string());
+                    }
+                }
+
+                if let Some(overrides) = self.overrides.as_ref() {
+                    if overrides.use_agent_orchestrator.unwrap_or_default() {
+                        metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
+                    }
+                }
+
+                let messages = self.construct_llm_messages(&callout_context);
+
+                let chat_completion_request = ChatCompletionsRequest {
+                    model: callout_context.request_body.model.clone(),
+                    messages,
+                    tools: None,
+                    stream: callout_context.request_body.stream,
+                    stream_options: callout_context.request_body.stream_options.clone(),
+                    metadata: Some(metadata),
+                };
+
+                let body_str = serde_json::to_string(&chat_completion_request).unwrap();
+                debug!("sending request to llm agent: {}", body_str);
+                self.set_http_request_body(0, self.request_body_size, body_str.as_bytes());
+                self.resume_http_request();
+                return;
+            }
+        }
+
        self.schedule_api_call_request(callout_context);
    }

    fn schedule_api_call_request(&mut self, mut callout_context: StreamCallContext) {
+        // Construct messages early to avoid mutable borrow conflicts
+
        let tools_call_name = self.tool_calls.as_ref().unwrap()[0].function.name.clone();
-        let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap();
+        let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap().clone();
        let tool_params = &self.tool_calls.as_ref().unwrap()[0].function.arguments;
        let endpoint_details = prompt_target.endpoint.as_ref().unwrap();
        let endpoint_path: String = endpoint_details
@ -329,7 +379,7 @@ impl StreamContext {
        let http_method = endpoint_details.method.clone().unwrap_or_default();
        let prompt_target_params = prompt_target.parameters.clone().unwrap_or_default();

-        let (path, body) = match compute_request_path_body(
+        let (path, api_call_body) = match compute_request_path_body(
            &endpoint_path,
            tool_params,
            &prompt_target_params,
@ -346,6 +396,8 @@ impl StreamContext {
            }
        };

+        debug!("api call body {:?}", api_call_body);
+
        let timeout_str = API_REQUEST_TIMEOUT_MS.to_string();

        let http_method_str = http_method.to_string();
@ -375,11 +427,12 @@ impl StreamContext {
            headers.insert(key.as_str(), value.as_str());
        }

+
        let call_args = CallArgs::new(
            ARCH_INTERNAL_CLUSTER_NAME,
            &path,
            headers.into_iter().collect(),
-            body.as_deref().map(|s| s.as_bytes()),
+            api_call_body.as_deref().map(|s| s.as_bytes()),
            vec![],
            Duration::from_secs(5),
        );
@ -406,6 +459,11 @@ impl StreamContext {
            "developer api call response received: status code: {}",
            http_status
        );
+        let prompt_target = self
+            .prompt_targets
+            .get(callout_context.prompt_target_name.as_ref().unwrap())
+            .unwrap()
+            .clone();
        if http_status != StatusCode::OK.as_str() {
            warn!(
                "api server responded with non 2xx status code: {}",
@ -441,6 +499,40 @@ impl StreamContext {
            }
        };

+        if !prompt_target
+            .auto_llm_dispatch_on_response
+            .unwrap_or(true)
+        {
+            let tool_call_response = self.tool_call_response.as_ref().unwrap().clone();
+
+            let direct_response_str = if self.streaming_response {
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        Some(tool_call_response.clone()),
+                        None,
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                ];
+
+                to_server_events(chunks)
+            } else {
+                tool_call_response
+            };
+
+            return self.send_http_response(
+                StatusCode::OK.as_u16().into(),
+                vec![],
+                Some(direct_response_str.as_bytes()),
+            );
+        }
+
        let final_prompt = format!(
            "{}\ncontext: {}",
            user_message.content.unwrap(),
@ -565,7 +657,7 @@ impl StreamContext {
        // check if the default target should be dispatched to the LLM provider
        if !prompt_target
            .auto_llm_dispatch_on_response
-            .unwrap_or_default()
+            .unwrap_or(true)
        {
            let default_target_response_str = if self.streaming_response {
                let chat_completion_response =
--- a/crates/prompt_gateway/src/tools.rs
+++ b/crates/prompt_gateway/src/tools.rs
@ -4,8 +4,13 @@ use std::collections::HashMap;
 use serde_yaml::Value;

 // only add params that are of string, number and bool type
-pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<String, String> {
+pub fn filter_tool_params(tool_params: &Option<HashMap<String, Value>>) -> HashMap<String, String> {
+    if tool_params.is_none() {
+        return HashMap::new();
+    }
    tool_params
+        .as_ref()
+        .unwrap()
        .iter()
        .filter(|(_, value)| value.is_number() || value.is_string() || value.is_bool())
        .map(|(key, value)| match value {
@ -22,7 +27,7 @@ pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<Strin

 pub fn compute_request_path_body(
    endpoint_path: &str,
-    tool_params: &HashMap<String, Value>,
+    tool_params: &Option<HashMap<String, Value>>,
    prompt_target_params: &[Parameter],
    http_method: &HttpMethod,
 ) -> Result<(String, Option<String>), String> {
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -352,10 +352,10 @@ fn prompt_gateway_request_to_llm_gateway() {
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
-                        arguments: HashMap::from([(
+                        arguments: Some(HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
-                        )]),
+                        )])),
                    },
                }]),
                model: None,
@ -381,8 +381,8 @@ fn prompt_gateway_request_to_llm_gateway() {
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Trace), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
@ -400,6 +400,7 @@ fn prompt_gateway_request_to_llm_gateway() {
        )
        .returning(Some(2))
        .expect_metric_increment("active_http_calls", 1)
+        .expect_log(Some(LogLevel::Trace), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();

@ -499,10 +500,10 @@ fn prompt_gateway_request_no_intent_match() {
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
-                        arguments: HashMap::from([(
+                        arguments: Some(HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
-                        )]),
+                        )])),
                    },
                }]),
                model: None,
@ -655,10 +656,10 @@ fn prompt_gateway_request_no_intent_match_default_target() {
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
-                        arguments: HashMap::from([(
+                        arguments: Some(HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
-                        )]),
+                        )])),
                    },
                }]),
                model: None,