pending changes

2026-06-17 15:25:17 +02:00 · 2025-12-15 18:17:15 -08:00 · 2025-12-15 18:17:15 -08:00 · 358fa856c4
commit 358fa856c4
parent afffa11e91
21 changed files with 1195 additions and 403 deletions
--- a/arch/supervisord.conf
+++ b/arch/supervisord.conf
@ -2,7 +2,7 @@
 nodaemon=true

 [program:brightstaff]
-command=sh -c "RUST_LOG=info /app/brightstaff 2>&1 | tee /var/log/brightstaff.log | while IFS= read -r line; do echo '[brightstaff]' \"$line\"; done"
+command=sh -c "RUST_LOG=debug /app/brightstaff 2>&1 | tee /var/log/brightstaff.log | while IFS= read -r line; do echo '[brightstaff]' \"$line\"; done"
 stdout_logfile=/dev/stdout
 redirect_stderr=true
 stdout_logfile_maxbytes=0
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -101,8 +101,17 @@ def validate_and_render_schema():

    # Process agents section and convert to endpoints
    agents = config_yaml.get("agents", [])
-    for agent in agents:
+    agent_filters = config_yaml.get("agent_filters", [])
+    agents_combined = agents + agent_filters
+    agent_id_keys = set()
+
+    for agent in agents_combined:
        agent_id = agent.get("id")
+        if agent_id in agent_id_keys:
+            raise Exception(
+                f"Duplicate agent id {agent_id}, please provide unique id for each agent"
+            )
+        agent_id_keys.add(agent_id)
        agent_endpoint = agent.get("url")

        if agent_id and agent_endpoint:
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@ -1 +1 @@
-docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.2
+docker build  -f arch/Dockerfile . -t katanemo/archgw -t katanemo/archgw:0.3.21
--- a/crates/brightstaff/src/handlers/agent_chat_completions.rs
+++ b/crates/brightstaff/src/handlers/agent_chat_completions.rs
@ -81,7 +81,7 @@ async fn handle_agent_chat(
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, AgentFilterChainError> {
    // Initialize services
    let agent_selector = AgentSelector::new(router_service);
-    let pipeline_processor = PipelineProcessor::default();
+    let mut pipeline_processor = PipelineProcessor::default();
    let response_handler = ResponseHandler::new();

    // Extract listener name from headers
@ -144,9 +144,9 @@ async fn handle_agent_chat(
    debug!("Processing agent pipeline: {}", selected_agent.id);

    // Process the filter chain
-    let processed_messages = pipeline_processor
+    let chat_history = pipeline_processor
        .process_filter_chain(
-            &chat_completions_request,
+            &chat_completions_request.messages,
            &selected_agent,
            &agent_map,
            &request_headers,
@ -161,8 +161,8 @@ async fn handle_agent_chat(
    debug!("Terminal agent details: {:?}", terminal_agent);

    let llm_response = pipeline_processor
-        .invoke_upstream_agent(
-            &processed_messages,
+        .invoke_terminal_agent(
+            &chat_history,
            &chat_completions_request,
            terminal_agent,
            &request_headers,
--- a/crates/brightstaff/src/handlers/agent_selector.rs
+++ b/crates/brightstaff/src/handlers/agent_selector.rs
@ -8,7 +8,6 @@ use hermesllm::apis::openai::Message;
 use tracing::{debug, warn};

 use crate::router::llm_router::RouterService;
-use crate::utils::mcp_client::McpClient;

 /// Errors that can occur during agent selection
 #[derive(Debug, thiserror::Error)]
@ -28,14 +27,12 @@ pub enum AgentSelectionError {
 /// Service for selecting agents based on routing preferences and listener configuration
 pub struct AgentSelector {
    router_service: Arc<RouterService>,
-    mcp_client: McpClient,
 }

 impl AgentSelector {
    pub fn new(router_service: Arc<RouterService>) -> Self {
        Self {
            router_service,
-            mcp_client: McpClient::new(),
        }
    }

@ -152,7 +149,7 @@ impl AgentSelector {
        for agent_chain in agents {
            // Get the actual agent from the agent_map
            let agent = agent_map.get(&agent_chain.id);
-            
+
            // Determine the description to use
            let description = if let Some(agent) = agent {
                // Check if this is an MCP agent (URL starts with mcp://)
@ -161,36 +158,10 @@ impl AgentSelector {
                        "Agent {} is an MCP agent, fetching tool description from: {}",
                        agent.id, agent.url
                    );
-                    
-                    // Fetch description from MCP endpoint
-                    match self
-                        .mcp_client
-                        .fetch_tool_description(&agent.url, agent.tool.as_deref())
-                        .await
-                    {
-                        Ok(mcp_description) => {
-                            if !mcp_description.is_empty() {
-                                debug!(
-                                    "Fetched MCP description for agent {}: {}",
-                                    agent.id, mcp_description
-                                );
-                                mcp_description
-                            } else {
-                                warn!(
-                                    "MCP tool description is empty for agent {}, using config description",
-                                    agent.id
-                                );
-                                agent_chain.description.clone().unwrap_or_default()
-                            }
-                        }
-                        Err(e) => {
-                            warn!(
-                                "Failed to fetch MCP description for agent {}: {}, using config description",
-                                agent.id, e
-                            );
-                            agent_chain.description.clone().unwrap_or_default()
-                        }
-                    }
+
+                    //TODO: fetch description from mcp server
+
+                    "MCP tool description placeholder from config".to_string()
                } else {
                    // Not an MCP agent, use description from config
                    agent_chain.description.clone().unwrap_or_default()
--- a/crates/brightstaff/src/handlers/jsonrpc.rs
+++ b/crates/brightstaff/src/handlers/jsonrpc.rs
@ -0,0 +1,44 @@
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum JsonRpcId {
+  String(String),
+  Number(u64),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcRequest {
+  pub jsonrpc: String,
+  pub id: JsonRpcId,
+  pub method: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub params: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcNotification {
+  pub jsonrpc: String,
+  pub method: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub params: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcError {
+  pub code: i32,
+  pub message: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub data: Option<serde_json::Value>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcResponse {
+  pub jsonrpc: String,
+  pub id: JsonRpcId,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub result: Option<HashMap<String, serde_json::Value>>,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub error: Option<JsonRpcError>,
+}
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -6,6 +6,7 @@ pub mod function_calling;
 pub mod pipeline_processor;
 pub mod response_handler;
 pub mod utils;
+pub mod jsonrpc;

 #[cfg(test)]
 mod integration_tests;
--- a/crates/brightstaff/src/handlers/pipeline_processor.rs
+++ b/crates/brightstaff/src/handlers/pipeline_processor.rs
@ -4,7 +4,10 @@ use common::configuration::{Agent, AgentFilterChain};
 use common::consts::{ARCH_UPSTREAM_HOST_HEADER, ENVOY_RETRY_HEADER};
 use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
 use hyper::header::HeaderMap;
-use tracing::{debug, warn};
+use tracing::{debug, info, warn};
+
+use crate::handlers::jsonrpc::{JsonRpcId, JsonRpcNotification, JsonRpcRequest, JsonRpcResponse};
+use uuid::Uuid;

 /// Errors that can occur during pipeline processing
 #[derive(Debug, thiserror::Error)]
@ -25,13 +28,17 @@ pub enum PipelineError {
 pub struct PipelineProcessor {
    client: reqwest::Client,
    url: String,
+    agent_id_session_map: HashMap<String, String>,
 }

+const ENVOY_API_ROUTER_ADDRESS: &str = "http://localhost:11000";
+
 impl Default for PipelineProcessor {
    fn default() -> Self {
        Self {
            client: reqwest::Client::new(),
-            url: "http://localhost:11000/v1/chat/completions".to_string(),
+            url: ENVOY_API_ROUTER_ADDRESS.to_string(),
+            agent_id_session_map: HashMap::new(),
        }
    }
 }
@ -41,18 +48,20 @@ impl PipelineProcessor {
        Self {
            client: reqwest::Client::new(),
            url,
+            agent_id_session_map: HashMap::new(),
        }
    }

    /// Process the filter chain of agents (all except the terminal agent)
    pub async fn process_filter_chain(
-        &self,
-        initial_request: &ChatCompletionsRequest,
+        &mut self,
+        chat_history: &[Message],
        agent_filter_chain: &AgentFilterChain,
        agent_map: &HashMap<String, Agent>,
        request_headers: &HeaderMap,
    ) -> Result<Vec<Message>, PipelineError> {
-        let mut chat_completions_history = initial_request.messages.clone();
+
+        let mut chat_history_updated = chat_history.to_vec();

        for agent_name in &agent_filter_chain.filter_chain {
            debug!("Processing filter agent: {}", agent_name);
@ -61,47 +70,83 @@ impl PipelineProcessor {
                .get(agent_name)
                .ok_or_else(|| PipelineError::AgentNotFound(agent_name.clone()))?;

-            debug!("Agent details: {:?}", agent);
+            let tool_name = agent.tool.as_deref().unwrap_or(&agent.id);

-            let response_content = self
-                .send_agent_filter_chain_request(
-                    &chat_completions_history,
-                    initial_request,
+            info!("executing filter: {}/{}, url: {}, conversation length: {}", agent_name, tool_name, agent.url, chat_history.len());
+
+            chat_history_updated = self
+                .execute_filter(
+                    &chat_history_updated,
                    agent,
                    request_headers,
                )
                .await?;

-            debug!("Received response from filter agent {}", agent_name);
-
-            // Parse the response content as new message history
-            chat_completions_history =
-                serde_json::from_str(&response_content).inspect_err(|err| {
-                    warn!(
-                        "Failed to parse response from agent {}, err: {}, response: {}",
-                        agent_name, err, response_content
-                    )
-                })?;
+            info!("Received response: updated conversation length: {}", chat_history.len());
        }

-        Ok(chat_completions_history)
+        Ok(chat_history_updated)
    }

    /// Send request to a specific agent and return the response content
-    async fn send_agent_filter_chain_request(
-        &self,
+    async fn execute_filter(
+        &mut self,
        messages: &[Message],
-        original_request: &ChatCompletionsRequest,
        agent: &Agent,
        request_headers: &HeaderMap,
-    ) -> Result<String, PipelineError> {
-        let mut request = original_request.clone();
-        request.messages = messages.to_vec();
+    ) -> Result<Vec<Message>, PipelineError> {

-        let request_body = serde_json::to_string(&request)?;
-        debug!("Sending request to agent {}", agent.id);
+        let mcp_session_id = if let Some(session_id) = self.agent_id_session_map.get(&agent.id) {
+            session_id.clone()
+        } else {
+            let session_id = self.get_new_session_id(&agent.id).await;
+            self.agent_id_session_map
+                .insert(agent.id.clone(), session_id.clone());
+            session_id
+        };
+
+        // let mut request = original_request.clone();
+        // request.messages = messages.to_vec();
+
+        let tool_name = agent.tool.as_deref().unwrap_or(&agent.id);
+
+        let arguments = serde_json::json!({
+            "messages": messages
+        });
+
+        let params = serde_json::json!({
+            "name": tool_name,
+            "arguments": arguments
+        });
+
+        let json_rpc_request = JsonRpcRequest {
+            jsonrpc: "2.0".to_string(),
+            id: JsonRpcId::String(Uuid::new_v4().to_string()),
+            method: "tools/call".to_string(),
+            params: Some(serde_json::from_value(params)?),
+        };
+
+        let request_body = serde_json::to_string(&json_rpc_request)?;
+        info!("Sending request to agent {}", agent.id);
+        info!("Request body: {}", request_body);
+
+        // Pretty print for debugging
+        let pretty_body = serde_json::to_string_pretty(&json_rpc_request)?;
+        info!("Request body (pretty):\n{}", pretty_body);

        let mut agent_headers = request_headers.clone();
+        info!("Using MCP session ID {} for agent {}", mcp_session_id, agent.id);
+
+        // Log all headers being sent
+        info!("Headers being sent:");
+        for (key, value) in agent_headers.iter() {
+            info!("  {}: {:?}", key, value);
+        }
+
+        agent_headers.insert(
+            "mcp-session-id",
+            hyper::header::HeaderValue::from_str(&mcp_session_id).unwrap(),
+        );
        agent_headers.remove(hyper::header::CONTENT_LENGTH);
        agent_headers.insert(
            ARCH_UPSTREAM_HOST_HEADER,
@ -114,9 +159,24 @@ impl PipelineProcessor {
            hyper::header::HeaderValue::from_str("3").unwrap(),
        );

+        agent_headers.insert(
+            "Accept",
+            hyper::header::HeaderValue::from_static("application/json, text/event-stream"),
+        );
+
+        agent_headers.insert(
+            "Content-Type",
+            hyper::header::HeaderValue::from_static("application/json"),
+        );
+
+        info!("Final headers being sent:");
+        for (key, value) in agent_headers.iter() {
+            info!("  {}: {:?}", key, value);
+        }
+
        let response = self
            .client
-            .post(&self.url)
+            .post(format!("{}/mcp", self.url))
            .headers(agent_headers)
            .body(request_body)
            .send()
@ -124,24 +184,149 @@ impl PipelineProcessor {

        let response_bytes = response.bytes().await?;

-        // Parse the response as JSON to extract the content
-        let response_json: serde_json::Value = serde_json::from_slice(&response_bytes)?;
+        info!(
+            "response bytes in str: {}",
+            String::from_utf8_lossy(&response_bytes)
+        );

-        let content = response_json
-            .get("choices")
-            .and_then(|choices| choices.as_array())
-            .and_then(|choices| choices.first())
-            .and_then(|choice| choice.get("message"))
-            .and_then(|message| message.get("content"))
-            .and_then(|content| content.as_str())
+        let response_str = String::from_utf8_lossy(&response_bytes);
+        let lines: Vec<&str> = response_str.lines().collect();
+
+        // Validate SSE format: first line should be "event: message"
+        if lines.is_empty() || lines[0] != "event: message" {
+            warn!("Invalid SSE response format from agent {}: expected 'event: message' as first line, got: {:?}", agent.id, lines.first());
+            return Err(PipelineError::NoContentInResponse(format!(
+                "Invalid SSE response format from agent {}: expected 'event: message' as first line",
+                agent.id
+            )));
+        }
+
+        // Find the data line
+        let data_lines: Vec<&str> = lines
+            .iter()
+            .filter(|line| line.starts_with("data: "))
+            .copied()
+            .collect();
+
+        if data_lines.len() != 1 {
+            warn!(
+                "Expected exactly one 'data:' line from agent {}, found {}",
+                agent.id,
+                data_lines.len()
+            );
+            return Err(PipelineError::NoContentInResponse(format!(
+                "Expected exactly one 'data:' line from agent {}, found {}",
+                agent.id,
+                data_lines.len()
+            )));
+        }
+
+        let data_chunk = &data_lines[0][6..]; // Skip "data: " prefix
+
+        let response: JsonRpcResponse = serde_json::from_str(data_chunk)?;
+        let response_result = response
+            .result
+            .ok_or_else(|| PipelineError::NoChoicesInResponse(agent.id.clone()))?;
+
+        let response_json = response_result
+            .get("structuredContent")
+            .ok_or_else(|| PipelineError::NoChoicesInResponse(agent.id.clone()))?;
+        // Parse the response as JSON to extract the content
+        // let response_json: serde_json::Value = serde_json::from_slice(&response_bytes)?;
+
+        let messages: Vec<Message> = response_json
+            .get("result")
+            .and_then(|v| v.as_array())
            .ok_or_else(|| PipelineError::NoContentInResponse(agent.id.clone()))?
+            .iter()
+            .map(|msg_value| serde_json::from_value(msg_value.clone()))
+            .collect::<Result<Vec<Message>, _>>()
+            .map_err(PipelineError::ParseError)?;
+
+        Ok(messages)
+    }
+
+    async fn get_new_session_id(&self, agent_id: &str) -> String {
+        let initialize_request = JsonRpcRequest {
+            jsonrpc: "2.0".to_string(),
+            id: JsonRpcId::Number(1),
+            method: "initialize".to_string(),
+            params: Some({
+                let mut params = HashMap::new();
+                params.insert(
+                    "protocolVersion".to_string(),
+                    serde_json::Value::String("2024-11-05".to_string()),
+                );
+                params.insert("capabilities".to_string(), serde_json::json!({}));
+                params.insert(
+                    "clientInfo".to_string(),
+                    serde_json::json!({
+                        "name": "brightstaff",
+                        "version": "1.0.0"
+                    }),
+                );
+                params
+            }),
+        };
+
+        let request_body = serde_json::to_string(&initialize_request).unwrap();
+
+        info!("Initializing MCP session for agent {}", agent_id);
+        info!("Initialize request body: {}", request_body);
+
+        let response = self
+            .client
+            .post(format!("{}/mcp", self.url))
+            .header("Content-Type", "application/json")
+            .header("Accept", "application/json, text/event-stream")
+            .header(ARCH_UPSTREAM_HOST_HEADER, agent_id)
+            .body(request_body)
+            .send()
+            .await
+            .expect("Failed to initialize MCP session");
+
+        info!("Initialize response status: {}", response.status());
+        info!("Initialize response headers: {:?}", response.headers());
+
+        let session_id = response
+            .headers()
+            .get("mcp-session-id")
+            .and_then(|v| v.to_str().ok())
+            .expect("No mcp-session-id in response")
            .to_string();

-        Ok(content)
+        info!("Created new MCP session for agent {}: {}", agent_id, session_id);
+
+        // Send initialized notification (without id field per JSON-RPC 2.0 spec)
+        let initialized_notification = JsonRpcNotification {
+            jsonrpc: "2.0".to_string(),
+            method: "notifications/initialized".to_string(),
+            params: None,
+        };
+
+        let notification_body = serde_json::to_string(&initialized_notification).unwrap();
+
+        info!("Sending initialized notification: {}", notification_body);
+
+        let notif_response = self
+            .client
+            .post(format!("{}/mcp", self.url))
+            .header("Content-Type", "application/json")
+            .header("Accept", "application/json, text/event-stream")
+            .header("mcp-session-id", &session_id)
+            .header(ARCH_UPSTREAM_HOST_HEADER, agent_id)
+            .body(notification_body)
+            .send()
+            .await
+            .expect("Failed to send initialized notification");
+
+        info!("Initialized notification response status: {}", notif_response.status());
+
+        session_id
    }

    /// Send request to terminal agent and return the raw response for streaming
-    pub async fn invoke_upstream_agent(
+    pub async fn invoke_terminal_agent(
        &self,
        messages: &[Message],
        original_request: &ChatCompletionsRequest,
@ -169,7 +354,7 @@ impl PipelineProcessor {

        let response = self
            .client
-            .post(&self.url)
+            .post(format!("{}/v1/chat/completions", self.url))
            .headers(agent_headers)
            .body(request_body)
            .send()
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -5,7 +5,7 @@ use brightstaff::handlers::function_calling::{function_calling_chat_handler};
 use brightstaff::router::llm_router::RouterService;
 use brightstaff::utils::tracing::init_tracer;
 use bytes::Bytes;
-use common::configuration::Configuration;
+use common::configuration::{Agent, Configuration};
 use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
 use http_body_util::{combinators::BoxBody, BodyExt, Empty};
 use hyper::body::Incoming;
@ -63,9 +63,18 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {

    let arch_config = Arc::new(config);

+    // combine agents and agent_filters into a single list of agents
+    let all_agents: Vec<Agent> = arch_config
+        .agents
+        .as_deref()
+        .unwrap_or_default()
+        .iter()
+        .chain(arch_config.agent_filters.as_deref().unwrap_or_default())
+        .cloned()
+        .collect();
+
    let llm_providers = Arc::new(RwLock::new(arch_config.model_providers.clone()));
-    let agents_list = Arc::new(RwLock::new(arch_config.agents.clone()));
-    let agent_filters = Arc::new(RwLock::new(arch_config.agent_filters.clone()));
+    let agents_list = Arc::new(RwLock::new(Some(all_agents)));
    let listeners = Arc::new(RwLock::new(arch_config.listeners.clone()));

    debug!(
@ -112,7 +121,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {

        let llm_providers = llm_providers.clone();
        let agents_list = agents_list.clone();
-        let agent_filters = agent_filters.clone();
        let listeners = listeners.clone();
        let service = service_fn(move |req| {
            let router_service = Arc::clone(&router_service);
@ -121,7 +129,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
            let llm_providers = llm_providers.clone();
            let model_aliases = Arc::clone(&model_aliases);
            let agents_list = agents_list.clone();
-            let agent_filters = agent_filters.clone();
            let listeners = listeners.clone();

            async move {
--- a/crates/build.sh
+++ b/crates/build.sh
@ -0,0 +1 @@
+cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway && cargo build --release -p brightstaff
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -21,16 +21,10 @@ pub struct ModelAlias {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Agent {
    pub id: String,
+    pub transport: Option<String>,
+    pub tool: Option<String>,
+    pub url: String,
    pub kind: Option<String>,
-    pub url: String,
-    pub tool: Option<String>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AgentFilter {
-    pub id: String,
-    pub url: String,
-    pub tool: Option<String>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -65,7 +59,7 @@ pub struct Configuration {
    pub mode: Option<GatewayMode>,
    pub routing: Option<Routing>,
    pub agents: Option<Vec<Agent>>,
-    pub agent_filters: Option<Vec<AgentFilter>>,
+    pub agent_filters: Option<Vec<Agent>>,
    pub listeners: Vec<Listener>,
 }

--- a/demos/use_cases/rag_agent/arch_config.yaml
+++ b/demos/use_cases/rag_agent/arch_config.yaml
@ -2,23 +2,21 @@ version: v0.3.0

 agents:
  - id: rag_agent
-    url: mcp://host.docker.internal:10501
-    # only sse is supported
-    # transport: sse or stdio
-    # optional tool name, defaults to "invoke"
-    # tool: invoke
+    url: mcp://host.docker.internal:10505
  - id: travel_agent
-    url: mcp://host.docker.internal:10502
+    transport: streamable-http
+    tool: invoke
+    url: mcp://host.docker.internal:10401

 agent_filters:
  - id: query_rewriter
-    url: mcp://host.docker.internal:10500
-    # tool is optional, defaults to id
-    # tool: query_rewriter
+    transport: streamable-http
+    tool: query_rewriter
+    url: mcp://host.docker.internal:10501
  - id: context_builder
-    url: mcp://host.docker.internal:10500
-  - id: input_guards
-    url: mcp://host.docker.internal:10500
+    transport: streamable-http
+    tool: context_builder
+    url: mcp://host.docker.internal:10502

 model_providers:
  - model: openai/gpt-4o-mini
@ -35,20 +33,20 @@ model_aliases:

 listeners:
  - type: agent
+    name: agent_1
    port: 8001
    router: arch_agent_router
    agents:
      - id: rag_agent
        description: virtual assistant for retrieval augmented generation tasks
        filter_chain:
-          - input_guards
          - query_rewriter
          - context_builder

-      - id: travel_agent
-        description: virtual assistant for travel bookings and recommendations
-        filter_chain:
-          - input_guards
+      # - id: travel_agent
+      #   description: virtual assistant for travel bookings and recommendations
+      #   filter_chain:
+      #     - input_guards

 tracing:
  random_sampling: 100
--- a/demos/use_cases/rag_agent/mcp_query.rest
+++ b/demos/use_cases/rag_agent/mcp_query.rest
@ -0,0 +1,86 @@
+### Initialize MCP Session (SSE)
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+
+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"capabilities":{},"protocolVersion":"2024-11-05","clientInfo":{"name":"test","version":"1.0.0"}}}
+
+### Send Initialized Notification
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: e4ec1ae904e14e06b7d194da10e5f74c
+
+{
+  "jsonrpc": "2.0",
+  "method": "notifications/initialized"
+}
+
+### List Tools
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: eb10a691b36e4547b6c93c5dc5b47e11
+
+{
+  "jsonrpc": "2.0",
+  "id": "list-tools-1",
+  "method": "tools/list"
+}
+
+### Call Query Rewriter Tool
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: 6b95ff75825a402b90eb3ea07e23fbce
+
+{
+  "jsonrpc": "2.0",
+  "id": "3d3b886a-6216-4a26-a422-7a972529c0e7",
+  "method": "tools/call",
+  "params": {
+    "arguments": {
+      "messages": [
+        {
+          "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?",
+          "role": "user"
+        }
+      ]
+    },
+    "name": "query_rewriter"
+  }
+}
+
+### another test
+
+# Content-Type: application/json
+# Accept: application/json, text/event-stream
+# mcp-session-id: ed7a81a1d39549ecaadb867a6b2daf1e
+
+POST http://localhost:10501/mcp
+content-type: application/json
+mcp-session-id: e4ec1ae904e14e06b7d194da10e5f74c
+accept: application/json, text/event-stream
+
+{"jsonrpc":"2.0","id":"4bb1043a-2953-4bcd-b801-f270b0ae8c39","method":"tools/call","params":{"arguments":{"messages":[{"content":"What is the guaranteed uptime percentage for TechCorp's cloud services?","role":"user"}]},"name":"query_rewriter"}}
+
+
+
+### stream test
+
+POST http://localhost:10501/mcp
+content-type: application/json
+mcp-session-id: 60be9fb816304cb6b9ecdb91d89cd91f
+accept: application/json, text/event-stream
+
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "method": "tools/call",
+  "params": {
+    "name": "long_job",
+    "arguments": {
+      "n": 3
+    }
+  }
+}
--- a/demos/use_cases/rag_agent/pyproject.toml
+++ b/demos/use_cases/rag_agent/pyproject.toml
@ -7,7 +7,7 @@ requires-python = ">=3.10"
 dependencies = [
    "click>=8.2.1",
    "mcp>=1.13.1",
-    "fastmcp>=2.12.2",
+    "fastmcp>=2.14",
    "pydantic>=2.11.7",
    "fastapi>=0.104.1",
    "uvicorn>=0.24.0",
--- a/demos/use_cases/rag_agent/src/rag_agent/init.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/init.py
@ -1,50 +1,88 @@
 import click
-from mcp.server.fastmcp import FastMCP
+from fastmcp import FastMCP

 mcp = None


@click.command()
-@click.option("--transport", "transport", default="sse", help="Transport type: stdio or sse")
+@click.option(
+    "--transport",
+    "transport",
+    default="streamable-http",
+    help="Transport type: stdio or sse",
+)
@click.option("--host", "host", default="localhost", help="Host to bind MCP server to")
@click.option("--port", "port", type=int, default=10500, help="Port for MCP server")
-@click.option("--agent", "agent", required=True, help="Agent name: query_rewriter, context_builder, or response_generator")
-@click.option("--name", "agent_name", default=None, help="Custom MCP server name (defaults to agent type)")
-def main(host, port, agent, transport, agent_name):
+@click.option(
+    "--agent",
+    "agent",
+    required=True,
+    help="Agent name: query_rewriter, context_builder, or response_generator",
+)
+@click.option(
+    "--name",
+    "agent_name",
+    default=None,
+    help="Custom MCP server name (defaults to agent type)",
+)
+@click.option(
+    "--rest-server",
+    "rest_server",
+    is_flag=True,
+    help="Start REST server instead of MCP server",
+)
+@click.option("--rest-port", "rest_port", default=8000, help="Port for REST server")
+def main(host, port, agent, transport, agent_name, rest_server, rest_port):
    """Start a RAG agent as an MCP server."""
-    
+
    # Map friendly names to agent modules
    agent_map = {
        "query_rewriter": ("rag_agent.query_rewriter", "Query Rewriter Agent"),
-        "context_builder": ("rag_agent.context_builder_agent", "Context Builder Agent"),
-        "response_generator": ("rag_agent.response_generator", "Response Generator Agent"),
+        "context_builder": ("rag_agent.context_builder", "Context Builder Agent"),
+        "response_generator": (
+            "rag_agent.rag_agent",
+            "Response Generator Agent",
+        ),
    }
-    
+
+    module_name, default_name = agent_map[agent]
+    mcp_name = agent_name or default_name
+
+    global mcp
+    mcp = FastMCP(mcp_name, host=host, port=port)
+
    if agent not in agent_map:
        print(f"Error: Unknown agent '{agent}'")
        print(f"Available agents: {', '.join(agent_map.keys())}")
        return
-    
-    module_name, default_name = agent_map[agent]
-    mcp_name = agent_name or default_name
-    
-    print(f"Starting MCP server: {mcp_name}")
-    print(f"  Agent: {agent}")
-    print(f"  Transport: {transport}")
-    print(f"  Host: {host}")
-    print(f"  Port: {port}")
-    
-    global mcp
-    mcp = FastMCP(mcp_name, host=host, port=port)
-    
-    # Import the agent module to register its tools
-    import importlib
-    importlib.import_module(module_name)
-    
-    print(f"Agent '{agent}' loaded successfully")
-    print(f"MCP server ready on {transport}://{host}:{port}")
-    
-    mcp.run(transport=transport)
+
+    if rest_server:
+        print(f"Starting REST server on {host}:{rest_port} for agent: {agent}")
+
+        if agent == "response_generator":
+            from rag_agent.rag_agent import start_server
+
+            start_server(host=host, port=rest_port)
+            return
+        else:
+            print("Please specify an agent to start with --agent option.")
+            return
+    else:
+        print(f"Starting MCP server: {mcp_name}")
+        print(f"  Agent: {agent}")
+        print(f"  Transport: {transport}")
+        print(f"  Host: {host}")
+        print(f"  Port: {port}")
+
+        # Import the agent module to register its tools
+        import importlib
+
+        importlib.import_module(module_name)
+
+        print(f"Agent '{agent}' loaded successfully")
+        print(f"MCP server ready on {transport}://{host}:{port}")
+
+        mcp.run(transport=transport)


 if __name__ == "__main__":
--- a/demos/use_cases/rag_agent/src/rag_agent/context_builder.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/context_builder.py
@ -191,54 +191,30 @@ class Response(BaseModel):
 # FastAPI app for REST server
 app = FastAPI(title="RAG Content Builder Agent", version="1.0.0")

+
@mcp.tool()
@app.post("/v1/chat/completions")
-async def context_builder(
-    request_body: ChatCompletionRequest
-) -> ChatCompletionResponse:
-    """ chat completions endpoint that augments user queries with relevant context from the knowledge base."""
+async def context_builder(messages: List[ChatMessage]) -> List[ChatMessage]:
+    """chat completions endpoint that augments user queries with relevant context from the knowledge base."""
    import time
    import uuid

-    logger.info(
-        f"Received chat completion request with {len(request_body.messages)} messages"
-    )
+    logger.info(f"Received chat completion request with {len(messages)} messages")

    # Get traceparent header from HTTP request using FastMCP's dependency function
    headers = get_http_headers()
    traceparent_header = headers.get("traceparent")
-    
+
    if traceparent_header:
        logger.info(f"Received traceparent header: {traceparent_header}")
    else:
        logger.info("No traceparent header found")

    # Augment the user query with relevant context
-    updated_messages = await augment_query_with_context(
-        request_body.messages, traceparent_header
-    )
-    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
+    updated_messages = await augment_query_with_context(messages, traceparent_header)

-    response = ChatCompletionResponse(
-        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        created=int(time.time()),
-        model=request_body.model,
-        choices=[
-            {
-                "index": 0,
-                "message": {"role": "user", "content": messages_history_json},
-                "finish_reason": "stop",
-            }
-        ],
-        usage={
-            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
-            "completion_tokens": len("Context added to user query.".split()),
-            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
-            + len("Context added to user query.".split()),
-        },
-    )
-
-    return response
+    # Return as dict to minimize text serialization
+    return [{"role": msg.role, "content": msg.content} for msg in updated_messages]


 def main():
--- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter.py
@ -1,3 +1,4 @@
+import asyncio
 import json
 from pydantic import BaseModel
 from typing import List, Optional, Dict, Any
@ -11,6 +12,9 @@ from .api import ChatMessage, ChatCompletionRequest, ChatCompletionResponse
 from . import mcp
 from fastmcp.server.dependencies import get_http_headers

+from fastmcp.dependencies import CurrentContext
+from fastmcp.server.context import Context
+
 # Set up logging
 logging.basicConfig(
    level=logging.INFO,
@ -29,10 +33,11 @@ archgw_client = AsyncOpenAI(
    api_key="EMPTY",  # archgw doesn't require a real API key
 )

+
 async def rewrite_query_with_archgw(
    messages: List[ChatMessage], traceparent_header: str
 ) -> str:
-    """ Rewrite the user query using LLM for better retrieval. """
+    """Rewrite the user query using LLM for better retrieval."""
    system_prompt = """You are a query rewriter that improves user queries for better retrieval.

    Given a conversation history, rewrite the last user message to be more specific and context-aware.
@ -89,33 +94,31 @@ class Response(BaseModel):
 app = FastAPI(title="RAG Agent Query Parser", version="1.0.0")


-@app.post("/v1/chat/completions")
@mcp.tool()
-async def query_rewriter(request_body: ChatCompletionRequest):
-    """Chat completions endpoint that rewrites the last user query using archgw."""
+async def query_rewriter(messages: List[ChatMessage]) -> List[ChatMessage]:
+    """Chat completions endpoint that rewrites the last user query using archgw.
+
+    Returns a dict with a 'messages' key containing the updated message list.
+    """
    import time
    import uuid

-    logger.info(
-        f"Received chat completion request with {len(request_body.messages)} messages"
-    )
+    logger.info(f"Received chat completion request with {len(messages)} messages")

    # Get traceparent header from HTTP request using FastMCP's dependency function
    headers = get_http_headers()
    traceparent_header = headers.get("traceparent")
-    
+
    if traceparent_header:
        logger.info(f"Received traceparent header: {traceparent_header}")
    else:
        logger.info("No traceparent header found")

    # Call archgw to rewrite the last user query
-    rewritten_query = await rewrite_query_with_archgw(
-        request_body.messages, traceparent_header
-    )
+    rewritten_query = await rewrite_query_with_archgw(messages, traceparent_header)

    # Create updated messages with the rewritten query
-    updated_messages = request_body.messages.copy()
+    updated_messages = messages.copy()

    # Find and update the last user message with the rewritten query
    for i in range(len(updated_messages) - 1, -1, -1):
@ -127,28 +130,8 @@ async def query_rewriter(request_body: ChatCompletionRequest):
            )
            break

-    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
-
-    response = ChatCompletionResponse(
-        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        created=int(time.time()),
-        model=request_body.model,
-        choices=[
-            {
-                "index": 0,
-                "message": {"role": "user", "content": messages_history_json},
-                "finish_reason": "stop",
-            }
-        ],
-        usage={
-            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
-            "completion_tokens": len("Updated query for better retrieval.".split()),
-            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
-            + len("Updated query for better retrieval.".split()),
-        },
-    )
-
-    return response
+    # Return as dict to minimize text serialization
+    return [{"role": msg.role, "content": msg.content} for msg in updated_messages]


@app.get("/health")
--- a/demos/use_cases/rag_agent/src/rag_agent/rag_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/rag_agent.py
@ -63,9 +63,8 @@ def prepare_response_messages(request_body: ChatCompletionRequest):


@app.post("/v1/chat/completions")
-@mcp.tool(name="invoke")
-async def chat_completion(request_body: ChatCompletionRequest):
-    """Chat completions endpoint that generates a coherent response based on all context."""
+async def chat_completion_http(request_body: ChatCompletionRequest):
+    """HTTP endpoint for chat completions with streaming support."""
    logger.info(
        f"Received chat completion request with {len(request_body.messages)} messages"
    )
@ -73,7 +72,7 @@ async def chat_completion(request_body: ChatCompletionRequest):
    # Get traceparent header from HTTP request using FastMCP's dependency function
    headers = get_http_headers()
    traceparent_header = headers.get("traceparent")
-    
+
    if traceparent_header:
        logger.info(f"Received traceparent header: {traceparent_header}")
    else:
@ -92,6 +91,23 @@ async def chat_completion(request_body: ChatCompletionRequest):
        return await non_streaming_chat_completions(request_body, traceparent_header)


+@mcp.tool(name="invoke")
+async def chat_completion(request_body: ChatCompletionRequest):
+    """Chat completions endpoint that generates a coherent response based on all context.
+
+    For MCP calls, streaming is collected and returned as a complete response.
+    """
+    logger.info(
+        f"[MCP] Received chat completion request with {len(request_body.messages)} messages"
+    )
+
+    # For MCP, always use non-streaming to return a complete response
+    response = await non_streaming_chat_completions(
+        request_body, traceparent_header=None
+    )
+    return response
+
+
 async def stream_chat_completions(
    request_body: ChatCompletionRequest, traceparent_header: str = None
 ):
--- a/demos/use_cases/rag_agent/start_agents.sh
+++ b/demos/use_cases/rag_agent/start_agents.sh
@ -21,16 +21,25 @@ cleanup() {

 trap cleanup EXIT

-log "Starting query_parser agent on port 10500..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10500 --agent query_parser &
+# log "Starting input guards filter on port 10500..."
+# uv run python -m rag_agent --host 0.0.0.0 --port 10500 --agent input_guards &
+# WAIT_FOR_PIDS+=($!)
+
+
+log "Starting query_parser agent on port 10501..."
+uv run python -m rag_agent --host 0.0.0.0 --port 10501 --agent query_rewriter &
 WAIT_FOR_PIDS+=($!)

-log "Starting context_builder agent on port 10501..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10501 --agent context_builder &
+log "Starting context_builder agent on port 10502..."
+uv run python -m rag_agent --host 0.0.0.0 --port 10502 --agent context_builder &
 WAIT_FOR_PIDS+=($!)

-log "Starting response_generator agent on port 10502..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10502 --agent response_generator &
+# log "Starting response_generator agent on port 10400..."
+# uv run python -m rag_agent --host 0.0.0.0 --port 10400 --agent response_generator &
+# WAIT_FOR_PIDS+=($!)
+
+log "Starting response_generator agent on port 10505..."
+uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10505 --agent response_generator &
 WAIT_FOR_PIDS+=($!)

 for PID in "${WAIT_FOR_PIDS[@]}"; do
--- a/demos/use_cases/rag_agent/test.rest
+++ b/demos/use_cases/rag_agent/test.rest
@ -49,7 +49,7 @@ Content-Type: application/json
      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
    }
  ],
-  "stream": false
+  "stream": true
 }

 ### send request to context builder agent
--- a/demos/use_cases/rag_agent/uv.lock
+++ b/demos/use_cases/rag_agent/uv.lock
				`@ -0,0 +1 @@`
				`cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway && cargo build --release -p brightstaff`