Use mcp tools for filter chain (#621)

* agents framework demo * more changes * add more changes * pending changes * fix tests * fix more * rebase with main and better handle error from mcp * add trace for filters * add test for client error, server error and for mcp error * update schema validate code and rename kind => type in agent_filter * fix agent description and pre-commit * fix tests * add provider specific request parsing in agents chat * fix precommit and tests * cleanup demo * update readme * fix pre-commit * refactor tracing * fix fmt * fix: handle MessageContent enum in responses API conversion - Update request.rs to handle new MessageContent enum structure from main - MessageContent can now be Text(String) or Items(Vec<InputContent>) - Handle new InputItem variants (ItemReference, FunctionCallOutput) - Fixes compilation error after merging latest main (#632) * address pr feedback * fix span * fix build * update openai version
2026-04-25 00:36:34 +02:00 · 2025-12-17 17:30:14 -08:00 · 2025-12-17 17:30:14 -08:00 · 2f9121407b
commit 2f9121407b
parent cb82a83c7b
40 changed files with 4886 additions and 190 deletions
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -14,6 +14,38 @@ properties:
    type: array
    items:
      type: object
+      properties:
+        id:
+          type: string
+        url:
+          type: string
+      additionalProperties: false
+      required:
+        - id
+        - url
+  filters:
+    type: array
+    items:
+      type: object
+      properties:
+        id:
+          type: string
+        url:
+          type: string
+        type:
+          type: string
+          enum:
+            - mcp
+        transport:
+          type: string
+          enum:
+            - streamable-http
+        tool:
+          type: string
+      additionalProperties: false
+      required:
+        - id
+        - url
  listeners:
    oneOf:
      - type: array
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -214,21 +214,21 @@ static_resources:
            - name: envoy.filters.network.http_connection_manager
              typed_config:
                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
-                {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
-                generate_request_id: true
-                tracing:
-                  provider:
-                    name: envoy.tracers.opentelemetry
-                    typed_config:
-                      "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
-                      grpc_service:
-                        envoy_grpc:
-                          cluster_name: opentelemetry_collector
-                        timeout: 0.250s
-                      service_name: tools
-                  random_sampling:
-                    value: {{ arch_tracing.random_sampling }}
-                {% endif %}
+                # {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
+                # generate_request_id: true
+                # tracing:
+                #   provider:
+                #     name: envoy.tracers.opentelemetry
+                #     typed_config:
+                #       "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
+                #       grpc_service:
+                #         envoy_grpc:
+                #           cluster_name: opentelemetry_collector
+                #         timeout: 0.250s
+                #       service_name: tools
+                #   random_sampling:
+                #     value: {{ arch_tracing.random_sampling }}
+                # {% endif %}
                stat_prefix: outbound_api_traffic
                codec_type: AUTO
                scheme_header_transformation:
@ -299,7 +299,7 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: arch_gateway
+                      service_name: plano(inbound)
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -101,8 +101,17 @@ def validate_and_render_schema():

    # Process agents section and convert to endpoints
    agents = config_yaml.get("agents", [])
-    for agent in agents:
+    filters = config_yaml.get("filters", [])
+    agents_combined = agents + filters
+    agent_id_keys = set()
+
+    for agent in agents_combined:
        agent_id = agent.get("id")
+        if agent_id in agent_id_keys:
+            raise Exception(
+                f"Duplicate agent id {agent_id}, please provide unique id for each agent"
+            )
+        agent_id_keys.add(agent_id)
        agent_endpoint = agent.get("url")

        if agent_id and agent_endpoint:
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -57,6 +57,10 @@ def convert_legacy_listeners(
        "timeout": "30s",
    }

+    # Handle None case
+    if listeners is None:
+        return [llm_gateway_listener], llm_gateway_listener, prompt_gateway_listener
+
    if isinstance(listeners, dict):
        # legacy listeners
        # check if type is array or object
--- a/arch/tools/test/test_config_generator.py
+++ b/arch/tools/test/test_config_generator.py
@ -94,21 +94,16 @@ def test_validate_and_render_happy_path_agent_config(monkeypatch):
 version: v0.3.0

 agents:
-  - name: query_rewriter
-    kind: openai
-    endpoint: http://localhost:10500
-  - name: context_builder
-    kind: openai
-    endpoint: http://localhost:10501
-  - name: response_generator
-    kind: openai
-    endpoint: http://localhost:10502
-  - name: research_agent
-    kind: openai
-    endpoint: http://localhost:10500
-  - name: input_guard_rails
-    kind: openai
-    endpoint: http://localhost:10503
+  - id: query_rewriter
+    url: http://localhost:10500
+  - id: context_builder
+    url: http://localhost:10501
+  - id: response_generator
+    url: http://localhost:10502
+  - id: research_agent
+    url: http://localhost:10500
+  - id: input_guard_rails
+    url: http://localhost:10503

 listeners:
  - name: tmobile
@ -156,7 +151,7 @@ listeners:
        mock.mock_open().return_value,  # ARCH_CONFIG_FILE_RENDERED (write)
    ]
    with mock.patch("builtins.open", m_open):
-        with mock.patch("config_generator.Environment"):
+        with mock.patch("cli.config_generator.Environment"):
            validate_and_render_schema()


--- a/crates/brightstaff/src/handlers/agent_chat_completions.rs
+++ b/crates/brightstaff/src/handlers/agent_chat_completions.rs
@ -1,16 +1,24 @@
 use std::sync::Arc;
+use std::time::{Instant, SystemTime};

 use bytes::Bytes;
-use hermesllm::apis::openai::ChatCompletionsRequest;
+use common::consts::TRACE_PARENT_HEADER;
+use common::traces::{SpanBuilder, SpanKind, parse_traceparent, generate_random_span_id};
+use hermesllm::apis::OpenAIMessage;
+use hermesllm::clients::SupportedAPIsFromClient;
+use hermesllm::providers::request::ProviderRequest;
+use hermesllm::ProviderRequestType;
 use http_body_util::combinators::BoxBody;
 use http_body_util::BodyExt;
 use hyper::{Request, Response};
+use serde::ser::Error as SerError;
 use tracing::{debug, info, warn};

 use super::agent_selector::{AgentSelectionError, AgentSelector};
 use super::pipeline_processor::{PipelineError, PipelineProcessor};
 use super::response_handler::ResponseHandler;
 use crate::router::llm_router::RouterService;
+use crate::tracing::{OperationNameBuilder, operation_component, http};

 /// Main errors for agent chat completions
 #[derive(Debug, thiserror::Error)]
@ -33,8 +41,17 @@ pub async fn agent_chat(
    _: String,
    agents_list: Arc<tokio::sync::RwLock<Option<Vec<common::configuration::Agent>>>>,
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
+    trace_collector: Arc<common::traces::TraceCollector>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
-    match handle_agent_chat(request, router_service, agents_list, listeners).await {
+    match handle_agent_chat(
+        request,
+        router_service,
+        agents_list,
+        listeners,
+        trace_collector,
+    )
+    .await
+    {
        Ok(response) => Ok(response),
        Err(err) => {
            // Check if this is a client error from the pipeline that should be cascaded
@ -109,10 +126,11 @@ async fn handle_agent_chat(
    router_service: Arc<RouterService>,
    agents_list: Arc<tokio::sync::RwLock<Option<Vec<common::configuration::Agent>>>>,
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
+    trace_collector: Arc<common::traces::TraceCollector>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, AgentFilterChainError> {
    // Initialize services
    let agent_selector = AgentSelector::new(router_service);
-    let pipeline_processor = PipelineProcessor::default();
+    let mut pipeline_processor = PipelineProcessor::default();
    let response_handler = ResponseHandler::new();

    // Extract listener name from headers
@ -132,6 +150,13 @@ async fn handle_agent_chat(
    info!("Handling request for listener: {}", listener.name);

    // Parse request body
+    let request_path = request
+        .uri()
+        .path()
+        .to_string()
+        .strip_prefix("/agents")
+        .unwrap()
+        .to_string();
    let request_headers = request.headers().clone();
    let chat_request_bytes = request.collect().await?.to_bytes();

@ -140,61 +165,141 @@ async fn handle_agent_chat(
        String::from_utf8_lossy(&chat_request_bytes)
    );

-    let chat_completions_request: ChatCompletionsRequest =
-        serde_json::from_slice(&chat_request_bytes).map_err(|err| {
-            warn!(
-                "Failed to parse request body as ChatCompletionsRequest: {}",
-                err
-            );
-            AgentFilterChainError::RequestParsing(err)
+    // Determine the API type from the endpoint
+    let api_type =
+        SupportedAPIsFromClient::from_endpoint(request_path.as_str()).ok_or_else(|| {
+            let err_msg = format!("Unsupported endpoint: {}", request_path);
+            warn!("{}", err_msg);
+            AgentFilterChainError::RequestParsing(serde_json::Error::custom(err_msg))
        })?;

+    let client_request = match ProviderRequestType::try_from((&chat_request_bytes[..], &api_type)) {
+        Ok(request) => request,
+        Err(err) => {
+            warn!("Failed to parse request as ProviderRequestType: {}", err);
+            let err_msg = format!("Failed to parse request: {}", err);
+            return Err(AgentFilterChainError::RequestParsing(
+                serde_json::Error::custom(err_msg),
+            ));
+        }
+    };
+
+    let message: Vec<OpenAIMessage> = client_request.get_messages();
+
+    // let chat_completions_request: ChatCompletionsRequest =
+    //     serde_json::from_slice(&chat_request_bytes).map_err(|err| {
+    //         warn!(
+    //             "Failed to parse request body as ChatCompletionsRequest: {}",
+    //             err
+    //         );
+    //         AgentFilterChainError::RequestParsing(err)
+    //     })?;
+
    // Extract trace parent for routing
    let trace_parent = request_headers
        .iter()
-        .find(|(key, _)| key.as_str() == "traceparent")
+        .find(|(key, _)| key.as_str() == TRACE_PARENT_HEADER)
        .map(|(_, value)| value.to_str().unwrap_or_default().to_string());

-    // Select appropriate agent using arch router llm model
-    let selected_agent = agent_selector
-        .select_agent(&chat_completions_request.messages, &listener, trace_parent)
-        .await?;
-
-    debug!("Processing agent pipeline: {}", selected_agent.id);
-
-    // Create agent map for pipeline processing
+    // Create agent map for pipeline processing and agent selection
    let agent_map = {
        let agents = agents_list.read().await;
        let agents = agents.as_ref().unwrap();
        agent_selector.create_agent_map(agents)
    };

+    // Parse trace parent to get trace_id and parent_span_id
+    let (trace_id, parent_span_id) = if let Some(ref tp) = trace_parent {
+        parse_traceparent(tp)
+    } else {
+        (String::new(), None)
+    };
+
+    // Select appropriate agent using arch router llm model
+    let selected_agent = agent_selector
+        .select_agent(&message, &listener, trace_parent.clone())
+        .await?;
+
+    debug!("Processing agent pipeline: {}", selected_agent.id);
+
+    // Record the start time for agent span
+    let agent_start_time = SystemTime::now();
+    let agent_start_instant = Instant::now();
+    // let (span_id, trace_id) = trace_collector.start_span(
+    //     trace_parent.clone(),
+    //     operation_component::AGENT,
+    //     &format!("/agents{}", request_path),
+    //     &selected_agent.id,
+    // );
+
+    let span_id = generate_random_span_id();
+
    // Process the filter chain
-    let processed_messages = pipeline_processor
+    let chat_history = pipeline_processor
        .process_filter_chain(
-            &chat_completions_request,
+            &message,
            &selected_agent,
            &agent_map,
            &request_headers,
+            Some(&trace_collector),
+            trace_id.clone(),
+            span_id.clone(),
        )
        .await?;

    // Get terminal agent and send final response
-    let terminal_agent_name = selected_agent.id;
+    let terminal_agent_name = selected_agent.id.clone();
    let terminal_agent = agent_map.get(&terminal_agent_name).unwrap();

    debug!("Processing terminal agent: {}", terminal_agent_name);
    debug!("Terminal agent details: {:?}", terminal_agent);

    let llm_response = pipeline_processor
-        .invoke_upstream_agent(
-            &processed_messages,
-            &chat_completions_request,
+        .invoke_agent(
+            &chat_history,
+            client_request,
            terminal_agent,
            &request_headers,
+            trace_id.clone(),
+            span_id.clone(),
        )
        .await?;

+    // Record agent span after processing is complete
+    let agent_end_time = SystemTime::now();
+    let agent_elapsed = agent_start_instant.elapsed();
+
+    // Build full path with /agents prefix
+    let full_path = format!("/agents{}", request_path);
+
+    // Build operation name: POST {full_path} {agent_name}
+    let operation_name = OperationNameBuilder::new()
+        .with_method("POST")
+        .with_path(&full_path)
+        .with_target(&terminal_agent_name)
+        .build();
+
+    let mut span_builder = SpanBuilder::new(&operation_name)
+        .with_span_id(span_id)
+        .with_kind(SpanKind::Internal)
+        .with_start_time(agent_start_time)
+        .with_end_time(agent_end_time)
+        .with_attribute(http::METHOD, "POST")
+        .with_attribute(http::TARGET, full_path)
+        .with_attribute("agent.name", terminal_agent_name.clone())
+        .with_attribute("duration_ms", format!("{:.2}", agent_elapsed.as_secs_f64() * 1000.0));
+
+    if !trace_id.is_empty() {
+        span_builder = span_builder.with_trace_id(trace_id);
+    }
+    if let Some(parent_id) = parent_span_id {
+        span_builder = span_builder.with_parent_span_id(parent_id);
+    }
+
+    let span = span_builder.build();
+    // Use plano(agent) as service name for the agent processing span
+    trace_collector.record_span(operation_component::AGENT, span);
+
    // Create streaming response
    response_handler
        .create_streaming_response(llm_response)
--- a/crates/brightstaff/src/handlers/agent_selector.rs
+++ b/crates/brightstaff/src/handlers/agent_selector.rs
@ -20,6 +20,8 @@ pub enum AgentSelectionError {
    RoutingError(String),
    #[error("Default agent not found for listener: {0}")]
    DefaultAgentNotFound(String),
+    #[error("MCP client error: {0}")]
+    McpError(String),
 }

 /// Service for selecting agents based on routing preferences and listener configuration
@ -29,7 +31,9 @@ pub struct AgentSelector {

 impl AgentSelector {
    pub fn new(router_service: Arc<RouterService>) -> Self {
-        Self { router_service }
+        Self {
+            router_service,
+        }
    }

    /// Find listener by name from the request headers
@ -77,7 +81,9 @@ impl AgentSelector {
            return Ok(agents[0].clone());
        }

-        let usage_preferences = self.convert_agent_description_to_routing_preferences(agents);
+        let usage_preferences = self
+            .convert_agent_description_to_routing_preferences(agents)
+            .await;
        debug!(
            "Agents usage preferences for agent routing str: {}",
            serde_json::to_string(&usage_preferences).unwrap_or_default()
@ -131,20 +137,23 @@ impl AgentSelector {
    }

    /// Convert agent descriptions to routing preferences
-    fn convert_agent_description_to_routing_preferences(
+    async fn convert_agent_description_to_routing_preferences(
        &self,
        agents: &[AgentFilterChain],
    ) -> Vec<ModelUsagePreference> {
-        agents
-            .iter()
-            .map(|agent| ModelUsagePreference {
-                model: agent.id.clone(),
+        let mut preferences = Vec::new();
+
+        for agent_chain in agents {
+            preferences.push(ModelUsagePreference {
+                model: agent_chain.id.clone(),
                routing_preferences: vec![RoutingPreference {
-                    name: agent.id.clone(),
-                    description: agent.description.as_ref().unwrap_or(&String::new()).clone(),
+                    name: agent_chain.id.clone(),
+                    description: agent_chain.description.clone().unwrap_or_default(),
                }],
-            })
-            .collect()
+            });
+        }
+
+        preferences
    }
 }

@ -183,8 +192,10 @@ mod tests {
    fn create_test_agent_struct(name: &str) -> Agent {
        Agent {
            id: name.to_string(),
-            kind: Some("test".to_string()),
+            agent_type: Some("test".to_string()),
            url: "http://localhost:8080".to_string(),
+            tool: None,
+            transport: None,
        }
    }

@ -240,8 +251,8 @@ mod tests {
        assert!(agent_map.contains_key("agent2"));
    }

-    #[test]
-    fn test_convert_agent_description_to_routing_preferences() {
+    #[tokio::test]
+    async fn test_convert_agent_description_to_routing_preferences() {
        let router_service = create_test_router_service();
        let selector = AgentSelector::new(router_service);

@ -250,7 +261,9 @@ mod tests {
            create_test_agent("agent2", "Second agent description", false),
        ];

-        let preferences = selector.convert_agent_description_to_routing_preferences(&agents);
+        let preferences = selector
+            .convert_agent_description_to_routing_preferences(&agents)
+            .await;

        assert_eq!(preferences.len(), 2);
        assert_eq!(preferences[0].model, "agent1");
--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@ -42,19 +42,23 @@ mod integration_tests {
        // Setup services
        let router_service = create_test_router_service();
        let agent_selector = AgentSelector::new(router_service);
-        let pipeline_processor = PipelineProcessor::default();
+        let mut pipeline_processor = PipelineProcessor::default();

        // Create test data
        let agents = vec![
            Agent {
                id: "filter-agent".to_string(),
-                kind: Some("filter".to_string()),
+                agent_type: Some("filter".to_string()),
                url: "http://localhost:8081".to_string(),
+                tool: None,
+                transport: None,
            },
            Agent {
                id: "terminal-agent".to_string(),
-                kind: Some("terminal".to_string()),
+                agent_type: Some("terminal".to_string()),
                url: "http://localhost:8082".to_string(),
+                tool: None,
+                transport: None,
            },
        ];

@ -107,7 +111,15 @@ mod integration_tests {

        let headers = HeaderMap::new();
        let result = pipeline_processor
-            .process_filter_chain(&request, &test_pipeline, &agent_map, &headers)
+            .process_filter_chain(
+                &request.messages,
+                &test_pipeline,
+                &agent_map,
+                &headers,
+                None,
+                String::new(),
+                String::new(),
+            )
            .await;

        println!("Pipeline processing result: {:?}", result);
--- a/crates/brightstaff/src/handlers/jsonrpc.rs
+++ b/crates/brightstaff/src/handlers/jsonrpc.rs
@ -0,0 +1,49 @@
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+pub const JSON_RPC_VERSION: &str = "2.0";
+pub const TOOL_CALL_METHOD : &str = "tools/call";
+pub const MCP_INITIALIZE: &str = "initialize";
+pub const MCP_INITIALIZE_NOTIFICATION: &str = "initialize/notification";
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum JsonRpcId {
+  String(String),
+  Number(u64),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcRequest {
+  pub jsonrpc: String,
+  pub id: JsonRpcId,
+  pub method: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub params: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcNotification {
+  pub jsonrpc: String,
+  pub method: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub params: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcError {
+  pub code: i32,
+  pub message: String,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub data: Option<serde_json::Value>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonRpcResponse {
+  pub jsonrpc: String,
+  pub id: JsonRpcId,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub result: Option<HashMap<String, serde_json::Value>>,
+  #[serde(skip_serializing_if = "Option::is_none")]
+  pub error: Option<JsonRpcError>,
+}
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -7,6 +7,7 @@ pub mod function_calling;
 pub mod pipeline_processor;
 pub mod response_handler;
 pub mod utils;
+pub mod jsonrpc;

 #[cfg(test)]
 mod integration_tests;
--- a/crates/brightstaff/src/handlers/pipeline_processor.rs
+++ b/crates/brightstaff/src/handlers/pipeline_processor.rs
@ -1,10 +1,24 @@
 use std::collections::HashMap;

 use common::configuration::{Agent, AgentFilterChain};
-use common::consts::{ARCH_UPSTREAM_HOST_HEADER, ENVOY_RETRY_HEADER};
-use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
+use common::consts::{
+    ARCH_UPSTREAM_HOST_HEADER, BRIGHT_STAFF_SERVICE_NAME, ENVOY_RETRY_HEADER, TRACE_PARENT_HEADER,
+};
+use common::traces::{SpanBuilder, SpanKind, generate_random_span_id};
+use hermesllm::apis::openai::Message;
+use hermesllm::{ProviderRequest, ProviderRequestType};
 use hyper::header::HeaderMap;
-use tracing::{debug, warn};
+use std::time::{Instant, SystemTime};
+use tracing::{debug, info, warn};
+
+use crate::tracing::operation_component::{self};
+use crate::tracing::{http, OperationNameBuilder};
+
+use crate::handlers::jsonrpc::{
+    JsonRpcId, JsonRpcNotification, JsonRpcRequest, JsonRpcResponse, JSON_RPC_VERSION,
+    MCP_INITIALIZE, MCP_INITIALIZE_NOTIFICATION, TOOL_CALL_METHOD,
+};
+use uuid::Uuid;

 /// Errors that can occur during pipeline processing
 #[derive(Debug, thiserror::Error)]
@ -19,6 +33,12 @@ pub enum PipelineError {
    NoChoicesInResponse(String),
    #[error("No content in response from agent '{0}'")]
    NoContentInResponse(String),
+    #[error("No result in response from agent '{0}'")]
+    NoResultInResponse(String),
+    #[error("No structured content in response from agent '{0}'")]
+    NoStructuredContentInResponse(String),
+    #[error("No messages in response from agent '{0}'")]
+    NoMessagesInResponse(String),
    #[error("Client error from agent '{agent}' (HTTP {status}): {body}")]
    ClientError {
        agent: String,
@ -37,13 +57,17 @@ pub enum PipelineError {
 pub struct PipelineProcessor {
    client: reqwest::Client,
    url: String,
+    agent_id_session_map: HashMap<String, String>,
 }

+const ENVOY_API_ROUTER_ADDRESS: &str = "http://localhost:11000";
+
 impl Default for PipelineProcessor {
    fn default() -> Self {
        Self {
            client: reqwest::Client::new(),
-            url: "http://localhost:11000/v1/chat/completions".to_string(),
+            url: ENVOY_API_ROUTER_ADDRESS.to_string(),
+            agent_id_session_map: HashMap::new(),
        }
    }
 }
@ -53,18 +77,128 @@ impl PipelineProcessor {
        Self {
            client: reqwest::Client::new(),
            url,
+            agent_id_session_map: HashMap::new(),
        }
    }

+    /// Record a span for filter execution
+    fn record_filter_span(
+        &self,
+        collector: &std::sync::Arc<common::traces::TraceCollector>,
+        agent_name: &str,
+        tool_name: &str,
+        start_time: SystemTime,
+        end_time: SystemTime,
+        elapsed: std::time::Duration,
+        trace_id: String,
+        parent_span_id: String,
+        span_id: String,
+    ) -> String {
+        // let (trace_id, parent_span_id) = self.extract_trace_context();
+
+        // Build operation name: POST /agents/* {filter_name}
+        // Using generic path since we don't have access to specific endpoint here
+        let operation_name = OperationNameBuilder::new()
+            .with_method("POST")
+            .with_path("/agents/*")
+            .with_target(agent_name)
+            .build();
+
+        let mut span_builder = SpanBuilder::new(&operation_name)
+            .with_span_id(span_id.clone())
+            .with_kind(SpanKind::Client)
+            .with_start_time(start_time)
+            .with_end_time(end_time)
+            .with_attribute(http::METHOD, "POST")
+            .with_attribute(http::TARGET, "/agents/*")
+            .with_attribute("filter.name", agent_name.to_string())
+            .with_attribute("filter.tool_name", tool_name.to_string())
+            .with_attribute(
+                "duration_ms",
+                format!("{:.2}", elapsed.as_secs_f64() * 1000.0),
+            );
+
+        if !trace_id.is_empty() {
+            span_builder = span_builder.with_trace_id(trace_id);
+        }
+        if !parent_span_id.is_empty() {
+            span_builder = span_builder.with_parent_span_id(parent_span_id);
+        }
+
+        let span = span_builder.build();
+        // Use plano(filter) as service name for filter execution spans
+        collector.record_span(operation_component::AGENT_FILTER, span);
+        span_id.clone()
+    }
+
+    /// Record a span for MCP protocol interactions
+    fn record_mcp_span(
+        &self,
+        collector: &std::sync::Arc<common::traces::TraceCollector>,
+        operation: &str,
+        agent_id: &str,
+        start_time: SystemTime,
+        end_time: SystemTime,
+        elapsed: std::time::Duration,
+        additional_attrs: Option<HashMap<&str, String>>,
+        trace_id: String,
+        parent_span_id: String,
+        span_id: Option<String>,
+    ) {
+        // let (trace_id, parent_span_id) = self.extract_trace_context();
+
+        // Build operation name: POST /mcp {agent_id}
+        let operation_name = OperationNameBuilder::new()
+            .with_method("POST")
+            .with_path("/mcp")
+            .with_operation(operation)
+            .with_target(agent_id)
+            .build();
+
+        let mut span_builder = SpanBuilder::new(&operation_name)
+            .with_span_id(span_id.unwrap_or_else(|| generate_random_span_id()))
+            .with_kind(SpanKind::Client)
+            .with_start_time(start_time)
+            .with_end_time(end_time)
+            .with_attribute(http::METHOD, "POST")
+            .with_attribute(http::TARGET, &format!("/mcp ({})", operation.to_string()))
+            .with_attribute("mcp.operation", operation.to_string())
+            .with_attribute("mcp.agent_id", agent_id.to_string())
+            .with_attribute(
+                "duration_ms",
+                format!("{:.2}", elapsed.as_secs_f64() * 1000.0),
+            );
+
+        if let Some(attrs) = additional_attrs {
+            for (key, value) in attrs {
+                span_builder = span_builder.with_attribute(key, value);
+            }
+        }
+
+        if !trace_id.is_empty() {
+            span_builder = span_builder.with_trace_id(trace_id);
+        }
+        if !parent_span_id.is_empty() {
+            span_builder = span_builder.with_parent_span_id(parent_span_id);
+        }
+
+        let span = span_builder.build();
+        // MCP spans also use plano(filter) service name as they are part of filter operations
+        collector.record_span(operation_component::AGENT_FILTER, span);
+    }
+
    /// Process the filter chain of agents (all except the terminal agent)
    pub async fn process_filter_chain(
-        &self,
-        initial_request: &ChatCompletionsRequest,
+        &mut self,
+        chat_history: &[Message],
        agent_filter_chain: &AgentFilterChain,
        agent_map: &HashMap<String, Agent>,
        request_headers: &HeaderMap,
+        trace_collector: Option<&std::sync::Arc<common::traces::TraceCollector>>,
+        trace_id: String,
+        parent_span_id: String,
    ) -> Result<Vec<Message>, PipelineError> {
-        let mut chat_completions_history = initial_request.messages.clone();
+        let mut chat_history_updated = chat_history.to_vec();

        for agent_name in &agent_filter_chain.filter_chain {
            debug!("Processing filter agent: {}", agent_name);
@ -73,123 +207,490 @@ impl PipelineProcessor {
                .get(agent_name)
                .ok_or_else(|| PipelineError::AgentNotFound(agent_name.clone()))?;

-            debug!("Agent details: {:?}", agent);
+            let tool_name = agent.tool.as_deref().unwrap_or(&agent.id);

-            let response_content = self
-                .send_agent_filter_chain_request(
-                    &chat_completions_history,
-                    initial_request,
+            info!(
+                "executing filter: {}/{}, url: {}, conversation length: {}",
+                agent_name,
+                tool_name,
+                agent.url,
+                chat_history.len()
+            );
+
+            let start_time = SystemTime::now();
+            let start_instant = Instant::now();
+
+            // Generate filter span ID before execution so MCP spans can use it as parent
+            let filter_span_id = generate_random_span_id();
+
+            chat_history_updated = self
+                .execute_filter(
+                    &chat_history_updated,
                    agent,
                    request_headers,
+                    trace_collector,
+                    trace_id.clone(),
+                    filter_span_id.clone(),
                )
                .await?;

-            debug!("Received response from filter agent {}", agent_name);
+            let end_time = SystemTime::now();
+            let elapsed = start_instant.elapsed();

-            // Parse the response content as new message history
-            chat_completions_history =
-                serde_json::from_str(&response_content).inspect_err(|err| {
-                    warn!(
-                        "Failed to parse response from agent {}, err: {}, response: {}",
-                        agent_name, err, response_content
-                    )
-                })?;
-        }
-
-        Ok(chat_completions_history)
-    }
-
-    /// Send request to a specific agent and return the response content
-    async fn send_agent_filter_chain_request(
-        &self,
-        messages: &[Message],
-        original_request: &ChatCompletionsRequest,
-        agent: &Agent,
-        request_headers: &HeaderMap,
-    ) -> Result<String, PipelineError> {
-        let mut request = original_request.clone();
-        request.messages = messages.to_vec();
-
-        let request_body = serde_json::to_string(&request)?;
-        debug!("Sending request to agent {}", agent.id);
-
-        let mut agent_headers = request_headers.clone();
-        agent_headers.remove(hyper::header::CONTENT_LENGTH);
-        agent_headers.insert(
-            ARCH_UPSTREAM_HOST_HEADER,
-            hyper::header::HeaderValue::from_str(&agent.id)
-                .map_err(|_| PipelineError::AgentNotFound(agent.id.clone()))?,
+            info!(
+                "Filter '{}' completed in {:.2}ms, updated conversation length: {}",
+                agent_name,
+                elapsed.as_secs_f64() * 1000.0,
+                chat_history_updated.len()
            );

-        agent_headers.insert(
+            // Record span for this filter execution
+            if let Some(collector) = trace_collector {
+                self.record_filter_span(
+                    collector,
+                    agent_name,
+                    tool_name,
+                    start_time,
+                    end_time,
+                    elapsed,
+                    trace_id.clone(),
+                    parent_span_id.clone(),
+                    filter_span_id,
+                );
+            }
+        }
+
+        Ok(chat_history_updated)
+    }
+
+    /// Build common MCP headers for requests
+    fn build_mcp_headers(
+        &self,
+        request_headers: &HeaderMap,
+        agent_id: &str,
+        session_id: Option<&str>,
+        trace_id: String,
+        parent_span_id: String,
+    ) -> Result<HeaderMap, PipelineError> {
+        let trace_parent = format!("00-{}-{}-01", trace_id, parent_span_id);
+        let mut headers = request_headers.clone();
+        headers.remove(hyper::header::CONTENT_LENGTH);
+
+        headers.remove(TRACE_PARENT_HEADER);
+        headers.insert(
+            TRACE_PARENT_HEADER,
+            hyper::header::HeaderValue::from_str(&trace_parent).unwrap(),
+        );
+
+        headers.insert(
+            ARCH_UPSTREAM_HOST_HEADER,
+            hyper::header::HeaderValue::from_str(agent_id)
+                .map_err(|_| PipelineError::AgentNotFound(agent_id.to_string()))?,
+        );
+
+        headers.insert(
            ENVOY_RETRY_HEADER,
            hyper::header::HeaderValue::from_str("3").unwrap(),
        );

+        headers.insert(
+            "Accept",
+            hyper::header::HeaderValue::from_static("application/json, text/event-stream"),
+        );
+
+        headers.insert(
+            "Content-Type",
+            hyper::header::HeaderValue::from_static("application/json"),
+        );
+
+        if let Some(sid) = session_id {
+            headers.insert(
+                "mcp-session-id",
+                hyper::header::HeaderValue::from_str(sid).unwrap(),
+            );
+        }
+
+        Ok(headers)
+    }
+
+    /// Parse SSE formatted response and extract JSON-RPC data
+    fn parse_sse_response(
+        &self,
+        response_bytes: &[u8],
+        agent_id: &str,
+    ) -> Result<String, PipelineError> {
+        let response_str = String::from_utf8_lossy(response_bytes);
+        let lines: Vec<&str> = response_str.lines().collect();
+
+        // Validate SSE format: first line should be "event: message"
+        if lines.is_empty() || lines[0] != "event: message" {
+            warn!(
+                "Invalid SSE response format from agent {}: expected 'event: message' as first line, got: {:?}",
+                agent_id,
+                lines.first()
+            );
+            return Err(PipelineError::NoContentInResponse(format!(
+                "Invalid SSE response format from agent {}: expected 'event: message' as first line",
+                agent_id
+            )));
+        }
+
+        // Find the data line
+        let data_lines: Vec<&str> = lines
+            .iter()
+            .filter(|line| line.starts_with("data: "))
+            .copied()
+            .collect();
+
+        if data_lines.len() != 1 {
+            warn!(
+                "Expected exactly one 'data:' line from agent {}, found {}",
+                agent_id,
+                data_lines.len()
+            );
+            return Err(PipelineError::NoContentInResponse(format!(
+                "Expected exactly one 'data:' line from agent {}, found {}",
+                agent_id,
+                data_lines.len()
+            )));
+        }
+
+        // Skip "data: " prefix
+        Ok(data_lines[0][6..].to_string())
+    }
+
+    /// Send an MCP request and return the response
+    async fn send_mcp_request(
+        &self,
+        json_rpc_request: &JsonRpcRequest,
+        headers: HeaderMap,
+        agent_id: &str,
+    ) -> Result<reqwest::Response, PipelineError> {
+        let request_body = serde_json::to_string(json_rpc_request)?;
+
+        debug!(
+            "Sending MCP request to agent {}: {}",
+            agent_id, request_body
+        );
+
        let response = self
            .client
-            .post(&self.url)
-            .headers(agent_headers)
+            .post(format!("{}/mcp", self.url))
+            .headers(headers)
            .body(request_body)
            .send()
            .await?;

-        let status = response.status();
+        Ok(response)
+    }
+
+    /// Build a tools/call JSON-RPC request
+    fn build_tool_call_request(
+        &self,
+        tool_name: &str,
+        messages: &[Message],
+    ) -> Result<JsonRpcRequest, PipelineError> {
+        let mut arguments = HashMap::new();
+        arguments.insert("messages".to_string(), serde_json::to_value(messages)?);
+
+        let mut params = HashMap::new();
+        params.insert("name".to_string(), serde_json::to_value(tool_name)?);
+        params.insert("arguments".to_string(), serde_json::to_value(arguments)?);
+
+        Ok(JsonRpcRequest {
+            jsonrpc: JSON_RPC_VERSION.to_string(),
+            id: JsonRpcId::String(Uuid::new_v4().to_string()),
+            method: TOOL_CALL_METHOD.to_string(),
+            params: Some(params),
+        })
+    }
+
+    /// Send request to a specific agent and return the response content
+    async fn execute_filter(
+        &mut self,
+        messages: &[Message],
+        agent: &Agent,
+        request_headers: &HeaderMap,
+        trace_collector: Option<&std::sync::Arc<common::traces::TraceCollector>>,
+        trace_id: String,
+        filter_span_id: String,
+    ) -> Result<Vec<Message>, PipelineError> {
+        // Get or create MCP session
+        let mcp_session_id = if let Some(session_id) = self.agent_id_session_map.get(&agent.id) {
+            session_id.clone()
+        } else {
+            let session_id = self
+                .get_new_session_id(
+                    &agent.id,
+                    trace_id.clone(),
+                    filter_span_id.clone(),
+                )
+                .await;
+            self.agent_id_session_map
+                .insert(agent.id.clone(), session_id.clone());
+            session_id
+        };
+
+        info!(
+            "Using MCP session ID {} for agent {}",
+            mcp_session_id, agent.id
+        );
+
+        // Build JSON-RPC request
+        let tool_name = agent.tool.as_deref().unwrap_or(&agent.id);
+        let json_rpc_request = self.build_tool_call_request(tool_name, messages)?;
+
+        // Generate span ID for this MCP tool call (child of filter span)
+        let mcp_span_id = generate_random_span_id();
+
+        // Build headers
+        let agent_headers =
+            self.build_mcp_headers(request_headers, &agent.id, Some(&mcp_session_id), trace_id.clone(), mcp_span_id.clone())?;
+
+        // Send request with tracing
+        let start_time = SystemTime::now();
+        let start_instant = Instant::now();
+
+        let response = self
+            .send_mcp_request(
+                &json_rpc_request,
+                agent_headers,
+                &agent.id,
+            )
+            .await?;
+        let http_status = response.status();
        let response_bytes = response.bytes().await?;

-        // Check for HTTP errors and handle them appropriately
-        if !status.is_success() {
+        let end_time = SystemTime::now();
+        let elapsed = start_instant.elapsed();
+
+        // Record MCP tool call span
+        if let Some(collector) = trace_collector {
+            let mut attrs = HashMap::new();
+            attrs.insert("mcp.method", "tools/call".to_string());
+            attrs.insert("mcp.tool_name", tool_name.to_string());
+            attrs.insert("mcp.session_id", mcp_session_id.clone());
+            attrs.insert("http.status_code", http_status.as_u16().to_string());
+
+            self.record_mcp_span(
+                collector,
+                "tool_call",
+                &agent.id,
+                start_time,
+                end_time,
+                elapsed,
+                Some(attrs),
+                trace_id.clone(),
+                filter_span_id.clone(),
+                Some(mcp_span_id),
+            );
+        }
+
+        // Handle HTTP errors
+        if !http_status.is_success() {
            let error_body = String::from_utf8_lossy(&response_bytes).to_string();
-
-            if status.is_client_error() {
-                // 4xx errors - cascade back to developer
-                return Err(PipelineError::ClientError {
+            return Err(if http_status.is_client_error() {
+                PipelineError::ClientError {
                    agent: agent.id.clone(),
-                    status: status.as_u16(),
+                    status: http_status.as_u16(),
                    body: error_body,
-                });
-            } else if status.is_server_error() {
-                // 5xx errors - server/agent error
-                return Err(PipelineError::ServerError {
+                }
+            } else {
+                PipelineError::ServerError {
                    agent: agent.id.clone(),
-                    status: status.as_u16(),
+                    status: http_status.as_u16(),
                    body: error_body,
+                }
            });
        }
-        }

-        // Parse the response as JSON to extract the content
-        let response_json: serde_json::Value = serde_json::from_slice(&response_bytes)?;
+        info!(
+            "Response from agent {}: {}",
+            agent.id,
+            String::from_utf8_lossy(&response_bytes)
+        );

-        let content = response_json
-            .get("choices")
-            .and_then(|choices| choices.as_array())
-            .and_then(|choices| choices.first())
-            .and_then(|choice| choice.get("message"))
-            .and_then(|message| message.get("content"))
-            .and_then(|content| content.as_str())
-            .ok_or_else(|| PipelineError::NoContentInResponse(agent.id.clone()))?
+        // Parse SSE response
+        let data_chunk = self.parse_sse_response(&response_bytes, &agent.id)?;
+        let response: JsonRpcResponse = serde_json::from_str(&data_chunk)?;
+        let response_result = response
+            .result
+            .ok_or_else(|| PipelineError::NoResultInResponse(agent.id.clone()))?;
+
+        // Check if error field is set in response result
+        if response_result
+            .get("isError")
+            .and_then(|v| v.as_bool())
+            .unwrap_or(false)
+        {
+            let error_message = response_result
+                .get("content")
+                .and_then(|v| v.as_array())
+                .and_then(|arr| arr.first())
+                .and_then(|v| v.get("text"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("unknown_error")
                .to_string();

-        Ok(content)
+            return Err(PipelineError::ClientError {
+                agent: agent.id.clone(),
+                status: http_status.as_u16(),
+                body: error_message,
+            });
+        }
+
+        // Extract structured content and parse messages
+        let response_json = response_result
+            .get("structuredContent")
+            .ok_or_else(|| PipelineError::NoStructuredContentInResponse(agent.id.clone()))?;
+
+        let messages: Vec<Message> = response_json
+            .get("result")
+            .and_then(|v| v.as_array())
+            .ok_or_else(|| PipelineError::NoMessagesInResponse(agent.id.clone()))?
+            .iter()
+            .map(|msg_value| serde_json::from_value(msg_value.clone()))
+            .collect::<Result<Vec<Message>, _>>()
+            .map_err(PipelineError::ParseError)?;
+
+        Ok(messages)
+    }
+
+    /// Build an initialize JSON-RPC request
+    fn build_initialize_request(&self) -> JsonRpcRequest {
+        JsonRpcRequest {
+            jsonrpc: JSON_RPC_VERSION.to_string(),
+            id: JsonRpcId::String(Uuid::new_v4().to_string()),
+            method: MCP_INITIALIZE.to_string(),
+            params: Some({
+                let mut params = HashMap::new();
+                params.insert(
+                    "protocolVersion".to_string(),
+                    serde_json::Value::String("2024-11-05".to_string()),
+                );
+                params.insert("capabilities".to_string(), serde_json::json!({}));
+                params.insert(
+                    "clientInfo".to_string(),
+                    serde_json::json!({
+                        "name": BRIGHT_STAFF_SERVICE_NAME,
+                        "version": "1.0.0"
+                    }),
+                );
+                params
+            }),
+        }
+    }
+
+    /// Send initialized notification after session creation
+    async fn send_initialized_notification(
+        &self,
+        agent_id: &str,
+        session_id: &str,
+        trace_id: String,
+        parent_span_id: String,
+    ) -> Result<(), PipelineError> {
+        let initialized_notification = JsonRpcNotification {
+            jsonrpc: JSON_RPC_VERSION.to_string(),
+            method: MCP_INITIALIZE_NOTIFICATION.to_string(),
+            params: None,
+        };
+
+        let notification_body = serde_json::to_string(&initialized_notification)?;
+        debug!("Sending initialized notification for agent {}", agent_id);
+
+        let headers = self.build_mcp_headers(&HeaderMap::new(), agent_id, Some(session_id), trace_id.clone(), parent_span_id.clone())?;
+
+        let response = self
+            .client
+            .post(format!("{}/mcp", self.url))
+            .headers(headers)
+            .body(notification_body)
+            .send()
+            .await?;
+
+        info!(
+            "Initialized notification response status: {}",
+            response.status()
+        );
+
+        Ok(())
+    }
+
+    async fn get_new_session_id(
+        &self,
+        agent_id: &str,
+        trace_id: String,
+        parent_span_id: String,
+    ) -> String {
+        info!("Initializing MCP session for agent {}", agent_id);
+
+        let initialize_request = self.build_initialize_request();
+        let headers = self
+            .build_mcp_headers(&HeaderMap::new(), agent_id, None, trace_id.clone(), parent_span_id.clone())
+            .expect("Failed to build headers for initialization");
+
+        let response = self
+            .send_mcp_request(&initialize_request, headers, agent_id)
+            .await
+            .expect("Failed to initialize MCP session");
+
+        info!("Initialize response status: {}", response.status());
+
+        let session_id = response
+            .headers()
+            .get("mcp-session-id")
+            .and_then(|v| v.to_str().ok())
+            .expect("No mcp-session-id in response")
+            .to_string();
+
+        info!(
+            "Created new MCP session for agent {}: {}",
+            agent_id, session_id
+        );
+
+        // Send initialized notification
+        self.send_initialized_notification(
+            agent_id,
+            &session_id,
+            trace_id.clone(),
+            parent_span_id.clone(),
+        )
+        .await
+        .expect("Failed to send initialized notification");
+
+        session_id
    }

    /// Send request to terminal agent and return the raw response for streaming
-    pub async fn invoke_upstream_agent(
+    pub async fn invoke_agent(
        &self,
        messages: &[Message],
-        original_request: &ChatCompletionsRequest,
+        mut original_request: ProviderRequestType,
        terminal_agent: &Agent,
        request_headers: &HeaderMap,
+        trace_id: String,
+        agent_span_id: String,
    ) -> Result<reqwest::Response, PipelineError> {
-        let mut request = original_request.clone();
-        request.messages = messages.to_vec();
+        // let mut request = original_request.clone();
+        original_request.set_messages(messages);

-        let request_body = serde_json::to_string(&request)?;
+        let request_body = ProviderRequestType::to_bytes(&original_request).unwrap();
+        // let request_body = serde_json::to_string(&request)?;
        debug!("Sending request to terminal agent {}", terminal_agent.id);

        let mut agent_headers = request_headers.clone();
        agent_headers.remove(hyper::header::CONTENT_LENGTH);
+
+        // Set traceparent header to make the egress span a child of the agent span
+        if !trace_id.is_empty() && !agent_span_id.is_empty() {
+            let trace_parent = format!("00-{}-{}-01", trace_id, agent_span_id);
+            agent_headers.remove(TRACE_PARENT_HEADER);
+            agent_headers.insert(
+                TRACE_PARENT_HEADER,
+                hyper::header::HeaderValue::from_str(&trace_parent).unwrap(),
+            );
+        }
+
        agent_headers.insert(
            ARCH_UPSTREAM_HOST_HEADER,
            hyper::header::HeaderValue::from_str(&terminal_agent.id)
@ -203,7 +704,7 @@ impl PipelineProcessor {

        let response = self
            .client
-            .post(&self.url)
+            .post(format!("{}/v1/chat/completions", self.url))
            .headers(agent_headers)
            .body(request_body)
            .send()
@ -217,6 +718,7 @@ impl PipelineProcessor {
 mod tests {
    use super::*;
    use hermesllm::apis::openai::{Message, MessageContent, Role};
+    use mockito::Server;
    use std::collections::HashMap;

    fn create_test_message(role: Role, content: &str) -> Message {
@ -240,23 +742,149 @@ mod tests {

    #[tokio::test]
    async fn test_agent_not_found_error() {
-        let processor = PipelineProcessor::default();
+        let mut processor = PipelineProcessor::default();
        let agent_map = HashMap::new();
        let request_headers = HeaderMap::new();

-        let initial_request = ChatCompletionsRequest {
-            messages: vec![create_test_message(Role::User, "Hello")],
-            model: "test-model".to_string(),
-            ..Default::default()
-        };
+        let messages = vec![create_test_message(Role::User, "Hello")];

        let pipeline = create_test_pipeline(vec!["nonexistent-agent", "terminal-agent"]);

        let result = processor
-            .process_filter_chain(&initial_request, &pipeline, &agent_map, &request_headers)
+            .process_filter_chain(&messages, &pipeline, &agent_map, &request_headers, None, String::new(), String::new())
            .await;

        assert!(result.is_err());
        matches!(result.unwrap_err(), PipelineError::AgentNotFound(_));
    }
+
+    #[tokio::test]
+    async fn test_execute_filter_http_status_error() {
+        let mut server = Server::new_async().await;
+        let _m = server
+            .mock("POST", "/mcp")
+            .with_status(500)
+            .with_body("boom")
+            .create();
+
+        let server_url = server.url();
+        let mut processor = PipelineProcessor::new(server_url.clone());
+        processor
+            .agent_id_session_map
+            .insert("agent-1".to_string(), "session-1".to_string());
+
+        let agent = Agent {
+            id: "agent-1".to_string(),
+            transport: None,
+            tool: None,
+            url: server_url,
+            agent_type: None,
+        };
+
+        let messages = vec![create_test_message(Role::User, "Hello")];
+        let request_headers = HeaderMap::new();
+
+        let result = processor
+            .execute_filter(&messages, &agent, &request_headers, None, "trace-123".to_string(), "span-123".to_string())
+            .await;
+
+        match result {
+            Err(PipelineError::ServerError { status, body, .. }) => {
+                assert_eq!(status, 500);
+                assert_eq!(body, "boom");
+            }
+            _ => panic!("Expected server error for 500 status"),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_execute_filter_http_client_error() {
+        let mut server = Server::new_async().await;
+        let _m = server
+            .mock("POST", "/mcp")
+            .with_status(400)
+            .with_body("bad request")
+            .create();
+
+        let server_url = server.url();
+        let mut processor = PipelineProcessor::new(server_url.clone());
+        processor
+            .agent_id_session_map
+            .insert("agent-3".to_string(), "session-3".to_string());
+
+        let agent = Agent {
+            id: "agent-3".to_string(),
+            transport: None,
+            tool: None,
+            url: server_url,
+            agent_type: None,
+        };
+
+        let messages = vec![create_test_message(Role::User, "Ping")];
+        let request_headers = HeaderMap::new();
+
+        let result = processor
+            .execute_filter(&messages, &agent, &request_headers, None, "trace-456".to_string(), "span-456".to_string())
+            .await;
+
+        match result {
+            Err(PipelineError::ClientError { status, body, .. }) => {
+                assert_eq!(status, 400);
+                assert_eq!(body, "bad request");
+            }
+            _ => panic!("Expected client error for 400 status"),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_execute_filter_mcp_error_flag() {
+        let rpc_body = serde_json::json!({
+            "jsonrpc": JSON_RPC_VERSION,
+            "id": "1",
+            "result": {
+                "isError": true,
+                "content": [
+                    { "text": "bad tool call" }
+                ]
+            }
+        });
+
+        let sse_body = format!("event: message\ndata: {}\n\n", rpc_body.to_string());
+
+        let mut server = Server::new_async().await;
+        let _m = server
+            .mock("POST", "/mcp")
+            .with_status(200)
+            .with_body(sse_body)
+            .create();
+
+        let server_url = server.url();
+        let mut processor = PipelineProcessor::new(server_url.clone());
+        processor
+            .agent_id_session_map
+            .insert("agent-2".to_string(), "session-2".to_string());
+
+        let agent = Agent {
+            id: "agent-2".to_string(),
+            transport: None,
+            tool: None,
+            url: server_url,
+            agent_type: None,
+        };
+
+        let messages = vec![create_test_message(Role::User, "Hi")];
+        let request_headers = HeaderMap::new();
+
+        let result = processor
+            .execute_filter(&messages, &agent, &request_headers, None, "trace-789".to_string(), "span-789".to_string())
+            .await;
+
+        match result {
+            Err(PipelineError::ClientError { status, body, .. }) => {
+                assert_eq!(status, 200);
+                assert_eq!(body, "bad tool call");
+            }
+            _ => panic!("Expected client error when isError flag is set"),
+        }
+    }
 }
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -1,14 +1,14 @@
 use brightstaff::handlers::agent_chat_completions::agent_chat;
+use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
-use brightstaff::handlers::function_calling::{function_calling_chat_handler};
 use brightstaff::router::llm_router::RouterService;
 use brightstaff::state::StateStorage;
 use brightstaff::state::postgresql::PostgreSQLConversationStorage;
 use brightstaff::state::memory::MemoryConversationalStorage;
 use brightstaff::utils::tracing::init_tracer;
 use bytes::Bytes;
-use common::configuration::Configuration;
+use common::configuration::{Agent, Configuration};
 use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
 use common::traces::TraceCollector;
 use http_body_util::{combinators::BoxBody, BodyExt, Empty};
@ -63,8 +63,18 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {

    let arch_config = Arc::new(config);

+    // combine agents and filters into a single list of agents
+    let all_agents: Vec<Agent> = arch_config
+        .agents
+        .as_deref()
+        .unwrap_or_default()
+        .iter()
+        .chain(arch_config.filters.as_deref().unwrap_or_default())
+        .cloned()
+        .collect();
+
    let llm_providers = Arc::new(RwLock::new(arch_config.model_providers.clone()));
-    let agents_list = Arc::new(RwLock::new(arch_config.agents.clone()));
+    let combined_agents_filters_list = Arc::new(RwLock::new(Some(all_agents)));
    let listeners = Arc::new(RwLock::new(arch_config.listeners.clone()));
    let llm_provider_url =
        env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
@ -98,7 +108,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        info!("Tracing configuration found in arch_config.yaml");
        Some(true)
    } else {
-        info!("No tracing configuration in arch_config.yaml, will check OTEL_TRACING_ENABLED env var");
+        info!(
+            "No tracing configuration in arch_config.yaml, will check OTEL_TRACING_ENABLED env var"
+        );
        None
    };
    let trace_collector = Arc::new(TraceCollector::new(tracing_enabled));
@ -142,11 +154,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        let io = TokioIo::new(stream);

        let router_service: Arc<RouterService> = Arc::clone(&router_service);
-        let model_aliases: Arc<Option<std::collections::HashMap<String, common::configuration::ModelAlias>>> = Arc::clone(&model_aliases);
+        let model_aliases: Arc<
+            Option<std::collections::HashMap<String, common::configuration::ModelAlias>>,
+        > = Arc::clone(&model_aliases);
        let llm_provider_url = llm_provider_url.clone();

        let llm_providers = llm_providers.clone();
-        let agents_list = agents_list.clone();
+        let agents_list = combined_agents_filters_list.clone();
        let listeners = listeners.clone();
        let trace_collector = trace_collector.clone();
        let state_storage = state_storage.clone();
@ -162,28 +176,36 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
            let state_storage = state_storage.clone();

            async move {
-                match (req.method(), req.uri().path()) {
-                    (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
-                        let fully_qualified_url =
-                            format!("{}{}", llm_provider_url, req.uri().path());
-                        llm_chat(req, router_service, fully_qualified_url, model_aliases, llm_providers, trace_collector, state_storage)
-                            .with_context(parent_cx)
-                            .await
-                    }
-                    (&Method::POST, "/agents/v1/chat/completions") => {
-                        let fully_qualified_url =
-                            format!("{}{}", llm_provider_url, req.uri().path());
-                        agent_chat(
+                let path = req.uri().path();
+                // Check if path starts with /agents
+                if path.starts_with("/agents") {
+                    // Check if it matches one of the agent API paths
+                    let stripped_path = path.strip_prefix("/agents").unwrap();
+                    if matches!(
+                        stripped_path,
+                        CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
+                    ) {
+                        let fully_qualified_url = format!("{}{}", llm_provider_url, stripped_path);
+                        return agent_chat(
                            req,
                            router_service,
                            fully_qualified_url,
                            agents_list,
                            listeners,
+                            trace_collector,
                        )
+                        .with_context(parent_cx)
+                        .await;
+                    }
+                }
+                match (req.method(), path) {
+                    (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
+                        let fully_qualified_url =
+                            format!("{}{}", llm_provider_url, path);
+                        llm_chat(req, router_service, fully_qualified_url, model_aliases, llm_providers, trace_collector, state_storage)
                            .with_context(parent_cx)
                            .await
                    }
-
                    (&Method::POST, "/function_calling") => {
                        let fully_qualified_url =
                            format!("{}{}", llm_provider_url, "/v1/chat/completions");
--- a/crates/brightstaff/src/tracing/constants.rs
+++ b/crates/brightstaff/src/tracing/constants.rs
@ -157,7 +157,7 @@ pub mod operation_component {
    pub const HANDOFF: &str = "plano(handoff)";

    /// Agent filter execution
-    pub const AGENT_FILTER: &str = "plano(agent filter)";
+    pub const AGENT_FILTER: &str = "plano(filter)";

    /// Agent execution
    pub const AGENT: &str = "plano(agent)";
@ -203,6 +203,7 @@ pub mod operation_component {
 pub struct OperationNameBuilder {
    method: Option<String>,
    path: Option<String>,
+    operation: Option<String>,
    target: Option<String>,
 }

@ -212,6 +213,7 @@ impl OperationNameBuilder {
        Self {
            method: None,
            path: None,
+            operation: None,
            target: None,
        }
    }
@ -234,6 +236,15 @@ impl OperationNameBuilder {
        self
    }

+    /// Set the operation type (optional, for MCP operations)
+    ///
+    /// # Arguments
+    /// * `operation` - Operation type (e.g., "tool_call", "session_init", "notification")
+    pub fn with_operation(mut self, operation: impl Into<String>) -> Self {
+        self.operation = Some(operation.into());
+        self
+    }
+
    /// Set the target (model name, agent name, or filter name)
    ///
    /// # Arguments
@ -246,7 +257,8 @@ impl OperationNameBuilder {
    /// Build the operation name string
    ///
    /// # Format
-    /// - With all components: `{method} {path} {target}`
+    /// - With all components: `{method} {path} ({operation}) {target}`
+    /// - Without operation: `{method} {path} {target}`
    /// - Without target: `{method} {path}`
    /// - Without path: `{method}`
    /// - Empty: returns empty string
@ -258,8 +270,12 @@ impl OperationNameBuilder {
        }

        if let Some(path) = self.path {
+            if let Some(operation) = self.operation {
+                parts.push(format!("{} ({})", path, operation));
+            } else {
                parts.push(path);
            }
+        }

        if let Some(target) = self.target {
            parts.push(target);
--- a/crates/build.sh
+++ b/crates/build.sh
@ -0,0 +1 @@
+cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway && cargo build --release -p brightstaff
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -21,8 +21,11 @@ pub struct ModelAlias {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Agent {
    pub id: String,
-    pub kind: Option<String>,
+    pub transport: Option<String>,
+    pub tool: Option<String>,
    pub url: String,
+    #[serde(rename = "type")]
+    pub agent_type: Option<String>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -71,6 +74,7 @@ pub struct Configuration {
    pub mode: Option<GatewayMode>,
    pub routing: Option<Routing>,
    pub agents: Option<Vec<Agent>>,
+    pub filters: Option<Vec<Agent>>,
    pub listeners: Vec<Listener>,
    pub state_storage: Option<StateStorageConfig>,
 }
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -32,3 +32,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
 pub const OTEL_POST_PATH: &str = "/v1/traces";
 pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
 pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries";
+pub const BRIGHT_STAFF_SERVICE_NAME : &str = "brightstaff";
--- a/crates/common/src/traces/mod.rs
+++ b/crates/common/src/traces/mod.rs
@ -18,7 +18,7 @@ pub use shapes::{
 };

 // Re-export new utilities
-pub use span_builder::{SpanBuilder, SpanKind};
+pub use span_builder::{SpanBuilder, SpanKind, generate_random_span_id};
 pub use resource_span_builder::ResourceSpanBuilder;
 pub use constants::*;

--- a/crates/common/src/traces/span_builder.rs
+++ b/crates/common/src/traces/span_builder.rs
@ -37,6 +37,7 @@ pub struct SpanBuilder {
    end_time: Option<SystemTime>,
    kind: SpanKind,
    attributes: HashMap<String, String>,
+    span_id: Option<String>,
 }

 impl SpanBuilder {
@ -53,6 +54,7 @@ impl SpanBuilder {
            end_time: None,
            kind: SpanKind::Internal,
            attributes: HashMap::new(),
+            span_id: None,
        }
    }

@ -62,6 +64,11 @@ impl SpanBuilder {
        self
    }

+    pub fn with_span_id(mut self, span_id: impl Into<String>) -> Self {
+        self.span_id = Some(span_id.into());
+        self
+    }
+
    /// Set the parent span ID to link this span to its parent
    pub fn with_parent_span_id(mut self, parent_span_id: impl Into<String>) -> Self {
        self.parent_span_id = Some(parent_span_id.into());
@ -125,7 +132,7 @@ impl SpanBuilder {
        // Build span directly without going through Span::new()
        Span {
            trace_id,
-            span_id: generate_random_span_id(),
+            span_id: self.span_id.unwrap_or_else(|| generate_random_span_id()),
            parent_span_id: self.parent_span_id,
            name: self.name,
            start_time_unix_nano: format!("{}", start_nanos),
@ -145,7 +152,7 @@ fn system_time_to_nanos(time: SystemTime) -> u128 {
 }

 /// Generate a random span ID (16 hex characters = 8 bytes)
-fn generate_random_span_id() -> String {
+pub fn generate_random_span_id() -> String {
    use rand::RngCore;
    let mut rng = rand::thread_rng();
    let mut random_bytes = [0u8; 8];
--- a/crates/hermesllm/src/apis/amazon_bedrock.rs
+++ b/crates/hermesllm/src/apis/amazon_bedrock.rs
@ -233,6 +233,104 @@ impl ProviderRequest for ConverseRequest {
    fn get_temperature(&self) -> Option<f32> {
        self.inference_config.as_ref()?.temperature
    }
+
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message> {
+        use crate::apis::openai::{Message, MessageContent, Role};
+
+        let mut openai_messages = Vec::new();
+
+        // Add system messages if present
+        if let Some(system) = &self.system {
+            for sys_block in system {
+                match sys_block {
+                    SystemContentBlock::Text { text } => {
+                        openai_messages.push(Message {
+                            role: Role::System,
+                            content: MessageContent::Text(text.clone()),
+                            name: None,
+                            tool_calls: None,
+                            tool_call_id: None,
+                        });
+                    }
+                    _ => {} // Skip other system content types
+                }
+            }
+        }
+
+        // Convert conversation messages
+        if let Some(messages) = &self.messages {
+            for msg in messages {
+                let role = match msg.role {
+                    ConversationRole::User => Role::User,
+                    ConversationRole::Assistant => Role::Assistant,
+                };
+
+                // Extract text from content blocks
+                let content = msg.content.iter()
+                    .filter_map(|block| {
+                        if let ContentBlock::Text { text } = block {
+                            Some(text.clone())
+                        } else {
+                            None
+                        }
+                    })
+                    .collect::<Vec<_>>()
+                    .join("\n");
+
+                openai_messages.push(Message {
+                    role,
+                    content: MessageContent::Text(content),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                });
+            }
+        }
+
+        openai_messages
+    }
+
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        // Convert OpenAI messages to Bedrock format
+        use crate::apis::amazon_bedrock::{ContentBlock, ConversationRole, SystemContentBlock};
+
+        let mut system_blocks = Vec::new();
+        let mut bedrock_messages = Vec::new();
+
+        for msg in messages {
+            match msg.role {
+                crate::apis::openai::Role::System => {
+                    if let crate::apis::openai::MessageContent::Text(text) = &msg.content {
+                        system_blocks.push(SystemContentBlock::Text { text: text.clone() });
+                    }
+                }
+                crate::apis::openai::Role::User | crate::apis::openai::Role::Assistant => {
+                    let role = match msg.role {
+                        crate::apis::openai::Role::User => ConversationRole::User,
+                        crate::apis::openai::Role::Assistant => ConversationRole::Assistant,
+                        _ => continue,
+                    };
+
+                    let content = if let crate::apis::openai::MessageContent::Text(text) = &msg.content {
+                        vec![ContentBlock::Text { text: text.clone() }]
+                    } else {
+                        vec![]
+                    };
+
+                    bedrock_messages.push(crate::apis::amazon_bedrock::Message {
+                        role,
+                        content,
+                    });
+                }
+                _ => {}
+            }
+        }
+
+        if !system_blocks.is_empty() {
+            self.system = Some(system_blocks);
+        }
+        self.messages = Some(bedrock_messages);
+    }
 }

 // ============================================================================
--- a/crates/hermesllm/src/apis/anthropic.rs
+++ b/crates/hermesllm/src/apis/anthropic.rs
@ -541,6 +541,65 @@ impl ProviderRequest for MessagesRequest {
    fn get_temperature(&self) -> Option<f32> {
        self.temperature
    }
+
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message> {
+        use crate::apis::openai::Message;
+
+        let mut openai_messages = Vec::new();
+
+        // Add system prompt as system message if present
+        if let Some(system) = &self.system {
+            openai_messages.push(system.clone().into());
+        }
+
+        // Convert each Anthropic message to OpenAI format
+        for msg in &self.messages {
+            if let Ok(converted_msgs) = TryInto::<Vec<Message>>::try_into(msg.clone()) {
+                openai_messages.extend(converted_msgs);
+            }
+        }
+
+        openai_messages
+    }
+
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        // Convert OpenAI messages to Anthropic format
+        // Separate system messages from regular messages
+        let mut system_messages = Vec::new();
+        let mut regular_messages = Vec::new();
+
+        for msg in messages {
+            if msg.role == crate::apis::openai::Role::System {
+                system_messages.push(msg.clone());
+            } else {
+                regular_messages.push(msg.clone());
+            }
+        }
+
+        // Set system prompt if there are system messages
+        if !system_messages.is_empty() {
+            // Combine all system messages into one
+            let system_text = system_messages.iter()
+                .filter_map(|msg| {
+                    if let crate::apis::openai::MessageContent::Text(text) = &msg.content {
+                        Some(text.as_str())
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>()
+                .join("\n");
+
+            self.system = Some(crate::apis::anthropic::MessagesSystemPrompt::Single(system_text));
+        }
+
+        // Convert regular messages
+        self.messages = regular_messages.iter()
+            .filter_map(|msg| {
+                msg.clone().try_into().ok()
+            })
+            .collect();
+    }
 }

 impl MessagesResponse {
--- a/crates/hermesllm/src/apis/openai.rs
+++ b/crates/hermesllm/src/apis/openai.rs
@ -735,6 +735,14 @@ impl ProviderRequest for ChatCompletionsRequest {
    fn get_temperature(&self) -> Option<f32> {
        self.temperature
    }
+
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message> {
+        self.messages.clone()
+    }
+
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        self.messages = messages.to_vec();
+    }
 }

 /// Implementation of ProviderResponse for ChatCompletionsResponse
--- a/crates/hermesllm/src/apis/openai_responses.rs
+++ b/crates/hermesllm/src/apis/openai_responses.rs
@ -1134,6 +1134,140 @@ impl ProviderRequest for ResponsesAPIRequest {
    fn get_temperature(&self) -> Option<f32> {
        self.temperature
    }
+
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message> {
+        use crate::apis::openai::{Message, MessageContent, Role};
+
+        let mut openai_messages = Vec::new();
+
+        // Add instructions as system message if present
+        if let Some(instructions) = &self.instructions {
+            openai_messages.push(Message {
+                role: Role::System,
+                content: MessageContent::Text(instructions.clone()),
+                name: None,
+                tool_calls: None,
+                tool_call_id: None,
+            });
+        }
+
+        // Convert input to messages
+        match &self.input {
+            InputParam::Text(text) => {
+                openai_messages.push(Message {
+                    role: Role::User,
+                    content: MessageContent::Text(text.clone()),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                });
+            }
+            InputParam::Items(items) => {
+                for item in items {
+                    match item {
+                        InputItem::Message(msg) => {
+                            // Convert message role
+                            let role = match msg.role {
+                                MessageRole::User => Role::User,
+                                MessageRole::Assistant => Role::Assistant,
+                                MessageRole::System => Role::System,
+                                MessageRole::Developer => Role::System, // Map developer to system
+                            };
+
+                            // Extract text from message content
+                            let content = match &msg.content {
+                                crate::apis::openai_responses::MessageContent::Text(text) => text.clone(),
+                                crate::apis::openai_responses::MessageContent::Items(items) => {
+                                    items.iter()
+                                        .filter_map(|c| {
+                                            if let InputContent::InputText { text } = c {
+                                                Some(text.clone())
+                                            } else {
+                                                None
+                                            }
+                                        })
+                                        .collect::<Vec<_>>()
+                                        .join("\n")
+                                }
+                            };
+
+                            openai_messages.push(Message {
+                                role,
+                                content: MessageContent::Text(content),
+                                name: None,
+                                tool_calls: None,
+                                tool_call_id: None,
+                            });
+                        }
+                        // Skip other input item types for now
+                        InputItem::ItemReference { .. } | InputItem::FunctionCallOutput { .. } => {
+                            // These are not yet supported in agent framework
+                        }
+                    }
+                }
+            }
+        }
+
+        openai_messages
+    }
+
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        // For ResponsesAPI, we need to convert messages back to input format
+        // Extract system messages as instructions
+        let system_text = messages.iter()
+            .filter(|msg| msg.role == crate::apis::openai::Role::System)
+            .filter_map(|msg| {
+                if let crate::apis::openai::MessageContent::Text(text) = &msg.content {
+                    Some(text.as_str())
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        if !system_text.is_empty() {
+            self.instructions = Some(system_text);
+        }
+
+        // Convert user/assistant messages to InputParam
+        // For simplicity, we'll use the last user message as the input
+        // or combine all non-system messages
+        let input_messages: Vec<_> = messages.iter()
+            .filter(|msg| msg.role != crate::apis::openai::Role::System)
+            .collect();
+
+        if !input_messages.is_empty() {
+            // If there's only one message, use Text format
+            if input_messages.len() == 1 {
+                if let crate::apis::openai::MessageContent::Text(text) = &input_messages[0].content {
+                    self.input = crate::apis::openai_responses::InputParam::Text(text.clone());
+                }
+            } else {
+                // Multiple messages - combine them as text for now
+                // A more sophisticated approach would use InputParam::Items
+                let combined_text = input_messages.iter()
+                    .filter_map(|msg| {
+                        if let crate::apis::openai::MessageContent::Text(text) = &msg.content {
+                            Some(format!("{}: {}",
+                                match msg.role {
+                                    crate::apis::openai::Role::User => "User",
+                                    crate::apis::openai::Role::Assistant => "Assistant",
+                                    _ => "Unknown",
+                                },
+                                text
+                            ))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect::<Vec<_>>()
+                    .join("\n");
+
+                self.input = crate::apis::openai_responses::InputParam::Text(combined_text);
+            }
+        }
+    }
 }

 // ============================================================================
--- a/crates/hermesllm/src/providers/request.rs
+++ b/crates/hermesllm/src/providers/request.rs
@ -47,6 +47,28 @@ pub trait ProviderRequest: Send + Sync {
    fn remove_metadata_key(&mut self, key: &str) -> bool;

    fn get_temperature(&self) -> Option<f32>;
+
+    /// Get message history as OpenAI Message format
+    /// This is useful for processing chat history across different provider formats
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message>;
+
+    /// Set message history from OpenAI Message format
+    /// This converts OpenAI messages to the appropriate format for each provider type
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]);
+}
+
+impl ProviderRequestType {
+    /// Set message history from OpenAI Message format
+    /// This converts OpenAI messages to the appropriate format for each provider type
+    pub fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        match self {
+            Self::ChatCompletionsRequest(r) => r.set_messages(messages),
+            Self::MessagesRequest(r) => r.set_messages(messages),
+            Self::BedrockConverse(r) => r.set_messages(messages),
+            Self::BedrockConverseStream(r) => r.set_messages(messages),
+            Self::ResponsesAPIRequest(r) => r.set_messages(messages),
+        }
+    }
 }

 impl ProviderRequest for ProviderRequestType {
@ -149,6 +171,26 @@ impl ProviderRequest for ProviderRequestType {
            Self::ResponsesAPIRequest(r) => r.get_temperature(),
        }
    }
+
+    fn get_messages(&self) -> Vec<crate::apis::openai::Message> {
+        match self {
+            Self::ChatCompletionsRequest(r) => r.get_messages(),
+            Self::MessagesRequest(r) => r.get_messages(),
+            Self::BedrockConverse(r) => r.get_messages(),
+            Self::BedrockConverseStream(r) => r.get_messages(),
+            Self::ResponsesAPIRequest(r) => r.get_messages(),
+        }
+    }
+
+    fn set_messages(&mut self, messages: &[crate::apis::openai::Message]) {
+        match self {
+            Self::ChatCompletionsRequest(r) => r.set_messages(messages),
+            Self::MessagesRequest(r) => r.set_messages(messages),
+            Self::BedrockConverse(r) => r.set_messages(messages),
+            Self::BedrockConverseStream(r) => r.set_messages(messages),
+            Self::ResponsesAPIRequest(r) => r.set_messages(messages),
+        }
+    }
 }

 /// Parse the client API from a byte slice.
@ -934,4 +976,131 @@ mod tests {
            .message
            .contains("OpenAI ChatCompletions, Anthropic Messages, and OpenAI Responses"));
    }
+
+    #[test]
+    fn test_get_message_history_chat_completions() {
+        use crate::apis::openai::{Message, MessageContent, Role};
+
+        let chat_req = ChatCompletionsRequest {
+            model: "gpt-4".to_string(),
+            messages: vec![
+                Message {
+                    role: Role::System,
+                    content: MessageContent::Text("You are helpful".to_string()),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                },
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text("Hello!".to_string()),
+                    name: None,
+                    tool_calls: None,
+                    tool_call_id: None,
+                },
+            ],
+            ..Default::default()
+        };
+
+        let provider_req = ProviderRequestType::ChatCompletionsRequest(chat_req);
+        let messages = provider_req.get_messages();
+
+        assert_eq!(messages.len(), 2);
+        assert_eq!(messages[0].role, Role::System);
+        assert_eq!(messages[1].role, Role::User);
+    }
+
+    #[test]
+    fn test_get_message_history_anthropic_messages() {
+        use crate::apis::anthropic::{
+            MessagesMessage, MessagesMessageContent, MessagesRequest, MessagesRole,
+            MessagesSystemPrompt,
+        };
+
+        let anthropic_req = MessagesRequest {
+            model: "claude-3-sonnet".to_string(),
+            messages: vec![MessagesMessage {
+                role: MessagesRole::User,
+                content: MessagesMessageContent::Single("Hello!".to_string()),
+            }],
+            system: Some(MessagesSystemPrompt::Single(
+                "You are helpful".to_string(),
+            )),
+            max_tokens: 100,
+            container: None,
+            mcp_servers: None,
+            metadata: None,
+            service_tier: None,
+            thinking: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            stream: None,
+            stop_sequences: None,
+            tools: None,
+            tool_choice: None,
+        };
+
+        let provider_req = ProviderRequestType::MessagesRequest(anthropic_req);
+        let messages = provider_req.get_messages();
+
+        // Should have system message + user message
+        assert_eq!(messages.len(), 2);
+        assert_eq!(
+            messages[0].role,
+            crate::apis::openai::Role::System
+        );
+        assert_eq!(
+            messages[1].role,
+            crate::apis::openai::Role::User
+        );
+    }
+
+    #[test]
+    fn test_get_message_history_responses_api() {
+        use crate::apis::openai_responses::{InputParam, ResponsesAPIRequest};
+
+        let responses_req = ResponsesAPIRequest {
+            model: "gpt-4o".to_string(),
+            input: InputParam::Text("Hello, world!".to_string()),
+            instructions: Some("Be helpful".to_string()),
+            temperature: None,
+            max_output_tokens: None,
+            stream: None,
+            metadata: None,
+            tools: None,
+            tool_choice: None,
+            parallel_tool_calls: None,
+            modalities: None,
+            user: None,
+            store: None,
+            reasoning_effort: None,
+            include: None,
+            audio: None,
+            text: None,
+            service_tier: None,
+            top_p: None,
+            top_logprobs: None,
+            stream_options: None,
+            truncation: None,
+            conversation: None,
+            previous_response_id: None,
+            max_tool_calls: None,
+            background: None,
+        };
+
+        let provider_req = ProviderRequestType::ResponsesAPIRequest(responses_req);
+        let messages = provider_req.get_messages();
+
+        // Should have system message (instructions) + user message (input)
+        assert_eq!(messages.len(), 2);
+        assert_eq!(
+            messages[0].role,
+            crate::apis::openai::Role::System
+        );
+        assert_eq!(
+            messages[1].role,
+            crate::apis::openai::Role::User
+        );
+    }
 }
--- a/demos/use_cases/mcp_filter/README.md
+++ b/demos/use_cases/mcp_filter/README.md
@ -0,0 +1,106 @@
+# RAG Agent Demo
+
+A multi-agent RAG system demonstrating archgw's agent filter chain with MCP protocol.
+
+## Architecture
+
+This demo consists of three components:
+1. **Query Rewriter** (MCP filter) - Rewrites user queries for better retrieval
+2. **Context Builder** (MCP filter) - Retrieves relevant context from knowledge base
+3. **RAG Agent** (REST) - Generates final responses based on augmented context
+
+## Components
+
+### Query Rewriter Filter (MCP)
+- **Port**: 10501
+- **Tool**: `query_rewriter`
+- Improves queries using LLM before retrieval
+
+### Context Builder Filter (MCP)
+- **Port**: 10502
+- **Tool**: `context_builder`
+- Augments queries with relevant passages from knowledge base
+
+### RAG Agent (REST/OpenAI)
+- **Port**: 10505
+- **Endpoint**: `/v1/chat/completions`
+- Generates responses using OpenAI-compatible API
+
+## Quick Start
+
+### 1. Start all agents
+```bash
+./start_agents.sh
+```
+
+This starts:
+- Query Rewriter MCP server on port 10501
+- Context Builder MCP server on port 10502
+- RAG Agent REST server on port 10505
+
+### 2. Start archgw
+```bash
+archgw up --foreground
+```
+
+### 3. Test the system
+```bash
+curl -X POST http://localhost:8001/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o",
+    "messages": [{"role": "user", "content": "What is the guaranteed uptime for TechCorp?"}]
+  }'
+```
+
+## Configuration
+
+The `arch_config.yaml` defines how agents are connected:
+
+```yaml
+filters:
+  - id: query_rewriter
+    url: mcp://host.docker.internal:10500
+    tool: rewrite_query_with_archgw  # MCP tool name
+
+  - id: context_builder
+    url: mcp://host.docker.internal:10501
+    tool: chat_completions
+```
+How It Works
+
+1. User sends request to archgw listener on port 8001
+2. Request passes through MCP filter chain:
+   - **Query Rewriter** rewrites the query for better retrieval
+   - **Context Builder** augments query with relevant knowledge base passages
+3. Augmented request is forwarded to **RAG Agent** REST endpoint
+4. RAG Agent generates final response using LLM
+
+## Configuration
+
+See `arch_config.yaml` for the complete filter chain setup. The MCP filters use default settings:
+- `type: mcp` (default)
+- `transport: streamable-http` (default)
+- Tool name defaults to filter ID `sample_queries.md` for example queries to test the RAG system.
+
+Example request:
+```bash
+curl -X POST http://localhost:8001/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What is the guaranteed uptime for TechCorp?"
+      }
+    ]
+  }'
+```
+- `LLM_GATEWAY_ENDPOINT` - archgw endpoint (default: `http://localhost:12000/v1`)
+- `OPENAI_API_KEY` - OpenAI API key for model providers
+
+## Additional Resources
+
+- See `sample_queries.md` for more example queries
+- See `arch_config.yaml` for complete configuration details
--- a/demos/use_cases/mcp_filter/arch_config.yaml
+++ b/demos/use_cases/mcp_filter/arch_config.yaml
@ -0,0 +1,41 @@
+version: v0.3.0
+
+agents:
+  - id: rag_agent
+    url: http://host.docker.internal:10505
+
+filters:
+  - id: query_rewriter
+    url: http://host.docker.internal:10501
+    # type: mcp # default is mcp
+    # transport: streamable-http # default is streamable-http
+    # tool: query_rewriter # default name is the filter id
+  - id: context_builder
+    url: http://host.docker.internal:10502
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+
+model_aliases:
+  fast-llm:
+    target: gpt-4o-mini
+  smart-llm:
+    target: gpt-4o
+
+listeners:
+  - type: agent
+    name: agent_1
+    port: 8001
+    router: arch_agent_router
+    agents:
+      - id: rag_agent
+        description: virtual assistant for retrieval augmented generation tasks
+        filter_chain:
+          - query_rewriter
+          - context_builder
+tracing:
+  random_sampling: 100
--- a/demos/use_cases/mcp_filter/docker-compose.yaml
+++ b/demos/use_cases/mcp_filter/docker-compose.yaml
@ -0,0 +1,17 @@
+services:
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+  open-web-ui:
+    image: dyrnq/open-webui:main
+    restart: always
+    ports:
+      - "8080:8080"
+    environment:
+      - DEFAULT_MODEL=gpt-4o-mini
+      - ENABLE_OPENAI_API=true
+      - OPENAI_API_BASE_URL=http://host.docker.internal:8001/v1
--- a/demos/use_cases/mcp_filter/mcp_query.rest
+++ b/demos/use_cases/mcp_filter/mcp_query.rest
@ -0,0 +1,86 @@
+### Initialize MCP Session (SSE)
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+
+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"capabilities":{},"protocolVersion":"2024-11-05","clientInfo":{"name":"test","version":"1.0.0"}}}
+
+### Send Initialized Notification
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: 35d455dc07b8400887f86668590f12bb
+
+{
+  "jsonrpc": "2.0",
+  "method": "notifications/initialized"
+}
+
+### List Tools
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: eb10a691b36e4547b6c93c5dc5b47e11
+
+{
+  "jsonrpc": "2.0",
+  "id": "list-tools-1",
+  "method": "tools/list"
+}
+
+### Call Query Rewriter Tool
+POST http://localhost:10501/mcp
+Content-Type: application/json
+Accept: application/json, text/event-stream
+mcp-session-id: 6b95ff75825a402b90eb3ea07e23fbce
+
+{
+  "jsonrpc": "2.0",
+  "id": "3d3b886a-6216-4a26-a422-7a972529c0e7",
+  "method": "tools/call",
+  "params": {
+    "arguments": {
+      "messages": [
+        {
+          "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?",
+          "role": "user"
+        }
+      ]
+    },
+    "name": "query_rewriter"
+  }
+}
+
+### another test
+
+# Content-Type: application/json
+# Accept: application/json, text/event-stream
+# mcp-session-id: ed7a81a1d39549ecaadb867a6b2daf1e
+
+POST http://localhost:10501/mcp
+content-type: application/json
+mcp-session-id: e4ec1ae904e14e06b7d194da10e5f74c
+accept: application/json, text/event-stream
+
+{"jsonrpc":"2.0","id":"4bb1043a-2953-4bcd-b801-f270b0ae8c39","method":"tools/call","params":{"arguments":{"messages":[{"content":"What is the guaranteed uptime percentage for TechCorp's cloud services?","role":"user"}]},"name":"query_rewriter"}}
+
+
+
+### stream test
+
+POST http://localhost:10501/mcp
+content-type: application/json
+mcp-session-id: 35d455dc07b8400887f86668590f12bb
+accept: application/json, text/event-stream
+
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "method": "tools/call",
+  "params": {
+    "name": "long_job",
+    "arguments": {
+      "n": 3
+    }
+  }
+}
--- a/demos/use_cases/mcp_filter/pyproject.toml
+++ b/demos/use_cases/mcp_filter/pyproject.toml
@ -0,0 +1,22 @@
+[project]
+name = "rag_agent"
+version = "0.1.0"
+description = "RAG Agent"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "click>=8.2.1",
+    "mcp>=1.13.1",
+    "fastmcp>=2.14",
+    "pydantic>=2.11.7",
+    "fastapi>=0.104.1",
+    "uvicorn>=0.24.0",
+    "openai==2.13.0",
+]
+
+[project.scripts]
+rag_agent = "rag_agent:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/demos/use_cases/mcp_filter/sample_queries.md
+++ b/demos/use_cases/mcp_filter/sample_queries.md
@ -0,0 +1,64 @@
+# Sample Queries for Knowledge Base RAG Agent
+
+## Service Level Agreement Queries
+- What is the guaranteed uptime percentage for TechCorp's cloud services?
+- What remedies are available if the API response time exceeds the agreed threshold?
+- How quickly must TechCorp respond to critical support issues?
+- What monitoring and reporting requirements are specified in the SLA?
+- When was the TechCorp service agreement signed and by whom?
+
+## Privacy Policy Queries
+- What encryption methods does DataSecure use to protect data?
+- How long does DataSecure retain personal data after account deletion?
+- What rights do users have regarding their personal information?
+- Can DataSecure sell user data to third parties for marketing?
+- Who should be contacted for privacy-related concerns at DataSecure?
+
+## Supply Chain Agreement Queries
+- What types of automotive components does PrecisionParts supply?
+- What are the payment terms and volume discount structure?
+- What quality standards must the supplied components meet?
+- What are the penalties for late delivery?
+- What insurance coverage requirements apply to the supplier?
+
+## Student Data Management Queries
+- What federal laws must EduTech comply with regarding student data?
+- What security measures are in place to protect student information?
+- How long are student records retained after graduation?
+- What consent is required for students under 13 years old?
+- Who can access student educational records?
+
+## Investment Advisory Queries
+- What is FinanceFirst's management fee structure?
+- What types of investments are included in the advisory services?
+- What regulatory body oversees FinanceFirst Advisors?
+- How often are portfolio reviews conducted?
+- What are the client's responsibilities under this agreement?
+
+## Healthcare Standards Queries
+- What is the target response time for emergency code teams?
+- What hand hygiene compliance rate is required?
+- How quickly must medical records be completed after patient encounters?
+- What continuing education requirements apply to nursing staff?
+- What patient safety protocols are mandatory upon admission?
+
+## Cross-Document Queries
+- Which agreements include confidentiality or data protection provisions?
+- What are the common termination notice periods across different contract types?
+- Which documents specify insurance or liability coverage requirements?
+- What compliance and regulatory requirements are mentioned across agreements?
+- Which contracts include performance metrics or service level commitments?
+
+## Complex Analysis Queries
+- Compare the data retention policies across the privacy policy and student data management documents.
+- What are the different approaches to risk management across the supply chain and investment advisory agreements?
+- How do the security measures in the healthcare standards compare to those in the privacy policy?
+- Which agreements provide the most detailed compliance and regulatory frameworks?
+- What common themes exist in the quality assurance requirements across different industries?
+
+## Document-Specific Detail Queries
+- List all the specific percentages, timeframes, and numerical requirements mentioned in the SLA.
+- What are all the contact persons and their roles mentioned across the documents?
+- Identify all the compliance standards and certifications referenced in the supply chain agreement.
+- What are the specific consequences or penalties mentioned for non-compliance across agreements?
+- List all the third-party systems, tools, or services mentioned in the documents.
--- a/demos/use_cases/mcp_filter/src/rag_agent/init.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/init.py
@ -0,0 +1,98 @@
+import click
+from fastmcp import FastMCP
+
+mcp = None
+
+
+@click.command()
+@click.option(
+    "--transport",
+    "transport",
+    default="streamable-http",
+    help="Transport type: stdio or sse",
+)
+@click.option("--host", "host", default="localhost", help="Host to bind MCP server to")
+@click.option("--port", "port", type=int, default=10500, help="Port for MCP server")
+@click.option(
+    "--agent",
+    "agent",
+    required=True,
+    help="Agent name: query_rewriter, context_builder, or response_generator",
+)
+@click.option(
+    "--name",
+    "agent_name",
+    default=None,
+    help="Custom MCP server name (defaults to agent type)",
+)
+@click.option(
+    "--rest-server",
+    "rest_server",
+    is_flag=True,
+    help="Start REST server instead of MCP server",
+)
+@click.option("--rest-port", "rest_port", default=8000, help="Port for REST server")
+def main(host, port, agent, transport, agent_name, rest_server, rest_port):
+    """Start a RAG agent as an MCP server or REST server."""
+
+    # Map friendly names to agent modules
+    agent_map = {
+        "query_rewriter": ("rag_agent.query_rewriter", "Query Rewriter Agent"),
+        "context_builder": ("rag_agent.context_builder", "Context Builder Agent"),
+        "response_generator": (
+            "rag_agent.rag_agent",
+            "Response Generator Agent",
+        ),
+    }
+
+    if agent not in agent_map:
+        print(f"Error: Unknown agent '{agent}'")
+        print(f"Available agents: {', '.join(agent_map.keys())}")
+        return
+
+    module_name, default_name = agent_map[agent]
+    mcp_name = agent_name or default_name
+
+    if rest_server:
+        # Only response_generator supports REST server mode
+        if agent != "response_generator":
+            print(f"Error: Agent '{agent}' does not support REST server mode.")
+            print(f"REST server is only supported for: response_generator")
+            print(f"Remove --rest-server flag to start {agent} as an MCP server.")
+            return
+
+        print(f"Starting REST server on {host}:{rest_port} for agent: {agent}")
+        from rag_agent.rag_agent import start_server
+
+        start_server(host=host, port=rest_port)
+        return
+    else:
+        # Only query_rewriter and context_builder support MCP
+        if agent not in ["query_rewriter", "context_builder"]:
+            print(f"Error: Agent '{agent}' does not support MCP mode.")
+            print(f"MCP is only supported for: query_rewriter, context_builder")
+            print(f"Use --rest-server flag to start {agent} as a REST server.")
+            return
+
+        global mcp
+        mcp = FastMCP(mcp_name, host=host, port=port)
+
+        print(f"Starting MCP server: {mcp_name}")
+        print(f"  Agent: {agent}")
+        print(f"  Transport: {transport}")
+        print(f"  Host: {host}")
+        print(f"  Port: {port}")
+
+        # Import the agent module to register its tools
+        import importlib
+
+        importlib.import_module(module_name)
+
+        print(f"Agent '{agent}' loaded successfully")
+        print(f"MCP server ready on {transport}://{host}:{port}")
+
+        mcp.run(transport=transport)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/use_cases/mcp_filter/src/rag_agent/main.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/main.py
@ -0,0 +1,4 @@
+from . import main
+
+if __name__ == "__main__":
+    main()
--- a/demos/use_cases/mcp_filter/src/rag_agent/api.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/api.py
@ -0,0 +1,36 @@
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 1.0
+    max_tokens: Optional[int] = None
+    top_p: Optional[float] = 1.0
+    frequency_penalty: Optional[float] = 0.0
+    presence_penalty: Optional[float] = 0.0
+    stream: Optional[bool] = False
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Dict[str, int]
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
--- a/demos/use_cases/mcp_filter/src/rag_agent/context_builder.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/context_builder.py
@ -0,0 +1,205 @@
+import json
+from typing import List, Optional, Dict, Any
+from openai import AsyncOpenAI
+import os
+import logging
+import csv
+from pathlib import Path
+
+from .api import ChatMessage
+from . import mcp
+from fastmcp.server.dependencies import get_http_headers
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [CONTEXT_BUILDER]    - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+RAG_MODEL = "gpt-4o-mini"
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+# Global variable to store the knowledge base
+knowledge_base = []
+
+
+def load_knowledge_base():
+    """Load the sample_knowledge_base.csv file into memory on startup."""
+    global knowledge_base
+
+    # Get the path to the CSV file relative to this script
+    current_dir = Path(__file__).parent
+    csv_path = current_dir / "sample_knowledge_base.csv"
+
+    print(f"Loading knowledge base from {csv_path}")
+
+    try:
+        knowledge_base = []
+        with open(csv_path, "r", encoding="utf-8-sig") as file:
+            csv_reader = csv.DictReader(file)
+            for row in csv_reader:
+                knowledge_base.append({"path": row["path"], "content": row["content"]})
+
+        logger.info(f"Loaded {len(knowledge_base)} documents from knowledge base")
+
+    except Exception as e:
+        logger.error(f"Error loading knowledge base: {e}")
+        knowledge_base = []
+
+
+async def find_relevant_passages(
+    query: str, traceparent: Optional[str] = None, top_k: int = 3
+) -> List[Dict[str, str]]:
+    """Use the LLM to find the most relevant passages from the knowledge base."""
+
+    if not knowledge_base:
+        logger.warning("Knowledge base is empty")
+        return []
+
+    # Create a system prompt for passage selection
+    system_prompt = f"""You are a retrieval assistant that selects the most relevant document passages for a given query.
+
+                    Given a user query and a list of document passages, identify the {top_k} most relevant passages that would help answer the query.
+
+                    Query: {query}
+
+                    Available passages:
+                    """
+
+    # Add all passages with indices
+    for i, doc in enumerate(knowledge_base):
+        system_prompt += (
+            f"\n[{i}] Path: {doc['path']}\nContent: {doc['content'][:500]}...\n"
+        )
+
+    system_prompt += f"""
+
+        Please respond with ONLY the indices of the {top_k} most relevant passages, separated by commas (e.g., "0,3,7").
+        If fewer than {top_k} passages are relevant, return only the relevant ones.
+        If no passages are relevant, return "NONE"."""
+
+    try:
+        # Call archgw to select relevant passages
+        logger.info(f"Calling archgw to find relevant passages for query: '{query}'")
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent:
+            extra_headers["traceparent"] = traceparent
+
+        response = await archgw_client.chat.completions.create(
+            model=RAG_MODEL,
+            messages=[{"role": "system", "content": system_prompt}],
+            temperature=0.1,
+            max_tokens=50,
+            extra_headers=extra_headers,
+        )
+
+        result = response.choices[0].message.content.strip()
+        logger.info(f"LLM selected passages: {result}")
+
+        # Parse the indices
+        if result.upper() == "NONE":
+            return []
+
+        selected_passages = []
+        indices = [
+            int(idx.strip()) for idx in result.split(",") if idx.strip().isdigit()
+        ]
+
+        for idx in indices:
+            if 0 <= idx < len(knowledge_base):
+                selected_passages.append(knowledge_base[idx])
+
+        logger.info(f"Selected {len(selected_passages)} relevant passages")
+        return selected_passages
+
+    except Exception as e:
+        logger.error(f"Error finding relevant passages: {e}")
+        return []
+
+
+async def augment_query_with_context(
+    messages: List[ChatMessage], traceparent: Optional[str] = None
+) -> List[ChatMessage]:
+    """Extract user query, find relevant context, and augment the messages."""
+
+    # Find the last user message
+    last_user_message = None
+    last_user_index = -1
+
+    for i in range(len(messages) - 1, -1, -1):
+        if messages[i].role == "user":
+            last_user_message = messages[i].content
+            last_user_index = i
+            break
+
+    if not last_user_message:
+        logger.warning("No user message found in conversation")
+        return messages
+
+    logger.info(f"Processing user query: '{last_user_message}'")
+
+    # Find relevant passages
+    relevant_passages = await find_relevant_passages(last_user_message, traceparent)
+
+    if not relevant_passages:
+        logger.info("No relevant passages found, returning original messages")
+        return messages
+
+    # Build context from relevant passages
+    context_parts = []
+    for i, passage in enumerate(relevant_passages):
+        context_parts.append(
+            f"Document {i+1} ({passage['path']}):\n{passage['content']}"
+        )
+
+    context = "\n\n".join(context_parts)
+
+    # Create augmented content with original query and context
+    augmented_content = f"""{last_user_message} RELEVANT CONTEXT:
+    {context}"""
+
+    # Create updated messages with the augmented query
+    updated_messages = messages.copy()
+    updated_messages[last_user_index] = ChatMessage(
+        role="user", content=augmented_content
+    )
+
+    logger.info(f"Augmented user query with {len(relevant_passages)} relevant passages")
+
+    return updated_messages
+
+
+# Load knowledge base on module import
+load_knowledge_base()
+
+
+@mcp.tool()
+async def context_builder(messages: List[ChatMessage]) -> List[ChatMessage]:
+    """MCP tool that augments user queries with relevant context from the knowledge base."""
+    logger.info(f"Received chat completion request with {len(messages)} messages")
+
+    # Get traceparent header from MCP request
+    headers = get_http_headers()
+    traceparent_header = headers.get("traceparent")
+
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Augment the user query with relevant context
+    updated_messages = await augment_query_with_context(messages, traceparent_header)
+
+    # Return as dict to minimize text serialization
+    return [{"role": msg.role, "content": msg.content} for msg in updated_messages]
--- a/demos/use_cases/mcp_filter/src/rag_agent/query_rewriter.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/query_rewriter.py
@ -0,0 +1,119 @@
+import asyncio
+import json
+from typing import List, Optional, Dict, Any
+from openai import AsyncOpenAI
+import os
+import logging
+
+from .api import ChatMessage
+from . import mcp
+from fastmcp.server.dependencies import get_http_headers
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [QUERY_REWRITER]     - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+QUERY_REWRITE_MODEL = "gpt-4o-mini"
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+
+async def rewrite_query_with_archgw(
+    messages: List[ChatMessage], traceparent_header: str
+) -> str:
+    """Rewrite the user query using LLM for better retrieval."""
+    system_prompt = """You are a query rewriter that improves user queries for better retrieval.
+
+    Given a conversation history, rewrite the last user message to be more specific and context-aware.
+    The rewritten query should:
+    1. Include relevant context from previous messages
+    2. Be clear and specific for information retrieval
+    3. Maintain the user's intent
+    4. Be concise but comprehensive
+
+    Return only the rewritten query, nothing else."""
+
+    # Prepare messages for the query rewriter - just add system prompt to existing messages
+    rewrite_messages = [{"role": "system", "content": system_prompt}]
+
+    # Add conversation history
+    for msg in messages:
+        rewrite_messages.append({"role": msg.role, "content": msg.content})
+
+    try:
+        # Call archgw using OpenAI client
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to rewrite query")
+        response = await archgw_client.chat.completions.create(
+            model=QUERY_REWRITE_MODEL,
+            messages=rewrite_messages,
+            temperature=0.3,
+            max_tokens=200,
+            extra_headers=extra_headers,
+        )
+
+        rewritten_query = response.choices[0].message.content.strip()
+        logger.info(f"Query rewritten successfully: '{rewritten_query}'")
+        return rewritten_query
+
+    except Exception as e:
+        logger.error(f"Error rewriting query: {e}")
+
+    # If rewriting fails, return the original last user message
+    logger.info("Falling back to original user message")
+    for message in reversed(messages):
+        if message.role == "user":
+            return message.content
+    return ""
+
+
+@mcp.tool()
+async def query_rewriter(messages: List[ChatMessage]) -> List[ChatMessage]:
+    """Chat completions endpoint that rewrites the last user query using archgw.
+
+    Returns a dict with a 'messages' key containing the updated message list.
+    """
+    import time
+    import uuid
+
+    logger.info(f"Received chat completion request with {len(messages)} messages")
+
+    # Get traceparent header from HTTP request using FastMCP's dependency function
+    headers = get_http_headers()
+    traceparent_header = headers.get("traceparent")
+
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Call archgw to rewrite the last user query
+    rewritten_query = await rewrite_query_with_archgw(messages, traceparent_header)
+
+    # Create updated messages with the rewritten query
+    updated_messages = messages.copy()
+
+    # Find and update the last user message with the rewritten query
+    for i in range(len(updated_messages) - 1, -1, -1):
+        if updated_messages[i].role == "user":
+            original_query = updated_messages[i].content
+            updated_messages[i] = ChatMessage(role="user", content=rewritten_query)
+            logger.info(
+                f"Updated user query from '{original_query}' to '{rewritten_query}'"
+            )
+            break
+
+    # Return as dict to minimize text serialization
+    return [{"role": msg.role, "content": msg.content} for msg in updated_messages]
--- a/demos/use_cases/mcp_filter/src/rag_agent/rag_agent.py
+++ b/demos/use_cases/mcp_filter/src/rag_agent/rag_agent.py
@ -0,0 +1,303 @@
+import json
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from openai import AsyncOpenAI
+import os
+import logging
+import time
+import uuid
+import uvicorn
+import asyncio
+
+from .api import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionStreamResponse,
+)
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+RESPONSE_MODEL = "gpt-4o"
+
+# System prompt for response generation
+SYSTEM_PROMPT = """You are a helpful assistant that generates coherent, contextual responses.
+
+Given a conversation history, generate a helpful and relevant response based on all the context available in the messages.
+Your response should:
+1. Be contextually aware of the entire conversation
+2. Address the user's needs appropriately
+3. Be helpful and informative
+4. Maintain a natural conversational tone
+
+Generate a complete response to assist the user."""
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+# FastAPI app for REST server
+app = FastAPI(title="RAG Agent Response Generator", version="1.0.0")
+
+
+def prepare_response_messages(request_body: ChatCompletionRequest):
+    """Prepare messages for response generation by adding system prompt."""
+    response_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+
+    # Add conversation history
+    for msg in request_body.messages:
+        response_messages.append({"role": msg.role, "content": msg.content})
+
+    return response_messages
+
+
+@app.post("/v1/chat/completions")
+async def chat_completion_http(request: Request, request_body: ChatCompletionRequest):
+    """HTTP endpoint for chat completions with streaming support."""
+    logger.info(
+        f"Received chat completion request with {len(request_body.messages)} messages"
+    )
+
+    # Get traceparent header from HTTP request
+    traceparent_header = request.headers.get("traceparent")
+
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Check if streaming is requested
+    if request_body.stream:
+        return StreamingResponse(
+            stream_chat_completions(request_body, traceparent_header),
+            media_type="text/plain",
+            headers={
+                "content-type": "text/event-stream",
+            },
+        )
+    else:
+        return await non_streaming_chat_completions(request_body, traceparent_header)
+
+
+async def stream_chat_completions(
+    request_body: ChatCompletionRequest, traceparent_header: str = None
+):
+    """Generate streaming chat completions."""
+    # Prepare messages for response generation
+    response_messages = prepare_response_messages(request_body)
+
+    try:
+        # Call archgw using OpenAI client for streaming
+        logger.info(
+            f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate streaming response"
+        )
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+
+        response_stream = await archgw_client.chat.completions.create(
+            model=RESPONSE_MODEL,
+            messages=response_messages,
+            temperature=request_body.temperature or 0.7,
+            max_tokens=request_body.max_tokens or 1000,
+            stream=True,
+            extra_headers=extra_headers,
+        )
+
+        completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+        created_time = int(time.time())
+        collected_content = []
+
+        async for chunk in response_stream:
+            if chunk.choices and chunk.choices[0].delta.content:
+                content = chunk.choices[0].delta.content
+                collected_content.append(content)
+
+                # Create streaming response chunk
+                stream_chunk = ChatCompletionStreamResponse(
+                    id=completion_id,
+                    created=created_time,
+                    model=request_body.model,
+                    choices=[
+                        {
+                            "index": 0,
+                            "delta": {"content": content},
+                            "finish_reason": None,
+                        }
+                    ],
+                )
+
+                yield f"data: {stream_chunk.model_dump_json()}\n\n"
+
+        # Send final chunk with complete response in expected format
+        full_response = "".join(collected_content)
+        updated_history = [{"role": "assistant", "content": full_response}]
+
+        final_chunk = ChatCompletionStreamResponse(
+            id=completion_id,
+            created=created_time,
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": "stop",
+                    "message": {
+                        "role": "assistant",
+                        "content": json.dumps(updated_history),
+                    },
+                }
+            ],
+        )
+
+        yield f"data: {final_chunk.model_dump_json()}\n\n"
+        yield "data: [DONE]\n\n"
+
+    except Exception as e:
+        logger.error(f"Error generating streaming response: {e}")
+
+        # Send error as streaming response
+        error_chunk = ChatCompletionStreamResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "I apologize, but I'm having trouble generating a response right now. Please try again."
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+        )
+
+        yield f"data: {error_chunk.model_dump_json()}\n\n"
+        yield "data: [DONE]\n\n"
+
+
+async def non_streaming_chat_completions(
+    request_body: ChatCompletionRequest, traceparent_header: str = None
+):
+    """Generate non-streaming chat completions."""
+    # Prepare messages for response generation
+    response_messages = prepare_response_messages(request_body)
+
+    try:
+        # Call archgw using OpenAI client
+        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate response")
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+
+        response = await archgw_client.chat.completions.create(
+            model=RESPONSE_MODEL,
+            messages=response_messages,
+            temperature=request_body.temperature or 0.7,
+            max_tokens=request_body.max_tokens or 1000,
+            extra_headers=extra_headers,
+        )
+
+        generated_response = response.choices[0].message.content.strip()
+        logger.info(f"Response generated successfully")
+
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": generated_response,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                ),
+                "completion_tokens": len(generated_response.split()),
+                "total_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                )
+                + len(generated_response.split()),
+            },
+        )
+
+    except Exception as e:
+        logger.error(f"Error generating response: {e}")
+
+        # Fallback response
+        fallback_message = "I apologize, but I'm having trouble generating a response right now. Please try again."
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": fallback_message},
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                ),
+                "completion_tokens": len(fallback_message.split()),
+                "total_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                )
+                + len(fallback_message.split()),
+            },
+        )
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+
+
+def start_server(host: str = "localhost", port: int = 8000):
+    """Start the REST server."""
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_config={
+            "version": 1,
+            "disable_existing_loggers": False,
+            "formatters": {
+                "default": {
+                    "format": "%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
+                },
+            },
+            "handlers": {
+                "default": {
+                    "formatter": "default",
+                    "class": "logging.StreamHandler",
+                    "stream": "ext://sys.stdout",
+                },
+            },
+            "root": {
+                "level": "INFO",
+                "handlers": ["default"],
+            },
+        },
+    )
--- a/demos/use_cases/mcp_filter/src/rag_agent/sample_knowledge_base.csv
+++ b/demos/use_cases/mcp_filter/src/rag_agent/sample_knowledge_base.csv
@ -0,0 +1,257 @@
+path,content
+TechCorp_CloudServices_SLA_Agreement_2024,"SERVICE LEVEL AGREEMENT
+This Service Level Agreement (""SLA"") is entered into on March 15, 2024, between TechCorp Solutions Inc., a Delaware corporation (""Provider""), and CloudFirst Enterprises LLC (""Customer"").
+
+DEFINITIONS
+Service Availability: The percentage of time during which the cloud services are operational and accessible.
+Downtime: Any period when the services are unavailable or inaccessible to Customer.
+Response Time: The time between service request submission and initial response from Provider.
+
+SERVICE COMMITMENTS
+Provider guarantees 99.9% uptime for all cloud infrastructure services during any calendar month.
+Average response time for API calls shall not exceed 200 milliseconds under normal operating conditions.
+Customer support response times: Critical issues within 1 hour, Standard issues within 4 hours.
+
+REMEDIES
+For each full percentage point below 99.9% availability, Customer receives 10% credit on monthly fees.
+If response times exceed 500ms for more than 5 minutes in any hour, Customer receives 5% monthly credit.
+
+MONITORING AND REPORTING
+Provider will maintain real-time monitoring systems and provide monthly performance reports.
+All metrics will be measured from Provider's monitoring systems located in primary data centers.
+
+This SLA remains in effect for the duration of the underlying service agreement.
+
+Executed by:
+TechCorp Solutions Inc.
+Sarah Mitchell, VP Operations
+Date: March 15, 2024
+
+CloudFirst Enterprises LLC
+Robert Chen, CTO
+Date: March 16, 2024"
+
+DataSecure_Privacy_Policy_v3.2,"PRIVACY POLICY
+DataSecure Analytics, Inc. (""Company"") Privacy Policy
+Effective Date: January 1, 2024
+Last Updated: February 28, 2024
+
+INFORMATION COLLECTION
+We collect information you provide directly, such as account details, usage preferences, and communication records.
+Automatically collected data includes IP addresses, browser types, device information, and service interaction logs.
+Third-party integrations may provide additional user behavior and demographic information with consent.
+
+DATA USAGE
+Personal information is used to provide services, improve user experience, and communicate service updates.
+Aggregated, non-identifiable data may be used for analytics, research, and service enhancement.
+We do not sell personal information to third parties for marketing purposes.
+
+DATA PROTECTION
+All data is encrypted in transit using TLS 1.3 and at rest using AES-256 encryption.
+Access controls limit data access to authorized personnel only on a need-to-know basis.
+Regular security audits and penetration testing ensure ongoing protection measures.
+
+DATA RETENTION
+Personal data is retained for the duration of active service plus 24 months.
+Logs and analytics data are retained for 12 months unless legally required otherwise.
+Upon account deletion, personal data is permanently removed within 30 days.
+
+USER RIGHTS
+Users may request access to, correction of, or deletion of their personal information.
+Data portability requests will be fulfilled in standard formats within 30 days.
+Marketing communications can be opted out of at any time.
+
+CONTACT
+For privacy concerns, contact: privacy@datasecure.com
+Data Protection Officer: Jennifer Walsh, jwalsh@datasecure.com"
+
+GlobalManufacturing_SupplyChain_Contract_Q2_2024,"SUPPLY CHAIN AGREEMENT
+This Supply Chain Agreement is entered into between GlobalManufacturing Corp (""Buyer"") and PrecisionParts Ltd (""Supplier"") effective April 1, 2024.
+
+SCOPE OF SERVICES
+Supplier will provide automotive components including brake assemblies, suspension parts, and electrical harnesses.
+All products must meet ISO 9001 quality standards and automotive industry specifications.
+Delivery schedule: Weekly shipments every Tuesday, with 48-hour advance shipping notifications.
+
+PRICING AND PAYMENT
+Component pricing is fixed for initial 6-month term with quarterly price review thereafter.
+Payment terms: Net 45 days from invoice date via electronic transfer.
+Volume discounts apply: 5% for orders exceeding 10,000 units per month, 8% for orders exceeding 25,000 units.
+
+QUALITY REQUIREMENTS
+All components must pass incoming inspection with less than 0.1% defect rate.
+Supplier maintains quality certifications including IATF 16949 and environmental compliance.
+Batch tracking and traceability required for all delivered components.
+
+LOGISTICS AND DELIVERY
+Supplier responsible for packaging, labeling, and delivery to Buyer's distribution centers.
+Delivery windows: 8 AM - 4 PM, Monday through Friday, with advance appointment scheduling.
+Late delivery penalties: 2% of shipment value for each day beyond scheduled delivery.
+
+RISK MANAGEMENT
+Supplier maintains business continuity plans and alternative sourcing strategies.
+Force majeure events must be reported within 24 hours with mitigation plans.
+Insurance requirements: $5M general liability, $2M product liability coverage.
+
+INTELLECTUAL PROPERTY
+All custom tooling and specifications remain property of Buyer.
+Supplier grants license to use necessary patents for component manufacturing.
+
+This agreement shall remain in effect for 24 months with automatic renewal unless terminated.
+
+GlobalManufacturing Corp
+Michael Rodriguez, Supply Chain Director
+Date: April 1, 2024
+
+PrecisionParts Ltd
+Amanda Foster, VP Sales
+Date: April 2, 2024"
+
+EduTech_StudentData_Management_Policy_2024,"STUDENT DATA MANAGEMENT POLICY
+EduTech Learning Platform - Data Management and Protection Policy
+Document Version: 2.1
+Effective Date: August 15, 2024
+
+SCOPE AND PURPOSE
+This policy governs the collection, use, storage, and protection of student educational records and personal information.
+Applies to all employees, contractors, and third-party service providers accessing student data.
+Compliance with FERPA, COPPA, and state student privacy laws is mandatory.
+
+DATA CLASSIFICATION
+Educational Records: Grades, attendance, assignments, and academic progress information.
+Personal Information: Names, addresses, contact details, and demographic information.
+Behavioral Data: Learning patterns, platform usage, and engagement metrics.
+
+COLLECTION PRINCIPLES
+Data collection is limited to educational purposes and service improvement only.
+Parental consent required for students under 13 years of age.
+Students and parents have right to review and request corrections to educational records.
+
+ACCESS CONTROLS
+Role-based access ensures personnel see only data necessary for their functions.
+Multi-factor authentication required for all system access.
+Access logs maintained and reviewed monthly for unauthorized activity.
+
+DATA SHARING
+Educational records shared only with authorized school personnel and parents/students.
+No data sharing with third parties for commercial purposes without explicit consent.
+Research data must be de-identified and aggregated before external sharing.
+
+SECURITY MEASURES
+Data encrypted using industry-standard protocols during transmission and storage.
+Regular security assessments and vulnerability testing conducted quarterly.
+Incident response plan includes notification procedures for data breaches.
+
+RETENTION AND DISPOSAL
+Student records retained according to school district policies, typically 5-7 years post-graduation.
+Inactive accounts and associated data purged after 2 years of non-use.
+Secure data destruction protocols ensure complete removal of sensitive information.
+
+COMPLIANCE MONITORING
+Annual privacy training required for all staff handling student data.
+Regular audits ensure ongoing compliance with applicable privacy regulations.
+Privacy impact assessments conducted for new features or data uses.
+
+Contact: Dr. Lisa Thompson, Chief Privacy Officer
+Email: privacy@edutech-learning.com
+Phone: (555) 123-4567"
+
+FinanceFirst_Investment_Advisory_Agreement_2024,"INVESTMENT ADVISORY AGREEMENT
+This Investment Advisory Agreement is entered into between FinanceFirst Advisors LLC (""Advisor"") and Madison Investment Group (""Client"") on May 20, 2024.
+
+ADVISORY SERVICES
+Advisor will provide comprehensive investment management and financial planning services.
+Services include portfolio construction, asset allocation, risk assessment, and performance monitoring.
+Regular portfolio reviews conducted quarterly with detailed performance reporting.
+
+INVESTMENT AUTHORITY
+Client grants Advisor discretionary authority to make investment decisions within agreed parameters.
+Investment universe includes stocks, bonds, ETFs, mutual funds, and alternative investments as appropriate.
+All trades executed through qualified broker-dealers with best execution practices.
+
+FEE STRUCTURE
+Management fee: 1.25% annually on assets under management, calculated and billed quarterly.
+Performance fee: 15% of returns exceeding S&P 500 benchmark, calculated annually.
+Additional fees may apply for specialized services such as tax planning or estate planning.
+
+CLIENT RESPONSIBILITIES
+Client must provide accurate financial information and promptly communicate changes in circumstances.
+Investment objectives and risk tolerance should be reviewed and updated annually.
+Client responsible for reviewing and approving investment policy statement.
+
+RISK DISCLOSURE
+All investments carry risk of loss, and past performance does not guarantee future results.
+Diversification does not ensure profit or protect against loss in declining markets.
+Alternative investments may have limited liquidity and higher volatility.
+
+REGULATORY COMPLIANCE
+Advisor is registered with the Securities and Exchange Commission as an investment advisor.
+All activities conducted in accordance with Investment Advisers Act of 1940 and applicable regulations.
+Form ADV Part 2 brochure provided annually with material updates.
+
+CONFIDENTIALITY
+All client information treated as confidential and shared only as necessary for service provision.
+Third-party service providers bound by confidentiality agreements.
+Client data protected through secure systems and access controls.
+
+TERMINATION
+Either party may terminate agreement with 30 days written notice.
+Upon termination, Advisor will assist with orderly transfer of assets to new custodian or advisor.
+Final fee calculation prorated to date of termination.
+
+FinanceFirst Advisors LLC
+Thomas Anderson, Managing Partner
+Date: May 20, 2024
+
+Madison Investment Group
+Rebecca Martinez, Chief Investment Officer
+Date: May 21, 2024"
+
+HealthSystem_PatientCare_Standards_2024,"PATIENT CARE STANDARDS AND PROTOCOLS
+Metropolitan Health System - Clinical Care Standards
+Document ID: MHS-PCS-2024-001
+Effective Date: June 1, 2024
+
+PATIENT SAFETY PROTOCOLS
+All patients must have proper identification verification using two unique identifiers.
+Medication administration requires independent double-check for high-risk medications.
+Fall risk assessments completed within 4 hours of admission with appropriate interventions.
+
+CLINICAL DOCUMENTATION
+Medical records must be completed within 24 hours of patient encounter.
+All entries require electronic signature with timestamp and provider identification.
+Critical values and abnormal results must be communicated and documented immediately.
+
+INFECTION CONTROL
+Hand hygiene compliance monitored with target rate of 95% or higher.
+Personal protective equipment used according to transmission-based precautions.
+Isolation procedures implemented within 2 hours of identification of infectious conditions.
+
+EMERGENCY RESPONSE
+Code team response time target: 3 minutes from activation to arrival.
+Crash cart and emergency equipment checks performed daily and documented.
+All staff required to maintain current CPR and emergency response certifications.
+
+PATIENT COMMUNICATION
+Patient rights and responsibilities communicated upon admission.
+Informed consent obtained and documented prior to procedures and treatments.
+Family involvement encouraged with respect for patient privacy preferences.
+
+QUALITY MEASURES
+Patient satisfaction scores monitored monthly with target of 4.5/5.0 or higher.
+Medication error rates tracked with goal of less than 1 per 1000 patient days.
+Hospital-acquired infection rates measured and benchmarked against national standards.
+
+STAFF COMPETENCY
+Annual competency assessments required for all clinical staff.
+Continuing education requirements: 24 hours annually for nurses, 40 hours for physicians.
+Specialty certifications maintained according to department and role requirements.
+
+TECHNOLOGY STANDARDS
+Electronic health record system used for all patient documentation.
+Telemedicine capabilities available for remote consultations and monitoring.
+Clinical decision support tools integrated to assist with diagnosis and treatment decisions.
+
+Contact: Dr. Patricia Williams, Chief Medical Officer
+Email: pwilliams@metrohealthsystem.org
+Phone: (555) 987-6543"
--- a/demos/use_cases/mcp_filter/start_agents.sh
+++ b/demos/use_cases/mcp_filter/start_agents.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+WAIT_FOR_PIDS=()
+
+log() {
+  timestamp=$(python3 -c 'from datetime import datetime; print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:23])')
+  message="$*"
+  echo "$timestamp - $message"
+}
+
+cleanup() {
+    log "Caught signal, terminating all user processes ..."
+    for PID in "${WAIT_FOR_PIDS[@]}"; do
+        if kill $PID 2> /dev/null; then
+            log "killed process: $PID"
+        fi
+    done
+    exit 1
+}
+
+trap cleanup EXIT
+
+# log "Starting input guards filter on port 10500..."
+# uv run python -m rag_agent --host 0.0.0.0 --port 10500 --agent input_guards &
+# WAIT_FOR_PIDS+=($!)
+
+
+log "Starting query_parser agent on port 10501..."
+uv run python -m rag_agent --host 0.0.0.0 --port 10501 --agent query_rewriter &
+WAIT_FOR_PIDS+=($!)
+
+log "Starting context_builder agent on port 10502..."
+uv run python -m rag_agent --host 0.0.0.0 --port 10502 --agent context_builder &
+WAIT_FOR_PIDS+=($!)
+
+# log "Starting response_generator agent on port 10400..."
+# uv run python -m rag_agent --host 0.0.0.0 --port 10400 --agent response_generator &
+# WAIT_FOR_PIDS+=($!)
+
+log "Starting response_generator agent on port 10505..."
+uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10505 --agent response_generator &
+WAIT_FOR_PIDS+=($!)
+
+for PID in "${WAIT_FOR_PIDS[@]}"; do
+    wait "$PID"
+done
--- a/demos/use_cases/mcp_filter/test.rest
+++ b/demos/use_cases/mcp_filter/test.rest
@ -0,0 +1,95 @@
+@baseUrl = http://0.0.0.0:10502
+@model = gpt-4o
+
+# Health Check
+GET {{baseUrl}}/health
+
+###
+
+# Test 1: Simple Non-Streaming Chat Completion
+POST {{baseUrl}}/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello! Can you help me understand what machine learning is?"
+    }
+  ]
+}
+
+###
+
+# Test 2: Simple Streaming Chat Completion
+POST {{baseUrl}}/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Explain the concept of artificial intelligence in simple terms."
+    }
+  ],
+  "stream": true
+}
+
+### Test 3
+POST http://localhost:8001/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
+    }
+  ],
+  "stream": true
+}
+
+### send request to context builder agent
+POST http://localhost:10501/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "gpt-4o-mini",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
+    }
+  ]
+}
+
+### test fast-llm
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "fast-llm",
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
+
+### test smart-llm
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "smart-llm",
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
--- a/demos/use_cases/mcp_filter/uv.lock
+++ b/demos/use_cases/mcp_filter/uv.lock
--- a/tests/e2e/docker-compose.yaml
+++ b/tests/e2e/docker-compose.yaml
@ -10,7 +10,6 @@ services:
    volumes:
      - ../../demos/samples_python/weather_forecast/arch_config.yaml:/app/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-      - ~/archgw_logs:/var/log/
    extra_hosts:
      - "host.docker.internal:host-gateway"
    environment:
				`@ -0,0 +1 @@`
				`cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway && cargo build --release -p brightstaff`