agents framework demo

2026-06-17 15:25:17 +02:00 · 2025-11-24 15:02:52 -08:00 · 2025-11-24 15:02:52 -08:00 · dcfc85ca74
commit dcfc85ca74
parent b01a81927d
15 changed files with 2787 additions and 0 deletions
--- a/demos/use_cases/rag_agent/README.md
+++ b/demos/use_cases/rag_agent/README.md
@ -0,0 +1,28 @@
+# RAG Agent Query Parser
+
+A FastAPI service that rewrites user queries using archgw and gpt-4o-mini for better retrieval accuracy.
+
+## How it Works
+
+1. Receives a chat completion request with conversation history
+2. Calls archgw's LLM gateway with gpt-4o-mini to rewrite the last user query
+3. Returns the rewritten query as the assistant response
+
+## Setup and Running
+
+1. **Start archgw**:
+   ```bash
+   archgw up --foreground
+   ```
+
+2. **Start the query parser service**:
+   ```bash
+   uv run python -m rag_agent.query_parser
+   ```
+
+## Configuration
+
+```bash
+# archgw LLM Gateway base URL (default: http://localhost:12000/v1)
+export LLM_GATEWAY_ENDPOINT="http://localhost:12000/v1"
+```
--- a/demos/use_cases/rag_agent/arch_config.yaml
+++ b/demos/use_cases/rag_agent/arch_config.yaml
@ -0,0 +1,37 @@
+version: v0.3.0
+
+agents:
+  - id: query_rewriter
+    url: http://host.docker.internal:10500/v1/chat/completions
+  - id: context_builder
+    url: http://host.docker.internal:10501/v1/chat/completions
+  - id: rag_agent
+    url: http://host.docker.internal:10502/v1/chat/completions
+
+model_providers:
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+
+model_aliases:
+  fast-llm:
+    target: gpt-4o-mini
+  smart-llm:
+    target: gpt-4o
+
+listeners:
+  - type: agent
+    name: agent_1
+    port: 8001
+    router: arch_agent_router
+    agents:
+      - id: rag_agent
+        description: virtual assistant for device contracts for simple queries
+        filter_chain:
+          - query_rewriter
+          - context_builder
+
+tracing:
+  random_sampling: 100
--- a/demos/use_cases/rag_agent/docker-compose.yaml
+++ b/demos/use_cases/rag_agent/docker-compose.yaml
@ -0,0 +1,17 @@
+services:
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+  open-web-ui:
+    image: dyrnq/open-webui:main
+    restart: always
+    ports:
+      - "8080:8080"
+    environment:
+      - DEFAULT_MODEL=gpt-4o-mini
+      - ENABLE_OPENAI_API=true
+      - OPENAI_API_BASE_URL=http://host.docker.internal:8001/v1
--- a/demos/use_cases/rag_agent/pyproject.toml
+++ b/demos/use_cases/rag_agent/pyproject.toml
@ -0,0 +1,22 @@
+[project]
+name = "rag_agent"
+version = "0.1.0"
+description = "RAG Agent"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "click>=8.2.1",
+    "mcp>=1.13.1",
+    "fastmcp>=2.12.2",
+    "pydantic>=2.11.7",
+    "fastapi>=0.104.1",
+    "uvicorn>=0.24.0",
+    "openai>=1.0.0",
+]
+
+[project.scripts]
+rag_agent = "rag_agent:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/demos/use_cases/rag_agent/sample_queries.md
+++ b/demos/use_cases/rag_agent/sample_queries.md
@ -0,0 +1,64 @@
+# Sample Queries for Knowledge Base RAG Agent
+
+## Service Level Agreement Queries
+- What is the guaranteed uptime percentage for TechCorp's cloud services?
+- What remedies are available if the API response time exceeds the agreed threshold?
+- How quickly must TechCorp respond to critical support issues?
+- What monitoring and reporting requirements are specified in the SLA?
+- When was the TechCorp service agreement signed and by whom?
+
+## Privacy Policy Queries
+- What encryption methods does DataSecure use to protect data?
+- How long does DataSecure retain personal data after account deletion?
+- What rights do users have regarding their personal information?
+- Can DataSecure sell user data to third parties for marketing?
+- Who should be contacted for privacy-related concerns at DataSecure?
+
+## Supply Chain Agreement Queries
+- What types of automotive components does PrecisionParts supply?
+- What are the payment terms and volume discount structure?
+- What quality standards must the supplied components meet?
+- What are the penalties for late delivery?
+- What insurance coverage requirements apply to the supplier?
+
+## Student Data Management Queries
+- What federal laws must EduTech comply with regarding student data?
+- What security measures are in place to protect student information?
+- How long are student records retained after graduation?
+- What consent is required for students under 13 years old?
+- Who can access student educational records?
+
+## Investment Advisory Queries
+- What is FinanceFirst's management fee structure?
+- What types of investments are included in the advisory services?
+- What regulatory body oversees FinanceFirst Advisors?
+- How often are portfolio reviews conducted?
+- What are the client's responsibilities under this agreement?
+
+## Healthcare Standards Queries
+- What is the target response time for emergency code teams?
+- What hand hygiene compliance rate is required?
+- How quickly must medical records be completed after patient encounters?
+- What continuing education requirements apply to nursing staff?
+- What patient safety protocols are mandatory upon admission?
+
+## Cross-Document Queries
+- Which agreements include confidentiality or data protection provisions?
+- What are the common termination notice periods across different contract types?
+- Which documents specify insurance or liability coverage requirements?
+- What compliance and regulatory requirements are mentioned across agreements?
+- Which contracts include performance metrics or service level commitments?
+
+## Complex Analysis Queries
+- Compare the data retention policies across the privacy policy and student data management documents.
+- What are the different approaches to risk management across the supply chain and investment advisory agreements?
+- How do the security measures in the healthcare standards compare to those in the privacy policy?
+- Which agreements provide the most detailed compliance and regulatory frameworks?
+- What common themes exist in the quality assurance requirements across different industries?
+
+## Document-Specific Detail Queries
+- List all the specific percentages, timeframes, and numerical requirements mentioned in the SLA.
+- What are all the contact persons and their roles mentioned across the documents?
+- Identify all the compliance standards and certifications referenced in the supply chain agreement.
+- What are the specific consequences or penalties mentioned for non-compliance across agreements?
+- List all the third-party systems, tools, or services mentioned in the documents.
--- a/demos/use_cases/rag_agent/src/rag_agent/init.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/init.py
@ -0,0 +1,63 @@
+import click
+from mcp.server.fastmcp import FastMCP
+
+mcp = None
+
+
+@click.command()
+@click.option("--transport", "transport", default="stdio")
+@click.option("--host", "host", default="localhost")
+@click.option("--port", "port", default=10101)
+@click.option("--agent", "agent", default=None)
+@click.option(
+    "--rest-server",
+    "rest_server",
+    is_flag=True,
+    help="Start REST server instead of MCP server",
+)
+@click.option("--rest-port", "rest_port", default=8000, help="Port for REST server")
+def main(host, port, agent, transport, rest_server, rest_port):
+    if rest_server:
+        print(f"Starting REST server on {host}:{rest_port} for agent: {agent}")
+
+        if agent == "query_parser":
+            from rag_agent.query_rewriter_agent import start_server
+
+            start_server(host=host, port=rest_port)
+            return
+        elif agent == "context_builder":
+            from rag_agent.context_builder_agent import (
+                start_server,
+            )
+
+            start_server(host=host, port=rest_port)
+            return
+        elif agent == "response_generator":
+            from rag_agent.response_generator_agent import start_server
+
+            start_server(host=host, port=rest_port)
+            return
+        else:
+            print("Please specify an agent to start with --agent option.")
+            return
+
+    print(f"Starting agent(s): {agent if agent else 'all'}")
+    global mcp
+    mcp = FastMCP("RAG Agent Demo", host=host, port=port)
+
+    if agent == "query_parser":
+        import rag_agent.query_parser
+    elif agent == "document_store":
+        import rag_agent.document_store
+    elif agent == "response_generator":
+        import rag_agent.response_generator
+    else:
+        import rag_agent.query_parser
+        import rag_agent.document_store
+        import rag_agent.response_generator
+    print("All agents loaded.")
+    mcp.run(transport=transport)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/use_cases/rag_agent/src/rag_agent/main.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/main.py
@ -0,0 +1,4 @@
+from . import main
+
+if __name__ == "__main__":
+    main()
--- a/demos/use_cases/rag_agent/src/rag_agent/api.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/api.py
@ -0,0 +1,36 @@
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 1.0
+    max_tokens: Optional[int] = None
+    top_p: Optional[float] = 1.0
+    frequency_penalty: Optional[float] = 0.0
+    presence_penalty: Optional[float] = 0.0
+    stream: Optional[bool] = False
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Dict[str, int]
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
--- a/demos/use_cases/rag_agent/src/rag_agent/context_builder_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/context_builder_agent.py
@ -0,0 +1,280 @@
+import json
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+from fastapi import FastAPI, HTTPException, Request
+from openai import AsyncOpenAI
+import os
+import logging
+import csv
+from pathlib import Path
+import uvicorn
+
+from .api import ChatMessage, ChatCompletionRequest, ChatCompletionResponse
+
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [CONTEXT_BUILDER]    - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+RAG_MODEL = "gpt-4o-mini"
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+# Global variable to store the knowledge base
+knowledge_base = []
+
+
+def load_knowledge_base():
+    """Load the sample_knowledge_base.csv file into memory on startup."""
+    global knowledge_base
+
+    # Get the path to the CSV file relative to this script
+    current_dir = Path(__file__).parent
+    csv_path = current_dir / "sample_knowledge_base.csv"
+
+    print(f"Loading knowledge base from {csv_path}")
+
+    try:
+        knowledge_base = []
+        with open(csv_path, "r", encoding="utf-8-sig") as file:
+            csv_reader = csv.DictReader(file)
+            for row in csv_reader:
+                knowledge_base.append({"path": row["path"], "content": row["content"]})
+
+        logger.info(f"Loaded {len(knowledge_base)} documents from knowledge base")
+
+    except Exception as e:
+        logger.error(f"Error loading knowledge base: {e}")
+        knowledge_base = []
+
+
+async def find_relevant_passages(
+    query: str, traceparent: Optional[str] = None, top_k: int = 3
+) -> List[Dict[str, str]]:
+    """Use the LLM to find the most relevant passages from the knowledge base."""
+
+    if not knowledge_base:
+        logger.warning("Knowledge base is empty")
+        return []
+
+    # Create a system prompt for passage selection
+    system_prompt = f"""You are a retrieval assistant that selects the most relevant document passages for a given query.
+
+                    Given a user query and a list of document passages, identify the {top_k} most relevant passages that would help answer the query.
+
+                    Query: {query}
+
+                    Available passages:
+                    """
+
+    # Add all passages with indices
+    for i, doc in enumerate(knowledge_base):
+        system_prompt += (
+            f"\n[{i}] Path: {doc['path']}\nContent: {doc['content'][:500]}...\n"
+        )
+
+    system_prompt += f"""
+
+        Please respond with ONLY the indices of the {top_k} most relevant passages, separated by commas (e.g., "0,3,7").
+        If fewer than {top_k} passages are relevant, return only the relevant ones.
+        If no passages are relevant, return "NONE"."""
+
+    try:
+        # Call archgw to select relevant passages
+        logger.info(f"Calling archgw to find relevant passages for query: '{query}'")
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent:
+            extra_headers["traceparent"] = traceparent
+
+        response = await archgw_client.chat.completions.create(
+            model=RAG_MODEL,
+            messages=[{"role": "system", "content": system_prompt}],
+            temperature=0.1,
+            max_tokens=50,
+            extra_headers=extra_headers,
+        )
+
+        result = response.choices[0].message.content.strip()
+        logger.info(f"LLM selected passages: {result}")
+
+        # Parse the indices
+        if result.upper() == "NONE":
+            return []
+
+        selected_passages = []
+        indices = [
+            int(idx.strip()) for idx in result.split(",") if idx.strip().isdigit()
+        ]
+
+        for idx in indices:
+            if 0 <= idx < len(knowledge_base):
+                selected_passages.append(knowledge_base[idx])
+
+        logger.info(f"Selected {len(selected_passages)} relevant passages")
+        return selected_passages
+
+    except Exception as e:
+        logger.error(f"Error finding relevant passages: {e}")
+        return []
+
+
+async def augment_query_with_context(
+    messages: List[ChatMessage], traceparent: Optional[str] = None
+) -> List[ChatMessage]:
+    """Extract user query, find relevant context, and augment the messages."""
+
+    # Find the last user message
+    last_user_message = None
+    last_user_index = -1
+
+    for i in range(len(messages) - 1, -1, -1):
+        if messages[i].role == "user":
+            last_user_message = messages[i].content
+            last_user_index = i
+            break
+
+    if not last_user_message:
+        logger.warning("No user message found in conversation")
+        return messages
+
+    logger.info(f"Processing user query: '{last_user_message}'")
+
+    # Find relevant passages
+    relevant_passages = await find_relevant_passages(last_user_message, traceparent)
+
+    if not relevant_passages:
+        logger.info("No relevant passages found, returning original messages")
+        return messages
+
+    # Build context from relevant passages
+    context_parts = []
+    for i, passage in enumerate(relevant_passages):
+        context_parts.append(
+            f"Document {i+1} ({passage['path']}):\n{passage['content']}"
+        )
+
+    context = "\n\n".join(context_parts)
+
+    # Create augmented content with original query and context
+    augmented_content = f"""{last_user_message} RELEVANT CONTEXT:
+    {context}"""
+
+    # Create updated messages with the augmented query
+    updated_messages = messages.copy()
+    updated_messages[last_user_index] = ChatMessage(
+        role="user", content=augmented_content
+    )
+
+    logger.info(f"Augmented user query with {len(relevant_passages)} relevant passages")
+
+    return updated_messages
+
+
+class Response(BaseModel):
+    query: str
+    metadata: dict
+
+
+# FastAPI app for REST server
+app = FastAPI(title="RAG Content Builder Agent", version="1.0.0")
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(
+    request_body: ChatCompletionRequest, request: Request
+) -> ChatCompletionResponse:
+    """Chat completions endpoint that augments user queries with relevant context from the knowledge base."""
+    import time
+    import uuid
+
+    logger.info(
+        f"Received chat completion request with {len(request_body.messages)} messages"
+    )
+
+    # Read traceparent header if present
+    traceparent_header = request.headers.get("traceparent")
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Augment the user query with relevant context
+    updated_messages = await augment_query_with_context(
+        request_body.messages, traceparent_header
+    )
+    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
+
+    response = ChatCompletionResponse(
+        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+        created=int(time.time()),
+        model=request_body.model,
+        choices=[
+            {
+                "index": 0,
+                "message": {"role": "user", "content": messages_history_json},
+                "finish_reason": "stop",
+            }
+        ],
+        usage={
+            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
+            "completion_tokens": len("Context added to user query.".split()),
+            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
+            + len("Context added to user query.".split()),
+        },
+    )
+
+    return response
+
+
+def main():
+    """Main function to initialize the knowledge base and start the server."""
+    load_knowledge_base()
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+
+
+if __name__ == "__main__":
+    main()
+
+
+def start_server(host: str = "localhost", port: int = 8000):
+    """Start the REST server."""
+    load_knowledge_base()
+    # Rename the uvicorn.error logger
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_config={
+            "version": 1,
+            "disable_existing_loggers": False,
+            "formatters": {
+                "default": {
+                    "format": "%(asctime)s - [CONTEXT_BUILDER]    - %(levelname)s - %(message)s",
+                },
+            },
+            "handlers": {
+                "default": {
+                    "formatter": "default",
+                    "class": "logging.StreamHandler",
+                    "stream": "ext://sys.stdout",
+                },
+            },
+            "root": {
+                "level": "INFO",
+                "handlers": ["default"],
+            },
+        },
+    )
--- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
@ -0,0 +1,188 @@
+import json
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+from fastapi import FastAPI, HTTPException, Request
+from openai import AsyncOpenAI
+import os
+import logging
+import uvicorn
+
+from .api import ChatMessage, ChatCompletionRequest, ChatCompletionResponse
+
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [QUERY_REWRITER]     - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+QUERY_REWRITE_MODEL = "gpt-4o-mini"
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+
+async def rewrite_query_with_archgw(
+    messages: List[ChatMessage], traceparent_header: str
+) -> str:
+    # Prepare the system prompt for query rewriting
+    system_prompt = """You are a query rewriter that improves user queries for better retrieval.
+
+    Given a conversation history, rewrite the last user message to be more specific and context-aware.
+    The rewritten query should:
+    1. Include relevant context from previous messages
+    2. Be clear and specific for information retrieval
+    3. Maintain the user's intent
+    4. Be concise but comprehensive
+
+    Return only the rewritten query, nothing else."""
+
+    # Prepare messages for the query rewriter - just add system prompt to existing messages
+    rewrite_messages = [{"role": "system", "content": system_prompt}]
+
+    # Add conversation history
+    for msg in messages:
+        rewrite_messages.append({"role": msg.role, "content": msg.content})
+
+    try:
+        # Call archgw using OpenAI client
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to rewrite query")
+        response = await archgw_client.chat.completions.create(
+            model=QUERY_REWRITE_MODEL,
+            messages=rewrite_messages,
+            temperature=0.3,
+            max_tokens=200,
+            extra_headers=extra_headers,
+        )
+
+        rewritten_query = response.choices[0].message.content.strip()
+        logger.info(f"Query rewritten successfully: '{rewritten_query}'")
+        return rewritten_query
+
+    except Exception as e:
+        logger.error(f"Error rewriting query: {e}")
+
+    # If rewriting fails, return the original last user message
+    logger.info("Falling back to original user message")
+    for message in reversed(messages):
+        if message.role == "user":
+            return message.content
+    return ""
+
+
+class Response(BaseModel):
+    query: str
+    metadata: dict
+
+
+# FastAPI app for REST server
+app = FastAPI(title="RAG Agent Query Parser", version="1.0.0")
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request_body: ChatCompletionRequest, request: Request):
+    """Chat completions endpoint that rewrites the last user query using archgw."""
+    import time
+    import uuid
+
+    logger.info(
+        f"Received chat completion request with {len(request_body.messages)} messages"
+    )
+
+    # Read traceparent header if present
+    traceparent_header = request.headers.get("traceparent")
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Call archgw to rewrite the last user query
+    rewritten_query = await rewrite_query_with_archgw(
+        request_body.messages, traceparent_header
+    )
+
+    # Create updated messages with the rewritten query
+    updated_messages = request_body.messages.copy()
+
+    # Find and update the last user message with the rewritten query
+    for i in range(len(updated_messages) - 1, -1, -1):
+        if updated_messages[i].role == "user":
+            original_query = updated_messages[i].content
+            updated_messages[i] = ChatMessage(role="user", content=rewritten_query)
+            logger.info(
+                f"Updated user query from '{original_query}' to '{rewritten_query}'"
+            )
+            break
+
+    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
+
+    response = ChatCompletionResponse(
+        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+        created=int(time.time()),
+        model=request_body.model,
+        choices=[
+            {
+                "index": 0,
+                "message": {"role": "user", "content": messages_history_json},
+                "finish_reason": "stop",
+            }
+        ],
+        usage={
+            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
+            "completion_tokens": len("Updated query for better retrieval.".split()),
+            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
+            + len("Updated query for better retrieval.".split()),
+        },
+    )
+
+    return response
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+
+
+def parse_query(query):
+    """Parse the user query and returns metadata extracted from query."""
+    return Response(query=query, metadata={"is_valid": True})
+
+
+def start_server(host: str = "localhost", port: int = 8000):
+    """Start the REST server."""
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_config={
+            "version": 1,
+            "disable_existing_loggers": False,
+            "formatters": {
+                "default": {
+                    "format": "%(asctime)s - [QUERY_REWRITER]     - %(levelname)s - %(message)s",
+                },
+            },
+            "handlers": {
+                "default": {
+                    "formatter": "default",
+                    "class": "logging.StreamHandler",
+                    "stream": "ext://sys.stdout",
+                },
+            },
+            "root": {
+                "level": "INFO",
+                "handlers": ["default"],
+            },
+        },
+    )
--- a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
@ -0,0 +1,302 @@
+import json
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from openai import AsyncOpenAI
+import os
+import logging
+import time
+import uuid
+import uvicorn
+import asyncio
+
+from .api import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionStreamResponse,
+)
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# Configuration for archgw LLM gateway
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
+RESPONSE_MODEL = "gpt-4o"
+
+# System prompt for response generation
+SYSTEM_PROMPT = """You are a helpful assistant that generates coherent, contextual responses.
+
+Given a conversation history, generate a helpful and relevant response based on all the context available in the messages.
+Your response should:
+1. Be contextually aware of the entire conversation
+2. Address the user's needs appropriately
+3. Be helpful and informative
+4. Maintain a natural conversational tone
+
+Generate a complete response to assist the user."""
+
+# Initialize OpenAI client for archgw
+archgw_client = AsyncOpenAI(
+    base_url=LLM_GATEWAY_ENDPOINT,
+    api_key="EMPTY",  # archgw doesn't require a real API key
+)
+
+# FastAPI app for REST server
+app = FastAPI(title="RAG Agent Response Generator", version="1.0.0")
+
+
+def prepare_response_messages(request_body: ChatCompletionRequest):
+    """Prepare messages for response generation by adding system prompt."""
+    response_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+
+    # Add conversation history
+    for msg in request_body.messages:
+        response_messages.append({"role": msg.role, "content": msg.content})
+
+    return response_messages
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request_body: ChatCompletionRequest, request: Request):
+    """Chat completions endpoint that generates a coherent response based on all context."""
+    logger.info(
+        f"Received chat completion request with {len(request_body.messages)} messages"
+    )
+
+    # Read traceparent header if present
+    traceparent_header = request.headers.get("traceparent")
+    if traceparent_header:
+        logger.info(f"Received traceparent header: {traceparent_header}")
+    else:
+        logger.info("No traceparent header found")
+
+    # Check if streaming is requested
+    if request_body.stream:
+        return StreamingResponse(
+            stream_chat_completions(request_body, traceparent_header),
+            media_type="text/plain",
+            headers={
+                "content-type": "text/event-stream",
+            },
+        )
+    else:
+        return await non_streaming_chat_completions(request_body, traceparent_header)
+
+
+async def stream_chat_completions(
+    request_body: ChatCompletionRequest, traceparent_header: str = None
+):
+    """Generate streaming chat completions."""
+    # Prepare messages for response generation
+    response_messages = prepare_response_messages(request_body)
+
+    try:
+        # Call archgw using OpenAI client for streaming
+        logger.info(
+            f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate streaming response"
+        )
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+
+        response_stream = await archgw_client.chat.completions.create(
+            model=RESPONSE_MODEL,
+            messages=response_messages,
+            temperature=request_body.temperature or 0.7,
+            max_tokens=request_body.max_tokens or 1000,
+            stream=True,
+            extra_headers=extra_headers,
+        )
+
+        completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+        created_time = int(time.time())
+        collected_content = []
+
+        async for chunk in response_stream:
+            if chunk.choices and chunk.choices[0].delta.content:
+                content = chunk.choices[0].delta.content
+                collected_content.append(content)
+
+                # Create streaming response chunk
+                stream_chunk = ChatCompletionStreamResponse(
+                    id=completion_id,
+                    created=created_time,
+                    model=request_body.model,
+                    choices=[
+                        {
+                            "index": 0,
+                            "delta": {"content": content},
+                            "finish_reason": None,
+                        }
+                    ],
+                )
+
+                yield f"data: {stream_chunk.model_dump_json()}\n\n"
+
+        # Send final chunk with complete response in expected format
+        full_response = "".join(collected_content)
+        updated_history = [{"role": "assistant", "content": full_response}]
+
+        final_chunk = ChatCompletionStreamResponse(
+            id=completion_id,
+            created=created_time,
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": "stop",
+                    "message": {
+                        "role": "assistant",
+                        "content": json.dumps(updated_history),
+                    },
+                }
+            ],
+        )
+
+        yield f"data: {final_chunk.model_dump_json()}\n\n"
+        yield "data: [DONE]\n\n"
+
+    except Exception as e:
+        logger.error(f"Error generating streaming response: {e}")
+
+        # Send error as streaming response
+        error_chunk = ChatCompletionStreamResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "I apologize, but I'm having trouble generating a response right now. Please try again."
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+        )
+
+        yield f"data: {error_chunk.model_dump_json()}\n\n"
+        yield "data: [DONE]\n\n"
+
+
+async def non_streaming_chat_completions(
+    request_body: ChatCompletionRequest, traceparent_header: str = None
+):
+    """Generate non-streaming chat completions."""
+    # Prepare messages for response generation
+    response_messages = prepare_response_messages(request_body)
+
+    try:
+        # Call archgw using OpenAI client
+        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate response")
+
+        # Prepare extra headers if traceparent is provided
+        extra_headers = {"x-envoy-max-retries": "3"}
+        if traceparent_header:
+            extra_headers["traceparent"] = traceparent_header
+
+        response = await archgw_client.chat.completions.create(
+            model=RESPONSE_MODEL,
+            messages=response_messages,
+            temperature=request_body.temperature or 0.7,
+            max_tokens=request_body.max_tokens or 1000,
+            extra_headers=extra_headers,
+        )
+
+        generated_response = response.choices[0].message.content.strip()
+        logger.info(f"Response generated successfully")
+
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": generated_response,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                ),
+                "completion_tokens": len(generated_response.split()),
+                "total_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                )
+                + len(generated_response.split()),
+            },
+        )
+
+    except Exception as e:
+        logger.error(f"Error generating response: {e}")
+
+        # Fallback response
+        fallback_message = "I apologize, but I'm having trouble generating a response right now. Please try again."
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            created=int(time.time()),
+            model=request_body.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": fallback_message},
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                ),
+                "completion_tokens": len(fallback_message.split()),
+                "total_tokens": sum(
+                    len(msg.content.split()) for msg in request_body.messages
+                )
+                + len(fallback_message.split()),
+            },
+        )
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+
+
+def start_server(host: str = "localhost", port: int = 8000):
+    """Start the REST server."""
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_config={
+            "version": 1,
+            "disable_existing_loggers": False,
+            "formatters": {
+                "default": {
+                    "format": "%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
+                },
+            },
+            "handlers": {
+                "default": {
+                    "formatter": "default",
+                    "class": "logging.StreamHandler",
+                    "stream": "ext://sys.stdout",
+                },
+            },
+            "root": {
+                "level": "INFO",
+                "handlers": ["default"],
+            },
+        },
+    )
--- a/demos/use_cases/rag_agent/src/rag_agent/sample_knowledge_base.csv
+++ b/demos/use_cases/rag_agent/src/rag_agent/sample_knowledge_base.csv
@ -0,0 +1,257 @@
+path,content
+TechCorp_CloudServices_SLA_Agreement_2024,"SERVICE LEVEL AGREEMENT
+This Service Level Agreement (""SLA"") is entered into on March 15, 2024, between TechCorp Solutions Inc., a Delaware corporation (""Provider""), and CloudFirst Enterprises LLC (""Customer"").
+
+DEFINITIONS
+Service Availability: The percentage of time during which the cloud services are operational and accessible.
+Downtime: Any period when the services are unavailable or inaccessible to Customer.
+Response Time: The time between service request submission and initial response from Provider.
+
+SERVICE COMMITMENTS
+Provider guarantees 99.9% uptime for all cloud infrastructure services during any calendar month.
+Average response time for API calls shall not exceed 200 milliseconds under normal operating conditions.
+Customer support response times: Critical issues within 1 hour, Standard issues within 4 hours.
+
+REMEDIES
+For each full percentage point below 99.9% availability, Customer receives 10% credit on monthly fees.
+If response times exceed 500ms for more than 5 minutes in any hour, Customer receives 5% monthly credit.
+
+MONITORING AND REPORTING
+Provider will maintain real-time monitoring systems and provide monthly performance reports.
+All metrics will be measured from Provider's monitoring systems located in primary data centers.
+
+This SLA remains in effect for the duration of the underlying service agreement.
+
+Executed by:
+TechCorp Solutions Inc.
+Sarah Mitchell, VP Operations
+Date: March 15, 2024
+
+CloudFirst Enterprises LLC
+Robert Chen, CTO
+Date: March 16, 2024"
+
+DataSecure_Privacy_Policy_v3.2,"PRIVACY POLICY
+DataSecure Analytics, Inc. (""Company"") Privacy Policy
+Effective Date: January 1, 2024
+Last Updated: February 28, 2024
+
+INFORMATION COLLECTION
+We collect information you provide directly, such as account details, usage preferences, and communication records.
+Automatically collected data includes IP addresses, browser types, device information, and service interaction logs.
+Third-party integrations may provide additional user behavior and demographic information with consent.
+
+DATA USAGE
+Personal information is used to provide services, improve user experience, and communicate service updates.
+Aggregated, non-identifiable data may be used for analytics, research, and service enhancement.
+We do not sell personal information to third parties for marketing purposes.
+
+DATA PROTECTION
+All data is encrypted in transit using TLS 1.3 and at rest using AES-256 encryption.
+Access controls limit data access to authorized personnel only on a need-to-know basis.
+Regular security audits and penetration testing ensure ongoing protection measures.
+
+DATA RETENTION
+Personal data is retained for the duration of active service plus 24 months.
+Logs and analytics data are retained for 12 months unless legally required otherwise.
+Upon account deletion, personal data is permanently removed within 30 days.
+
+USER RIGHTS
+Users may request access to, correction of, or deletion of their personal information.
+Data portability requests will be fulfilled in standard formats within 30 days.
+Marketing communications can be opted out of at any time.
+
+CONTACT
+For privacy concerns, contact: privacy@datasecure.com
+Data Protection Officer: Jennifer Walsh, jwalsh@datasecure.com"
+
+GlobalManufacturing_SupplyChain_Contract_Q2_2024,"SUPPLY CHAIN AGREEMENT
+This Supply Chain Agreement is entered into between GlobalManufacturing Corp (""Buyer"") and PrecisionParts Ltd (""Supplier"") effective April 1, 2024.
+
+SCOPE OF SERVICES
+Supplier will provide automotive components including brake assemblies, suspension parts, and electrical harnesses.
+All products must meet ISO 9001 quality standards and automotive industry specifications.
+Delivery schedule: Weekly shipments every Tuesday, with 48-hour advance shipping notifications.
+
+PRICING AND PAYMENT
+Component pricing is fixed for initial 6-month term with quarterly price review thereafter.
+Payment terms: Net 45 days from invoice date via electronic transfer.
+Volume discounts apply: 5% for orders exceeding 10,000 units per month, 8% for orders exceeding 25,000 units.
+
+QUALITY REQUIREMENTS
+All components must pass incoming inspection with less than 0.1% defect rate.
+Supplier maintains quality certifications including IATF 16949 and environmental compliance.
+Batch tracking and traceability required for all delivered components.
+
+LOGISTICS AND DELIVERY
+Supplier responsible for packaging, labeling, and delivery to Buyer's distribution centers.
+Delivery windows: 8 AM - 4 PM, Monday through Friday, with advance appointment scheduling.
+Late delivery penalties: 2% of shipment value for each day beyond scheduled delivery.
+
+RISK MANAGEMENT
+Supplier maintains business continuity plans and alternative sourcing strategies.
+Force majeure events must be reported within 24 hours with mitigation plans.
+Insurance requirements: $5M general liability, $2M product liability coverage.
+
+INTELLECTUAL PROPERTY
+All custom tooling and specifications remain property of Buyer.
+Supplier grants license to use necessary patents for component manufacturing.
+
+This agreement shall remain in effect for 24 months with automatic renewal unless terminated.
+
+GlobalManufacturing Corp
+Michael Rodriguez, Supply Chain Director
+Date: April 1, 2024
+
+PrecisionParts Ltd
+Amanda Foster, VP Sales
+Date: April 2, 2024"
+
+EduTech_StudentData_Management_Policy_2024,"STUDENT DATA MANAGEMENT POLICY
+EduTech Learning Platform - Data Management and Protection Policy
+Document Version: 2.1
+Effective Date: August 15, 2024
+
+SCOPE AND PURPOSE
+This policy governs the collection, use, storage, and protection of student educational records and personal information.
+Applies to all employees, contractors, and third-party service providers accessing student data.
+Compliance with FERPA, COPPA, and state student privacy laws is mandatory.
+
+DATA CLASSIFICATION
+Educational Records: Grades, attendance, assignments, and academic progress information.
+Personal Information: Names, addresses, contact details, and demographic information.
+Behavioral Data: Learning patterns, platform usage, and engagement metrics.
+
+COLLECTION PRINCIPLES
+Data collection is limited to educational purposes and service improvement only.
+Parental consent required for students under 13 years of age.
+Students and parents have right to review and request corrections to educational records.
+
+ACCESS CONTROLS
+Role-based access ensures personnel see only data necessary for their functions.
+Multi-factor authentication required for all system access.
+Access logs maintained and reviewed monthly for unauthorized activity.
+
+DATA SHARING
+Educational records shared only with authorized school personnel and parents/students.
+No data sharing with third parties for commercial purposes without explicit consent.
+Research data must be de-identified and aggregated before external sharing.
+
+SECURITY MEASURES
+Data encrypted using industry-standard protocols during transmission and storage.
+Regular security assessments and vulnerability testing conducted quarterly.
+Incident response plan includes notification procedures for data breaches.
+
+RETENTION AND DISPOSAL
+Student records retained according to school district policies, typically 5-7 years post-graduation.
+Inactive accounts and associated data purged after 2 years of non-use.
+Secure data destruction protocols ensure complete removal of sensitive information.
+
+COMPLIANCE MONITORING
+Annual privacy training required for all staff handling student data.
+Regular audits ensure ongoing compliance with applicable privacy regulations.
+Privacy impact assessments conducted for new features or data uses.
+
+Contact: Dr. Lisa Thompson, Chief Privacy Officer
+Email: privacy@edutech-learning.com
+Phone: (555) 123-4567"
+
+FinanceFirst_Investment_Advisory_Agreement_2024,"INVESTMENT ADVISORY AGREEMENT
+This Investment Advisory Agreement is entered into between FinanceFirst Advisors LLC (""Advisor"") and Madison Investment Group (""Client"") on May 20, 2024.
+
+ADVISORY SERVICES
+Advisor will provide comprehensive investment management and financial planning services.
+Services include portfolio construction, asset allocation, risk assessment, and performance monitoring.
+Regular portfolio reviews conducted quarterly with detailed performance reporting.
+
+INVESTMENT AUTHORITY
+Client grants Advisor discretionary authority to make investment decisions within agreed parameters.
+Investment universe includes stocks, bonds, ETFs, mutual funds, and alternative investments as appropriate.
+All trades executed through qualified broker-dealers with best execution practices.
+
+FEE STRUCTURE
+Management fee: 1.25% annually on assets under management, calculated and billed quarterly.
+Performance fee: 15% of returns exceeding S&P 500 benchmark, calculated annually.
+Additional fees may apply for specialized services such as tax planning or estate planning.
+
+CLIENT RESPONSIBILITIES
+Client must provide accurate financial information and promptly communicate changes in circumstances.
+Investment objectives and risk tolerance should be reviewed and updated annually.
+Client responsible for reviewing and approving investment policy statement.
+
+RISK DISCLOSURE
+All investments carry risk of loss, and past performance does not guarantee future results.
+Diversification does not ensure profit or protect against loss in declining markets.
+Alternative investments may have limited liquidity and higher volatility.
+
+REGULATORY COMPLIANCE
+Advisor is registered with the Securities and Exchange Commission as an investment advisor.
+All activities conducted in accordance with Investment Advisers Act of 1940 and applicable regulations.
+Form ADV Part 2 brochure provided annually with material updates.
+
+CONFIDENTIALITY
+All client information treated as confidential and shared only as necessary for service provision.
+Third-party service providers bound by confidentiality agreements.
+Client data protected through secure systems and access controls.
+
+TERMINATION
+Either party may terminate agreement with 30 days written notice.
+Upon termination, Advisor will assist with orderly transfer of assets to new custodian or advisor.
+Final fee calculation prorated to date of termination.
+
+FinanceFirst Advisors LLC
+Thomas Anderson, Managing Partner
+Date: May 20, 2024
+
+Madison Investment Group
+Rebecca Martinez, Chief Investment Officer
+Date: May 21, 2024"
+
+HealthSystem_PatientCare_Standards_2024,"PATIENT CARE STANDARDS AND PROTOCOLS
+Metropolitan Health System - Clinical Care Standards
+Document ID: MHS-PCS-2024-001
+Effective Date: June 1, 2024
+
+PATIENT SAFETY PROTOCOLS
+All patients must have proper identification verification using two unique identifiers.
+Medication administration requires independent double-check for high-risk medications.
+Fall risk assessments completed within 4 hours of admission with appropriate interventions.
+
+CLINICAL DOCUMENTATION
+Medical records must be completed within 24 hours of patient encounter.
+All entries require electronic signature with timestamp and provider identification.
+Critical values and abnormal results must be communicated and documented immediately.
+
+INFECTION CONTROL
+Hand hygiene compliance monitored with target rate of 95% or higher.
+Personal protective equipment used according to transmission-based precautions.
+Isolation procedures implemented within 2 hours of identification of infectious conditions.
+
+EMERGENCY RESPONSE
+Code team response time target: 3 minutes from activation to arrival.
+Crash cart and emergency equipment checks performed daily and documented.
+All staff required to maintain current CPR and emergency response certifications.
+
+PATIENT COMMUNICATION
+Patient rights and responsibilities communicated upon admission.
+Informed consent obtained and documented prior to procedures and treatments.
+Family involvement encouraged with respect for patient privacy preferences.
+
+QUALITY MEASURES
+Patient satisfaction scores monitored monthly with target of 4.5/5.0 or higher.
+Medication error rates tracked with goal of less than 1 per 1000 patient days.
+Hospital-acquired infection rates measured and benchmarked against national standards.
+
+STAFF COMPETENCY
+Annual competency assessments required for all clinical staff.
+Continuing education requirements: 24 hours annually for nurses, 40 hours for physicians.
+Specialty certifications maintained according to department and role requirements.
+
+TECHNOLOGY STANDARDS
+Electronic health record system used for all patient documentation.
+Telemedicine capabilities available for remote consultations and monitoring.
+Clinical decision support tools integrated to assist with diagnosis and treatment decisions.
+
+Contact: Dr. Patricia Williams, Chief Medical Officer
+Email: pwilliams@metrohealthsystem.org
+Phone: (555) 987-6543"
--- a/demos/use_cases/rag_agent/start_agents.sh
+++ b/demos/use_cases/rag_agent/start_agents.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+WAIT_FOR_PIDS=()
+
+log() {
+  timestamp=$(python3 -c 'from datetime import datetime; print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:23])')
+  message="$*"
+  echo "$timestamp - $message"
+}
+
+cleanup() {
+    log "Caught signal, terminating all user processes ..."
+    for PID in "${WAIT_FOR_PIDS[@]}"; do
+        if kill $PID 2> /dev/null; then
+            log "killed process: $PID"
+        fi
+    done
+    exit 1
+}
+
+trap cleanup EXIT
+
+log "Starting query_parser agent on port 10500..."
+uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10500 --agent query_parser &
+WAIT_FOR_PIDS+=($!)
+
+log "Starting context_builder agent on port 10501..."
+uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10501 --agent context_builder &
+WAIT_FOR_PIDS+=($!)
+
+log "Starting response_generator agent on port 10502..."
+uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10502 --agent response_generator &
+WAIT_FOR_PIDS+=($!)
+
+for PID in "${WAIT_FOR_PIDS[@]}"; do
+    wait "$PID"
+done
--- a/demos/use_cases/rag_agent/test.rest
+++ b/demos/use_cases/rag_agent/test.rest
@ -0,0 +1,95 @@
+@baseUrl = http://0.0.0.0:10502
+@model = gpt-4o
+
+# Health Check
+GET {{baseUrl}}/health
+
+###
+
+# Test 1: Simple Non-Streaming Chat Completion
+POST {{baseUrl}}/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello! Can you help me understand what machine learning is?"
+    }
+  ]
+}
+
+###
+
+# Test 2: Simple Streaming Chat Completion
+POST {{baseUrl}}/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Explain the concept of artificial intelligence in simple terms."
+    }
+  ],
+  "stream": true
+}
+
+### Test 3
+POST http://localhost:8001/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "{{model}}",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
+    }
+  ],
+  "stream": false
+}
+
+### send request to context builder agent
+POST http://localhost:10501/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "gpt-4o-mini",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
+    }
+  ]
+}
+
+### test fast-llm
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "fast-llm",
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
+
+### test smart-llm
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "smart-llm",
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
--- a/demos/use_cases/rag_agent/uv.lock
+++ b/demos/use_cases/rag_agent/uv.lock