delete rag agent demo

2026-06-17 15:25:17 +02:00 · 2025-10-14 12:51:45 -07:00 · 2025-10-14 12:51:45 -07:00 · eed776fc94
commit eed776fc94
parent d3595aeee1
15 changed files with 0 additions and 2765 deletions
--- a/demos/use_cases/rag_agent/README.md
+++ b/demos/use_cases/rag_agent/README.md
@ -1,28 +0,0 @@
-# RAG Agent Query Parser
-
-A FastAPI service that rewrites user queries using archgw and gpt-4o-mini for better retrieval accuracy.
-
-## How it Works
-
-1. Receives a chat completion request with conversation history
-2. Calls archgw's LLM gateway with gpt-4o-mini to rewrite the last user query
-3. Returns the rewritten query as the assistant response
-
-## Setup and Running
-
-1. **Start archgw**:
-   ```bash
-   archgw up --foreground
-   ```
-
-2. **Start the query parser service**:
-   ```bash
-   uv run python -m rag_agent.query_parser
-   ```
-
-## Configuration
-
-```bash
-# archgw LLM Gateway base URL (default: http://localhost:12000/v1)
-export LLM_GATEWAY_ENDPOINT="http://localhost:12000/v1"
-```
--- a/demos/use_cases/rag_agent/arch_config.yaml
+++ b/demos/use_cases/rag_agent/arch_config.yaml
@ -1,43 +0,0 @@
-version: v0.3.0
-
-agents:
-  - id: query_rewriter
-    url: http://host.docker.internal:10500/v1/chat/completions
-  - id: context_builder
-    url: http://host.docker.internal:10501/v1/chat/completions
-  - id: rag_agent
-    url: http://host.docker.internal:10502/v1/chat/completions
-  - id: research_agent
-    url: http://host.docker.internal:10503/v1/chat/completions
-  - id: weather_forecast_agent
-    url: http://host.docker.internal:10504/process
-
-model_providers:
-  - model: openai/gpt-4o-mini
-    access_key: $OPENAI_API_KEY
-    default: true
-  - model: openai/gpt-4o
-    access_key: $OPENAI_API_KEY
-  - model: ollama/llama3.1
-    base_url: http://host.docker.internal:11434
-
-model_aliases:
-  fast-llm:
-    target: gpt-4o-mini
-  smart-llm:
-    target: gpt-4o
-
-listeners:
-  - type: agent
-    name: agent_1
-    port: 8001
-    router: arch_agent_router
-    agents:
-      - id: rag_agent
-        description: virtual assistant for device contracts for simple queries
-        filter_chain:
-          - query_rewriter
-          - context_builder
-
-tracing:
-  random_sampling: 100
--- a/demos/use_cases/rag_agent/docker-compose.yaml
+++ b/demos/use_cases/rag_agent/docker-compose.yaml
@ -1,17 +0,0 @@
-services:
-  jaeger:
-    build:
-      context: ../../shared/jaeger
-    ports:
-      - "16686:16686"
-      - "4317:4317"
-      - "4318:4318"
-  open-web-ui:
-    image: dyrnq/open-webui:main
-    restart: always
-    ports:
-      - "8080:8080"
-    environment:
-      - DEFAULT_MODEL=gpt-4o-mini
-      - ENABLE_OPENAI_API=true
-      - OPENAI_API_BASE_URL=http://host.docker.internal:8001/v1
--- a/demos/use_cases/rag_agent/pyproject.toml
+++ b/demos/use_cases/rag_agent/pyproject.toml
@ -1,22 +0,0 @@
-[project]
-name = "rag_agent"
-version = "0.1.0"
-description = "RAG Agent"
-readme = "README.md"
-requires-python = ">=3.10"
-dependencies = [
-    "click>=8.2.1",
-    "mcp>=1.13.1",
-    "fastmcp>=2.12.2",
-    "pydantic>=2.11.7",
-    "fastapi>=0.104.1",
-    "uvicorn>=0.24.0",
-    "openai>=1.0.0",
-]
-
-[project.scripts]
-rag_agent = "rag_agent:main"
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
--- a/demos/use_cases/rag_agent/sample_queries.md
+++ b/demos/use_cases/rag_agent/sample_queries.md
@ -1,64 +0,0 @@
-# Sample Queries for Knowledge Base RAG Agent
-
-## Service Level Agreement Queries
- What is the guaranteed uptime percentage for TechCorp's cloud services?
- What remedies are available if the API response time exceeds the agreed threshold?
- How quickly must TechCorp respond to critical support issues?
- What monitoring and reporting requirements are specified in the SLA?
- When was the TechCorp service agreement signed and by whom?
-
-## Privacy Policy Queries
- What encryption methods does DataSecure use to protect data?
- How long does DataSecure retain personal data after account deletion?
- What rights do users have regarding their personal information?
- Can DataSecure sell user data to third parties for marketing?
- Who should be contacted for privacy-related concerns at DataSecure?
-
-## Supply Chain Agreement Queries
- What types of automotive components does PrecisionParts supply?
- What are the payment terms and volume discount structure?
- What quality standards must the supplied components meet?
- What are the penalties for late delivery?
- What insurance coverage requirements apply to the supplier?
-
-## Student Data Management Queries
- What federal laws must EduTech comply with regarding student data?
- What security measures are in place to protect student information?
- How long are student records retained after graduation?
- What consent is required for students under 13 years old?
- Who can access student educational records?
-
-## Investment Advisory Queries
- What is FinanceFirst's management fee structure?
- What types of investments are included in the advisory services?
- What regulatory body oversees FinanceFirst Advisors?
- How often are portfolio reviews conducted?
- What are the client's responsibilities under this agreement?
-
-## Healthcare Standards Queries
- What is the target response time for emergency code teams?
- What hand hygiene compliance rate is required?
- How quickly must medical records be completed after patient encounters?
- What continuing education requirements apply to nursing staff?
- What patient safety protocols are mandatory upon admission?
-
-## Cross-Document Queries
- Which agreements include confidentiality or data protection provisions?
- What are the common termination notice periods across different contract types?
- Which documents specify insurance or liability coverage requirements?
- What compliance and regulatory requirements are mentioned across agreements?
- Which contracts include performance metrics or service level commitments?
-
-## Complex Analysis Queries
- Compare the data retention policies across the privacy policy and student data management documents.
- What are the different approaches to risk management across the supply chain and investment advisory agreements?
- How do the security measures in the healthcare standards compare to those in the privacy policy?
- Which agreements provide the most detailed compliance and regulatory frameworks?
- What common themes exist in the quality assurance requirements across different industries?
-
-## Document-Specific Detail Queries
- List all the specific percentages, timeframes, and numerical requirements mentioned in the SLA.
- What are all the contact persons and their roles mentioned across the documents?
- Identify all the compliance standards and certifications referenced in the supply chain agreement.
- What are the specific consequences or penalties mentioned for non-compliance across agreements?
- List all the third-party systems, tools, or services mentioned in the documents.
--- a/demos/use_cases/rag_agent/src/rag_agent/init.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/init.py
@ -1,63 +0,0 @@
-import click
-from mcp.server.fastmcp import FastMCP
-
-mcp = None
-
-
-@click.command()
-@click.option("--transport", "transport", default="stdio")
-@click.option("--host", "host", default="localhost")
-@click.option("--port", "port", default=10101)
-@click.option("--agent", "agent", default=None)
-@click.option(
-    "--rest-server",
-    "rest_server",
-    is_flag=True,
-    help="Start REST server instead of MCP server",
-)
-@click.option("--rest-port", "rest_port", default=8000, help="Port for REST server")
-def main(host, port, agent, transport, rest_server, rest_port):
-    if rest_server:
-        print(f"Starting REST server on {host}:{rest_port} for agent: {agent}")
-
-        if agent == "query_parser":
-            from rag_agent.query_rewriter_agent import start_server
-
-            start_server(host=host, port=rest_port)
-            return
-        elif agent == "context_builder":
-            from rag_agent.context_builder_agent import (
-                start_server,
-            )
-
-            start_server(host=host, port=rest_port)
-            return
-        elif agent == "response_generator":
-            from rag_agent.response_generator_agent import start_server
-
-            start_server(host=host, port=rest_port)
-            return
-        else:
-            print("Please specify an agent to start with --agent option.")
-            return
-
-    print(f"Starting agent(s): {agent if agent else 'all'}")
-    global mcp
-    mcp = FastMCP("RAG Agent Demo", host=host, port=port)
-
-    if agent == "query_parser":
-        import rag_agent.query_parser
-    elif agent == "document_store":
-        import rag_agent.document_store
-    elif agent == "response_generator":
-        import rag_agent.response_generator
-    else:
-        import rag_agent.query_parser
-        import rag_agent.document_store
-        import rag_agent.response_generator
-    print("All agents loaded.")
-    mcp.run(transport=transport)
-
-
-if __name__ == "__main__":
-    main()
--- a/demos/use_cases/rag_agent/src/rag_agent/main.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/main.py
@ -1,4 +0,0 @@
-from . import main
-
-if __name__ == "__main__":
-    main()
--- a/demos/use_cases/rag_agent/src/rag_agent/api.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/api.py
@ -1,36 +0,0 @@
-from pydantic import BaseModel
-from typing import List, Optional, Dict, Any
-
-
-class ChatMessage(BaseModel):
-    role: str
-    content: str
-
-
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: List[ChatMessage]
-    temperature: Optional[float] = 1.0
-    max_tokens: Optional[int] = None
-    top_p: Optional[float] = 1.0
-    frequency_penalty: Optional[float] = 0.0
-    presence_penalty: Optional[float] = 0.0
-    stream: Optional[bool] = False
-    stop: Optional[List[str]] = None
-
-
-class ChatCompletionResponse(BaseModel):
-    id: str
-    object: str = "chat.completion"
-    created: int
-    model: str
-    choices: List[Dict[str, Any]]
-    usage: Dict[str, int]
-
-
-class ChatCompletionStreamResponse(BaseModel):
-    id: str
-    object: str = "chat.completion.chunk"
-    created: int
-    model: str
-    choices: List[Dict[str, Any]]
--- a/demos/use_cases/rag_agent/src/rag_agent/context_builder_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/context_builder_agent.py
@ -1,280 +0,0 @@
-import json
-from pydantic import BaseModel
-from typing import List, Optional, Dict, Any
-from fastapi import FastAPI, HTTPException, Request
-from openai import AsyncOpenAI
-import os
-import logging
-import csv
-from pathlib import Path
-import uvicorn
-
-from .api import ChatMessage, ChatCompletionRequest, ChatCompletionResponse
-
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - [CONTEXT_BUILDER]    - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-
-# Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
-RAG_MODEL = "gpt-4o-mini"
-
-# Initialize OpenAI client for archgw
-archgw_client = AsyncOpenAI(
-    base_url=LLM_GATEWAY_ENDPOINT,
-    api_key="EMPTY",  # archgw doesn't require a real API key
-)
-
-# Global variable to store the knowledge base
-knowledge_base = []
-
-
-def load_knowledge_base():
-    """Load the sample_knowledge_base.csv file into memory on startup."""
-    global knowledge_base
-
-    # Get the path to the CSV file relative to this script
-    current_dir = Path(__file__).parent
-    csv_path = current_dir / "sample_knowledge_base.csv"
-
-    print(f"Loading knowledge base from {csv_path}")
-
-    try:
-        knowledge_base = []
-        with open(csv_path, "r", encoding="utf-8-sig") as file:
-            csv_reader = csv.DictReader(file)
-            for row in csv_reader:
-                knowledge_base.append({"path": row["path"], "content": row["content"]})
-
-        logger.info(f"Loaded {len(knowledge_base)} documents from knowledge base")
-
-    except Exception as e:
-        logger.error(f"Error loading knowledge base: {e}")
-        knowledge_base = []
-
-
-async def find_relevant_passages(
-    query: str, traceparent: Optional[str] = None, top_k: int = 3
-) -> List[Dict[str, str]]:
-    """Use the LLM to find the most relevant passages from the knowledge base."""
-
-    if not knowledge_base:
-        logger.warning("Knowledge base is empty")
-        return []
-
-    # Create a system prompt for passage selection
-    system_prompt = f"""You are a retrieval assistant that selects the most relevant document passages for a given query.
-
-                    Given a user query and a list of document passages, identify the {top_k} most relevant passages that would help answer the query.
-
-                    Query: {query}
-
-                    Available passages:
-                    """
-
-    # Add all passages with indices
-    for i, doc in enumerate(knowledge_base):
-        system_prompt += (
-            f"\n[{i}] Path: {doc['path']}\nContent: {doc['content'][:500]}...\n"
-        )
-
-    system_prompt += f"""
-
-        Please respond with ONLY the indices of the {top_k} most relevant passages, separated by commas (e.g., "0,3,7").
-        If fewer than {top_k} passages are relevant, return only the relevant ones.
-        If no passages are relevant, return "NONE"."""
-
-    try:
-        # Call archgw to select relevant passages
-        logger.info(f"Calling archgw to find relevant passages for query: '{query}'")
-
-        # Prepare extra headers if traceparent is provided
-        extra_headers = {"x-envoy-max-retries": "3"}
-        if traceparent:
-            extra_headers["traceparent"] = traceparent
-
-        response = await archgw_client.chat.completions.create(
-            model=RAG_MODEL,
-            messages=[{"role": "system", "content": system_prompt}],
-            temperature=0.1,
-            max_tokens=50,
-            extra_headers=extra_headers,
-        )
-
-        result = response.choices[0].message.content.strip()
-        logger.info(f"LLM selected passages: {result}")
-
-        # Parse the indices
-        if result.upper() == "NONE":
-            return []
-
-        selected_passages = []
-        indices = [
-            int(idx.strip()) for idx in result.split(",") if idx.strip().isdigit()
-        ]
-
-        for idx in indices:
-            if 0 <= idx < len(knowledge_base):
-                selected_passages.append(knowledge_base[idx])
-
-        logger.info(f"Selected {len(selected_passages)} relevant passages")
-        return selected_passages
-
-    except Exception as e:
-        logger.error(f"Error finding relevant passages: {e}")
-        return []
-
-
-async def augment_query_with_context(
-    messages: List[ChatMessage], traceparent: Optional[str] = None
-) -> List[ChatMessage]:
-    """Extract user query, find relevant context, and augment the messages."""
-
-    # Find the last user message
-    last_user_message = None
-    last_user_index = -1
-
-    for i in range(len(messages) - 1, -1, -1):
-        if messages[i].role == "user":
-            last_user_message = messages[i].content
-            last_user_index = i
-            break
-
-    if not last_user_message:
-        logger.warning("No user message found in conversation")
-        return messages
-
-    logger.info(f"Processing user query: '{last_user_message}'")
-
-    # Find relevant passages
-    relevant_passages = await find_relevant_passages(last_user_message, traceparent)
-
-    if not relevant_passages:
-        logger.info("No relevant passages found, returning original messages")
-        return messages
-
-    # Build context from relevant passages
-    context_parts = []
-    for i, passage in enumerate(relevant_passages):
-        context_parts.append(
-            f"Document {i+1} ({passage['path']}):\n{passage['content']}"
-        )
-
-    context = "\n\n".join(context_parts)
-
-    # Create augmented content with original query and context
-    augmented_content = f"""{last_user_message} RELEVANT CONTEXT:
-    {context}"""
-
-    # Create updated messages with the augmented query
-    updated_messages = messages.copy()
-    updated_messages[last_user_index] = ChatMessage(
-        role="user", content=augmented_content
-    )
-
-    logger.info(f"Augmented user query with {len(relevant_passages)} relevant passages")
-
-    return updated_messages
-
-
-class Response(BaseModel):
-    query: str
-    metadata: dict
-
-
-# FastAPI app for REST server
-app = FastAPI(title="RAG Content Builder Agent", version="1.0.0")
-
-
-@app.post("/v1/chat/completions")
-async def chat_completions(
-    request_body: ChatCompletionRequest, request: Request
-) -> ChatCompletionResponse:
-    """Chat completions endpoint that augments user queries with relevant context from the knowledge base."""
-    import time
-    import uuid
-
-    logger.info(
-        f"Received chat completion request with {len(request_body.messages)} messages"
-    )
-
-    # Read traceparent header if present
-    traceparent_header = request.headers.get("traceparent")
-    if traceparent_header:
-        logger.info(f"Received traceparent header: {traceparent_header}")
-    else:
-        logger.info("No traceparent header found")
-
-    # Augment the user query with relevant context
-    updated_messages = await augment_query_with_context(
-        request_body.messages, traceparent_header
-    )
-    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
-
-    response = ChatCompletionResponse(
-        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        created=int(time.time()),
-        model=request_body.model,
-        choices=[
-            {
-                "index": 0,
-                "message": {"role": "user", "content": messages_history_json},
-                "finish_reason": "stop",
-            }
-        ],
-        usage={
-            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
-            "completion_tokens": len("Context added to user query.".split()),
-            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
-            + len("Context added to user query.".split()),
-        },
-    )
-
-    return response
-
-
-def main():
-    """Main function to initialize the knowledge base and start the server."""
-    load_knowledge_base()
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
-
-
-if __name__ == "__main__":
-    main()
-
-
-def start_server(host: str = "localhost", port: int = 8000):
-    """Start the REST server."""
-    load_knowledge_base()
-    # Rename the uvicorn.error logger
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_config={
-            "version": 1,
-            "disable_existing_loggers": False,
-            "formatters": {
-                "default": {
-                    "format": "%(asctime)s - [CONTEXT_BUILDER]    - %(levelname)s - %(message)s",
-                },
-            },
-            "handlers": {
-                "default": {
-                    "formatter": "default",
-                    "class": "logging.StreamHandler",
-                    "stream": "ext://sys.stdout",
-                },
-            },
-            "root": {
-                "level": "INFO",
-                "handlers": ["default"],
-            },
-        },
-    )
--- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
@ -1,188 +0,0 @@
-import json
-from pydantic import BaseModel
-from typing import List, Optional, Dict, Any
-from fastapi import FastAPI, HTTPException, Request
-from openai import AsyncOpenAI
-import os
-import logging
-import uvicorn
-
-from .api import ChatMessage, ChatCompletionRequest, ChatCompletionResponse
-
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - [QUERY_REWRITER]     - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-
-# Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
-QUERY_REWRITE_MODEL = "gpt-4o-mini"
-
-# Initialize OpenAI client for archgw
-archgw_client = AsyncOpenAI(
-    base_url=LLM_GATEWAY_ENDPOINT,
-    api_key="EMPTY",  # archgw doesn't require a real API key
-)
-
-
-async def rewrite_query_with_archgw(
-    messages: List[ChatMessage], traceparent_header: str
-) -> str:
-    # Prepare the system prompt for query rewriting
-    system_prompt = """You are a query rewriter that improves user queries for better retrieval.
-
-    Given a conversation history, rewrite the last user message to be more specific and context-aware.
-    The rewritten query should:
-    1. Include relevant context from previous messages
-    2. Be clear and specific for information retrieval
-    3. Maintain the user's intent
-    4. Be concise but comprehensive
-
-    Return only the rewritten query, nothing else."""
-
-    # Prepare messages for the query rewriter - just add system prompt to existing messages
-    rewrite_messages = [{"role": "system", "content": system_prompt}]
-
-    # Add conversation history
-    for msg in messages:
-        rewrite_messages.append({"role": msg.role, "content": msg.content})
-
-    try:
-        # Call archgw using OpenAI client
-        extra_headers = {"x-envoy-max-retries": "3"}
-        if traceparent_header:
-            extra_headers["traceparent"] = traceparent_header
-        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to rewrite query")
-        response = await archgw_client.chat.completions.create(
-            model=QUERY_REWRITE_MODEL,
-            messages=rewrite_messages,
-            temperature=0.3,
-            max_tokens=200,
-            extra_headers=extra_headers,
-        )
-
-        rewritten_query = response.choices[0].message.content.strip()
-        logger.info(f"Query rewritten successfully: '{rewritten_query}'")
-        return rewritten_query
-
-    except Exception as e:
-        logger.error(f"Error rewriting query: {e}")
-
-    # If rewriting fails, return the original last user message
-    logger.info("Falling back to original user message")
-    for message in reversed(messages):
-        if message.role == "user":
-            return message.content
-    return ""
-
-
-class Response(BaseModel):
-    query: str
-    metadata: dict
-
-
-# FastAPI app for REST server
-app = FastAPI(title="RAG Agent Query Parser", version="1.0.0")
-
-
-@app.post("/v1/chat/completions")
-async def chat_completions(request_body: ChatCompletionRequest, request: Request):
-    """Chat completions endpoint that rewrites the last user query using archgw."""
-    import time
-    import uuid
-
-    logger.info(
-        f"Received chat completion request with {len(request_body.messages)} messages"
-    )
-
-    # Read traceparent header if present
-    traceparent_header = request.headers.get("traceparent")
-    if traceparent_header:
-        logger.info(f"Received traceparent header: {traceparent_header}")
-    else:
-        logger.info("No traceparent header found")
-
-    # Call archgw to rewrite the last user query
-    rewritten_query = await rewrite_query_with_archgw(
-        request_body.messages, traceparent_header
-    )
-
-    # Create updated messages with the rewritten query
-    updated_messages = request_body.messages.copy()
-
-    # Find and update the last user message with the rewritten query
-    for i in range(len(updated_messages) - 1, -1, -1):
-        if updated_messages[i].role == "user":
-            original_query = updated_messages[i].content
-            updated_messages[i] = ChatMessage(role="user", content=rewritten_query)
-            logger.info(
-                f"Updated user query from '{original_query}' to '{rewritten_query}'"
-            )
-            break
-
-    messages_history_json = json.dumps([msg.dict() for msg in updated_messages])
-
-    response = ChatCompletionResponse(
-        id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        created=int(time.time()),
-        model=request_body.model,
-        choices=[
-            {
-                "index": 0,
-                "message": {"role": "user", "content": messages_history_json},
-                "finish_reason": "stop",
-            }
-        ],
-        usage={
-            "prompt_tokens": sum(len(msg.content.split()) for msg in updated_messages),
-            "completion_tokens": len("Updated query for better retrieval.".split()),
-            "total_tokens": sum(len(msg.content.split()) for msg in updated_messages)
-            + len("Updated query for better retrieval.".split()),
-        },
-    )
-
-    return response
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy"}
-
-
-def parse_query(query):
-    """Parse the user query and returns metadata extracted from query."""
-    return Response(query=query, metadata={"is_valid": True})
-
-
-def start_server(host: str = "localhost", port: int = 8000):
-    """Start the REST server."""
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_config={
-            "version": 1,
-            "disable_existing_loggers": False,
-            "formatters": {
-                "default": {
-                    "format": "%(asctime)s - [QUERY_REWRITER]     - %(levelname)s - %(message)s",
-                },
-            },
-            "handlers": {
-                "default": {
-                    "formatter": "default",
-                    "class": "logging.StreamHandler",
-                    "stream": "ext://sys.stdout",
-                },
-            },
-            "root": {
-                "level": "INFO",
-                "handlers": ["default"],
-            },
-        },
-    )
--- a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
@ -1,302 +0,0 @@
-import json
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse
-from openai import AsyncOpenAI
-import os
-import logging
-import time
-import uuid
-import uvicorn
-import asyncio
-
-from .api import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
-    ChatCompletionStreamResponse,
-)
-
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-# Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
-RESPONSE_MODEL = "gpt-4o"
-
-# System prompt for response generation
-SYSTEM_PROMPT = """You are a helpful assistant that generates coherent, contextual responses.
-
-Given a conversation history, generate a helpful and relevant response based on all the context available in the messages.
-Your response should:
-1. Be contextually aware of the entire conversation
-2. Address the user's needs appropriately
-3. Be helpful and informative
-4. Maintain a natural conversational tone
-
-Generate a complete response to assist the user."""
-
-# Initialize OpenAI client for archgw
-archgw_client = AsyncOpenAI(
-    base_url=LLM_GATEWAY_ENDPOINT,
-    api_key="EMPTY",  # archgw doesn't require a real API key
-)
-
-# FastAPI app for REST server
-app = FastAPI(title="RAG Agent Response Generator", version="1.0.0")
-
-
-def prepare_response_messages(request_body: ChatCompletionRequest):
-    """Prepare messages for response generation by adding system prompt."""
-    response_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-
-    # Add conversation history
-    for msg in request_body.messages:
-        response_messages.append({"role": msg.role, "content": msg.content})
-
-    return response_messages
-
-
-@app.post("/v1/chat/completions")
-async def chat_completions(request_body: ChatCompletionRequest, request: Request):
-    """Chat completions endpoint that generates a coherent response based on all context."""
-    logger.info(
-        f"Received chat completion request with {len(request_body.messages)} messages"
-    )
-
-    # Read traceparent header if present
-    traceparent_header = request.headers.get("traceparent")
-    if traceparent_header:
-        logger.info(f"Received traceparent header: {traceparent_header}")
-    else:
-        logger.info("No traceparent header found")
-
-    # Check if streaming is requested
-    if request_body.stream:
-        return StreamingResponse(
-            stream_chat_completions(request_body, traceparent_header),
-            media_type="text/plain",
-            headers={
-                "content-type": "text/event-stream",
-            },
-        )
-    else:
-        return await non_streaming_chat_completions(request_body, traceparent_header)
-
-
-async def stream_chat_completions(
-    request_body: ChatCompletionRequest, traceparent_header: str = None
-):
-    """Generate streaming chat completions."""
-    # Prepare messages for response generation
-    response_messages = prepare_response_messages(request_body)
-
-    try:
-        # Call archgw using OpenAI client for streaming
-        logger.info(
-            f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate streaming response"
-        )
-
-        # Prepare extra headers if traceparent is provided
-        extra_headers = {"x-envoy-max-retries": "3"}
-        if traceparent_header:
-            extra_headers["traceparent"] = traceparent_header
-
-        response_stream = await archgw_client.chat.completions.create(
-            model=RESPONSE_MODEL,
-            messages=response_messages,
-            temperature=request_body.temperature or 0.7,
-            max_tokens=request_body.max_tokens or 1000,
-            stream=True,
-            extra_headers=extra_headers,
-        )
-
-        completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
-        created_time = int(time.time())
-        collected_content = []
-
-        async for chunk in response_stream:
-            if chunk.choices and chunk.choices[0].delta.content:
-                content = chunk.choices[0].delta.content
-                collected_content.append(content)
-
-                # Create streaming response chunk
-                stream_chunk = ChatCompletionStreamResponse(
-                    id=completion_id,
-                    created=created_time,
-                    model=request_body.model,
-                    choices=[
-                        {
-                            "index": 0,
-                            "delta": {"content": content},
-                            "finish_reason": None,
-                        }
-                    ],
-                )
-
-                yield f"data: {stream_chunk.model_dump_json()}\n\n"
-
-        # Send final chunk with complete response in expected format
-        full_response = "".join(collected_content)
-        updated_history = [{"role": "assistant", "content": full_response}]
-
-        final_chunk = ChatCompletionStreamResponse(
-            id=completion_id,
-            created=created_time,
-            model=request_body.model,
-            choices=[
-                {
-                    "index": 0,
-                    "delta": {},
-                    "finish_reason": "stop",
-                    "message": {
-                        "role": "assistant",
-                        "content": json.dumps(updated_history),
-                    },
-                }
-            ],
-        )
-
-        yield f"data: {final_chunk.model_dump_json()}\n\n"
-        yield "data: [DONE]\n\n"
-
-    except Exception as e:
-        logger.error(f"Error generating streaming response: {e}")
-
-        # Send error as streaming response
-        error_chunk = ChatCompletionStreamResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            created=int(time.time()),
-            model=request_body.model,
-            choices=[
-                {
-                    "index": 0,
-                    "delta": {
-                        "content": "I apologize, but I'm having trouble generating a response right now. Please try again."
-                    },
-                    "finish_reason": "stop",
-                }
-            ],
-        )
-
-        yield f"data: {error_chunk.model_dump_json()}\n\n"
-        yield "data: [DONE]\n\n"
-
-
-async def non_streaming_chat_completions(
-    request_body: ChatCompletionRequest, traceparent_header: str = None
-):
-    """Generate non-streaming chat completions."""
-    # Prepare messages for response generation
-    response_messages = prepare_response_messages(request_body)
-
-    try:
-        # Call archgw using OpenAI client
-        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate response")
-
-        # Prepare extra headers if traceparent is provided
-        extra_headers = {"x-envoy-max-retries": "3"}
-        if traceparent_header:
-            extra_headers["traceparent"] = traceparent_header
-
-        response = await archgw_client.chat.completions.create(
-            model=RESPONSE_MODEL,
-            messages=response_messages,
-            temperature=request_body.temperature or 0.7,
-            max_tokens=request_body.max_tokens or 1000,
-            extra_headers=extra_headers,
-        )
-
-        generated_response = response.choices[0].message.content.strip()
-        logger.info(f"Response generated successfully")
-
-        return ChatCompletionResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            created=int(time.time()),
-            model=request_body.model,
-            choices=[
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": generated_response,
-                    },
-                    "finish_reason": "stop",
-                }
-            ],
-            usage={
-                "prompt_tokens": sum(
-                    len(msg.content.split()) for msg in request_body.messages
-                ),
-                "completion_tokens": len(generated_response.split()),
-                "total_tokens": sum(
-                    len(msg.content.split()) for msg in request_body.messages
-                )
-                + len(generated_response.split()),
-            },
-        )
-
-    except Exception as e:
-        logger.error(f"Error generating response: {e}")
-
-        # Fallback response
-        fallback_message = "I apologize, but I'm having trouble generating a response right now. Please try again."
-        return ChatCompletionResponse(
-            id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
-            created=int(time.time()),
-            model=request_body.model,
-            choices=[
-                {
-                    "index": 0,
-                    "message": {"role": "assistant", "content": fallback_message},
-                    "finish_reason": "stop",
-                }
-            ],
-            usage={
-                "prompt_tokens": sum(
-                    len(msg.content.split()) for msg in request_body.messages
-                ),
-                "completion_tokens": len(fallback_message.split()),
-                "total_tokens": sum(
-                    len(msg.content.split()) for msg in request_body.messages
-                )
-                + len(fallback_message.split()),
-            },
-        )
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy"}
-
-
-def start_server(host: str = "localhost", port: int = 8000):
-    """Start the REST server."""
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_config={
-            "version": 1,
-            "disable_existing_loggers": False,
-            "formatters": {
-                "default": {
-                    "format": "%(asctime)s - [RESPONSE_GENERATOR] - %(levelname)s - %(message)s",
-                },
-            },
-            "handlers": {
-                "default": {
-                    "formatter": "default",
-                    "class": "logging.StreamHandler",
-                    "stream": "ext://sys.stdout",
-                },
-            },
-            "root": {
-                "level": "INFO",
-                "handlers": ["default"],
-            },
-        },
-    )
--- a/demos/use_cases/rag_agent/src/rag_agent/sample_knowledge_base.csv
+++ b/demos/use_cases/rag_agent/src/rag_agent/sample_knowledge_base.csv
@ -1,257 +0,0 @@
-path,content
-TechCorp_CloudServices_SLA_Agreement_2024,"SERVICE LEVEL AGREEMENT
-This Service Level Agreement (""SLA"") is entered into on March 15, 2024, between TechCorp Solutions Inc., a Delaware corporation (""Provider""), and CloudFirst Enterprises LLC (""Customer"").
-
-DEFINITIONS
-Service Availability: The percentage of time during which the cloud services are operational and accessible.
-Downtime: Any period when the services are unavailable or inaccessible to Customer.
-Response Time: The time between service request submission and initial response from Provider.
-
-SERVICE COMMITMENTS
-Provider guarantees 99.9% uptime for all cloud infrastructure services during any calendar month.
-Average response time for API calls shall not exceed 200 milliseconds under normal operating conditions.
-Customer support response times: Critical issues within 1 hour, Standard issues within 4 hours.
-
-REMEDIES
-For each full percentage point below 99.9% availability, Customer receives 10% credit on monthly fees.
-If response times exceed 500ms for more than 5 minutes in any hour, Customer receives 5% monthly credit.
-
-MONITORING AND REPORTING
-Provider will maintain real-time monitoring systems and provide monthly performance reports.
-All metrics will be measured from Provider's monitoring systems located in primary data centers.
-
-This SLA remains in effect for the duration of the underlying service agreement.
-
-Executed by:
-TechCorp Solutions Inc.
-Sarah Mitchell, VP Operations
-Date: March 15, 2024
-
-CloudFirst Enterprises LLC
-Robert Chen, CTO
-Date: March 16, 2024"
-
-DataSecure_Privacy_Policy_v3.2,"PRIVACY POLICY
-DataSecure Analytics, Inc. (""Company"") Privacy Policy
-Effective Date: January 1, 2024
-Last Updated: February 28, 2024
-
-INFORMATION COLLECTION
-We collect information you provide directly, such as account details, usage preferences, and communication records.
-Automatically collected data includes IP addresses, browser types, device information, and service interaction logs.
-Third-party integrations may provide additional user behavior and demographic information with consent.
-
-DATA USAGE
-Personal information is used to provide services, improve user experience, and communicate service updates.
-Aggregated, non-identifiable data may be used for analytics, research, and service enhancement.
-We do not sell personal information to third parties for marketing purposes.
-
-DATA PROTECTION
-All data is encrypted in transit using TLS 1.3 and at rest using AES-256 encryption.
-Access controls limit data access to authorized personnel only on a need-to-know basis.
-Regular security audits and penetration testing ensure ongoing protection measures.
-
-DATA RETENTION
-Personal data is retained for the duration of active service plus 24 months.
-Logs and analytics data are retained for 12 months unless legally required otherwise.
-Upon account deletion, personal data is permanently removed within 30 days.
-
-USER RIGHTS
-Users may request access to, correction of, or deletion of their personal information.
-Data portability requests will be fulfilled in standard formats within 30 days.
-Marketing communications can be opted out of at any time.
-
-CONTACT
-For privacy concerns, contact: privacy@datasecure.com
-Data Protection Officer: Jennifer Walsh, jwalsh@datasecure.com"
-
-GlobalManufacturing_SupplyChain_Contract_Q2_2024,"SUPPLY CHAIN AGREEMENT
-This Supply Chain Agreement is entered into between GlobalManufacturing Corp (""Buyer"") and PrecisionParts Ltd (""Supplier"") effective April 1, 2024.
-
-SCOPE OF SERVICES
-Supplier will provide automotive components including brake assemblies, suspension parts, and electrical harnesses.
-All products must meet ISO 9001 quality standards and automotive industry specifications.
-Delivery schedule: Weekly shipments every Tuesday, with 48-hour advance shipping notifications.
-
-PRICING AND PAYMENT
-Component pricing is fixed for initial 6-month term with quarterly price review thereafter.
-Payment terms: Net 45 days from invoice date via electronic transfer.
-Volume discounts apply: 5% for orders exceeding 10,000 units per month, 8% for orders exceeding 25,000 units.
-
-QUALITY REQUIREMENTS
-All components must pass incoming inspection with less than 0.1% defect rate.
-Supplier maintains quality certifications including IATF 16949 and environmental compliance.
-Batch tracking and traceability required for all delivered components.
-
-LOGISTICS AND DELIVERY
-Supplier responsible for packaging, labeling, and delivery to Buyer's distribution centers.
-Delivery windows: 8 AM - 4 PM, Monday through Friday, with advance appointment scheduling.
-Late delivery penalties: 2% of shipment value for each day beyond scheduled delivery.
-
-RISK MANAGEMENT
-Supplier maintains business continuity plans and alternative sourcing strategies.
-Force majeure events must be reported within 24 hours with mitigation plans.
-Insurance requirements: $5M general liability, $2M product liability coverage.
-
-INTELLECTUAL PROPERTY
-All custom tooling and specifications remain property of Buyer.
-Supplier grants license to use necessary patents for component manufacturing.
-
-This agreement shall remain in effect for 24 months with automatic renewal unless terminated.
-
-GlobalManufacturing Corp
-Michael Rodriguez, Supply Chain Director
-Date: April 1, 2024
-
-PrecisionParts Ltd
-Amanda Foster, VP Sales
-Date: April 2, 2024"
-
-EduTech_StudentData_Management_Policy_2024,"STUDENT DATA MANAGEMENT POLICY
-EduTech Learning Platform - Data Management and Protection Policy
-Document Version: 2.1
-Effective Date: August 15, 2024
-
-SCOPE AND PURPOSE
-This policy governs the collection, use, storage, and protection of student educational records and personal information.
-Applies to all employees, contractors, and third-party service providers accessing student data.
-Compliance with FERPA, COPPA, and state student privacy laws is mandatory.
-
-DATA CLASSIFICATION
-Educational Records: Grades, attendance, assignments, and academic progress information.
-Personal Information: Names, addresses, contact details, and demographic information.
-Behavioral Data: Learning patterns, platform usage, and engagement metrics.
-
-COLLECTION PRINCIPLES
-Data collection is limited to educational purposes and service improvement only.
-Parental consent required for students under 13 years of age.
-Students and parents have right to review and request corrections to educational records.
-
-ACCESS CONTROLS
-Role-based access ensures personnel see only data necessary for their functions.
-Multi-factor authentication required for all system access.
-Access logs maintained and reviewed monthly for unauthorized activity.
-
-DATA SHARING
-Educational records shared only with authorized school personnel and parents/students.
-No data sharing with third parties for commercial purposes without explicit consent.
-Research data must be de-identified and aggregated before external sharing.
-
-SECURITY MEASURES
-Data encrypted using industry-standard protocols during transmission and storage.
-Regular security assessments and vulnerability testing conducted quarterly.
-Incident response plan includes notification procedures for data breaches.
-
-RETENTION AND DISPOSAL
-Student records retained according to school district policies, typically 5-7 years post-graduation.
-Inactive accounts and associated data purged after 2 years of non-use.
-Secure data destruction protocols ensure complete removal of sensitive information.
-
-COMPLIANCE MONITORING
-Annual privacy training required for all staff handling student data.
-Regular audits ensure ongoing compliance with applicable privacy regulations.
-Privacy impact assessments conducted for new features or data uses.
-
-Contact: Dr. Lisa Thompson, Chief Privacy Officer
-Email: privacy@edutech-learning.com
-Phone: (555) 123-4567"
-
-FinanceFirst_Investment_Advisory_Agreement_2024,"INVESTMENT ADVISORY AGREEMENT
-This Investment Advisory Agreement is entered into between FinanceFirst Advisors LLC (""Advisor"") and Madison Investment Group (""Client"") on May 20, 2024.
-
-ADVISORY SERVICES
-Advisor will provide comprehensive investment management and financial planning services.
-Services include portfolio construction, asset allocation, risk assessment, and performance monitoring.
-Regular portfolio reviews conducted quarterly with detailed performance reporting.
-
-INVESTMENT AUTHORITY
-Client grants Advisor discretionary authority to make investment decisions within agreed parameters.
-Investment universe includes stocks, bonds, ETFs, mutual funds, and alternative investments as appropriate.
-All trades executed through qualified broker-dealers with best execution practices.
-
-FEE STRUCTURE
-Management fee: 1.25% annually on assets under management, calculated and billed quarterly.
-Performance fee: 15% of returns exceeding S&P 500 benchmark, calculated annually.
-Additional fees may apply for specialized services such as tax planning or estate planning.
-
-CLIENT RESPONSIBILITIES
-Client must provide accurate financial information and promptly communicate changes in circumstances.
-Investment objectives and risk tolerance should be reviewed and updated annually.
-Client responsible for reviewing and approving investment policy statement.
-
-RISK DISCLOSURE
-All investments carry risk of loss, and past performance does not guarantee future results.
-Diversification does not ensure profit or protect against loss in declining markets.
-Alternative investments may have limited liquidity and higher volatility.
-
-REGULATORY COMPLIANCE
-Advisor is registered with the Securities and Exchange Commission as an investment advisor.
-All activities conducted in accordance with Investment Advisers Act of 1940 and applicable regulations.
-Form ADV Part 2 brochure provided annually with material updates.
-
-CONFIDENTIALITY
-All client information treated as confidential and shared only as necessary for service provision.
-Third-party service providers bound by confidentiality agreements.
-Client data protected through secure systems and access controls.
-
-TERMINATION
-Either party may terminate agreement with 30 days written notice.
-Upon termination, Advisor will assist with orderly transfer of assets to new custodian or advisor.
-Final fee calculation prorated to date of termination.
-
-FinanceFirst Advisors LLC
-Thomas Anderson, Managing Partner
-Date: May 20, 2024
-
-Madison Investment Group
-Rebecca Martinez, Chief Investment Officer
-Date: May 21, 2024"
-
-HealthSystem_PatientCare_Standards_2024,"PATIENT CARE STANDARDS AND PROTOCOLS
-Metropolitan Health System - Clinical Care Standards
-Document ID: MHS-PCS-2024-001
-Effective Date: June 1, 2024
-
-PATIENT SAFETY PROTOCOLS
-All patients must have proper identification verification using two unique identifiers.
-Medication administration requires independent double-check for high-risk medications.
-Fall risk assessments completed within 4 hours of admission with appropriate interventions.
-
-CLINICAL DOCUMENTATION
-Medical records must be completed within 24 hours of patient encounter.
-All entries require electronic signature with timestamp and provider identification.
-Critical values and abnormal results must be communicated and documented immediately.
-
-INFECTION CONTROL
-Hand hygiene compliance monitored with target rate of 95% or higher.
-Personal protective equipment used according to transmission-based precautions.
-Isolation procedures implemented within 2 hours of identification of infectious conditions.
-
-EMERGENCY RESPONSE
-Code team response time target: 3 minutes from activation to arrival.
-Crash cart and emergency equipment checks performed daily and documented.
-All staff required to maintain current CPR and emergency response certifications.
-
-PATIENT COMMUNICATION
-Patient rights and responsibilities communicated upon admission.
-Informed consent obtained and documented prior to procedures and treatments.
-Family involvement encouraged with respect for patient privacy preferences.
-
-QUALITY MEASURES
-Patient satisfaction scores monitored monthly with target of 4.5/5.0 or higher.
-Medication error rates tracked with goal of less than 1 per 1000 patient days.
-Hospital-acquired infection rates measured and benchmarked against national standards.
-
-STAFF COMPETENCY
-Annual competency assessments required for all clinical staff.
-Continuing education requirements: 24 hours annually for nurses, 40 hours for physicians.
-Specialty certifications maintained according to department and role requirements.
-
-TECHNOLOGY STANDARDS
-Electronic health record system used for all patient documentation.
-Telemedicine capabilities available for remote consultations and monitoring.
-Clinical decision support tools integrated to assist with diagnosis and treatment decisions.
-
-Contact: Dr. Patricia Williams, Chief Medical Officer
-Email: pwilliams@metrohealthsystem.org
-Phone: (555) 987-6543"
--- a/demos/use_cases/rag_agent/start_agents.sh
+++ b/demos/use_cases/rag_agent/start_agents.sh
@ -1,38 +0,0 @@
-#!/bin/bash
-set -e
-
-WAIT_FOR_PIDS=()
-
-log() {
-  timestamp=$(python3 -c 'from datetime import datetime; print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:23])')
-  message="$*"
-  echo "$timestamp - $message"
-}
-
-cleanup() {
-    log "Caught signal, terminating all user processes ..."
-    for PID in "${WAIT_FOR_PIDS[@]}"; do
-        if kill $PID 2> /dev/null; then
-            log "killed process: $PID"
-        fi
-    done
-    exit 1
-}
-
-trap cleanup EXIT
-
-log "Starting query_parser agent on port 10500..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10500 --agent query_parser &
-WAIT_FOR_PIDS+=($!)
-
-log "Starting context_builder agent on port 10501..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10501 --agent context_builder &
-WAIT_FOR_PIDS+=($!)
-
-log "Starting response_generator agent on port 10502..."
-uv run python -m rag_agent --rest-server --host 0.0.0.0 --rest-port 10502 --agent response_generator &
-WAIT_FOR_PIDS+=($!)
-
-for PID in "${WAIT_FOR_PIDS[@]}"; do
-    wait "$PID"
-done
--- a/demos/use_cases/rag_agent/test.rest
+++ b/demos/use_cases/rag_agent/test.rest
@ -1,67 +0,0 @@
-@baseUrl = http://0.0.0.0:10502
-@model = gpt-4o
-
-# Health Check
-GET {{baseUrl}}/health
-
-###
-
-# Test 1: Simple Non-Streaming Chat Completion
-POST {{baseUrl}}/v1/chat/completions
-Content-Type: application/json
-
-{
-  "model": "{{model}}",
-  "messages": [
-    {
-      "role": "user",
-      "content": "Hello! Can you help me understand what machine learning is?"
-    }
-  ]
-}
-
-###
-
-# Test 2: Simple Streaming Chat Completion
-POST {{baseUrl}}/v1/chat/completions
-Content-Type: application/json
-
-{
-  "model": "{{model}}",
-  "messages": [
-    {
-      "role": "user",
-      "content": "Explain the concept of artificial intelligence in simple terms."
-    }
-  ],
-  "stream": true
-}
-
-### Test 3
-POST http://localhost:8001/v1/chat/completions
-Content-Type: application/json
-
-{
-  "model": "{{model}}",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
-    }
-  ],
-  "stream": false
-}
-
-### send request to context builder agent
-POST http://localhost:10501/v1/chat/completions
-Content-Type: application/json
-
-{
-  "model": "gpt-4o-mini",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is the guaranteed uptime percentage for TechCorp's cloud services?"
-    }
-  ]
-}
--- a/demos/use_cases/rag_agent/uv.lock
+++ b/demos/use_cases/rag_agent/uv.lock