merge main into model-listener-filter-chain

2026-06-17 15:25:17 +02:00 · 2026-03-10 06:52:19 +00:00 · 2026-03-10 06:52:19 +00:00 · aeb8aa9a54
commit aeb8aa9a54
parent 3d2be4f8b7 028a2cd196
99 changed files with 5792 additions and 655 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -20,8 +20,8 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
      - uses: pre-commit/action@v3.0.1

  # ──────────────────────────────────────────────
@ -33,10 +33,10 @@ jobs:
      run:
        working-directory: ./cli
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

@ -53,6 +53,60 @@ jobs:
      - name: Run tests
        run: uv run pytest

+  # ──────────────────────────────────────────────
+  # Native mode smoke test — build from source & start natively
+  # ──────────────────────────────────────────────
+  native-smoke-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          targets: wasm32-wasip1
+
+      - name: Install planoai CLI
+        working-directory: ./cli
+        run: |
+          uv sync
+          uv tool install .
+
+      - name: Build native binaries
+        run: planoai build
+
+      - name: Start plano natively
+        env:
+          OPENAI_API_KEY: test-key-not-used
+        run: planoai up tests/e2e/config_native_smoke.yaml
+
+      - name: Health check
+        run: |
+          for i in $(seq 1 30); do
+            if curl -sf http://localhost:12000/healthz > /dev/null 2>&1; then
+              echo "Health check passed"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Health check failed after 30s"
+          cat ~/.plano/run/logs/envoy.log || true
+          cat ~/.plano/run/logs/brightstaff.log || true
+          exit 1
+
+      - name: Stop plano
+        if: always()
+        run: planoai down || true
+
  # ──────────────────────────────────────────────
  # Single Docker build — shared by all downstream jobs
  # ──────────────────────────────────────────────
@ -60,7 +114,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Free disk space on runner
        run: |
@ -79,16 +133,16 @@ jobs:
          load: true
          tags: |
            ${{ env.PLANO_DOCKER_IMAGE }}
-            ${{ env.DOCKER_IMAGE }}:0.4.8
+            ${{ env.DOCKER_IMAGE }}:0.4.11
            ${{ env.DOCKER_IMAGE }}:latest
          cache-from: type=gha
          cache-to: type=gha,mode=max

      - name: Save image as artifact
-        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.8 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
+        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar

      - name: Upload image artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
        with:
          name: plano-image
          path: /tmp/plano-image.tar
@ -98,25 +152,18 @@ jobs:
  # Validate plano config
  # ──────────────────────────────────────────────
  validate-config:
-    needs: docker-build
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

-      - name: Download plano image
-        uses: actions/download-artifact@v4
-        with:
-          name: plano-image
-          path: /tmp
-
-      - name: Load plano image
-        run: docker load -i /tmp/plano-image.tar
+      - name: Install planoai
+        run: pip install -e ./cli

      - name: Validate plano config
        run: bash config/validate_plano_config.sh
@ -129,10 +176,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -173,7 +220,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Free disk space on runner
        run: |
@ -182,7 +229,7 @@ jobs:
          docker volume prune -f || true

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -191,12 +238,12 @@ jobs:
        run: docker load -i /tmp/plano-image.tar

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true
          cache-dependency-glob: |
@ -223,7 +270,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Free disk space on runner
        run: |
@ -232,7 +279,7 @@ jobs:
          docker volume prune -f || true

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -241,12 +288,12 @@ jobs:
        run: docker load -i /tmp/plano-image.tar

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true
          cache-dependency-glob: |
@ -273,7 +320,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Free disk space on runner
        run: |
@ -282,7 +329,7 @@ jobs:
          docker volume prune -f || true

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -291,12 +338,12 @@ jobs:
        run: docker load -i /tmp/plano-image.tar

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true
          cache-dependency-glob: |
@ -330,15 +377,15 @@ jobs:
        working-directory: ./tests/archgw
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -388,15 +435,15 @@ jobs:
    runs-on: ubuntu-latest-m
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
@ -440,15 +487,15 @@ jobs:
    runs-on: ubuntu-latest-m
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Download plano image
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
        with:
          name: plano-image
          path: /tmp
--- a/.github/workflows/docker-push-main.yml
+++ b/.github/workflows/docker-push-main.yml
@ -19,7 +19,7 @@ jobs:
    runs-on: [linux-arm64]
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Log in to Docker Hub
        uses: docker/login-action@v3
@ -35,7 +35,7 @@ jobs:
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Build and Push ARM64 Image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
@ -50,7 +50,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Log in to Docker Hub
        uses: docker/login-action@v3
@ -66,7 +66,7 @@ jobs:
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Build and Push AMD64 Image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/docker-push-release.yml
+++ b/.github/workflows/docker-push-release.yml
@ -18,7 +18,7 @@ jobs:
    runs-on: [linux-arm64]
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Log in to Docker Hub
        uses: docker/login-action@v3
@ -42,7 +42,7 @@ jobs:
            type=raw,value={{tag}}

      - name: Build and Push ARM64 Image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
@ -57,7 +57,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Log in to Docker Hub
        uses: docker/login-action@v3
@ -81,7 +81,7 @@ jobs:
            type=raw,value={{tag}}

      - name: Build and Push AMD64 Image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/publish-binaries.yml
+++ b/.github/workflows/publish-binaries.yml
@ -0,0 +1,111 @@
+name: Publish pre-compiled binaries (release)
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Release tag to upload binaries to (e.g. 0.4.9)"
+        required: true
+
+permissions:
+  contents: write
+
+jobs:
+  build-wasm-plugins:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          targets: wasm32-wasip1
+
+      - name: Build WASM plugins
+        working-directory: crates
+        run: cargo build --release --target wasm32-wasip1 -p llm_gateway -p prompt_gateway
+
+      - name: Compress and upload WASM plugins to release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cp crates/target/wasm32-wasip1/release/prompt_gateway.wasm prompt_gateway.wasm
+          cp crates/target/wasm32-wasip1/release/llm_gateway.wasm llm_gateway.wasm
+          gzip prompt_gateway.wasm
+          gzip llm_gateway.wasm
+          gh release upload "${{ github.event.release.tag_name || inputs.tag }}" \
+            prompt_gateway.wasm.gz \
+            llm_gateway.wasm.gz \
+            --clobber
+
+  build-brightstaff-linux-amd64:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Build brightstaff
+        working-directory: crates
+        run: cargo build --release -p brightstaff
+
+      - name: Compress and upload brightstaff to release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cp crates/target/release/brightstaff brightstaff-linux-amd64
+          gzip brightstaff-linux-amd64
+          gh release upload "${{ github.event.release.tag_name || inputs.tag }}" \
+            brightstaff-linux-amd64.gz \
+            --clobber
+
+  build-brightstaff-linux-arm64:
+    runs-on: [linux-arm64]
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Build brightstaff
+        working-directory: crates
+        run: cargo build --release -p brightstaff
+
+      - name: Compress and upload brightstaff to release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cp crates/target/release/brightstaff brightstaff-linux-arm64
+          gzip brightstaff-linux-arm64
+          gh release upload "${{ github.event.release.tag_name || inputs.tag }}" \
+            brightstaff-linux-arm64.gz \
+            --clobber
+
+  build-brightstaff-darwin-arm64:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Build brightstaff
+        working-directory: crates
+        run: cargo build --release -p brightstaff
+
+      - name: Compress and upload brightstaff to release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cp crates/target/release/brightstaff brightstaff-darwin-arm64
+          gzip brightstaff-darwin-arm64
+          gh release upload "${{ github.event.release.tag_name || inputs.tag }}" \
+            brightstaff-darwin-arm64.gz \
+            --clobber
--- a/.github/workflows/publish-pypi.yml
+++ b/.github/workflows/publish-pypi.yml
@ -17,20 +17,20 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6

      - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v6
        with:
          python-version: "3.14"

      - name: Install uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true

      - name: Build package
-        run: uv build
+        run: uv build --wheel

      - name: Publish to PyPI
        env:
--- a/.github/workflows/static.yml
+++ b/.github/workflows/static.yml
@ -13,11 +13,11 @@ jobs:
    steps:
      # Check out the code from the repository
      - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6

      # Set up Docker
      - name: Set up Docker
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3

      # Build and run the Docker container to generate the documentation
      - name: Build documentation using Docker
@ -30,7 +30,7 @@ jobs:

      # Deploy the docs to GitHub Pages
      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@v3
+        uses: peaceiris/actions-gh-pages@v4
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: ./docs/build/html  # Adjust this path based on where the HTML is generated
--- a/9
+++ b/9
@ -1,3 +1,6 @@
+# Envoy version — keep in sync with cli/planoai/consts.py ENVOY_VERSION
+ARG ENVOY_VERSION=v1.37.0
+
 # --- Dependency cache ---
 FROM rust:1.93.0 AS deps
 RUN rustup -v target add wasm32-wasip1
@ -40,7 +43,7 @@ COPY crates/brightstaff/src    brightstaff/src
 RUN find common hermesllm brightstaff -name "*.rs" -exec touch {} +
 RUN cargo build --release -p brightstaff

-FROM docker.io/envoyproxy/envoy:v1.37.0 AS envoy
+FROM docker.io/envoyproxy/envoy:${ENVOY_VERSION} AS envoy

 FROM python:3.14-slim AS arch

@ -66,8 +69,10 @@ RUN pip install --no-cache-dir uv
 COPY cli/pyproject.toml ./
 COPY cli/uv.lock ./
 COPY cli/README.md ./
+COPY config/plano_config_schema.yaml /config/plano_config_schema.yaml
+COPY config/envoy.template.yaml /config/envoy.template.yaml

-RUN uv run pip install --no-cache-dir .
+RUN pip install --no-cache-dir -e .

 COPY cli/planoai planoai/
 COPY config/envoy.template.yaml .
--- a/apps/www/src/components/Hero.tsx
+++ b/apps/www/src/components/Hero.tsx
@ -24,7 +24,7 @@ export function Hero() {
            >
              <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
                <span className="text-xs sm:text-sm font-medium text-black/65">
-                  v0.4.8
+                  v0.4.11
                </span>
                <span className="text-xs sm:text-sm font-medium text-black ">
                  —
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@ -1 +1 @@
-docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.8
+docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11
--- a/cli/planoai/init.py
+++ b/cli/planoai/init.py
@ -1,3 +1,3 @@
 """Plano CLI - Intelligent Prompt Gateway."""

-__version__ = "0.4.8"
+__version__ = "0.4.11"
--- a/cli/planoai/consts.py
+++ b/cli/planoai/consts.py
@ -5,5 +5,17 @@ PLANO_COLOR = "#969FF4"

 SERVICE_NAME_ARCHGW = "plano"
 PLANO_DOCKER_NAME = "plano"
-PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.8")
-DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://host.docker.internal:4317"
+PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.11")
+DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
+
+# Native mode constants
+PLANO_HOME = os.path.join(os.path.expanduser("~"), ".plano")
+PLANO_RUN_DIR = os.path.join(PLANO_HOME, "run")
+PLANO_BIN_DIR = os.path.join(PLANO_HOME, "bin")
+PLANO_PLUGINS_DIR = os.path.join(PLANO_HOME, "plugins")
+ENVOY_VERSION = "v1.37.0"  # keep in sync with Dockerfile ARG ENVOY_VERSION
+NATIVE_PID_FILE = os.path.join(PLANO_RUN_DIR, "plano.pid")
+DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
+
+PLANO_GITHUB_REPO = "katanemo/archgw"
+PLANO_RELEASE_BASE_URL = f"https://github.com/{PLANO_GITHUB_REPO}/releases/download"
--- a/cli/planoai/core.py
+++ b/cli/planoai/core.py
@ -33,8 +33,11 @@ def _get_gateway_ports(plano_config_file: str) -> list[int]:
    with open(plano_config_file) as f:
        plano_config_dict = yaml.safe_load(f)

+    model_providers = plano_config_dict.get("llm_providers") or plano_config_dict.get(
+        "model_providers"
+    )
    listeners, _, _ = convert_legacy_listeners(
-        plano_config_dict.get("listeners"), plano_config_dict.get("llm_providers")
+        plano_config_dict.get("listeners"), model_providers
    )

    all_ports = [listener.get("port") for listener in listeners]
--- a/cli/planoai/docker_cli.py
+++ b/cli/planoai/docker_cli.py
@ -40,11 +40,35 @@ def docker_remove_container(container: str) -> str:
    return result.returncode


+def _prepare_docker_config(plano_config_file: str) -> str:
+    """Copy config to a temp file, replacing localhost with host.docker.internal.
+
+    Configs use localhost for native-first mode, but Docker containers need
+    host.docker.internal to reach services on the host.
+    """
+    import tempfile
+
+    with open(plano_config_file, "r") as f:
+        content = f.read()
+
+    if "localhost" not in content:
+        return plano_config_file
+
+    content = content.replace("localhost", "host.docker.internal")
+    tmp = tempfile.NamedTemporaryFile(
+        mode="w", suffix=".yaml", prefix="plano_config_", delete=False
+    )
+    tmp.write(content)
+    tmp.close()
+    return tmp.name
+
+
 def docker_start_plano_detached(
    plano_config_file: str,
    env: dict,
    gateway_ports: list[int],
 ) -> str:
+    docker_config = _prepare_docker_config(plano_config_file)
    env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]]

    port_mappings = [
@ -58,7 +82,7 @@ def docker_start_plano_detached(
    port_mappings_args = [item for port in port_mappings for item in ("-p", port)]

    volume_mappings = [
-        f"{plano_config_file}:/app/plano_config.yaml:ro",
+        f"{docker_config}:/app/plano_config.yaml:ro",
    ]
    volume_mappings_args = [
        item for volume in volume_mappings for item in ("-v", volume)
--- a/cli/planoai/main.py
+++ b/cli/planoai/main.py
@ -30,6 +30,7 @@ from planoai.init_cmd import init as init_cmd
 from planoai.trace_cmd import trace as trace_cmd, start_trace_listener_background
 from planoai.consts import (
    DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
+    DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT,
    PLANO_DOCKER_IMAGE,
    PLANO_DOCKER_NAME,
 )
@ -130,7 +131,13 @@ def main(ctx, version):


@click.command()
-def build():
+@click.option(
+    "--docker",
+    default=False,
+    help="Build the Docker image instead of native binaries.",
+    is_flag=True,
+)
+def build(docker):
    """Build Plano from source. Works from any directory within the repo."""

    # Find the repo root
@ -141,6 +148,68 @@ def build():
        )
        sys.exit(1)

+    if not docker:
+        import shutil
+
+        crates_dir = os.path.join(repo_root, "crates")
+        console = _console()
+        _print_cli_header(console)
+
+        if not shutil.which("cargo"):
+            console.print(
+                "[red]✗[/red] [bold]cargo[/bold] not found. "
+                "Install Rust: [cyan]https://rustup.rs[/cyan]"
+            )
+            sys.exit(1)
+
+        console.print("[dim]Building WASM plugins (wasm32-wasip1)...[/dim]")
+        try:
+            subprocess.run(
+                [
+                    "cargo",
+                    "build",
+                    "--release",
+                    "--target",
+                    "wasm32-wasip1",
+                    "-p",
+                    "llm_gateway",
+                    "-p",
+                    "prompt_gateway",
+                ],
+                cwd=crates_dir,
+                check=True,
+            )
+            log.info("WASM plugins built")
+        except subprocess.CalledProcessError as e:
+            console.print(f"[red]✗[/red] WASM build failed: {e}")
+            sys.exit(1)
+
+        console.print("[dim]Building brightstaff (native)...[/dim]")
+        try:
+            subprocess.run(
+                [
+                    "cargo",
+                    "build",
+                    "--release",
+                    "-p",
+                    "brightstaff",
+                ],
+                cwd=crates_dir,
+                check=True,
+            )
+            log.info("brightstaff built")
+        except subprocess.CalledProcessError as e:
+            console.print(f"[red]✗[/red] brightstaff build failed: {e}")
+            sys.exit(1)
+
+        wasm_dir = os.path.join(crates_dir, "target", "wasm32-wasip1", "release")
+        native_dir = os.path.join(crates_dir, "target", "release")
+        console.print(f"\n[bold]Build artifacts:[/bold]")
+        console.print(f"  {os.path.join(wasm_dir, 'prompt_gateway.wasm')}")
+        console.print(f"  {os.path.join(wasm_dir, 'llm_gateway.wasm')}")
+        console.print(f"  {os.path.join(native_dir, 'brightstaff')}")
+        return
+
    dockerfile_path = os.path.join(repo_root, "Dockerfile")

    if not os.path.exists(dockerfile_path):
@ -192,7 +261,13 @@ def build():
    help="Port for the OTLP trace collector (default: 4317).",
    show_default=True,
 )
-def up(file, path, foreground, with_tracing, tracing_port):
+@click.option(
+    "--docker",
+    default=False,
+    help="Run Plano inside Docker instead of natively.",
+    is_flag=True,
+)
+def up(file, path, foreground, with_tracing, tracing_port, docker):
    """Starts Plano."""
    from rich.status import Status

@ -209,26 +284,51 @@ def up(file, path, foreground, with_tracing, tracing_port):
        )
        sys.exit(1)

-    with Status(
-        "[dim]Validating configuration[/dim]", spinner="dots", spinner_style="dim"
-    ):
-        (
-            validation_return_code,
-            _,
-            validation_stderr,
-        ) = docker_validate_plano_schema(plano_config_file)
+    if not docker:
+        from planoai.native_runner import native_validate_config

-    if validation_return_code != 0:
-        console.print(f"[red]✗[/red] Validation failed")
-        if validation_stderr:
-            console.print(f"  [dim]{validation_stderr.strip()}[/dim]")
-        sys.exit(1)
+        with Status(
+            "[dim]Validating configuration[/dim]",
+            spinner="dots",
+            spinner_style="dim",
+        ):
+            try:
+                native_validate_config(plano_config_file)
+            except SystemExit:
+                console.print(f"[red]✗[/red] Validation failed")
+                sys.exit(1)
+            except Exception as e:
+                console.print(f"[red]✗[/red] Validation failed")
+                console.print(f"  [dim]{str(e).strip()}[/dim]")
+                sys.exit(1)
+    else:
+        with Status(
+            "[dim]Validating configuration (Docker)[/dim]",
+            spinner="dots",
+            spinner_style="dim",
+        ):
+            (
+                validation_return_code,
+                _,
+                validation_stderr,
+            ) = docker_validate_plano_schema(plano_config_file)

-    console.print(f"[green]✓[/green] Configuration valid")
+        if validation_return_code != 0:
+            console.print(f"[red]✗[/red] Validation failed")
+            if validation_stderr:
+                console.print(f"  [dim]{validation_stderr.strip()}[/dim]")
+            sys.exit(1)
+
+    log.info("Configuration valid")

    # Set up environment
+    default_otel = (
+        DEFAULT_OTEL_TRACING_GRPC_ENDPOINT
+        if docker
+        else DEFAULT_NATIVE_OTEL_TRACING_GRPC_ENDPOINT
+    )
    env_stage = {
-        "OTEL_TRACING_GRPC_ENDPOINT": DEFAULT_OTEL_TRACING_GRPC_ENDPOINT,
+        "OTEL_TRACING_GRPC_ENDPOINT": default_otel,
    }
    env = os.environ.copy()
    env.pop("PATH", None)
@ -296,13 +396,20 @@ def up(file, path, foreground, with_tracing, tracing_port):
                sys.exit(1)

        # Update the OTEL endpoint so the gateway sends traces to the right port
-        env_stage[
-            "OTEL_TRACING_GRPC_ENDPOINT"
-        ] = f"http://host.docker.internal:{tracing_port}"
+        tracing_host = "host.docker.internal" if docker else "localhost"
+        otel_endpoint = f"http://{tracing_host}:{tracing_port}"
+        env_stage["OTEL_TRACING_GRPC_ENDPOINT"] = otel_endpoint

    env.update(env_stage)
    try:
-        start_plano(plano_config_file, env, foreground=foreground)
+        if not docker:
+            from planoai.native_runner import start_native
+
+            start_native(
+                plano_config_file, env, foreground=foreground, with_tracing=with_tracing
+            )
+        else:
+            start_plano(plano_config_file, env, foreground=foreground)

        # When tracing is enabled but --foreground is not, keep the process
        # alive so the OTLP collector continues to receive spans.
@ -320,15 +427,31 @@ def up(file, path, foreground, with_tracing, tracing_port):


@click.command()
-def down():
+@click.option(
+    "--docker",
+    default=False,
+    help="Stop a Docker-based Plano instance.",
+    is_flag=True,
+)
+def down(docker):
    """Stops Plano."""
    console = _console()
    _print_cli_header(console)

-    with console.status(
-        f"[{PLANO_COLOR}]Shutting down Plano...[/{PLANO_COLOR}]", spinner="dots"
-    ):
-        stop_docker_container()
+    if not docker:
+        from planoai.native_runner import stop_native
+
+        with console.status(
+            f"[{PLANO_COLOR}]Shutting down Plano...[/{PLANO_COLOR}]",
+            spinner="dots",
+        ):
+            stop_native()
+    else:
+        with console.status(
+            f"[{PLANO_COLOR}]Shutting down Plano (Docker)...[/{PLANO_COLOR}]",
+            spinner="dots",
+        ):
+            stop_docker_container()


@click.command()
@ -360,9 +483,21 @@ def generate_prompt_targets(file):
    is_flag=True,
 )
@click.option("--follow", help="Follow the logs", is_flag=True)
-def logs(debug, follow):
+@click.option(
+    "--docker",
+    default=False,
+    help="Stream logs from a Docker-based Plano instance.",
+    is_flag=True,
+)
+def logs(debug, follow, docker):
    """Stream logs from access logs services."""

+    if not docker:
+        from planoai.native_runner import native_logs
+
+        native_logs(debug=debug, follow=follow)
+        return
+
    plano_process = None
    try:
        if debug:
--- a/cli/planoai/native_binaries.py
+++ b/cli/planoai/native_binaries.py
@ -0,0 +1,325 @@
+import gzip
+import os
+import platform
+import shutil
+import sys
+import tarfile
+import tempfile
+
+import planoai
+from planoai.consts import (
+    ENVOY_VERSION,
+    PLANO_BIN_DIR,
+    PLANO_PLUGINS_DIR,
+    PLANO_RELEASE_BASE_URL,
+)
+from planoai.utils import find_repo_root, getLogger
+
+log = getLogger(__name__)
+
+
+def _get_platform_slug():
+    """Return the platform slug for binary downloads."""
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+
+    mapping = {
+        ("linux", "x86_64"): "linux-amd64",
+        ("linux", "aarch64"): "linux-arm64",
+        ("darwin", "arm64"): "darwin-arm64",
+    }
+
+    slug = mapping.get((system, machine))
+    if slug is None:
+        if system == "darwin" and machine == "x86_64":
+            print(
+                "Error: macOS x86_64 (Intel) is not supported. "
+                "Pre-built binaries are only available for Apple Silicon (arm64)."
+            )
+            sys.exit(1)
+        print(
+            f"Error: Unsupported platform {system}/{machine}. "
+            "Supported platforms: linux-amd64, linux-arm64, darwin-arm64"
+        )
+        sys.exit(1)
+
+    return slug
+
+
+def _download_file(url, dest, label=None):
+    """Download a file from *url* to *dest* with a progress bar."""
+    import urllib.request
+    import urllib.error
+
+    if label is None:
+        label = os.path.basename(dest)
+
+    try:
+        response = urllib.request.urlopen(url)
+        total = int(response.headers.get("Content-Length", 0))
+        downloaded = 0
+        block_size = 64 * 1024
+
+        with open(dest, "wb") as f:
+            while True:
+                chunk = response.read(block_size)
+                if not chunk:
+                    break
+                f.write(chunk)
+                downloaded += len(chunk)
+                if total > 0:
+                    pct = downloaded * 100 // total
+                    bar_len = 30
+                    filled = bar_len * downloaded // total
+                    bar = "█" * filled + "░" * (bar_len - filled)
+                    mb = downloaded / (1024 * 1024)
+                    total_mb = total / (1024 * 1024)
+                    print(
+                        f"\r  {label} {bar} {pct}% ({mb:.1f}/{total_mb:.1f} MB)",
+                        end="",
+                        flush=True,
+                    )
+
+        print()  # newline after progress bar
+
+    except urllib.error.URLError as e:
+        print(f"\nError downloading {label}: {e}")
+        print(f"  URL: {url}")
+        print("Please check your internet connection and try again.")
+        sys.exit(1)
+
+
+def ensure_envoy_binary():
+    """Download Envoy binary if not already present or version changed. Returns path to binary."""
+    envoy_path = os.path.join(PLANO_BIN_DIR, "envoy")
+    version_path = os.path.join(PLANO_BIN_DIR, "envoy.version")
+
+    if os.path.exists(envoy_path) and os.access(envoy_path, os.X_OK):
+        # Check if cached binary matches the pinned version
+        if os.path.exists(version_path):
+            with open(version_path, "r") as f:
+                cached_version = f.read().strip()
+            if cached_version == ENVOY_VERSION:
+                log.info(f"Envoy {ENVOY_VERSION} (cached)")
+                return envoy_path
+            log.info(
+                f"Envoy version changed ({cached_version} → {ENVOY_VERSION}), re-downloading..."
+            )
+        else:
+            log.info("Envoy binary found (unknown version, re-downloading...)")
+
+    slug = _get_platform_slug()
+    url = (
+        f"https://github.com/tetratelabs/archive-envoy/releases/download/"
+        f"{ENVOY_VERSION}/envoy-{ENVOY_VERSION}-{slug}.tar.xz"
+    )
+
+    os.makedirs(PLANO_BIN_DIR, exist_ok=True)
+
+    with tempfile.NamedTemporaryFile(suffix=".tar.xz", delete=False) as tmp:
+        tmp_path = tmp.name
+
+    try:
+        _download_file(url, tmp_path, label=f"Envoy {ENVOY_VERSION}")
+        log.info(f"Extracting Envoy {ENVOY_VERSION}...")
+        with tarfile.open(tmp_path, "r:xz") as tar:
+            # Find the envoy binary inside the archive
+            envoy_member = None
+            for member in tar.getmembers():
+                if member.name.endswith("/bin/envoy") or member.name == "bin/envoy":
+                    envoy_member = member
+                    break
+
+            if envoy_member is None:
+                print("Error: Could not find envoy binary in the downloaded archive.")
+                print("Archive contents:")
+                for member in tar.getmembers():
+                    print(f"  {member.name}")
+                sys.exit(1)
+
+            # Extract just the binary
+            f = tar.extractfile(envoy_member)
+            if f is None:
+                print("Error: Could not extract envoy binary from archive.")
+                sys.exit(1)
+
+            with open(envoy_path, "wb") as out:
+                out.write(f.read())
+
+        os.chmod(envoy_path, 0o755)
+        with open(version_path, "w") as f:
+            f.write(ENVOY_VERSION)
+        return envoy_path
+
+    finally:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+
+
+def _find_local_wasm_plugins():
+    """Check for WASM plugins built from source. Returns (prompt_gw, llm_gw) or None."""
+    repo_root = find_repo_root()
+    if not repo_root:
+        return None
+    wasm_dir = os.path.join(repo_root, "crates", "target", "wasm32-wasip1", "release")
+    prompt_gw = os.path.join(wasm_dir, "prompt_gateway.wasm")
+    llm_gw = os.path.join(wasm_dir, "llm_gateway.wasm")
+    if os.path.exists(prompt_gw) and os.path.exists(llm_gw):
+        return prompt_gw, llm_gw
+    return None
+
+
+def _find_local_brightstaff():
+    """Check for brightstaff binary built from source. Returns path or None."""
+    repo_root = find_repo_root()
+    if not repo_root:
+        return None
+    path = os.path.join(repo_root, "crates", "target", "release", "brightstaff")
+    if os.path.exists(path) and os.access(path, os.X_OK):
+        return path
+    return None
+
+
+def ensure_wasm_plugins():
+    """Find or download WASM plugins. Checks: local build → cached download → fresh download."""
+    # 1. Local source build (inside repo)
+    local = _find_local_wasm_plugins()
+    if local:
+        log.info("Using locally-built WASM plugins")
+        return local
+
+    # 2. Cached download
+    version = planoai.__version__
+    version_path = os.path.join(PLANO_PLUGINS_DIR, "wasm.version")
+    prompt_gw_path = os.path.join(PLANO_PLUGINS_DIR, "prompt_gateway.wasm")
+    llm_gw_path = os.path.join(PLANO_PLUGINS_DIR, "llm_gateway.wasm")
+
+    if os.path.exists(prompt_gw_path) and os.path.exists(llm_gw_path):
+        if os.path.exists(version_path):
+            with open(version_path, "r") as f:
+                cached_version = f.read().strip()
+            if cached_version == version:
+                log.info(f"WASM plugins {version} (cached)")
+                return prompt_gw_path, llm_gw_path
+            log.info(
+                f"WASM plugins version changed ({cached_version} → {version}), re-downloading..."
+            )
+        else:
+            log.info("WASM plugins found (unknown version, re-downloading...)")
+
+    # 3. Download from GitHub releases (gzipped)
+    os.makedirs(PLANO_PLUGINS_DIR, exist_ok=True)
+
+    for name, dest in [
+        ("prompt_gateway.wasm", prompt_gw_path),
+        ("llm_gateway.wasm", llm_gw_path),
+    ]:
+        gz_name = f"{name}.gz"
+        url = f"{PLANO_RELEASE_BASE_URL}/{version}/{gz_name}"
+        gz_dest = dest + ".gz"
+        _download_file(url, gz_dest, label=f"{name} ({version})")
+        log.info(f"Decompressing {name}...")
+        with gzip.open(gz_dest, "rb") as f_in, open(dest, "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+        os.unlink(gz_dest)
+
+    with open(version_path, "w") as f:
+        f.write(version)
+
+    return prompt_gw_path, llm_gw_path
+
+
+def ensure_brightstaff_binary():
+    """Find or download brightstaff binary. Checks: local build → cached download → fresh download."""
+    # 1. Local source build (inside repo)
+    local = _find_local_brightstaff()
+    if local:
+        log.info("Using locally-built brightstaff")
+        return local
+
+    # 2. Cached download
+    version = planoai.__version__
+    brightstaff_path = os.path.join(PLANO_BIN_DIR, "brightstaff")
+    version_path = os.path.join(PLANO_BIN_DIR, "brightstaff.version")
+
+    if os.path.exists(brightstaff_path) and os.access(brightstaff_path, os.X_OK):
+        if os.path.exists(version_path):
+            with open(version_path, "r") as f:
+                cached_version = f.read().strip()
+            if cached_version == version:
+                log.info(f"brightstaff {version} (cached)")
+                return brightstaff_path
+            log.info(
+                f"brightstaff version changed ({cached_version} → {version}), re-downloading..."
+            )
+        else:
+            log.info("brightstaff found (unknown version, re-downloading...)")
+
+    # 3. Download from GitHub releases (gzipped)
+    slug = _get_platform_slug()
+    filename = f"brightstaff-{slug}.gz"
+    url = f"{PLANO_RELEASE_BASE_URL}/{version}/{filename}"
+
+    os.makedirs(PLANO_BIN_DIR, exist_ok=True)
+
+    gz_path = brightstaff_path + ".gz"
+    _download_file(url, gz_path, label=f"brightstaff ({version}, {slug})")
+    log.info("Decompressing brightstaff...")
+    with gzip.open(gz_path, "rb") as f_in, open(brightstaff_path, "wb") as f_out:
+        shutil.copyfileobj(f_in, f_out)
+    os.unlink(gz_path)
+
+    os.chmod(brightstaff_path, 0o755)
+    with open(version_path, "w") as f:
+        f.write(version)
+    return brightstaff_path
+
+
+def find_wasm_plugins():
+    """Find WASM plugin files built from source. Returns (prompt_gateway_path, llm_gateway_path)."""
+    repo_root = find_repo_root()
+    if not repo_root:
+        print(
+            "Error: Could not find repository root. "
+            "Make sure you're inside the plano repository."
+        )
+        sys.exit(1)
+
+    wasm_dir = os.path.join(repo_root, "crates", "target", "wasm32-wasip1", "release")
+    prompt_gw = os.path.join(wasm_dir, "prompt_gateway.wasm")
+    llm_gw = os.path.join(wasm_dir, "llm_gateway.wasm")
+
+    missing = []
+    if not os.path.exists(prompt_gw):
+        missing.append("prompt_gateway.wasm")
+    if not os.path.exists(llm_gw):
+        missing.append("llm_gateway.wasm")
+
+    if missing:
+        print(f"Error: WASM plugins not found: {', '.join(missing)}")
+        print(f"  Expected at: {wasm_dir}/")
+        print("  Run 'planoai build' first to build them.")
+        sys.exit(1)
+
+    return prompt_gw, llm_gw
+
+
+def find_brightstaff_binary():
+    """Find the brightstaff binary built from source. Returns path."""
+    repo_root = find_repo_root()
+    if not repo_root:
+        print(
+            "Error: Could not find repository root. "
+            "Make sure you're inside the plano repository."
+        )
+        sys.exit(1)
+
+    brightstaff_path = os.path.join(
+        repo_root, "crates", "target", "release", "brightstaff"
+    )
+    if not os.path.exists(brightstaff_path):
+        print(f"Error: brightstaff binary not found at {brightstaff_path}")
+        print("  Run 'planoai build' first to build it.")
+        sys.exit(1)
+
+    return brightstaff_path
--- a/cli/planoai/native_runner.py
+++ b/cli/planoai/native_runner.py
@ -0,0 +1,463 @@
+import contextlib
+import io
+import json
+import os
+import signal
+import subprocess
+import sys
+import time
+
+from planoai.consts import (
+    NATIVE_PID_FILE,
+    PLANO_RUN_DIR,
+)
+from planoai.docker_cli import health_check_endpoint
+from planoai.native_binaries import (
+    ensure_brightstaff_binary,
+    ensure_envoy_binary,
+    ensure_wasm_plugins,
+)
+from planoai.utils import find_repo_root, getLogger
+
+log = getLogger(__name__)
+
+
+def _find_config_dir():
+    """Locate the directory containing plano_config_schema.yaml and envoy.template.yaml.
+
+    Checks package data first (pip-installed), then falls back to the repo checkout.
+    """
+    import planoai
+
+    pkg_data = os.path.join(os.path.dirname(planoai.__file__), "data")
+    if os.path.isdir(pkg_data) and os.path.exists(
+        os.path.join(pkg_data, "plano_config_schema.yaml")
+    ):
+        return pkg_data
+
+    repo_root = find_repo_root()
+    if repo_root:
+        config_dir = os.path.join(repo_root, "config")
+        if os.path.isdir(config_dir):
+            return config_dir
+
+    print(
+        "Error: Could not find config templates. "
+        "Make sure you're inside the plano repository or have the planoai package installed."
+    )
+    sys.exit(1)
+
+
+@contextlib.contextmanager
+def _temporary_env(overrides):
+    """Context manager that sets env vars from *overrides* and restores originals on exit."""
+    saved = {}
+    for key, value in overrides.items():
+        saved[key] = os.environ.get(key)
+        os.environ[key] = value
+    try:
+        yield
+    finally:
+        for key, original in saved.items():
+            if original is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = original
+
+
+def render_native_config(plano_config_file, env, with_tracing=False):
+    """Render envoy and plano configs for native mode. Returns (envoy_config_path, plano_config_rendered_path)."""
+    import yaml
+
+    os.makedirs(PLANO_RUN_DIR, exist_ok=True)
+
+    prompt_gw_path, llm_gw_path = ensure_wasm_plugins()
+
+    # If --with-tracing, inject tracing config if not already present
+    effective_config_file = os.path.abspath(plano_config_file)
+    if with_tracing:
+        with open(plano_config_file, "r") as f:
+            config_data = yaml.safe_load(f)
+        tracing = config_data.get("tracing", {})
+        if not tracing.get("random_sampling"):
+            tracing["random_sampling"] = 100
+            config_data["tracing"] = tracing
+            effective_config_file = os.path.join(
+                PLANO_RUN_DIR, "config_with_tracing.yaml"
+            )
+            with open(effective_config_file, "w") as f:
+                yaml.dump(config_data, f, default_flow_style=False)
+
+    envoy_config_path = os.path.join(PLANO_RUN_DIR, "envoy.yaml")
+    plano_config_rendered_path = os.path.join(
+        PLANO_RUN_DIR, "plano_config_rendered.yaml"
+    )
+
+    # Set environment variables that config_generator.validate_and_render_schema() reads
+    config_dir = _find_config_dir()
+    overrides = {
+        "PLANO_CONFIG_FILE": effective_config_file,
+        "PLANO_CONFIG_SCHEMA_FILE": os.path.join(
+            config_dir, "plano_config_schema.yaml"
+        ),
+        "TEMPLATE_ROOT": config_dir,
+        "ENVOY_CONFIG_TEMPLATE_FILE": "envoy.template.yaml",
+        "PLANO_CONFIG_FILE_RENDERED": plano_config_rendered_path,
+        "ENVOY_CONFIG_FILE_RENDERED": envoy_config_path,
+    }
+
+    # Also propagate caller env vars (API keys, OTEL endpoint, etc.)
+    for key, value in env.items():
+        if key not in overrides:
+            overrides[key] = value
+
+    with _temporary_env(overrides):
+        from planoai.config_generator import validate_and_render_schema
+
+        # Suppress verbose print output from config_generator
+        with contextlib.redirect_stdout(io.StringIO()):
+            validate_and_render_schema()
+
+    # Post-process envoy.yaml: replace Docker WASM plugin paths with local paths
+    with open(envoy_config_path, "r") as f:
+        envoy_content = f.read()
+
+    envoy_content = envoy_content.replace(
+        "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm", prompt_gw_path
+    )
+    envoy_content = envoy_content.replace(
+        "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm", llm_gw_path
+    )
+
+    # Replace /var/log/ paths with local log directory (non-root friendly)
+    log_dir = os.path.join(PLANO_RUN_DIR, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+    envoy_content = envoy_content.replace("/var/log/", log_dir + "/")
+
+    # Replace Linux CA cert path with platform-appropriate path
+    import platform
+
+    if platform.system() == "Darwin":
+        envoy_content = envoy_content.replace(
+            "/etc/ssl/certs/ca-certificates.crt", "/etc/ssl/cert.pem"
+        )
+
+    with open(envoy_config_path, "w") as f:
+        f.write(envoy_content)
+
+    # Run envsubst-equivalent on both rendered files using the caller's env
+    with _temporary_env(env):
+        for filepath in [envoy_config_path, plano_config_rendered_path]:
+            with open(filepath, "r") as f:
+                content = f.read()
+            content = os.path.expandvars(content)
+            with open(filepath, "w") as f:
+                f.write(content)
+
+    return envoy_config_path, plano_config_rendered_path
+
+
+def start_native(plano_config_file, env, foreground=False, with_tracing=False):
+    """Start Envoy and brightstaff natively."""
+    from planoai.core import _get_gateway_ports
+
+    # Stop any existing instance first
+    if os.path.exists(NATIVE_PID_FILE):
+        log.info("Stopping existing Plano instance...")
+        stop_native()
+
+    envoy_path = ensure_envoy_binary()
+    ensure_wasm_plugins()
+    brightstaff_path = ensure_brightstaff_binary()
+    envoy_config_path, plano_config_rendered_path = render_native_config(
+        plano_config_file, env, with_tracing=with_tracing
+    )
+
+    log.info("Configuration rendered")
+
+    log_dir = os.path.join(PLANO_RUN_DIR, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+
+    log_level = env.get("LOG_LEVEL", "info")
+
+    # Start brightstaff
+    brightstaff_env = os.environ.copy()
+    brightstaff_env["RUST_LOG"] = log_level
+    brightstaff_env["PLANO_CONFIG_PATH_RENDERED"] = plano_config_rendered_path
+    # Propagate API keys and other env vars
+    for key, value in env.items():
+        brightstaff_env[key] = value
+
+    brightstaff_pid = _daemon_exec(
+        [brightstaff_path],
+        brightstaff_env,
+        os.path.join(log_dir, "brightstaff.log"),
+    )
+    log.info(f"Started brightstaff (PID {brightstaff_pid})")
+
+    # Start envoy
+    envoy_pid = _daemon_exec(
+        [
+            envoy_path,
+            "-c",
+            envoy_config_path,
+            "--component-log-level",
+            f"wasm:{log_level}",
+            "--log-format",
+            "[%Y-%m-%d %T.%e][%l] %v",
+        ],
+        brightstaff_env,
+        os.path.join(log_dir, "envoy.log"),
+    )
+    log.info(f"Started envoy (PID {envoy_pid})")
+
+    # Save PIDs
+    os.makedirs(PLANO_RUN_DIR, exist_ok=True)
+    with open(NATIVE_PID_FILE, "w") as f:
+        json.dump(
+            {
+                "envoy_pid": envoy_pid,
+                "brightstaff_pid": brightstaff_pid,
+            },
+            f,
+        )
+
+    # Health check
+    gateway_ports = _get_gateway_ports(plano_config_file)
+    log.info("Waiting for listeners to become healthy...")
+
+    start_time = time.time()
+    timeout = 60
+    while True:
+        all_healthy = True
+        for port in gateway_ports:
+            if not health_check_endpoint(f"http://localhost:{port}/healthz"):
+                all_healthy = False
+
+        if all_healthy:
+            log.info("Plano is running (native mode)")
+            for port in gateway_ports:
+                log.info(f"  http://localhost:{port}")
+            break
+
+        # Check if processes are still alive
+        if not _is_pid_alive(brightstaff_pid):
+            log.error("brightstaff exited unexpectedly")
+            log.error(f"  Check logs: {os.path.join(log_dir, 'brightstaff.log')}")
+            _kill_pid(envoy_pid)
+            sys.exit(1)
+
+        if not _is_pid_alive(envoy_pid):
+            log.error("envoy exited unexpectedly")
+            log.error(f"  Check logs: {os.path.join(log_dir, 'envoy.log')}")
+            _kill_pid(brightstaff_pid)
+            sys.exit(1)
+
+        if time.time() - start_time > timeout:
+            log.error(f"Health check timed out after {timeout}s")
+            log.error(f"  Check logs in: {log_dir}")
+            stop_native()
+            sys.exit(1)
+
+        time.sleep(1)
+
+    if foreground:
+        log.info("Running in foreground. Press Ctrl+C to stop.")
+        log.info(f"Logs: {log_dir}")
+        try:
+            import glob
+
+            access_logs = sorted(glob.glob(os.path.join(log_dir, "access_*.log")))
+            tail_proc = subprocess.Popen(
+                [
+                    "tail",
+                    "-f",
+                    os.path.join(log_dir, "envoy.log"),
+                    os.path.join(log_dir, "brightstaff.log"),
+                ]
+                + access_logs,
+                stdout=sys.stdout,
+                stderr=sys.stderr,
+            )
+            tail_proc.wait()
+        except KeyboardInterrupt:
+            log.info("Stopping Plano...")
+            if tail_proc.poll() is None:
+                tail_proc.terminate()
+            stop_native()
+    else:
+        log.info(f"Logs: {log_dir}")
+        log.info("Run 'planoai down' to stop.")
+
+
+def _daemon_exec(args, env, log_path):
+    """Start a fully daemonized process via double-fork. Returns the child PID."""
+    log_fd = os.open(log_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o644)
+
+    pid = os.fork()
+    if pid > 0:
+        # Parent: close our copy of the log fd and wait for intermediate child
+        os.close(log_fd)
+        os.waitpid(pid, 0)
+        # Read the grandchild PID from the pipe
+        grandchild_pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{pid}")
+        deadline = time.time() + 5
+        while time.time() < deadline:
+            if os.path.exists(grandchild_pid_path):
+                with open(grandchild_pid_path, "r") as f:
+                    grandchild_pid = int(f.read().strip())
+                os.unlink(grandchild_pid_path)
+                return grandchild_pid
+            time.sleep(0.05)
+        raise RuntimeError(f"Timed out waiting for daemon PID from {args[0]}")
+
+    # First child: create new session and fork again
+    os.setsid()
+    grandchild_pid = os.fork()
+    if grandchild_pid > 0:
+        # Intermediate child: write grandchild PID and exit
+        pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{os.getpid()}")
+        with open(pid_path, "w") as f:
+            f.write(str(grandchild_pid))
+        os._exit(0)
+
+    # Grandchild: this is the actual daemon
+    os.dup2(log_fd, 1)  # stdout -> log
+    os.dup2(log_fd, 2)  # stderr -> log
+    os.close(log_fd)
+    # Close stdin
+    devnull = os.open(os.devnull, os.O_RDONLY)
+    os.dup2(devnull, 0)
+    os.close(devnull)
+
+    os.execve(args[0], args, env)
+
+
+def _is_pid_alive(pid):
+    """Check if a process with the given PID is still running."""
+    try:
+        os.kill(pid, 0)
+        return True
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True  # Process exists but we can't signal it
+
+
+def _kill_pid(pid):
+    """Send SIGTERM to a PID, ignoring errors."""
+    try:
+        os.kill(pid, signal.SIGTERM)
+    except (ProcessLookupError, PermissionError):
+        pass
+
+
+def stop_native():
+    """Stop natively-running Envoy and brightstaff processes."""
+    if not os.path.exists(NATIVE_PID_FILE):
+        log.info("No native Plano instance found (PID file missing).")
+        return
+
+    with open(NATIVE_PID_FILE, "r") as f:
+        pids = json.load(f)
+
+    envoy_pid = pids.get("envoy_pid")
+    brightstaff_pid = pids.get("brightstaff_pid")
+
+    for name, pid in [("envoy", envoy_pid), ("brightstaff", brightstaff_pid)]:
+        if pid is None:
+            continue
+        try:
+            os.kill(pid, signal.SIGTERM)
+            log.info(f"Sent SIGTERM to {name} (PID {pid})")
+        except ProcessLookupError:
+            log.info(f"{name} (PID {pid}) already stopped")
+            continue
+        except PermissionError:
+            log.error(f"Permission denied stopping {name} (PID {pid})")
+            continue
+
+        # Wait for graceful shutdown
+        deadline = time.time() + 10
+        while time.time() < deadline:
+            try:
+                os.kill(pid, 0)  # Check if still alive
+                time.sleep(0.5)
+            except ProcessLookupError:
+                break
+        else:
+            # Still alive after timeout, force kill
+            try:
+                os.kill(pid, signal.SIGKILL)
+                log.info(f"Sent SIGKILL to {name} (PID {pid})")
+            except ProcessLookupError:
+                pass
+
+    os.unlink(NATIVE_PID_FILE)
+    log.info("Plano stopped (native mode).")
+
+
+def native_validate_config(plano_config_file):
+    """Validate config in-process without Docker."""
+    config_dir = _find_config_dir()
+
+    # Create temp dir for rendered output (we just want validation)
+    os.makedirs(PLANO_RUN_DIR, exist_ok=True)
+
+    overrides = {
+        "PLANO_CONFIG_FILE": os.path.abspath(plano_config_file),
+        "PLANO_CONFIG_SCHEMA_FILE": os.path.join(
+            config_dir, "plano_config_schema.yaml"
+        ),
+        "TEMPLATE_ROOT": config_dir,
+        "ENVOY_CONFIG_TEMPLATE_FILE": "envoy.template.yaml",
+        "PLANO_CONFIG_FILE_RENDERED": os.path.join(
+            PLANO_RUN_DIR, "plano_config_rendered.yaml"
+        ),
+        "ENVOY_CONFIG_FILE_RENDERED": os.path.join(PLANO_RUN_DIR, "envoy.yaml"),
+    }
+
+    with _temporary_env(overrides):
+        from planoai.config_generator import validate_and_render_schema
+
+        # Suppress verbose print output from config_generator
+        with contextlib.redirect_stdout(io.StringIO()):
+            validate_and_render_schema()
+
+
+def native_logs(debug=False, follow=False):
+    """Stream logs from native-mode Plano."""
+    import glob as glob_mod
+
+    log_dir = os.path.join(PLANO_RUN_DIR, "logs")
+    if not os.path.isdir(log_dir):
+        log.error(f"No native log directory found at {log_dir}")
+        log.error("Is Plano running? Start it with: planoai up <config.yaml>")
+        sys.exit(1)
+
+    log_files = sorted(glob_mod.glob(os.path.join(log_dir, "access_*.log")))
+    if debug:
+        log_files.extend(
+            [
+                os.path.join(log_dir, "envoy.log"),
+                os.path.join(log_dir, "brightstaff.log"),
+            ]
+        )
+
+    # Filter to files that exist
+    log_files = [f for f in log_files if os.path.exists(f)]
+    if not log_files:
+        log.error(f"No log files found in {log_dir}")
+        sys.exit(1)
+
+    tail_args = ["tail"]
+    if follow:
+        tail_args.append("-f")
+    tail_args.extend(log_files)
+
+    try:
+        proc = subprocess.Popen(tail_args, stdout=sys.stdout, stderr=sys.stderr)
+        proc.wait()
+    except KeyboardInterrupt:
+        if proc.poll() is None:
+            proc.terminate()
--- a/cli/planoai/templates/coding_agent_routing.yaml
+++ b/cli/planoai/templates/coding_agent_routing.yaml
@ -23,7 +23,7 @@ model_providers:

  # Ollama Models
  - model: ollama/llama3.1
-    base_url: http://host.docker.internal:11434
+    base_url: http://localhost:11434


 # Model aliases - friendly names that map to actual provider names
--- a/cli/planoai/trace_cmd.py
+++ b/cli/planoai/trace_cmd.py
@ -2,8 +2,11 @@ import json
 import os
 import re
 import string
+import subprocess
+import sys
 import threading
 import time
+from http import HTTPStatus
 from collections import OrderedDict
 from concurrent import futures
 from dataclasses import dataclass
@ -22,6 +25,7 @@ from rich.text import Text
 from rich.tree import Tree

 from planoai.consts import PLANO_COLOR
+from planoai import trace_listener_runtime

 DEFAULT_GRPC_PORT = 4317
 MAX_TRACES = 50
@ -35,7 +39,7 @@ class TraceListenerBindError(RuntimeError):
 def _trace_listener_bind_error_message(address: str) -> str:
    return (
        f"Failed to start OTLP listener on {address}: address is already in use.\n"
-        "Stop the process using that port or run `planoai trace listen --port <PORT>`."
+        "Stop the process using that port or run `planoai trace listen`."
    )


@ -57,6 +61,25 @@ class TraceSummary:
        return dt.astimezone().strftime("%Y-%m-%d %H:%M:%S")


+def _is_port_in_use(host: str, port: int) -> bool:
+    """Check whether a TCP listener is accepting connections on host:port."""
+    import socket
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(0.2)
+        return s.connect_ex((host, port)) == 0
+
+
+def _get_listener_pid() -> int | None:
+    """Return persisted listener PID if process is alive."""
+    return trace_listener_runtime.get_listener_pid()
+
+
+def _stop_background_listener() -> bool:
+    """Stop persisted listener process if one is running."""
+    return trace_listener_runtime.stop_listener_process()
+
+
 def _parse_filter_patterns(filter_patterns: tuple[str, ...]) -> list[str]:
    parts: list[str] = []
    for raw in filter_patterns:
@ -437,8 +460,6 @@ class _OTLPTraceServicer(trace_service_pb2_grpc.TraceServiceServicer):
    """gRPC servicer that receives OTLP ExportTraceServiceRequest and
    merges incoming spans into the global _TRACE_STORE by trace_id."""

-    _console = Console(stderr=True)
-
    def Export(self, request, context):  # noqa: N802
        for resource_spans in request.resource_spans:
            service_name = "unknown"
@ -456,27 +477,6 @@ class _OTLPTraceServicer(trace_service_pb2_grpc.TraceServiceServicer):
                        continue
                    span_dict = _proto_span_to_dict(span, service_name)
                    _TRACE_STORE.merge_spans(trace_id, [span_dict])
-                    short_id = trace_id[:8]
-                    short_span = span.span_id.hex()[:8]
-                    span_start = (
-                        datetime.fromtimestamp(
-                            span.start_time_unix_nano / 1_000_000_000, tz=timezone.utc
-                        )
-                        .astimezone()
-                        .strftime("%H:%M:%S.%f")[:-3]
-                    )
-                    dur_ns = span.end_time_unix_nano - span.start_time_unix_nano
-                    dur_s = dur_ns / 1_000_000_000
-                    dur_str = f"{dur_s:.3f}".rstrip("0").rstrip(".")
-                    dur_str = f"{dur_str}s"
-                    self._console.print(
-                        f"[dim]{span_start}[/dim], "
-                        f"trace=[yellow]{short_id}[/yellow], "
-                        f"span=[yellow]{short_span}[/yellow], "
-                        f"[bold {_service_color(service_name)}]{service_name}[/bold {_service_color(service_name)}] "
-                        f"[cyan]{span.name}[/cyan] "
-                        f"[dim]({dur_str})[/dim]"
-                    )

        return trace_service_pb2.ExportTraceServiceResponse()

@ -499,12 +499,8 @@ class _TraceQueryHandler(grpc.GenericRpcHandler):
        return json.dumps({"traces": traces}, separators=(",", ":")).encode("utf-8")


-def _create_trace_server(host: str, grpc_port: int) -> grpc.Server:
-    """Create, bind, and start an OTLP/gRPC trace-collection server.
-
-    Returns the running ``grpc.Server``.  The caller is responsible
-    for calling ``server.stop()`` when done.
-    """
+def _start_trace_server(host: str, grpc_port: int) -> grpc.Server:
+    """Create, bind, and start an OTLP/gRPC trace server."""
    grpc_server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=4),
        handlers=[_TraceQueryHandler()],
@ -525,38 +521,88 @@ def _create_trace_server(host: str, grpc_port: int) -> grpc.Server:
    return grpc_server


-def _start_trace_listener(host: str, grpc_port: int) -> None:
-    """Start the OTLP/gRPC listener and block until interrupted."""
-    console = Console()
-    try:
-        grpc_server = _create_trace_server(host, grpc_port)
-    except TraceListenerBindError as exc:
-        raise click.ClickException(str(exc)) from exc
+def _serve_trace_listener(host: str, grpc_port: int) -> None:
+    """Run the listener loop until process termination."""
+    # Persist PID immediately after fork, before server startup.
+    # This ensures the PID file exists even if server initialization fails.
+    trace_listener_runtime.write_listener_pid(os.getpid())

-    console.print()
-    console.print(f"[bold {PLANO_COLOR}]Listening for traces...[/bold {PLANO_COLOR}]")
-    console.print(
-        f"[green]●[/green] gRPC (OTLP receiver) on [cyan]{host}:{grpc_port}[/cyan]"
-    )
-    console.print("[dim]Press Ctrl+C to stop.[/dim]")
-    console.print()
    try:
+        grpc_server = _start_trace_server(host, grpc_port)
        grpc_server.wait_for_termination()
    except KeyboardInterrupt:
        pass
    finally:
-        grpc_server.stop(grace=2)
+        # Best-effort cleanup; server may not exist if startup failed.
+        try:
+            grpc_server.stop(grace=2)
+        except NameError:
+            pass
+        trace_listener_runtime.remove_listener_pid()
+
+
+def _start_trace_listener(host: str, grpc_port: int) -> None:
+    """Start the OTLP/gRPC listener as a daemon process."""
+
+    console = Console()
+
+    # Check if the requested port is already in use.
+    if _is_port_in_use(host, grpc_port):
+        existing_pid = _get_listener_pid()
+        if existing_pid:
+            # If the process PID is known, inform user that our listener is already running.
+            console.print(
+                f"[yellow]⚠[/yellow] Trace listener already running on port [cyan]{grpc_port}[/cyan] (PID: {existing_pid})"
+            )
+        else:
+            # If port is taken but no tracked listener PID exists, warn user of unknown conflict.
+            console.print(
+                f"[red]✗[/red] Port [cyan]{grpc_port}[/cyan] is already in use by another process"
+            )
+            console.print(f"\n[dim]Check what's using the port:[/dim]")
+            console.print(f"  [cyan]lsof -i :{grpc_port}[/cyan]")
+        return
+
+    # Fork/daemonize and run the trace server in the background.
+    try:
+        pid = trace_listener_runtime.daemonize_and_run(
+            lambda: _serve_trace_listener(host, grpc_port)
+        )
+    except OSError as e:
+        console.print(f"[red]✗[/red] Failed to start trace listener: {e}")
+        return
+
+    if pid is None:
+        # We're in the child process; daemonize_and_run never returns here.
+        return
+
+    # In the parent process: wait briefly for the background process to bind the port.
+    time.sleep(0.5)  # Give child process time to start and bind to the port.
+
+    if _is_port_in_use(host, grpc_port):
+        # Success: the trace listener started and bound the port.
+        console.print()
+        console.print(
+            f"[bold {PLANO_COLOR}]Trace listener started[/bold {PLANO_COLOR}]"
+        )
+        console.print(
+            f"[green]●[/green] gRPC (OTLP receiver) on [cyan]{host}:{grpc_port}[/cyan]"
+        )
+        console.print(f"[dim]Process ID: {pid}[/dim]")
+        console.print(
+            "[dim]Use [cyan]planoai trace[/cyan] to view collected traces.[/dim]"
+        )
+        console.print()
+    else:
+        # Failure: trace listener child process did not successfully start.
+        console.print(f"[red]✗[/red] Failed to start trace listener")


 def start_trace_listener_background(
    host: str = "0.0.0.0", grpc_port: int = DEFAULT_GRPC_PORT
 ) -> grpc.Server:
-    """Start the trace listener in the background (non-blocking).
-
-    Returns the running ``grpc.Server`` so the caller can call
-    ``server.stop()`` later.
-    """
-    return _create_trace_server(host, grpc_port)
+    """Start the trace server in-process and return ``grpc.Server`` handle."""
+    return _start_trace_server(host, grpc_port)


 def _span_time_ns(span: dict[str, Any], key: str) -> int:
@ -584,13 +630,13 @@ def _trace_summary(trace: dict[str, Any]) -> TraceSummary:
 def _service_color(service: str) -> str:
    service = service.lower()
    if "inbound" in service:
-        return "white"
+        return "#4860fa"
    if "outbound" in service:
-        return "white"
+        return "#57d9a9"
    if "orchestrator" in service:
        return PLANO_COLOR
    if "routing" in service:
-        return "magenta"
+        return "#e3a2fa"
    if "agent" in service:
        return "cyan"
    if "llm" in service:
@ -598,6 +644,63 @@ def _service_color(service: str) -> str:
    return "white"


+def _error_symbol(status_code: str) -> str:
+    code = int(status_code) if status_code.isdigit() else 0
+
+    if code >= 500:
+        return "💥"  # Server error - something broke
+    elif code == 429:
+        return "🚦"  # Rate limited
+    elif code == 404:
+        return "🔍"  # Not found
+    elif code == 403:
+        return "🚫"  # Forbidden
+    elif code == 401:
+        return "🔐"  # Unauthorized
+    elif code >= 400:
+        return "⚠️"  # Client error
+    else:
+        return "❓"  # Generic error
+
+
+def _error_description(status_code: str) -> str:
+    """Return a developer-friendly description of the error."""
+    code = int(status_code) if status_code.isdigit() else 0
+
+    if code < 400:
+        return "Error"
+    try:
+        return HTTPStatus(code).phrase
+    except ValueError:
+        if code >= 500:
+            return "Server Error"
+        return "Client Error"
+
+
+def _detect_error(span: dict[str, Any]) -> tuple[bool, str, str]:
+    """Detect if span has an error and return (has_error, status_code, error_msg).
+
+    Returns:
+        tuple: (has_error, status_code, error_description)
+    """
+    attrs = _attrs(span)
+    status_code = attrs.get("http.status_code", "")
+
+    # Check for non-2xx status codes
+    if status_code and status_code.isdigit():
+        code = int(status_code)
+        if code >= 400:
+            return True, status_code, _error_description(status_code)
+
+    # Check for explicit error attributes
+    if "error.message" in attrs:
+        return True, status_code or "unknown", attrs["error.message"]
+    if "exception.message" in attrs:
+        return True, status_code or "unknown", attrs["exception.message"]
+
+    return False, "", ""
+
+
 # Attributes to show for inbound/outbound spans when not verbose (trimmed view).
 _INBOUND_OUTBOUND_ATTR_KEYS = (
    "http.method",
@ -621,10 +724,20 @@ def _trim_attrs_for_display(


 def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:
+    # Error attributes always come first
+    error_priority = [
+        "http.status_code",
+        "error.type",
+        "error.message",
+        "error.stack",
+        "exception.type",
+        "exception.message",
+    ]
+
+    # Then regular priority attributes
    priority = [
        "http.method",
        "http.target",
-        "http.status_code",
        "guid:x-request-id",
        "request_size",
        "response_size",
@ -641,7 +754,10 @@ def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:
        "llm.duration_ms",
        "llm.response_bytes",
    ]
-    prioritized = [(k, attrs[k]) for k in priority if k in attrs]
+
+    # Combine error priority with regular priority
+    full_priority = error_priority + priority
+    prioritized = [(k, attrs[k]) for k in full_priority if k in attrs]
    prioritized_keys = {k for k, _ in prioritized}
    remaining = [(k, v) for k, v in attrs.items() if k not in prioritized_keys]
    remaining.sort(key=lambda item: item[0])
@ -649,8 +765,14 @@ def _sorted_attr_items(attrs: dict[str, str]) -> list[tuple[str, str]]:


 def _display_attr_value(key: str, value: str) -> str:
-    if key == "http.status_code" and value != "200":
-        return f"{value} ⚠️"
+    if key == "http.status_code":
+        if value.isdigit():
+            code = int(value)
+            if code >= 400:
+                return f"{value} {_error_symbol(value)}"
+            elif code >= 200 and code < 300:
+                return f"{value}"
+        return value
    return value


@ -670,7 +792,7 @@ def _build_tree(trace: dict[str, Any], console: Console, verbose: bool = False)
    )

    spans.sort(key=lambda s: _span_time_ns(s, "startTimeUnixNano"))
-    tree = Tree("", guide_style="dim")
+    tree = Tree("", guide_style="dim #5b5a5c bold")

    for span in spans:
        service = span.get("service", "plano(unknown)")
@ -678,22 +800,52 @@ def _build_tree(trace: dict[str, Any], console: Console, verbose: bool = False)
        offset_ms = max(
            0, (_span_time_ns(span, "startTimeUnixNano") - start_ns) / 1_000_000
        )
-        color = _service_color(service)
-        label = Text(f"{offset_ms:.0f}ms ", style="yellow")
-        label.append(service, style=f"bold {color}")
-        if name:
-            label.append(f" {name}", style="dim white")
+
+        # Check for errors in this span
+        has_error, error_code, error_desc = _detect_error(span)
+
+        if has_error:
+            # Create error banner above the span
+            error_banner = Text()
+            error_banner.append(error_desc, style="bright_red")
+            tree.add(error_banner)
+
+            # Style the span label in light red
+            label = Text(f"{offset_ms:.0f}ms ", style="#ff6b6b")
+            label.append(service, style="bold #ff6b6b")
+            if name:
+                label.append(f" {name}", style="#ff6b6b italic")
+        else:
+            # Normal styling
+            color = _service_color(service)
+            label = Text(f"{offset_ms:.0f}ms ", style="#949c99")
+            label.append(service, style=f"bold {color}")
+            if name:
+                label.append(f" {name}", style="dim white bold italic")

        node = tree.add(label)
        attrs = _trim_attrs_for_display(_attrs(span), service, verbose)
        sorted_items = list(_sorted_attr_items(attrs))
        for idx, (key, value) in enumerate(sorted_items):
            attr_line = Text()
-            attr_line.append(f"{key}: ", style="white")
-            attr_line.append(
-                _display_attr_value(key, str(value)),
-                style=f"{PLANO_COLOR}",
-            )
+            # attribute key
+            attr_line.append(f"{key}: ", style="#a4a9aa")
+            # attribute value
+            if key == "http.status_code" and value.isdigit():
+                val_int = int(value)
+                val_style = "bold red" if val_int >= 400 else "green"
+                attr_line.append(_display_attr_value(key, str(value)), style=val_style)
+            elif key in [
+                "error.message",
+                "exception.message",
+                "error.type",
+                "exception.type",
+            ]:
+                attr_line.append(_display_attr_value(key, str(value)), style="red")
+            else:
+                attr_line.append(
+                    _display_attr_value(key, str(value)), style=f"{PLANO_COLOR} bold"
+                )
            if idx == len(sorted_items) - 1:
                attr_line.append("\n")
            node.add(attr_line)
@ -904,7 +1056,7 @@ def _run_trace_show(
    _build_tree(trace_obj, console, verbose=verbose)


-@click.group(invoke_without_command=True)
+@click.command()
@click.argument("target", required=False)
@click.option(
    "--filter",
@ -950,9 +1102,8 @@ def trace(
    verbose,
 ):
    """Trace requests from the local OTLP listener."""
-    if ctx.invoked_subcommand:
-        return
-    if target == "listen" and not any(
+    # Handle operational shortcuts when invoked as target values.
+    has_show_options = any(
        [
            filter_patterns,
            where_filters,
@ -963,9 +1114,20 @@ def trace(
            json_out,
            verbose,
        ]
-    ):
+    )
+
+    if target == "listen" and not has_show_options:
        _start_trace_listener("0.0.0.0", DEFAULT_GRPC_PORT)
        return
+
+    if target in ("stop", "down") and not has_show_options:
+        console = Console()
+        if _stop_background_listener():
+            console.print(f"[green]✓[/green] Trace listener stopped")
+        else:
+            console.print(f"[dim]No background trace listener running[/dim]")
+        return
+
    _run_trace_show(
        target,
        filter_patterns,
@ -977,17 +1139,3 @@ def trace(
        json_out,
        verbose,
    )
-
-
-@trace.command("listen")
-@click.option("--host", default="0.0.0.0", show_default=True)
-@click.option(
-    "--port",
-    type=int,
-    default=DEFAULT_GRPC_PORT,
-    show_default=True,
-    help="gRPC port for receiving OTLP traces.",
-)
-def trace_listen(host: str, port: int) -> None:
-    """Listen for OTLP/gRPC traces."""
-    _start_trace_listener(host, port)
--- a/cli/planoai/trace_listener_runtime.py
+++ b/cli/planoai/trace_listener_runtime.py
@ -0,0 +1,127 @@
+"""
+Trace listener process runtime utilities.
+"""
+
+import os
+import signal
+import time
+import logging
+from collections.abc import Callable
+
+# Canonical PID file used by `planoai trace listen/down`.
+TRACE_LISTENER_PID_PATH = os.path.expanduser("~/.plano/run/trace_listener.pid")
+TRACE_LISTENER_LOG_PATH = os.path.expanduser("~/.plano/run/trace_listener.log")
+LOGGER = logging.getLogger(__name__)
+
+
+def write_listener_pid(pid: int) -> None:
+    """Persist listener PID for later management commands."""
+    # Ensure parent directory exists for first-time installs.
+    os.makedirs(os.path.dirname(TRACE_LISTENER_PID_PATH), exist_ok=True)
+    with open(TRACE_LISTENER_PID_PATH, "w") as f:
+        f.write(str(pid))
+
+
+def remove_listener_pid() -> None:
+    """Remove persisted listener PID file if present."""
+    # Best-effort cleanup; missing file is not an error.
+    if os.path.exists(TRACE_LISTENER_PID_PATH):
+        os.remove(TRACE_LISTENER_PID_PATH)
+
+
+def get_listener_pid() -> int | None:
+    """Return listener PID if present and process is alive."""
+    if not os.path.exists(TRACE_LISTENER_PID_PATH):
+        return None
+
+    try:
+        # Parse persisted PID.
+        with open(TRACE_LISTENER_PID_PATH, "r") as f:
+            pid = int(f.read().strip())
+        # Signal 0 performs liveness check without sending a real signal.
+        os.kill(pid, 0)
+        return pid
+    except (ValueError, ProcessLookupError, OSError):
+        # Stale or malformed PID file: clean it up to prevent repeated confusion.
+        LOGGER.warning(
+            "Removing stale or malformed trace listener PID file at %s",
+            TRACE_LISTENER_PID_PATH,
+        )
+        remove_listener_pid()
+        return None
+
+
+def stop_listener_process(grace_seconds: float = 0.5) -> bool:
+    """Stop persisted listener process, returning True if one was stopped."""
+    pid = get_listener_pid()
+    if pid is None:
+        return False
+
+    try:
+        # Try graceful shutdown first.
+        os.kill(pid, signal.SIGTERM)
+        # Allow the process a short window to exit cleanly.
+        time.sleep(grace_seconds)
+        try:
+            # If still alive, force terminate.
+            os.kill(pid, 0)
+            os.kill(pid, signal.SIGKILL)
+        except ProcessLookupError:
+            # Already exited after SIGTERM.
+            pass
+        remove_listener_pid()
+        return True
+    except ProcessLookupError:
+        # Process disappeared between checks; treat as already stopped.
+        remove_listener_pid()
+        return False
+
+
+def daemonize_and_run(run_forever: Callable[[], None]) -> int | None:
+    """
+    Fork and detach process to create a Unix daemon.
+
+    Returns:
+    - Parent process: child PID (> 0), allowing caller to report startup.
+    - Child process: never returns; runs callback in daemon context until termination.
+
+    Raises:
+    - OSError: if fork fails (e.g., resource limits exceeded).
+    """
+    # Duplicate current process. Raises OSError if fork fails.
+    pid = os.fork()
+    if pid > 0:
+        # Parent returns child PID to caller.
+        return pid
+
+    # Child: detach from controlling terminal/session.
+    # This prevents SIGHUP when parent terminal closes and ensures
+    # the daemon cannot reacquire a controlling terminal.
+    os.setsid()
+
+    # Redirect stdin to /dev/null and stdout/stderr to a persistent log file.
+    # This keeps the daemon terminal-independent while preserving diagnostics.
+    os.makedirs(os.path.dirname(TRACE_LISTENER_LOG_PATH), exist_ok=True)
+    devnull_in = os.open(os.devnull, os.O_RDONLY)
+    try:
+        log_fd = os.open(
+            TRACE_LISTENER_LOG_PATH,
+            os.O_WRONLY | os.O_CREAT | os.O_APPEND,
+            0o644,
+        )
+    except OSError:
+        # If logging cannot be initialized, keep running with output discarded.
+        log_fd = os.open(os.devnull, os.O_WRONLY)
+    os.dup2(devnull_in, 0)  # stdin
+    os.dup2(log_fd, 1)  # stdout
+    os.dup2(log_fd, 2)  # stderr
+    if devnull_in > 2:
+        os.close(devnull_in)
+    if log_fd > 2:
+        os.close(log_fd)
+
+    # Run the daemon main loop (expected to block until process termination).
+    run_forever()
+
+    # If callback unexpectedly returns, exit cleanly to avoid returning to parent context.
+    os._exit(0)
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "planoai"
-version = "0.4.8"
+version = "0.4.11"
 description = "Python-based CLI tool to manage Plano."
 authors = [{name = "Katanemo Labs, Inc."}]
 readme = "README.md"
@ -37,6 +37,10 @@ path = "planoai/__init__.py"
 [tool.hatch.build.targets.wheel]
 packages = ["planoai"]

+[tool.hatch.build.targets.wheel.force-include]
+"../config/plano_config_schema.yaml" = "planoai/data/plano_config_schema.yaml"
+"../config/envoy.template.yaml" = "planoai/data/envoy.template.yaml"
+
 [tool.hatch.build.targets.sdist]
 include = ["planoai/**"]

--- a/cli/test/source/failure.json
+++ b/cli/test/source/failure.json
--- a/cli/test/source/success.json
+++ b/cli/test/source/success.json
@ -0,0 +1,803 @@
+{
+    "traces": [
+      {
+        "trace_id": "86f21585168a31a23578d77096cc143b",
+        "spans": [
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "1d6159b920daf4e9",
+            "parentSpanId": "c5d6cd3cfb32b551",
+            "name": "POST archfc.katanemo.dev/v1/chat/completions",
+            "startTimeUnixNano": "1770937700292451000",
+            "endTimeUnixNano": "1770937700552403000",
+            "service": "plano(outbound)",
+            "attributes": [
+              {
+                "key": "node_id",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "zone",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "guid:x-request-id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.url",
+                "value": {
+                  "stringValue": "https://archfc.katanemo.dev/v1/chat/completions"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "downstream_cluster",
+                "value": {
+                  "stringValue": "-"
+                }
+              },
+              {
+                "key": "user_agent",
+                "value": {
+                  "stringValue": "-"
+                }
+              },
+              {
+                "key": "http.protocol",
+                "value": {
+                  "stringValue": "HTTP/1.1"
+                }
+              },
+              {
+                "key": "peer.address",
+                "value": {
+                  "stringValue": "127.0.0.1"
+                }
+              },
+              {
+                "key": "request_size",
+                "value": {
+                  "stringValue": "3293"
+                }
+              },
+              {
+                "key": "response_size",
+                "value": {
+                  "stringValue": "341"
+                }
+              },
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "proxy"
+                }
+              },
+              {
+                "key": "upstream_cluster",
+                "value": {
+                  "stringValue": "arch"
+                }
+              },
+              {
+                "key": "upstream_cluster.name",
+                "value": {
+                  "stringValue": "arch"
+                }
+              },
+              {
+                "key": "http.status_code",
+                "value": {
+                  "stringValue": "200"
+                }
+              },
+              {
+                "key": "response_flags",
+                "value": {
+                  "stringValue": "-"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "4234f793a77a40c8",
+            "parentSpanId": "445f868c5c36294e",
+            "name": "routing",
+            "startTimeUnixNano": "1770937700576995630",
+            "endTimeUnixNano": "1770937700577104880",
+            "service": "plano(routing)",
+            "attributes": [
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "routing"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "http.target",
+                "value": {
+                  "stringValue": "/v1/chat/completions"
+                }
+              },
+              {
+                "key": "model.requested",
+                "value": {
+                  "stringValue": "openai/gpt-4o-mini"
+                }
+              },
+              {
+                "key": "model.alias_resolved",
+                "value": {
+                  "stringValue": "openai/gpt-4o-mini"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(routing)"
+                }
+              },
+              {
+                "key": "routing.determination_ms",
+                "value": {
+                  "intValue": "0"
+                }
+              },
+              {
+                "key": "route.selected_model",
+                "value": {
+                  "stringValue": "none"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "445f868c5c36294e",
+            "parentSpanId": "8311d2245d859e71",
+            "name": "POST /v1/chat/completions openai/gpt-4o-mini",
+            "startTimeUnixNano": "1770937700576869630",
+            "endTimeUnixNano": "1770937701151370214",
+            "service": "plano(llm)",
+            "attributes": [
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "llm"
+                }
+              },
+              {
+                "key": "request_id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "http.path",
+                "value": {
+                  "stringValue": "/v1/chat/completions"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(llm)"
+                }
+              },
+              {
+                "key": "llm.temperature",
+                "value": {
+                  "stringValue": "0.1"
+                }
+              },
+              {
+                "key": "llm.user_message_preview",
+                "value": {
+                  "stringValue": "What’s the weather in Seattle?"
+                }
+              },
+              {
+                "key": "llm.model",
+                "value": {
+                  "stringValue": "openai/gpt-4o-mini"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(llm)"
+                }
+              },
+              {
+                "key": "llm.time_to_first_token",
+                "value": {
+                  "intValue": "572"
+                }
+              },
+              {
+                "key": "signals.quality",
+                "value": {
+                  "stringValue": "Good"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "da348b97890a6c9b",
+            "parentSpanId": "",
+            "name": "POST /v1/chat/completions",
+            "startTimeUnixNano": "1770937700183402000",
+            "endTimeUnixNano": "1770937704394122000",
+            "service": "plano(inbound)",
+            "attributes": [
+              {
+                "key": "node_id",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "zone",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "guid:x-request-id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.url",
+                "value": {
+                  "stringValue": "https://localhost/v1/chat/completions"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "downstream_cluster",
+                "value": {
+                  "stringValue": "-"
+                }
+              },
+              {
+                "key": "user_agent",
+                "value": {
+                  "stringValue": "Python/3.11 aiohttp/3.13.2"
+                }
+              },
+              {
+                "key": "http.protocol",
+                "value": {
+                  "stringValue": "HTTP/1.1"
+                }
+              },
+              {
+                "key": "peer.address",
+                "value": {
+                  "stringValue": "172.18.0.1"
+                }
+              },
+              {
+                "key": "request_size",
+                "value": {
+                  "stringValue": "125"
+                }
+              },
+              {
+                "key": "response_size",
+                "value": {
+                  "stringValue": "34401"
+                }
+              },
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "proxy"
+                }
+              },
+              {
+                "key": "upstream_cluster",
+                "value": {
+                  "stringValue": "bright_staff"
+                }
+              },
+              {
+                "key": "upstream_cluster.name",
+                "value": {
+                  "stringValue": "bright_staff"
+                }
+              },
+              {
+                "key": "http.status_code",
+                "value": {
+                  "stringValue": "200"
+                }
+              },
+              {
+                "key": "response_flags",
+                "value": {
+                  "stringValue": "-"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "79a116cf7d63602a",
+            "parentSpanId": "8b6345129425cf4a",
+            "name": "POST api.openai.com/v1/chat/completions",
+            "startTimeUnixNano": "1770937702607128000",
+            "endTimeUnixNano": "1770937704391625000",
+            "service": "plano(outbound)",
+            "attributes": [
+              {
+                "key": "node_id",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "zone",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "guid:x-request-id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.url",
+                "value": {
+                  "stringValue": "https://api.openai.com/v1/chat/completions"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "downstream_cluster",
+                "value": {
+                  "stringValue": "-"
+                }
+              },
+              {
+                "key": "user_agent",
+                "value": {
+                  "stringValue": "AsyncOpenAI/Python 2.17.0"
+                }
+              },
+              {
+                "key": "http.protocol",
+                "value": {
+                  "stringValue": "HTTP/1.1"
+                }
+              },
+              {
+                "key": "peer.address",
+                "value": {
+                  "stringValue": "127.0.0.1"
+                }
+              },
+              {
+                "key": "request_size",
+                "value": {
+                  "stringValue": "1927"
+                }
+              },
+              {
+                "key": "response_size",
+                "value": {
+                  "stringValue": "20646"
+                }
+              },
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "proxy"
+                }
+              },
+              {
+                "key": "upstream_cluster",
+                "value": {
+                  "stringValue": "openai"
+                }
+              },
+              {
+                "key": "upstream_cluster.name",
+                "value": {
+                  "stringValue": "openai"
+                }
+              },
+              {
+                "key": "http.status_code",
+                "value": {
+                  "stringValue": "200"
+                }
+              },
+              {
+                "key": "response_flags",
+                "value": {
+                  "stringValue": "-"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "60508ba7960d51bc",
+            "parentSpanId": "445f868c5c36294e",
+            "name": "POST api.openai.com/v1/chat/completions",
+            "startTimeUnixNano": "1770937700589205000",
+            "endTimeUnixNano": "1770937701149191000",
+            "service": "plano(outbound)",
+            "attributes": [
+              {
+                "key": "node_id",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "zone",
+                "value": {
+                  "stringValue": ""
+                }
+              },
+              {
+                "key": "guid:x-request-id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.url",
+                "value": {
+                  "stringValue": "https://api.openai.com/v1/chat/completions"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "downstream_cluster",
+                "value": {
+                  "stringValue": "-"
+                }
+              },
+              {
+                "key": "user_agent",
+                "value": {
+                  "stringValue": "AsyncOpenAI/Python 2.17.0"
+                }
+              },
+              {
+                "key": "http.protocol",
+                "value": {
+                  "stringValue": "HTTP/1.1"
+                }
+              },
+              {
+                "key": "peer.address",
+                "value": {
+                  "stringValue": "127.0.0.1"
+                }
+              },
+              {
+                "key": "request_size",
+                "value": {
+                  "stringValue": "930"
+                }
+              },
+              {
+                "key": "response_size",
+                "value": {
+                  "stringValue": "346"
+                }
+              },
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "proxy"
+                }
+              },
+              {
+                "key": "upstream_cluster",
+                "value": {
+                  "stringValue": "openai"
+                }
+              },
+              {
+                "key": "upstream_cluster.name",
+                "value": {
+                  "stringValue": "openai"
+                }
+              },
+              {
+                "key": "http.status_code",
+                "value": {
+                  "stringValue": "200"
+                }
+              },
+              {
+                "key": "response_flags",
+                "value": {
+                  "stringValue": "-"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "8311d2245d859e71",
+            "parentSpanId": "c5d6cd3cfb32b551",
+            "name": "weather_agent /v1/chat/completions",
+            "startTimeUnixNano": "1770937700553490130",
+            "endTimeUnixNano": "1770937704393946299",
+            "service": "plano(agent)",
+            "attributes": [
+              {
+                "key": "agent_id",
+                "value": {
+                  "stringValue": "weather_agent"
+                }
+              },
+              {
+                "key": "message_count",
+                "value": {
+                  "stringValue": "1"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(agent)"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "9eb8a70a8c167f85",
+            "parentSpanId": "8b6345129425cf4a",
+            "name": "routing",
+            "startTimeUnixNano": "1770937702591610381",
+            "endTimeUnixNano": "1770937702592150423",
+            "service": "plano(routing)",
+            "attributes": [
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "routing"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "http.target",
+                "value": {
+                  "stringValue": "/v1/chat/completions"
+                }
+              },
+              {
+                "key": "model.requested",
+                "value": {
+                  "stringValue": "openai/gpt-5.2"
+                }
+              },
+              {
+                "key": "model.alias_resolved",
+                "value": {
+                  "stringValue": "openai/gpt-5.2"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(routing)"
+                }
+              },
+              {
+                "key": "routing.determination_ms",
+                "value": {
+                  "intValue": "0"
+                }
+              },
+              {
+                "key": "route.selected_model",
+                "value": {
+                  "stringValue": "none"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "c5d6cd3cfb32b551",
+            "parentSpanId": "da348b97890a6c9b",
+            "name": "travel_booking_service",
+            "startTimeUnixNano": "1770937700188669630",
+            "endTimeUnixNano": "1770937704393949091",
+            "service": "plano(orchestrator)",
+            "attributes": [
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "orchestrator"
+                }
+              },
+              {
+                "key": "request_id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "http.path",
+                "value": {
+                  "stringValue": "/agents/v1/chat/completions"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(orchestrator)"
+                }
+              },
+              {
+                "key": "selection.listener",
+                "value": {
+                  "stringValue": "travel_booking_service"
+                }
+              },
+              {
+                "key": "selection.agent_count",
+                "value": {
+                  "intValue": "1"
+                }
+              },
+              {
+                "key": "selection.agents",
+                "value": {
+                  "stringValue": "weather_agent"
+                }
+              },
+              {
+                "key": "selection.determination_ms",
+                "value": {
+                  "stringValue": "264.48"
+                }
+              }
+            ]
+          },
+          {
+            "traceId": "86f21585168a31a23578d77096cc143b",
+            "spanId": "8b6345129425cf4a",
+            "parentSpanId": "8311d2245d859e71",
+            "name": "POST /v1/chat/completions openai/gpt-5.2",
+            "startTimeUnixNano": "1770937702591499256",
+            "endTimeUnixNano": "1770937704393043174",
+            "service": "plano(llm)",
+            "attributes": [
+              {
+                "key": "component",
+                "value": {
+                  "stringValue": "llm"
+                }
+              },
+              {
+                "key": "request_id",
+                "value": {
+                  "stringValue": "0e1acd44-41ea-9681-9944-f2f1bec65faf"
+                }
+              },
+              {
+                "key": "http.method",
+                "value": {
+                  "stringValue": "POST"
+                }
+              },
+              {
+                "key": "http.path",
+                "value": {
+                  "stringValue": "/v1/chat/completions"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(llm)"
+                }
+              },
+              {
+                "key": "llm.temperature",
+                "value": {
+                  "stringValue": "0.7"
+                }
+              },
+              {
+                "key": "llm.user_message_preview",
+                "value": {
+                  "stringValue": "What’s the weather in Seattle?\n\nWeather data for S..."
+                }
+              },
+              {
+                "key": "llm.model",
+                "value": {
+                  "stringValue": "openai/gpt-5.2"
+                }
+              },
+              {
+                "key": "service.name.override",
+                "value": {
+                  "stringValue": "plano(llm)"
+                }
+              },
+              {
+                "key": "llm.time_to_first_token",
+                "value": {
+                  "intValue": "506"
+                }
+              },
+              {
+                "key": "signals.quality",
+                "value": {
+                  "stringValue": "Good"
+                }
+              }
+            ]
+          }
+        ]
+      }
+    ]
+  }
--- a/cli/test/test_trace_cmd.py
+++ b/cli/test/test_trace_cmd.py
@ -1,7 +1,70 @@
-import pytest
-import rich_click as click
+import copy
+import json
+import re
+from pathlib import Path

-from planoai import trace_cmd
+import pytest
+from click.testing import CliRunner
+
+from planoai.trace_cmd import trace
+import planoai.trace_cmd as trace_cmd
+
+
+def _load_success_traces() -> list[dict]:
+    source_path = Path(__file__).parent / "source" / "success.json"
+    payload = json.loads(source_path.read_text(encoding="utf-8"))
+    return payload["traces"]
+
+
+def _load_failure_traces() -> list[dict]:
+    source_path = Path(__file__).parent / "source" / "failure.json"
+    payload = json.loads(source_path.read_text(encoding="utf-8"))
+    return payload["traces"]
+
+
+def _build_trace_set() -> list[dict]:
+    traces = copy.deepcopy(_load_success_traces())
+    primary = traces[0]
+
+    secondary = copy.deepcopy(primary)
+    secondary["trace_id"] = "1234567890abcdef1234567890abcdef"
+    for span in secondary.get("spans", []):
+        span["traceId"] = secondary["trace_id"]
+        if span.get("startTimeUnixNano", "").isdigit():
+            span["startTimeUnixNano"] = str(
+                int(span["startTimeUnixNano"]) - 1_000_000_000
+            )
+        if span.get("endTimeUnixNano", "").isdigit():
+            span["endTimeUnixNano"] = str(int(span["endTimeUnixNano"]) - 1_000_000_000)
+
+    return [primary, secondary]
+
+
+def _json_from_output(output: str) -> dict:
+    start = output.find("{")
+    if start == -1:
+        raise AssertionError(f"No JSON object found in output:\n{output}")
+    return json.loads(output[start:])
+
+
+def _plain_output(output: str) -> str:
+    # Strip ANSI color/style sequences emitted by rich-click in CI terminals.
+    return re.sub(r"\x1b\[[0-9;]*m", "", output)
+
+
+@pytest.fixture
+def runner() -> CliRunner:
+    return CliRunner()
+
+
+@pytest.fixture
+def traces() -> list[dict]:
+    return _build_trace_set()
+
+
+@pytest.fixture
+def failure_traces() -> list[dict]:
+    return copy.deepcopy(_load_failure_traces())


 class _FakeGrpcServer:
@ -12,7 +75,7 @@ class _FakeGrpcServer:
        return None


-def test_create_trace_server_raises_bind_error(monkeypatch):
+def test_start_trace_server_raises_bind_error(monkeypatch):
    monkeypatch.setattr(
        trace_cmd.grpc, "server", lambda *_args, **_kwargs: _FakeGrpcServer()
    )
@ -23,22 +86,305 @@ def test_create_trace_server_raises_bind_error(monkeypatch):
    )

    with pytest.raises(trace_cmd.TraceListenerBindError) as excinfo:
-        trace_cmd._create_trace_server("0.0.0.0", 4317)
+        trace_cmd._start_trace_server("0.0.0.0", 4317)

    assert "already in use" in str(excinfo.value)
-    assert "planoai trace listen --port" in str(excinfo.value)
+    assert "planoai trace listen" in str(excinfo.value)


-def test_start_trace_listener_converts_bind_error_to_click_exception(monkeypatch):
-    monkeypatch.setattr(
-        trace_cmd,
-        "_create_trace_server",
-        lambda *_args, **_kwargs: (_ for _ in ()).throw(
-            trace_cmd.TraceListenerBindError("port in use")
-        ),
+def test_trace_listen_starts_listener_with_defaults(runner, monkeypatch):
+    seen = {}
+
+    def fake_start(host: str, port: int) -> None:
+        seen["host"] = host
+        seen["port"] = port
+
+    monkeypatch.setattr(trace_cmd, "_start_trace_listener", fake_start)
+
+    result = runner.invoke(trace, ["listen"])
+
+    assert result.exit_code == 0, result.output
+    assert seen == {"host": "0.0.0.0", "port": trace_cmd.DEFAULT_GRPC_PORT}
+
+
+def test_trace_down_prints_success_when_listener_stopped(runner, monkeypatch):
+    monkeypatch.setattr(trace_cmd, "_stop_background_listener", lambda: True)
+
+    result = runner.invoke(trace, ["down"])
+
+    assert result.exit_code == 0, result.output
+    assert "Trace listener stopped" in result.output
+
+
+def test_trace_down_prints_no_listener_when_not_running(runner, monkeypatch):
+    monkeypatch.setattr(trace_cmd, "_stop_background_listener", lambda: False)
+
+    result = runner.invoke(trace, ["down"])
+
+    assert result.exit_code == 0, result.output
+    assert "No background trace listener running" in result.output
+
+
+def test_trace_default_target_uses_last_and_builds_first_trace(
+    runner, monkeypatch, traces
+):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+    seen = {}
+
+    def fake_build_tree(trace_obj, _console, verbose=False):
+        seen["trace_id"] = trace_obj["trace_id"]
+        seen["verbose"] = verbose
+
+    monkeypatch.setattr(trace_cmd, "_build_tree", fake_build_tree)
+
+    result = runner.invoke(trace, [])
+
+    assert result.exit_code == 0, result.output
+    assert seen["trace_id"] == traces[0]["trace_id"]
+    assert seen["verbose"] is False
+
+
+def test_trace_list_any_prints_short_trace_ids(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["--list", "--no-interactive", "any"])
+
+    assert result.exit_code == 0, result.output
+    assert "Trace IDs:" in result.output
+    assert traces[0]["trace_id"][:8] in result.output
+    assert traces[1]["trace_id"][:8] in result.output
+
+
+def test_trace_list_target_conflict_errors(runner, traces, monkeypatch):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["--list", traces[0]["trace_id"]])
+
+    assert result.exit_code != 0
+    assert "Target and --list cannot be used together." in _plain_output(result.output)
+
+
+def test_trace_json_list_with_limit_outputs_trace_ids(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["--list", "any", "--json", "--limit", "1"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert payload == {"trace_ids": [traces[0]["trace_id"]]}
+
+
+def test_trace_json_for_short_target_returns_one_trace(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+    short_target = traces[0]["trace_id"][:8]
+
+    result = runner.invoke(trace, [short_target, "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert len(payload["traces"]) == 1
+    assert payload["traces"][0]["trace_id"] == traces[0]["trace_id"]
+
+
+@pytest.mark.parametrize(
+    ("target", "message"),
+    [
+        ("abc", "Trace ID must be 8 or 32 hex characters."),
+        ("00000000", "Short trace ID must be 8 hex characters."),
+        ("0" * 32, "Trace ID must be 32 hex characters."),
+    ],
+)
+def test_trace_target_validation_errors(runner, target, message):
+    result = runner.invoke(trace, [target])
+    assert result.exit_code != 0
+    assert message in _plain_output(result.output)
+
+
+def test_trace_where_invalid_format_errors(runner):
+    result = runner.invoke(trace, ["any", "--where", "bad-format"])
+
+    assert result.exit_code != 0
+    assert "Invalid --where filter(s): bad-format. Use key=value." in _plain_output(
+        result.output
    )

-    with pytest.raises(click.ClickException) as excinfo:
-        trace_cmd._start_trace_listener("0.0.0.0", 4317)

-    assert "port in use" in str(excinfo.value)
+def test_trace_where_unknown_key_errors(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["any", "--where", "not.a.real.key=value"])
+
+    assert result.exit_code != 0
+    assert "Unknown --where key(s): not.a.real.key" in _plain_output(result.output)
+
+
+def test_trace_where_filters_to_matching_trace(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(
+        trace, ["any", "--where", "agent_id=weather_agent", "--json"]
+    )
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert [trace_item["trace_id"] for trace_item in payload["traces"]] == [
+        traces[0]["trace_id"],
+        traces[1]["trace_id"],
+    ]
+
+
+def test_trace_where_and_filters_can_exclude_all(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(
+        trace,
+        [
+            "any",
+            "--where",
+            "agent_id=weather_agent",
+            "--where",
+            "http.status_code=500",
+            "--json",
+        ],
+    )
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert payload == {"traces": []}
+
+
+def test_trace_filter_restricts_attributes_by_pattern(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["any", "--filter", "http.*", "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    for trace_item in payload["traces"]:
+        for span in trace_item["spans"]:
+            for attr in span.get("attributes", []):
+                assert attr["key"].startswith("http.")
+
+
+def test_trace_filter_unmatched_warns_and_returns_unfiltered(
+    runner, monkeypatch, traces
+):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+
+    result = runner.invoke(trace, ["any", "--filter", "not-found-*", "--json"])
+
+    assert result.exit_code == 0, result.output
+    assert (
+        "Filter key(s) not found: not-found-*. Returning unfiltered traces."
+        in result.output
+    )
+    payload = _json_from_output(result.output)
+    assert len(payload["traces"]) == len(traces)
+
+
+def test_trace_since_can_filter_out_old_traces(runner, monkeypatch, traces):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(traces))
+    monkeypatch.setattr(trace_cmd.time, "time", lambda: 1_999_999_999.0)
+
+    result = runner.invoke(trace, ["any", "--since", "1m", "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert payload == {"traces": []}
+
+
+def test_trace_negative_limit_errors(runner):
+    result = runner.invoke(trace, ["any", "--limit", "-1"])
+
+    assert result.exit_code != 0
+    assert "Limit must be greater than or equal to 0." in _plain_output(result.output)
+
+
+def test_trace_empty_data_prints_no_traces_found(runner, monkeypatch):
+    monkeypatch.setattr(trace_cmd, "_fetch_traces_raw", lambda: [])
+
+    result = runner.invoke(trace, [])
+
+    assert result.exit_code == 0, result.output
+    assert "No traces found." in result.output
+
+
+def test_trace_invalid_filter_token_errors(runner):
+    result = runner.invoke(trace, ["any", "--filter", "http.method,"])
+
+    assert result.exit_code != 0
+    assert "Filter contains empty tokens." in _plain_output(result.output)
+
+
+def test_trace_failure_json_any_contains_all_fixture_trace_ids(
+    runner, monkeypatch, failure_traces
+):
+    monkeypatch.setattr(
+        trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
+    )
+
+    result = runner.invoke(trace, ["any", "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert [item["trace_id"] for item in payload["traces"]] == [
+        "f7a31829c4b5d6e8a9f0b1c2d3e4f5a6",
+        "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6",
+        "b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7",
+    ]
+
+
+@pytest.mark.parametrize(
+    ("status_code", "expected_trace_ids"),
+    [
+        ("503", ["f7a31829c4b5d6e8a9f0b1c2d3e4f5a6"]),
+        ("429", ["a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6"]),
+        ("500", ["b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7"]),
+    ],
+)
+def test_trace_failure_where_status_filters_expected_traces(
+    runner, monkeypatch, failure_traces, status_code, expected_trace_ids
+):
+    monkeypatch.setattr(
+        trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
+    )
+
+    result = runner.invoke(
+        trace, ["any", "--where", f"http.status_code={status_code}", "--json"]
+    )
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert [item["trace_id"] for item in payload["traces"]] == expected_trace_ids
+
+
+def test_trace_failure_default_render_shows_service_unavailable_banner(
+    runner, monkeypatch, failure_traces
+):
+    monkeypatch.setattr(
+        trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
+    )
+
+    result = runner.invoke(trace, [])
+
+    assert result.exit_code == 0, result.output
+    assert "Service Unavailable" in result.output
+    assert "503" in result.output
+
+
+def test_trace_failure_filter_keeps_http_status_code_attributes(
+    runner, monkeypatch, failure_traces
+):
+    monkeypatch.setattr(
+        trace_cmd, "_fetch_traces_raw", lambda: copy.deepcopy(failure_traces)
+    )
+
+    result = runner.invoke(trace, ["any", "--filter", "http.status_code", "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = _json_from_output(result.output)
+    assert payload["traces"], "Expected traces in failure fixture"
+    for trace_item in payload["traces"]:
+        for span in trace_item["spans"]:
+            keys = [attr["key"] for attr in span.get("attributes", [])]
+            assert set(keys).issubset({"http.status_code"})
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -394,7 +394,19 @@ properties:
        type: boolean
      opentracing_grpc_endpoint:
        type: string
-      additionalProperties: false
+      span_attributes:
+        type: object
+        properties:
+          header_prefixes:
+            type: array
+            items:
+              type: string
+          static:
+            type: object
+            additionalProperties:
+              type: string
+        additionalProperties: false
+    additionalProperties: false
  mode:
    type: string
    enum:
@ -407,7 +419,7 @@ properties:
        type: string
      model:
        type: string
-      additionalProperties: false
+    additionalProperties: false
  state_storage:
    type: object
    properties:
--- a/config/test_passthrough.yaml
+++ b/config/test_passthrough.yaml
@ -6,8 +6,8 @@
 # that manage their own API key validation.
 #
 # To test:
-#   docker build -t plano-passthrough-test .
-#   docker run -d -p 10000:10000 -v $(pwd)/config/test_passthrough.yaml:/app/plano_config.yaml plano-passthrough-test
+#   pip install planoai
+#   planoai up config/test_passthrough.yaml
 #
 #   curl http://localhost:10000/v1/chat/completions \
 #     -H "Authorization: Bearer sk-your-virtual-key" \
--- a/config/validate_plano_config.sh
+++ b/config/validate_plano_config.sh
@ -1,20 +1,32 @@
 #!/bin/bash

+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 failed_files=()

 for file in $(find . -name config.yaml -o -name plano_config_full_reference.yaml); do
  echo "Validating ${file}..."
-  touch $(pwd)/${file}_rendered
-  if ! docker run --rm -v "$(pwd)/${file}:/app/plano_config.yaml:ro" -v "$(pwd)/${file}_rendered:/app/plano_config_rendered.yaml:rw" --entrypoint /bin/sh ${PLANO_DOCKER_IMAGE:-katanemo/plano:0.4.8} -c "python -m planoai.config_generator" 2>&1 > /dev/null ; then
+  rendered_file="$(pwd)/${file}_rendered"
+  touch "$rendered_file"
+
+  PLANO_CONFIG_FILE="$(pwd)/${file}" \
+  PLANO_CONFIG_SCHEMA_FILE="${SCRIPT_DIR}/plano_config_schema.yaml" \
+  TEMPLATE_ROOT="${SCRIPT_DIR}" \
+  ENVOY_CONFIG_TEMPLATE_FILE="envoy.template.yaml" \
+  PLANO_CONFIG_FILE_RENDERED="$rendered_file" \
+  ENVOY_CONFIG_FILE_RENDERED="/dev/null" \
+  python -m planoai.config_generator 2>&1 > /dev/null
+
+  if [ $? -ne 0 ]; then
    echo "Validation failed for $file"
    failed_files+=("$file")
  fi
+
  RENDERED_CHECKED_IN_FILE=$(echo $file | sed 's/\.yaml$/_rendered.yaml/')
  if [ -f "$RENDERED_CHECKED_IN_FILE" ]; then
    echo "Checking rendered file against checked-in version..."
-    if ! diff -q "${file}_rendered" "$RENDERED_CHECKED_IN_FILE" > /dev/null; then
-      echo "Rendered file ${file}_rendered does not match checked-in version ${RENDERED_CHECKED_IN_FILE}"
-      failed_files+=("${file}_rendered")
+    if ! diff -q "$rendered_file" "$RENDERED_CHECKED_IN_FILE" > /dev/null; then
+      echo "Rendered file $rendered_file does not match checked-in version ${RENDERED_CHECKED_IN_FILE}"
+      failed_files+=("$rendered_file")
    else
      echo "Rendered file matches checked-in version."
    fi
--- a/crates/Cargo.lock
+++ b/crates/Cargo.lock
@ -436,11 +436,14 @@ name = "common"
 version = "0.1.0"
 dependencies = [
 "axum",
+ "bytes",
 "derivative",
 "duration-string",
 "governor",
 "hermesllm",
 "hex",
+ "http-body-util",
+ "hyper 1.6.0",
 "log",
 "pretty_assertions",
 "proxy-wasm",
--- a/crates/brightstaff/src/handlers/agent_chat_completions.rs
+++ b/crates/brightstaff/src/handlers/agent_chat_completions.rs
@ -2,6 +2,8 @@ use std::sync::Arc;
 use std::time::Instant;

 use bytes::Bytes;
+use common::configuration::SpanAttributes;
+use common::errors::BrightStaffError;
 use common::llm_providers::LlmProviders;
 use hermesllm::apis::OpenAIMessage;
 use hermesllm::clients::SupportedAPIsFromClient;
@ -19,17 +21,17 @@ use super::agent_selector::{AgentSelectionError, AgentSelector};
 use super::pipeline_processor::{PipelineError, PipelineProcessor};
 use super::response_handler::ResponseHandler;
 use crate::router::plano_orchestrator::OrchestratorService;
-use crate::tracing::{operation_component, set_service_name};
+use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};

 /// Main errors for agent chat completions
 #[derive(Debug, thiserror::Error)]
 pub enum AgentFilterChainError {
+    #[error("Forwarded error: {0}")]
+    Brightstaff(#[from] BrightStaffError),
    #[error("Agent selection error: {0}")]
    Selection(#[from] AgentSelectionError),
    #[error("Pipeline processing error: {0}")]
    Pipeline(#[from] PipelineError),
-    #[error("Response handling error: {0}")]
-    Response(#[from] super::response_handler::ResponseError),
    #[error("Request parsing error: {0}")]
    RequestParsing(#[from] serde_json::Error),
    #[error("HTTP error: {0}")]
@ -42,8 +44,11 @@ pub async fn agent_chat(
    _: String,
    agents_list: Arc<tokio::sync::RwLock<Option<Vec<common::configuration::Agent>>>>,
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
+    span_attributes: Arc<Option<SpanAttributes>>,
    llm_providers: Arc<RwLock<LlmProviders>>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    let custom_attrs =
+        collect_custom_trace_attributes(request.headers(), span_attributes.as_ref().as_ref());
    // Extract request_id from headers or generate a new one
    let request_id: String = match request
        .headers()
@ -76,6 +81,7 @@ pub async fn agent_chat(
            listeners,
            llm_providers,
            request_id,
+            custom_attrs,
        )
        .await
        {
@ -103,16 +109,15 @@ pub async fn agent_chat(
                        "agent_response": body
                    });

+                    let status_code = hyper::StatusCode::from_u16(*status)
+                        .unwrap_or(hyper::StatusCode::INTERNAL_SERVER_ERROR);
+
                    let json_string = error_json.to_string();
-                    let mut response =
-                        Response::new(ResponseHandler::create_full_body(json_string));
-                    *response.status_mut() = hyper::StatusCode::from_u16(*status)
-                        .unwrap_or(hyper::StatusCode::BAD_REQUEST);
-                    response.headers_mut().insert(
-                        hyper::header::CONTENT_TYPE,
-                        "application/json".parse().unwrap(),
-                    );
-                    return Ok(response);
+                    return Ok(BrightStaffError::ForwardedError {
+                        status_code,
+                        message: json_string,
+                    }
+                    .into_response());
                }

                // Print detailed error information with full error chain for other errors
@ -145,8 +150,11 @@ pub async fn agent_chat(
                // Log the error for debugging
                info!(error = %error_json, "structured error info");

-                // Return JSON error response
-                Ok(ResponseHandler::create_json_error_response(&error_json))
+                Ok(BrightStaffError::ForwardedError {
+                    status_code: StatusCode::BAD_REQUEST,
+                    message: error_json.to_string(),
+                }
+                .into_response())
            }
        }
    }
@ -161,6 +169,7 @@ async fn handle_agent_chat_inner(
    listeners: Arc<tokio::sync::RwLock<Vec<common::configuration::Listener>>>,
    llm_providers: Arc<RwLock<LlmProviders>>,
    request_id: String,
+    custom_attrs: std::collections::HashMap<String, String>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, AgentFilterChainError> {
    // Initialize services
    let agent_selector = AgentSelector::new(orchestrator_service);
@ -183,6 +192,9 @@ async fn handle_agent_chat_inner(

    get_active_span(|span| {
        span.update_name(listener.name.to_string());
+        for (key, value) in &custom_attrs {
+            span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
+        }
    });

    info!(listener = %listener.name, "handling request");
@ -249,10 +261,7 @@ async fn handle_agent_chat_inner(
            None => {
                let err_msg = "No model specified in request and no default provider configured";
                warn!("{}", err_msg);
-                let mut bad_request =
-                    Response::new(ResponseHandler::create_full_body(err_msg.to_string()));
-                *bad_request.status_mut() = StatusCode::BAD_REQUEST;
-                return Ok(bad_request);
+                return Ok(BrightStaffError::NoModelSpecified.into_response());
            }
        }
    }
@ -348,6 +357,9 @@ async fn handle_agent_chat_inner(
            set_service_name(operation_component::AGENT);
            get_active_span(|span| {
                span.update_name(format!("{} /v1/chat/completions", agent_name));
+                for (key, value) in &custom_attrs {
+                    span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
+                }
            });

            pipeline_processor
--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@ -5,9 +5,10 @@ use hyper::header::HeaderMap;

 use crate::handlers::agent_selector::{AgentSelectionError, AgentSelector};
 use crate::handlers::pipeline_processor::PipelineProcessor;
-use crate::handlers::response_handler::ResponseHandler;
 use crate::router::plano_orchestrator::OrchestratorService;
-
+use common::errors::BrightStaffError;
+use http_body_util::BodyExt;
+use hyper::StatusCode;
 /// Integration test that demonstrates the modular agent chat flow
 /// This test shows how the three main components work together:
 /// 1. AgentSelector - selects the appropriate agents based on orchestration
@ -129,8 +130,24 @@ mod tests {
        }

        // Test 4: Error Response Creation
-        let error_response = ResponseHandler::create_bad_request("Test error");
-        assert_eq!(error_response.status(), hyper::StatusCode::BAD_REQUEST);
+        let err = BrightStaffError::ModelNotFound("gpt-5-secret".to_string());
+        let response = err.into_response();
+
+        assert_eq!(response.status(), StatusCode::NOT_FOUND);
+
+        // Helper to extract body as JSON
+        let body_bytes = response.into_body().collect().await.unwrap().to_bytes();
+        let body: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
+
+        assert_eq!(body["error"]["code"], "ModelNotFound");
+        assert_eq!(
+            body["error"]["details"]["rejected_model_id"],
+            "gpt-5-secret"
+        );
+        assert!(body["error"]["message"]
+            .as_str()
+            .unwrap()
+            .contains("gpt-5-secret"));

        println!("✅ All modular components working correctly!");
    }
@ -149,12 +166,21 @@ mod tests {
            AgentSelectionError::ListenerNotFound(_)
        ));

-        // Test error response creation
-        let error_response = ResponseHandler::create_internal_error("Pipeline failed");
-        assert_eq!(
-            error_response.status(),
-            hyper::StatusCode::INTERNAL_SERVER_ERROR
-        );
+        let technical_reason = "Database connection timed out";
+        let err = BrightStaffError::InternalServerError(technical_reason.to_string());
+
+        let response = err.into_response();
+
+        // --- 1. EXTRACT BYTES ---
+        let body_bytes = response.into_body().collect().await.unwrap().to_bytes();
+
+        // --- 2. DECLARE body_json HERE ---
+        let body_json: serde_json::Value =
+            serde_json::from_slice(&body_bytes).expect("Failed to parse JSON body");
+
+        // --- 3. USE body_json ---
+        assert_eq!(body_json["error"]["code"], "InternalServerError");
+        assert_eq!(body_json["error"]["details"]["reason"], technical_reason);

        println!("✅ Error handling working correctly!");
    }
--- a/crates/brightstaff/src/handlers/llm.rs
+++ b/crates/brightstaff/src/handlers/llm.rs
@ -1,5 +1,5 @@
 use bytes::Bytes;
-use common::configuration::{Agent, AgentFilterChain, Listener, ModelAlias};
+use common::configuration::{Agent, AgentFilterChain, Listener, ModelAlias, SpanAttributes};
 use common::consts::{
    ARCH_IS_STREAMING_HEADER, ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
 };
@ -30,13 +30,11 @@ use crate::state::response_state_processor::ResponsesStateProcessor;
 use crate::state::{
    extract_input_items, retrieve_and_combine_input, StateStorage, StateStorageError,
 };
-use crate::tracing::{llm as tracing_llm, operation_component, set_service_name};
+use crate::tracing::{
+    collect_custom_trace_attributes, llm as tracing_llm, operation_component, set_service_name,
+};

-fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
-    Full::new(chunk.into())
-        .map_err(|never| match never {})
-        .boxed()
-}
+use common::errors::BrightStaffError;

 #[allow(clippy::too_many_arguments)]
 pub async fn llm_chat(
@ -45,6 +43,7 @@ pub async fn llm_chat(
    full_qualified_llm_provider_url: String,
    model_aliases: Arc<Option<HashMap<String, ModelAlias>>>,
    llm_providers: Arc<RwLock<LlmProviders>>,
+    span_attributes: Arc<Option<SpanAttributes>>,
    state_storage: Option<Arc<dyn StateStorage>>,
    listeners: Arc<RwLock<Vec<Listener>>>,
    agents_list: Arc<RwLock<Option<Vec<Agent>>>>,
@ -59,6 +58,8 @@ pub async fn llm_chat(
        Some(id) => id,
        None => uuid::Uuid::new_v4().to_string(),
    };
+    let custom_attrs =
+        collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref());

    // Create a span with request_id that will be included in all log lines
    let request_span = info_span!(
@ -80,6 +81,7 @@ pub async fn llm_chat(
        full_qualified_llm_provider_url,
        model_aliases,
        llm_providers,
+        custom_attrs,
        state_storage,
        request_id,
        request_path,
@ -98,6 +100,7 @@ async fn llm_chat_inner(
    full_qualified_llm_provider_url: String,
    model_aliases: Arc<Option<HashMap<String, ModelAlias>>>,
    llm_providers: Arc<RwLock<LlmProviders>>,
+    custom_attrs: HashMap<String, String>,
    state_storage: Option<Arc<dyn StateStorage>>,
    request_id: String,
    request_path: String,
@ -107,6 +110,11 @@ async fn llm_chat_inner(
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
    // Set service name for LLM operations
    set_service_name(operation_component::LLM);
+    get_active_span(|span| {
+        for (key, value) in &custom_attrs {
+            span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
+        }
+    });

    // Extract or generate traceparent - this establishes the trace context for all spans
    let traceparent: String = match request_headers
@ -144,10 +152,11 @@ async fn llm_chat_inner(
                error = %err,
                "failed to parse request as ProviderRequestType"
            );
-            let err_msg = format!("Failed to parse request: {}", err);
-            let mut bad_request = Response::new(full(err_msg));
-            *bad_request.status_mut() = StatusCode::BAD_REQUEST;
-            return Ok(bad_request);
+            return Ok(BrightStaffError::InvalidRequest(format!(
+                "Failed to parse request: {}",
+                err
+            ))
+            .into_response());
        }
    };

@ -172,9 +181,7 @@ async fn llm_chat_inner(
            None => {
                let err_msg = "No model specified in request and no default provider configured";
                warn!("{}", err_msg);
-                let mut bad_request = Response::new(full(err_msg.to_string()));
-                *bad_request.status_mut() = StatusCode::BAD_REQUEST;
-                return Ok(bad_request);
+                return Ok(BrightStaffError::NoModelSpecified.into_response());
            }
        }
    } else {
@ -195,14 +202,8 @@ async fn llm_chat_inner(
        .get(&alias_resolved_model)
        .is_none()
    {
-        let err_msg = format!(
-            "Model '{}' not found in configured providers",
-            alias_resolved_model
-        );
        warn!(model = %alias_resolved_model, "model not found in configured providers");
-        let mut bad_request = Response::new(full(err_msg));
-        *bad_request.status_mut() = StatusCode::BAD_REQUEST;
-        return Ok(bad_request);
+        return Ok(BrightStaffError::ModelNotFound(alias_resolved_model).into_response());
    }

    // Handle provider/model slug format (e.g., "openai/gpt-4")
@ -261,12 +262,7 @@ async fn llm_chat_inner(
                let agents_guard = agents_list.read().await;
                let agent_map: HashMap<String, Agent> = agents_guard
                    .as_ref()
-                    .map(|agents| {
-                        agents
-                            .iter()
-                            .map(|a| (a.id.clone(), a.clone()))
-                            .collect()
-                    })
+                    .map(|agents| agents.iter().map(|a| (a.id.clone(), a.clone())).collect())
                    .unwrap_or_default();

                // Create a temporary AgentFilterChain to reuse PipelineProcessor
@ -387,13 +383,10 @@ async fn llm_chat_inner(
                        Err(StateStorageError::NotFound(_)) => {
                            // Return 409 Conflict when previous_response_id not found
                            warn!(previous_response_id = %prev_resp_id, "previous response_id not found");
-                            let err_msg = format!(
-                                "Conversation state not found for previous_response_id: {}",
-                                prev_resp_id
-                            );
-                            let mut conflict_response = Response::new(full(err_msg));
-                            *conflict_response.status_mut() = StatusCode::CONFLICT;
-                            return Ok(conflict_response);
+                            return Ok(BrightStaffError::ConversationStateNotFound(
+                                prev_resp_id.to_string(),
+                            )
+                            .into_response());
                        }
                        Err(e) => {
                            // Log warning but continue on other storage errors
@ -444,9 +437,11 @@ async fn llm_chat_inner(
    {
        Ok(result) => result,
        Err(err) => {
-            let mut internal_error = Response::new(full(err.message));
-            *internal_error.status_mut() = err.status_code;
-            return Ok(internal_error);
+            return Ok(BrightStaffError::ForwardedError {
+                status_code: err.status_code,
+                message: err.message,
+            }
+            .into_response());
        }
    };

@ -512,10 +507,11 @@ async fn llm_chat_inner(
    {
        Ok(res) => res,
        Err(err) => {
-            let err_msg = format!("Failed to send request: {}", err);
-            let mut internal_error = Response::new(full(err_msg));
-            *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-            return Ok(internal_error);
+            return Ok(BrightStaffError::InternalServerError(format!(
+                "Failed to send request: {}",
+                err
+            ))
+            .into_response());
        }
    };

@ -572,12 +568,11 @@ async fn llm_chat_inner(

    match response.body(streaming_response.body) {
        Ok(response) => Ok(response),
-        Err(err) => {
-            let err_msg = format!("Failed to create response: {}", err);
-            let mut internal_error = Response::new(full(err_msg));
-            *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-            Ok(internal_error)
-        }
+        Err(err) => Ok(BrightStaffError::InternalServerError(format!(
+            "Failed to create response: {}",
+            err
+        ))
+        .into_response()),
    }
 }

@ -650,3 +645,9 @@ async fn get_provider_info(
        (hermesllm::ProviderId::OpenAI, None)
    }
 }
+
+fn full<T: Into<Bytes>>(chunk: T) -> BoxBody<Bytes, hyper::Error> {
+    Full::new(chunk.into())
+        .map_err(|never| match never {})
+        .boxed()
+}
--- a/crates/brightstaff/src/handlers/mod.rs
+++ b/crates/brightstaff/src/handlers/mod.rs
@ -7,6 +7,7 @@ pub mod models;
 pub mod pipeline_processor;
 pub mod response_handler;
 pub mod router_chat;
+pub mod routing_service;
 pub mod utils;

 #[cfg(test)]
--- a/crates/brightstaff/src/handlers/response_handler.rs
+++ b/crates/brightstaff/src/handlers/response_handler.rs
@ -1,25 +1,17 @@
 use bytes::Bytes;
+use common::errors::BrightStaffError;
 use hermesllm::apis::OpenAIApi;
 use hermesllm::clients::{SupportedAPIsFromClient, SupportedUpstreamAPIs};
 use hermesllm::SseEvent;
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full, StreamBody};
 use hyper::body::Frame;
-use hyper::{Response, StatusCode};
+use hyper::Response;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_stream::StreamExt;
 use tracing::{info, warn, Instrument};

-/// Errors that can occur during response handling
-#[derive(Debug, thiserror::Error)]
-pub enum ResponseError {
-    #[error("Failed to create response: {0}")]
-    ResponseCreationFailed(#[from] hyper::http::Error),
-    #[error("Stream error: {0}")]
-    StreamError(String),
-}
-
 /// Service for handling HTTP responses and streaming
 pub struct ResponseHandler;

@ -35,40 +27,6 @@ impl ResponseHandler {
            .boxed()
    }

-    /// Create an error response with a given status code and message
-    pub fn create_error_response(
-        status: StatusCode,
-        message: &str,
-    ) -> Response<BoxBody<Bytes, hyper::Error>> {
-        let mut response = Response::new(Self::create_full_body(message.to_string()));
-        *response.status_mut() = status;
-        response
-    }
-
-    /// Create a bad request response
-    pub fn create_bad_request(message: &str) -> Response<BoxBody<Bytes, hyper::Error>> {
-        Self::create_error_response(StatusCode::BAD_REQUEST, message)
-    }
-
-    /// Create an internal server error response
-    pub fn create_internal_error(message: &str) -> Response<BoxBody<Bytes, hyper::Error>> {
-        Self::create_error_response(StatusCode::INTERNAL_SERVER_ERROR, message)
-    }
-
-    /// Create a JSON error response
-    pub fn create_json_error_response(
-        error_json: &serde_json::Value,
-    ) -> Response<BoxBody<Bytes, hyper::Error>> {
-        let json_string = error_json.to_string();
-        let mut response = Response::new(Self::create_full_body(json_string));
-        *response.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-        response.headers_mut().insert(
-            hyper::header::CONTENT_TYPE,
-            "application/json".parse().unwrap(),
-        );
-        response
-    }
-
    /// Create a streaming response from a reqwest response.
    /// The spawned streaming task is instrumented with both `agent_span` and `orchestrator_span`
    /// so their durations reflect the actual time spent streaming to the client.
@ -77,13 +35,13 @@ impl ResponseHandler {
        llm_response: reqwest::Response,
        agent_span: tracing::Span,
        orchestrator_span: tracing::Span,
-    ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ResponseError> {
+    ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, BrightStaffError> {
        // Copy headers from the original response
        let response_headers = llm_response.headers();
        let mut response_builder = Response::builder();

        let headers = response_builder.headers_mut().ok_or_else(|| {
-            ResponseError::StreamError("Failed to get mutable headers".to_string())
+            BrightStaffError::StreamError("Failed to get mutable headers".to_string())
        })?;

        for (header_name, header_value) in response_headers.iter() {
@ -123,7 +81,7 @@ impl ResponseHandler {

        response_builder
            .body(stream_body)
-            .map_err(ResponseError::from)
+            .map_err(BrightStaffError::from)
    }

    /// Collect the full response body as a string
@ -136,7 +94,7 @@ impl ResponseHandler {
    pub async fn collect_full_response(
        &self,
        llm_response: reqwest::Response,
-    ) -> Result<String, ResponseError> {
+    ) -> Result<String, BrightStaffError> {
        use hermesllm::apis::streaming_shapes::sse::SseStreamIter;

        let response_headers = llm_response.headers();
@ -144,10 +102,9 @@ impl ResponseHandler {
            .get(hyper::header::CONTENT_TYPE)
            .is_some_and(|v| v.to_str().unwrap_or("").contains("text/event-stream"));

-        let response_bytes = llm_response
-            .bytes()
-            .await
-            .map_err(|e| ResponseError::StreamError(format!("Failed to read response: {}", e)))?;
+        let response_bytes = llm_response.bytes().await.map_err(|e| {
+            BrightStaffError::StreamError(format!("Failed to read response: {}", e))
+        })?;

        if is_sse_streaming {
            let client_api =
@ -185,7 +142,7 @@ impl ResponseHandler {
        } else {
            // If not SSE, treat as regular text response
            let response_text = String::from_utf8(response_bytes.to_vec()).map_err(|e| {
-                ResponseError::StreamError(format!("Failed to decode response: {}", e))
+                BrightStaffError::StreamError(format!("Failed to decode response: {}", e))
            })?;

            Ok(response_text)
@ -204,42 +161,6 @@ mod tests {
    use super::*;
    use hyper::StatusCode;

-    #[test]
-    fn test_create_bad_request() {
-        let response = ResponseHandler::create_bad_request("Invalid request");
-        assert_eq!(response.status(), StatusCode::BAD_REQUEST);
-    }
-
-    #[test]
-    fn test_create_internal_error() {
-        let response = ResponseHandler::create_internal_error("Server error");
-        assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR);
-    }
-
-    #[test]
-    fn test_create_error_response() {
-        let response =
-            ResponseHandler::create_error_response(StatusCode::NOT_FOUND, "Resource not found");
-        assert_eq!(response.status(), StatusCode::NOT_FOUND);
-    }
-
-    #[test]
-    fn test_create_json_error_response() {
-        let error_json = serde_json::json!({
-            "error": {
-                "type": "TestError",
-                "message": "Test error message"
-            }
-        });
-
-        let response = ResponseHandler::create_json_error_response(&error_json);
-        assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR);
-        assert_eq!(
-            response.headers().get("content-type").unwrap(),
-            "application/json"
-        );
-    }
-
    #[tokio::test]
    async fn test_create_streaming_response_with_mock() {
        use mockito::Server;
--- a/crates/brightstaff/src/handlers/router_chat.rs
+++ b/crates/brightstaff/src/handlers/router_chat.rs
@ -10,6 +10,7 @@ use crate::tracing::routing;

 pub struct RoutingResult {
    pub model_name: String,
+    pub route_name: Option<String>,
 }

 pub struct RoutingError {
@ -133,9 +134,12 @@ pub async fn router_chat_get_upstream_model(

    match routing_result {
        Ok(route) => match route {
-            Some((_, model_name)) => {
+            Some((route_name, model_name)) => {
                current_span.record("route.selected_model", model_name.as_str());
-                Ok(RoutingResult { model_name })
+                Ok(RoutingResult {
+                    model_name,
+                    route_name: Some(route_name),
+                })
            }
            None => {
                // No route determined, return sentinel value "none"
@ -145,6 +149,7 @@ pub async fn router_chat_get_upstream_model(

                Ok(RoutingResult {
                    model_name: "none".to_string(),
+                    route_name: None,
                })
            }
        },
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@ -0,0 +1,163 @@
+use bytes::Bytes;
+use common::configuration::SpanAttributes;
+use common::consts::{REQUEST_ID_HEADER, TRACE_PARENT_HEADER};
+use common::errors::BrightStaffError;
+use hermesllm::clients::SupportedAPIsFromClient;
+use hermesllm::ProviderRequestType;
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Full};
+use hyper::{Request, Response, StatusCode};
+use std::sync::Arc;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use crate::handlers::router_chat::router_chat_get_upstream_model;
+use crate::router::llm_router::RouterService;
+use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};
+
+#[derive(serde::Serialize)]
+struct RoutingDecisionResponse {
+    model: String,
+    route: Option<String>,
+    trace_id: String,
+}
+
+pub async fn routing_decision(
+    request: Request<hyper::body::Incoming>,
+    router_service: Arc<RouterService>,
+    request_path: String,
+    span_attributes: Arc<Option<SpanAttributes>>,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    let request_headers = request.headers().clone();
+    let request_id: String = request_headers
+        .get(REQUEST_ID_HEADER)
+        .and_then(|h| h.to_str().ok())
+        .map(|s| s.to_string())
+        .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
+
+    let custom_attrs =
+        collect_custom_trace_attributes(&request_headers, span_attributes.as_ref().as_ref());
+
+    let request_span = info_span!(
+        "routing_decision",
+        component = "routing",
+        request_id = %request_id,
+        http.method = %request.method(),
+        http.path = %request_path,
+    );
+
+    routing_decision_inner(
+        request,
+        router_service,
+        request_id,
+        request_path,
+        request_headers,
+        custom_attrs,
+    )
+    .instrument(request_span)
+    .await
+}
+
+async fn routing_decision_inner(
+    request: Request<hyper::body::Incoming>,
+    router_service: Arc<RouterService>,
+    request_id: String,
+    request_path: String,
+    request_headers: hyper::HeaderMap,
+    custom_attrs: std::collections::HashMap<String, String>,
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
+    set_service_name(operation_component::ROUTING);
+    opentelemetry::trace::get_active_span(|span| {
+        for (key, value) in &custom_attrs {
+            span.set_attribute(opentelemetry::KeyValue::new(key.clone(), value.clone()));
+        }
+    });
+
+    // Extract or generate traceparent
+    let traceparent: String = match request_headers
+        .get(TRACE_PARENT_HEADER)
+        .and_then(|h| h.to_str().ok())
+        .map(|s| s.to_string())
+    {
+        Some(tp) => tp,
+        None => {
+            let trace_id = uuid::Uuid::new_v4().to_string().replace("-", "");
+            let generated_tp = format!("00-{}-0000000000000000-01", trace_id);
+            warn!(
+                generated_traceparent = %generated_tp,
+                "TRACE_PARENT header missing, generated new traceparent"
+            );
+            generated_tp
+        }
+    };
+
+    // Extract trace_id from traceparent (format: 00-{trace_id}-{span_id}-{flags})
+    let trace_id = traceparent
+        .split('-')
+        .nth(1)
+        .unwrap_or("unknown")
+        .to_string();
+
+    // Parse request body
+    let chat_request_bytes = request.collect().await?.to_bytes();
+
+    debug!(
+        body = %String::from_utf8_lossy(&chat_request_bytes),
+        "routing decision request body received"
+    );
+
+    let client_request = match ProviderRequestType::try_from((
+        &chat_request_bytes[..],
+        &SupportedAPIsFromClient::from_endpoint(request_path.as_str()).unwrap(),
+    )) {
+        Ok(request) => request,
+        Err(err) => {
+            warn!(error = %err, "failed to parse request for routing decision");
+            return Ok(BrightStaffError::InvalidRequest(format!(
+                "Failed to parse request: {}",
+                err
+            ))
+            .into_response());
+        }
+    };
+
+    // Call the existing routing logic
+    let routing_result = router_chat_get_upstream_model(
+        router_service,
+        client_request,
+        &traceparent,
+        &request_path,
+        &request_id,
+    )
+    .await;
+
+    match routing_result {
+        Ok(result) => {
+            let response = RoutingDecisionResponse {
+                model: result.model_name,
+                route: result.route_name,
+                trace_id,
+            };
+
+            info!(
+                model = %response.model,
+                route = ?response.route,
+                "routing decision completed"
+            );
+
+            let json = serde_json::to_string(&response).unwrap();
+            let body = Full::new(Bytes::from(json))
+                .map_err(|never| match never {})
+                .boxed();
+
+            Ok(Response::builder()
+                .status(StatusCode::OK)
+                .header("Content-Type", "application/json")
+                .body(body)
+                .unwrap())
+        }
+        Err(err) => {
+            warn!(error = %err.message, "routing decision failed");
+            Ok(BrightStaffError::InternalServerError(err.message).into_response())
+        }
+    }
+}
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -2,6 +2,7 @@ use brightstaff::handlers::agent_chat_completions::agent_chat;
 use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
+use brightstaff::handlers::routing_service::routing_decision;
 use brightstaff::router::llm_router::RouterService;
 use brightstaff::router::plano_orchestrator::OrchestratorService;
 use brightstaff::state::memory::MemoryConversationalStorage;
@ -114,6 +115,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    ));

    let model_aliases = Arc::new(plano_config.model_aliases.clone());
+    let span_attributes = Arc::new(
+        plano_config
+            .tracing
+            .as_ref()
+            .and_then(|tracing| tracing.span_attributes.clone()),
+    );

    // Initialize trace collector and start background flusher
    // Tracing is enabled if the tracing config is present in plano_config.yaml
@ -173,6 +180,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        let llm_providers = llm_providers.clone();
        let agents_list = combined_agents_filters_list.clone();
        let listeners = listeners.clone();
+        let span_attributes = span_attributes.clone();
        let state_storage = state_storage.clone();
        let service = service_fn(move |req| {
            let router_service = Arc::clone(&router_service);
@ -183,10 +191,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
            let model_aliases = Arc::clone(&model_aliases);
            let agents_list = agents_list.clone();
            let listeners = listeners.clone();
+            let span_attributes = span_attributes.clone();
            let state_storage = state_storage.clone();

            async move {
-                let path = req.uri().path();
+                let path = req.uri().path().to_string();
                // Check if path starts with /agents
                if path.starts_with("/agents") {
                    // Check if it matches one of the agent API paths
@ -202,13 +211,30 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                            fully_qualified_url,
                            agents_list,
                            listeners,
+                            span_attributes,
                            llm_providers,
                        )
                        .with_context(parent_cx)
                        .await;
                    }
                }
-                match (req.method(), path) {
+                if let Some(stripped_path) = path.strip_prefix("/routing") {
+                    let stripped_path = stripped_path.to_string();
+                    if matches!(
+                        stripped_path.as_str(),
+                        CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH
+                    ) {
+                        return routing_decision(
+                            req,
+                            router_service,
+                            stripped_path,
+                            span_attributes,
+                        )
+                        .with_context(parent_cx)
+                        .await;
+                    }
+                }
+                match (req.method(), path.as_str()) {
                    (
                        &Method::POST,
                        CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH,
@ -220,6 +246,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                            fully_qualified_url,
                            model_aliases,
                            llm_providers,
+                            span_attributes,
                            state_storage,
                            listeners,
                            agents_list,
@ -262,7 +289,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
                        Ok(response)
                    }
                    _ => {
-                        debug!(method = %req.method(), path = %req.uri().path(), "no route found");
+                        debug!(method = %req.method(), path = %path, "no route found");
                        let mut not_found = Response::new(empty());
                        *not_found.status_mut() = StatusCode::NOT_FOUND;
                        Ok(not_found)
--- a/crates/brightstaff/src/tracing/custom_attributes.rs
+++ b/crates/brightstaff/src/tracing/custom_attributes.rs
@ -0,0 +1,156 @@
+use std::collections::HashMap;
+
+use common::configuration::SpanAttributes;
+use common::traces::SpanBuilder;
+use hyper::header::HeaderMap;
+
+pub fn collect_custom_trace_attributes(
+    headers: &HeaderMap,
+    span_attributes: Option<&SpanAttributes>,
+) -> HashMap<String, String> {
+    let mut attributes = HashMap::new();
+    let Some(span_attributes) = span_attributes else {
+        return attributes;
+    };
+
+    if let Some(static_attributes) = span_attributes.static_attributes.as_ref() {
+        for (key, value) in static_attributes {
+            attributes.insert(key.clone(), value.clone());
+        }
+    }
+
+    let Some(header_prefixes) = span_attributes.header_prefixes.as_deref() else {
+        return attributes;
+    };
+    if header_prefixes.is_empty() {
+        return attributes;
+    }
+
+    for (name, value) in headers.iter() {
+        let header_name = name.as_str();
+        let matched_prefix = header_prefixes
+            .iter()
+            .find(|prefix| header_name.starts_with(prefix.as_str()))
+            .map(String::as_str);
+        let Some(prefix) = matched_prefix else {
+            continue;
+        };
+
+        let Some(raw_value) = value.to_str().ok().map(str::trim) else {
+            continue;
+        };
+
+        let suffix = header_name.strip_prefix(prefix).unwrap_or("");
+        let suffix_key = suffix.trim_start_matches('-').replace('-', ".");
+        if suffix_key.is_empty() {
+            continue;
+        }
+
+        attributes.insert(suffix_key, raw_value.to_string());
+    }
+
+    attributes
+}
+
+pub fn append_span_attributes(
+    mut span_builder: SpanBuilder,
+    attributes: &HashMap<String, String>,
+) -> SpanBuilder {
+    for (key, value) in attributes {
+        span_builder = span_builder.with_attribute(key, value);
+    }
+    span_builder
+}
+
+#[cfg(test)]
+mod tests {
+    use super::collect_custom_trace_attributes;
+    use common::configuration::SpanAttributes;
+    use hyper::header::{HeaderMap, HeaderValue};
+    use std::collections::HashMap;
+
+    #[test]
+    fn extracts_headers_by_prefix() {
+        let mut headers = HeaderMap::new();
+        headers.insert("x-katanemo-tenant-id", HeaderValue::from_static("ten_456"));
+        headers.insert("x-katanemo-user-id", HeaderValue::from_static("usr_789"));
+        headers.insert("x-katanemo-admin-level", HeaderValue::from_static("3"));
+        headers.insert("x-other-id", HeaderValue::from_static("ignored"));
+
+        let attrs = collect_custom_trace_attributes(
+            &headers,
+            Some(&SpanAttributes {
+                header_prefixes: Some(vec!["x-katanemo-".to_string()]),
+                static_attributes: None,
+            }),
+        );
+
+        assert_eq!(attrs.get("tenant.id"), Some(&"ten_456".to_string()));
+        assert_eq!(attrs.get("user.id"), Some(&"usr_789".to_string()));
+        assert_eq!(attrs.get("admin.level"), Some(&"3".to_string()));
+        assert!(!attrs.contains_key("other.id"));
+    }
+
+    #[test]
+    fn returns_empty_when_prefixes_missing_or_empty() {
+        let mut headers = HeaderMap::new();
+        headers.insert("x-katanemo-tenant-id", HeaderValue::from_static("ten_456"));
+
+        let attrs_none = collect_custom_trace_attributes(
+            &headers,
+            Some(&SpanAttributes {
+                header_prefixes: None,
+                static_attributes: None,
+            }),
+        );
+        assert!(attrs_none.is_empty());
+
+        let attrs_empty = collect_custom_trace_attributes(
+            &headers,
+            Some(&SpanAttributes {
+                header_prefixes: Some(Vec::new()),
+                static_attributes: None,
+            }),
+        );
+        assert!(attrs_empty.is_empty());
+    }
+
+    #[test]
+    fn supports_multiple_prefixes() {
+        let mut headers = HeaderMap::new();
+        headers.insert("x-katanemo-tenant-id", HeaderValue::from_static("ten_456"));
+        headers.insert("x-tenant-user-id", HeaderValue::from_static("usr_789"));
+
+        let attrs = collect_custom_trace_attributes(
+            &headers,
+            Some(&SpanAttributes {
+                header_prefixes: Some(vec!["x-katanemo-".to_string(), "x-tenant-".to_string()]),
+                static_attributes: None,
+            }),
+        );
+
+        assert_eq!(attrs.get("tenant.id"), Some(&"ten_456".to_string()));
+        assert_eq!(attrs.get("user.id"), Some(&"usr_789".to_string()));
+    }
+
+    #[test]
+    fn header_attributes_override_static_attributes() {
+        let mut headers = HeaderMap::new();
+        headers.insert("x-katanemo-tenant-id", HeaderValue::from_static("ten_456"));
+
+        let mut static_attributes = HashMap::new();
+        static_attributes.insert("tenant.id".to_string(), "ten_static".to_string());
+        static_attributes.insert("environment".to_string(), "prod".to_string());
+
+        let attrs = collect_custom_trace_attributes(
+            &headers,
+            Some(&SpanAttributes {
+                header_prefixes: Some(vec!["x-katanemo-".to_string()]),
+                static_attributes: Some(static_attributes),
+            }),
+        );
+
+        assert_eq!(attrs.get("tenant.id"), Some(&"ten_456".to_string()));
+        assert_eq!(attrs.get("environment"), Some(&"prod".to_string()));
+    }
+}
--- a/crates/brightstaff/src/tracing/mod.rs
+++ b/crates/brightstaff/src/tracing/mod.rs
@ -1,9 +1,11 @@
 mod constants;
+mod custom_attributes;
 mod service_name_exporter;

 pub use constants::{
    error, http, llm, operation_component, routing, signals, OperationNameBuilder,
 };
+pub use custom_attributes::{append_span_attributes, collect_custom_trace_attributes};
 pub use service_name_exporter::{ServiceNameOverrideExporter, SERVICE_NAME_OVERRIDE_KEY};

 use opentelemetry::trace::get_active_span;
--- a/crates/common/Cargo.toml
+++ b/crates/common/Cargo.toml
@ -20,6 +20,9 @@ urlencoding = "2.1.3"
 url = "2.5.4"
 hermesllm = { version = "0.1.0", path = "../hermesllm" }
 serde_with = "3.13.0"
+hyper = "1.0"
+bytes = "1.0"
+http-body-util = "0.1"

 [features]
 default = []
@ -30,3 +33,6 @@ serde_json = "1.0.64"
 serial_test = "3.2"
 axum = "0.7"
 tokio = { version = "1.44", features = ["sync", "time", "macros", "rt"] }
+hyper = { version = "1.0", features = ["full"] }
+bytes = "1.0"
+http-body-util = "0.1"
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -93,6 +93,14 @@ pub struct Tracing {
    pub trace_arch_internal: Option<bool>,
    pub random_sampling: Option<u32>,
    pub opentracing_grpc_endpoint: Option<String>,
+    pub span_attributes: Option<SpanAttributes>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct SpanAttributes {
+    pub header_prefixes: Option<Vec<String>>,
+    #[serde(rename = "static")]
+    pub static_attributes: Option<HashMap<String, String>>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)]
--- a/crates/common/src/errors.rs
+++ b/crates/common/src/errors.rs
@ -1,9 +1,13 @@
-use proxy_wasm::types::Status;
-
 use crate::{api::open_ai::ChatCompletionChunkResponseError, ratelimit};
+use bytes::Bytes;
 use hermesllm::apis::openai::OpenAIError;
+use http_body_util::{combinators::BoxBody, BodyExt, Full};
+use hyper::{Error as HyperError, Response, StatusCode};
+use proxy_wasm::types::Status;
+use serde_json::json;
+use thiserror::Error;

-#[derive(thiserror::Error, Debug)]
+#[derive(Error, Debug)]
 pub enum ClientError {
    #[error("Error dispatching HTTP call to `{upstream_name}/{path}`, error: {internal_status:?}")]
    DispatchError {
@ -13,7 +17,7 @@ pub enum ClientError {
    },
 }

-#[derive(thiserror::Error, Debug)]
+#[derive(Error, Debug)]
 pub enum ServerError {
    #[error(transparent)]
    HttpDispatch(ClientError),
@ -43,3 +47,174 @@ pub enum ServerError {
    #[error("error parsing openai message: {0}")]
    OpenAIPError(#[from] OpenAIError),
 }
+// -----------------------------------------------------------------------------
+// BrightStaff Errors (Standardized)
+// -----------------------------------------------------------------------------
+#[derive(Debug, Error)]
+pub enum BrightStaffError {
+    #[error("The requested model '{0}' does not exist")]
+    ModelNotFound(String),
+
+    #[error("No model specified in request and no default provider configured")]
+    NoModelSpecified,
+
+    #[error("Conversation state not found for previous_response_id: {0}")]
+    ConversationStateNotFound(String),
+
+    #[error("Internal server error")]
+    InternalServerError(String),
+
+    #[error("Invalid request")]
+    InvalidRequest(String),
+
+    #[error("{message}")]
+    ForwardedError {
+        status_code: StatusCode,
+        message: String,
+    },
+
+    #[error("Stream error: {0}")]
+    StreamError(String),
+
+    #[error("Failed to create response: {0}")]
+    ResponseCreationFailed(#[from] hyper::http::Error),
+}
+
+impl BrightStaffError {
+    pub fn into_response(self) -> Response<BoxBody<Bytes, HyperError>> {
+        let (status, code, details) = match &self {
+            BrightStaffError::ModelNotFound(model_name) => (
+                StatusCode::NOT_FOUND,
+                "ModelNotFound",
+                json!({ "rejected_model_id": model_name }),
+            ),
+
+            BrightStaffError::NoModelSpecified => {
+                (StatusCode::BAD_REQUEST, "NoModelSpecified", json!({}))
+            }
+
+            BrightStaffError::ConversationStateNotFound(prev_resp_id) => (
+                StatusCode::CONFLICT,
+                "ConversationStateNotFound",
+                json!({ "previous_response_id": prev_resp_id }),
+            ),
+
+            BrightStaffError::InternalServerError(reason) => (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "InternalServerError",
+                // Passing the reason into details for easier debugging
+                json!({ "reason": reason }),
+            ),
+
+            BrightStaffError::InvalidRequest(reason) => (
+                StatusCode::BAD_REQUEST,
+                "InvalidRequest",
+                json!({ "reason": reason }),
+            ),
+
+            BrightStaffError::ForwardedError {
+                status_code,
+                message,
+            } => (*status_code, "ForwardedError", json!({ "reason": message })),
+
+            BrightStaffError::StreamError(reason) => (
+                StatusCode::BAD_REQUEST,
+                "StreamError",
+                json!({ "reason": reason }),
+            ),
+
+            BrightStaffError::ResponseCreationFailed(reason) => (
+                StatusCode::BAD_REQUEST,
+                "ResponseCreationFailed",
+                json!({ "reason": reason.to_string() }),
+            ),
+        };
+
+        let body_json = json!({
+            "error": {
+                "code": code,
+                "message": self.to_string(),
+                "details": details
+            }
+        });
+
+        // 1. Create the concrete body
+        let full_body = Full::new(Bytes::from(body_json.to_string()));
+
+        // 2. Convert it to BoxBody
+        // We map_err because Full never fails, but BoxBody expects a HyperError
+        let boxed_body = full_body
+            .map_err(|never| match never {}) // This handles the "Infallible" error type
+            .boxed();
+
+        Response::builder()
+            .status(status)
+            .header("content-type", "application/json")
+            .body(boxed_body)
+            .unwrap_or_else(|_| {
+                Response::new(
+                    Full::new(Bytes::from("Internal Error"))
+                        .map_err(|never| match never {})
+                        .boxed(),
+                )
+            })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use http_body_util::BodyExt; // For .collect().await
+
+    #[tokio::test]
+    async fn test_model_not_found_format() {
+        let err = BrightStaffError::ModelNotFound("gpt-5-secret".to_string());
+        let response = err.into_response();
+
+        assert_eq!(response.status(), StatusCode::NOT_FOUND);
+
+        // Helper to extract body as JSON
+        let body_bytes = response.into_body().collect().await.unwrap().to_bytes();
+        let body: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
+
+        assert_eq!(body["error"]["code"], "ModelNotFound");
+        assert_eq!(
+            body["error"]["details"]["rejected_model_id"],
+            "gpt-5-secret"
+        );
+        assert!(body["error"]["message"]
+            .as_str()
+            .unwrap()
+            .contains("gpt-5-secret"));
+    }
+
+    #[tokio::test]
+    async fn test_forwarded_error_preserves_status() {
+        let err = BrightStaffError::ForwardedError {
+            status_code: StatusCode::TOO_MANY_REQUESTS,
+            message: "Rate limit exceeded on agent side".to_string(),
+        };
+
+        let response = err.into_response();
+        assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS);
+
+        let body_bytes = response.into_body().collect().await.unwrap().to_bytes();
+        let body: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap();
+
+        assert_eq!(body["error"]["code"], "ForwardedError");
+    }
+
+    #[tokio::test]
+    async fn test_hyper_error_wrapping() {
+        // Manually trigger a hyper error by creating an invalid URI/Header
+        let hyper_err = hyper::http::Response::builder()
+            .status(1000) // Invalid status
+            .body(())
+            .unwrap_err();
+
+        let err = BrightStaffError::ResponseCreationFailed(hyper_err);
+        let response = err.into_response();
+
+        assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+    }
+}
--- a/demos/advanced/multi_turn_rag/config.yaml
+++ b/demos/advanced/multi_turn_rag/config.yaml
@ -7,7 +7,7 @@ listeners:

 endpoints:
  rag_energy_source_agent:
-    endpoint: host.docker.internal:18083
+    endpoint: localhost:18083
    connect_timeout: 0.005s

 model_providers:
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/README.md
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/README.md
@ -38,18 +38,17 @@ Plano acts as a **framework-agnostic proxy and data plane** that:
 ```bash
 # From the demo directory
 cd demos/agent_orchestration/multi_agent_crewai_langchain
-
-# Build and start all services
-docker-compose up -d
+./run_demo.sh
 ```

-This starts:
- **Plano** (ports 12000, 8001) - routing and orchestration
+This starts Plano natively and brings up via Docker Compose:
 - **CrewAI Flight Agent** (port 10520) - flight search
 - **LangChain Weather Agent** (port 10510) - weather forecasts
 - **AnythingLLM** (port 3001) - chat interface
 - **Jaeger** (port 16686) - distributed tracing

+Plano runs natively on the host (ports 12000, 8001).
+
 ### Try It Out

 1. **Open the Chat Interface**
@ -116,7 +115,7 @@ This starts:
 ## Cleanup

 ```bash
-docker-compose down
+./run_demo.sh down
 ```

 ## Next Steps
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/docker-compose.yaml
@ -1,21 +1,5 @@

 services:
-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "8001:8001"
-      - "12000:12000"
-    environment:
-      - PLANO_CONFIG_PATH=/app/plano_config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-      - OTEL_TRACING_GRPC_ENDPOINT=http://jaeger:4317
-      - LOG_LEVEL=${LOG_LEVEL:-info}
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml:ro
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-
  crewai-flight-agent:
    build:
      dockerfile: Dockerfile
@ -23,7 +7,7 @@ services:
    ports:
      - "10520:10520"
    environment:
-      - LLM_GATEWAY_ENDPOINT=http://plano:12000/v1
+      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
      - AEROAPI_KEY=${AEROAPI_KEY:?AEROAPI_KEY environment variable is required but not set}
      - PYTHONUNBUFFERED=1
    command: ["python", "-u", "crewai/flight_agent.py"]
@ -35,7 +19,7 @@ services:
    ports:
      - "10510:10510"
    environment:
-      - LLM_GATEWAY_ENDPOINT=http://plano:12000/v1
+      - LLM_GATEWAY_ENDPOINT=http://host.docker.internal:12000/v1
    command: ["python", "-u", "langchain/weather_agent.py"]

  anythingllm:
@ -48,7 +32,7 @@ services:
    environment:
      - STORAGE_DIR=/app/server/storage
      - LLM_PROVIDER=generic-openai
-      - GENERIC_OPEN_AI_BASE_PATH=http://plano:8001/v1
+      - GENERIC_OPEN_AI_BASE_PATH=http://host.docker.internal:8001/v1
      - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
      - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
      - GENERIC_OPEN_AI_API_KEY=sk-placeholder
--- a/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh
+++ b/demos/agent_orchestration/multi_agent_crewai_langchain/run_demo.sh
@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set API keys
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+    if [ -z "$AEROAPI_KEY" ]; then
+      echo "Error: AEROAPI_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo "AEROAPI_KEY=$AEROAPI_KEY" >> .env
+    echo ".env file created with API keys."
+  fi
+
+  # Step 3: Start Plano
+  echo "Starting Plano with config.yaml..."
+  planoai up config.yaml
+
+  # Step 4: Start agents and services
+  echo "Starting agents using Docker Compose..."
+  docker compose up -d
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Docker Compose services..."
+  docker compose down
+
+  # Step 2: Stop Plano
+  echo "Stopping Plano..."
+  planoai down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  start_demo
+fi
--- a/demos/agent_orchestration/travel_agents/README.md
+++ b/demos/agent_orchestration/travel_agents/README.md
@ -9,7 +9,7 @@ This demo consists of two intelligent agents that work together seamlessly:
 - **Weather Agent** - Real-time weather conditions and multi-day forecasts for any city worldwide
 - **Flight Agent** - Live flight information between airports with real-time tracking

-All agents use Plano's agent orchestration LLM to intelligently route user requests to the appropriate specialized agent based on conversation context and user intent. Both agents run as Docker containers for easy deployment.
+All agents use Plano's agent orchestration LLM to intelligently route user requests to the appropriate specialized agent based on conversation context and user intent.

 ## Features

@ -22,8 +22,8 @@ All agents use Plano's agent orchestration LLM to intelligently route user reque

 ## Prerequisites

- Docker and Docker Compose
- [Plano CLI](https://docs.planoai.dev/get_started/quickstart.html#prerequisites) installed
+- [Plano CLI](https://docs.planoai.dev/get_started/quickstart.html#prerequisites) installed (`pip install planoai`)
+- Docker and Docker Compose (for agent services)
 - [OpenAI API key](https://platform.openai.com/api-keys)
 - [FlightAware AeroAPI key](https://www.flightaware.com/aeroapi/portal)

@ -40,17 +40,18 @@ export AEROAPI_KEY="your-flightaware-api-key"
 export OPENAI_API_KEY="your OpenAI api key"
 ```

-### 2. Start All Agents & Plano with Docker
+### 2. Start the Demo

 ```bash
-docker compose up --build
+./run_demo.sh
 ```

-This starts:
+This starts Plano natively and brings up via Docker Compose:
 - Weather Agent on port 10510
 - Flight Agent on port 10520
 - Open WebUI on port 8080
- Plano Proxy on port 8001
+
+Plano runs natively on the host (port 8001).

 ### 4. Test the System

@ -92,7 +93,7 @@ Assistant: [Both weather_agent and flight_agent respond simultaneously]
 Weather     Flight
  Agent       Agent
 (10510)     (10520)
- [Docker]    [Docker]
+ (10510)     (10520)
 ```

 Each agent:
@ -101,7 +102,7 @@ Each agent:
 3. Generates response using GPT-5.2
 4. Streams response back to user

-Both agents run as Docker containers and communicate with Plano via `host.docker.internal`.
+Both agents run as Docker containers and communicate with Plano running natively on the host.

 ## Observability

--- a/demos/agent_orchestration/travel_agents/config.yaml
+++ b/demos/agent_orchestration/travel_agents/config.yaml
@ -2,9 +2,9 @@ version: v0.3.0

 agents:
  - id: weather_agent
-    url: http://host.docker.internal:10510
+    url: http://localhost:10510
  - id: flight_agent
-    url: http://host.docker.internal:10520
+    url: http://localhost:10520

 model_providers:
  - model: openai/gpt-5.2
@ -55,3 +55,6 @@ listeners:

 tracing:
  random_sampling: 100
+  span_attributes:
+    header_prefixes:
+      - x-acme-
--- a/demos/agent_orchestration/travel_agents/docker-compose.yaml
+++ b/demos/agent_orchestration/travel_agents/docker-compose.yaml
@ -1,18 +1,5 @@

 services:
-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "12000:12000"
-      - "8001:8001"
-    environment:
-      - PLANO_CONFIG_PATH=/config/config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
  weather-agent:
    build:
      context: .
--- a/demos/agent_orchestration/travel_agents/run_demo.sh
+++ b/demos/agent_orchestration/travel_agents/run_demo.sh
@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set API keys
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+    if [ -z "$AEROAPI_KEY" ]; then
+      echo "Error: AEROAPI_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo "AEROAPI_KEY=$AEROAPI_KEY" >> .env
+    echo ".env file created with API keys."
+  fi
+
+  # Step 3: Start Plano
+  echo "Starting Plano with config.yaml..."
+  planoai up config.yaml
+
+  # Step 4: Start agents and services
+  echo "Starting agents using Docker Compose..."
+  docker compose up -d
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Docker Compose services..."
+  docker compose down
+
+  # Step 2: Stop Plano
+  echo "Stopping Plano..."
+  planoai down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  start_demo
+fi
--- a/demos/agent_orchestration/travel_agents/test.rest
+++ b/demos/agent_orchestration/travel_agents/test.rest
@ -3,9 +3,16 @@
 ### Travel Agent Chat Completion Request
 POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
 Content-Type: application/json
+X-Acme-Workspace-Id: ws_7e2c5d91b4224f59b0e6a4e0125c21b3
+X-Acme-Tenant-Id: ten_4102a8c7fa6542b084b395d2df184a9a
+X-Acme-User-Id: usr_19df7e6751b846f9ba026776e3c12abe
+X-Acme-Admin-Level: 3
+X-Acme-Environment: production
+X-Acme-Is-Internal: false
+X-Acme-Cost-Center: HD100

 {
-  "model": "gpt-4o",
+  "model": "gpt-5.2",
  "messages": [
    {
      "role": "user",
@ -20,7 +27,28 @@ Content-Type: application/json
      "content": "What is one Alaska flight that goes direct to Atlanta from Seattle?"
    }
  ],
-  "max_tokens": 1000,
+  "max_completion_tokens": 1000,
+  "stream": false,
+  "temperature": 1.0
+}
+
+
+### Travel Agent Request (prefix mismatch - ignored)
+POST {{llm_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+X-Other-Workspace-Id: ws_7e2c5d91b4224f59b0e6a4e0125c21b3
+X-Other-Tenant-Id: ten_4102a8c7fa6542b084b395d2df184a9a
+X-Other-User-Id: usr_19df7e6751b846f9ba026776e3c12abe
+
+{
+  "model": "gpt-5.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What's the weather in Seattle?"
+    }
+  ],
+  "max_completion_tokens": 1000,
  "stream": false,
  "temperature": 1.0
 }
--- a/demos/filter_chains/http_filter/README.md
+++ b/demos/filter_chains/http_filter/README.md
@ -35,21 +35,21 @@ This demo consists of four components:

 ## Quick Start

-### 1. Start everything with Docker Compose
+### 1. Start the demo
 ```bash
-docker compose up --build
+export OPENAI_API_KEY="your-key"
+./run_demo.sh
 ```

-This brings up:
+This starts Plano natively and brings up via Docker Compose:
 - Input Guards MCP server on port 10500
 - Query Rewriter MCP server on port 10501
 - Context Builder MCP server on port 10502
 - RAG Agent REST server on port 10505
- Plano listener on port 8001 (and gateway on 12000)
 - Jaeger UI for viewing traces at http://localhost:16686
 - AnythingLLM at http://localhost:3001 for interactive queries

-> Set `OPENAI_API_KEY` in your environment before running; `LLM_GATEWAY_ENDPOINT` defaults to `http://host.docker.internal:12000/v1`.
+Plano runs natively on the host (port 8001 and 12000).

 ### 2. Test the system

@ -74,16 +74,16 @@ The `config.yaml` defines how agents are connected:
 ```yaml
 filters:
  - id: input_guards
-    url: http://host.docker.internal:10500
+    url: http://localhost:10500
    # type: mcp (default)
    # tool: input_guards (default - same as filter id)

  - id: query_rewriter
-    url: http://host.docker.internal:10501
+    url: http://localhost:10501
    # type: mcp (default)

  - id: context_builder
-    url: http://host.docker.internal:10502
+    url: http://localhost:10502
 ```

 ## How It Works
--- a/demos/filter_chains/http_filter/docker-compose.yaml
+++ b/demos/filter_chains/http_filter/docker-compose.yaml
@ -11,19 +11,6 @@ services:
    environment:
      - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1}
      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "12000:12000"
-      - "8001:8001"
-    environment:
-      - PLANO_CONFIG_PATH=/config/config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
  jaeger:
    build:
      context: ../../shared/jaeger
@ -41,7 +28,7 @@ services:
    environment:
      - STORAGE_DIR=/app/server/storage
      - LLM_PROVIDER=generic-openai
-      - GENERIC_OPEN_AI_BASE_PATH=http://plano:8001/v1
+      - GENERIC_OPEN_AI_BASE_PATH=http://host.docker.internal:8001/v1
      - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
      - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
      - GENERIC_OPEN_AI_API_KEY=sk-placeholder
--- a/demos/filter_chains/http_filter/run_demo.sh
+++ b/demos/filter_chains/http_filter/run_demo.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set OpenAI key
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo ".env file created with OPENAI_API_KEY."
+  fi
+
+  # Step 3: Start Plano
+  echo "Starting Plano with config.yaml..."
+  planoai up config.yaml
+
+  # Step 4: Start services
+  echo "Starting services using Docker Compose..."
+  docker compose up -d
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Docker Compose services..."
+  docker compose down
+
+  # Step 2: Stop Plano
+  echo "Stopping Plano..."
+  planoai down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  start_demo
+fi
--- a/demos/filter_chains/mcp_filter/README.md
+++ b/demos/filter_chains/mcp_filter/README.md
@ -35,21 +35,21 @@ This demo consists of four components:

 ## Quick Start

-### 1. Start everything with Docker Compose
+### 1. Start the demo
 ```bash
-docker compose up --build
+export OPENAI_API_KEY="your-key"
+./run_demo.sh
 ```

-This brings up:
+This starts Plano natively and brings up via Docker Compose:
 - Input Guards MCP server on port 10500
 - Query Rewriter MCP server on port 10501
 - Context Builder MCP server on port 10502
 - RAG Agent REST server on port 10505
- Plano listener on port 8001 (and gateway on 12000)
 - Jaeger UI for viewing traces at http://localhost:16686
 - AnythingLLM at http://localhost:3001 for interactive queries

-> Set `OPENAI_API_KEY` in your environment before running; `LLM_GATEWAY_ENDPOINT` defaults to `http://host.docker.internal:12000/v1`.
+Plano runs natively on the host (port 8001 and 12000).

 ### 2. Test the system

@ -74,16 +74,16 @@ The `config.yaml` defines how agents are connected:
 ```yaml
 filters:
  - id: input_guards
-    url: http://host.docker.internal:10500
+    url: http://localhost:10500
    # type: mcp (default)
    # tool: input_guards (default - same as filter id)

  - id: query_rewriter
-    url: http://host.docker.internal:10501
+    url: http://localhost:10501
    # type: mcp (default)

  - id: context_builder
-    url: http://host.docker.internal:10502
+    url: http://localhost:10502
 ```

 ## How It Works
--- a/demos/filter_chains/mcp_filter/config.yaml
+++ b/demos/filter_chains/mcp_filter/config.yaml
@ -2,21 +2,21 @@ version: v0.3.0

 agents:
  - id: rag_agent
-    url: http://host.docker.internal:10505
+    url: http://localhost:10505

 filters:
  - id: input_guards
-    url: http://host.docker.internal:10500
+    url: http://localhost:10500
    # type: mcp (default)
    # transport: streamable-http (default)
    # tool: input_guards (default - same as filter id)
  - id: query_rewriter
-    url: http://host.docker.internal:10501
+    url: http://localhost:10501
    # type: mcp (default)
    # transport: streamable-http (default)
    # tool: query_rewriter (default - same as filter id)
  - id: context_builder
-    url: http://host.docker.internal:10502
+    url: http://localhost:10502

 model_providers:
  - model: openai/gpt-4o-mini
--- a/demos/filter_chains/mcp_filter/docker-compose.yaml
+++ b/demos/filter_chains/mcp_filter/docker-compose.yaml
@ -11,21 +11,6 @@ services:
    environment:
      - LLM_GATEWAY_ENDPOINT=${LLM_GATEWAY_ENDPOINT:-http://host.docker.internal:12000/v1}
      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "11000:11000"
-      - "12001:12001"
-      - "12000:12000"
-      - "8001:8001"
-    environment:
-      - PLANO_CONFIG_PATH=/config/config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
  jaeger:
    build:
      context: ../../shared/jaeger
@ -43,7 +28,7 @@ services:
    environment:
      - STORAGE_DIR=/app/server/storage
      - LLM_PROVIDER=generic-openai
-      - GENERIC_OPEN_AI_BASE_PATH=http://plano:8001/v1
+      - GENERIC_OPEN_AI_BASE_PATH=http://host.docker.internal:8001/v1
      - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
      - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
      - GENERIC_OPEN_AI_API_KEY=sk-placeholder
--- a/demos/filter_chains/mcp_filter/run_demo.sh
+++ b/demos/filter_chains/mcp_filter/run_demo.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set OpenAI key
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo ".env file created with OPENAI_API_KEY."
+  fi
+
+  # Step 3: Start Plano
+  echo "Starting Plano with config.yaml..."
+  planoai up config.yaml
+
+  # Step 4: Start services
+  echo "Starting services using Docker Compose..."
+  docker compose up -d
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Docker Compose services..."
+  docker compose down
+
+  # Step 2: Stop Plano
+  echo "Stopping Plano..."
+  planoai down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  start_demo
+fi
--- a/demos/getting_started/llm_gateway/config.yaml
+++ b/demos/getting_started/llm_gateway/config.yaml
@ -44,7 +44,7 @@ model_providers:
    access_key: $TOGETHER_API_KEY

  - model: custom/test-model
-    base_url: http://host.docker.internal:11223
+    base_url: http://localhost:11223
    provider_interface: openai

 tracing:
--- a/demos/getting_started/llm_gateway/docker-compose.yaml
+++ b/demos/getting_started/llm_gateway/docker-compose.yaml
@ -1,20 +1,5 @@
 services:

-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "12000:12000"
-      - "12001:12001"
-    environment:
-      - PLANO_CONFIG_PATH=/app/plano_config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-      - OTEL_TRACING_GRPC_ENDPOINT=http://host.docker.internal:4317
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml:ro
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-
  anythingllm:
    image: mintplexlabs/anythingllm
    restart: always
@ -25,7 +10,7 @@ services:
    environment:
      - STORAGE_DIR=/app/server/storage
      - LLM_PROVIDER=generic-openai
-      - GENERIC_OPEN_AI_BASE_PATH=http://plano:12000/v1
+      - GENERIC_OPEN_AI_BASE_PATH=http://host.docker.internal:12000/v1
      - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
      - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
      - GENERIC_OPEN_AI_API_KEY=sk-placeholder
--- a/demos/getting_started/weather_forecast/config.yaml
+++ b/demos/getting_started/weather_forecast/config.yaml
@ -11,7 +11,7 @@ listeners:

 endpoints:
  weather_forecast_service:
-    endpoint: host.docker.internal:18083
+    endpoint: localhost:18083
    connect_timeout: 0.005s

 overrides:
--- a/demos/integrations/ollama/config.yaml
+++ b/demos/integrations/ollama/config.yaml
@ -9,7 +9,7 @@ model_providers:

  - model: my_llm_provider/llama3.2
    provider_interface: openai
-    base_url: http://host.docker.internal:11434
+    base_url: http://localhost:11434
    default: true

 system_prompt: |
--- a/demos/llm_routing/claude_code_router/README.md
+++ b/demos/llm_routing/claude_code_router/README.md
@ -39,8 +39,8 @@ Your Request → Plano → Suitable Model → Response
 # Install Claude Code if you haven't already
 npm install -g @anthropic-ai/claude-code

-# Ensure Docker is running
-docker --version
+# Install Plano CLI
+pip install planoai
 ```

 ### Step 1: Get Configuration
--- a/demos/llm_routing/claude_code_router/config.yaml
+++ b/demos/llm_routing/claude_code_router/config.yaml
@ -28,7 +28,7 @@ model_providers:

  # Ollama Models
  - model: ollama/llama3.1
-    base_url: http://host.docker.internal:11434
+    base_url: http://localhost:11434


 # Model aliases - friendly names that map to actual provider names
--- a/demos/llm_routing/model_alias_routing/config_with_aliases.yaml
+++ b/demos/llm_routing/model_alias_routing/config_with_aliases.yaml
@ -49,7 +49,7 @@ model_providers:

  # Ollama Models
  - model: ollama/llama3.1
-    base_url: http://host.docker.internal:11434
+    base_url: http://localhost:11434

  # Grok (xAI) Models
  - model: xai/grok-4-0709
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -0,0 +1,92 @@
+# Model Routing Service Demo
+
+This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select.
+
+## Setup
+
+Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`).
+
+```bash
+export OPENAI_API_KEY=<your-key>
+export ANTHROPIC_API_KEY=<your-key>
+```
+
+Start Plano:
+```bash
+cd demos/llm_routing/model_routing_service
+planoai up config.yaml
+```
+
+## Run the demo
+
+```bash
+./demo.sh
+```
+
+## Endpoints
+
+All three LLM API formats are supported:
+
+| Endpoint | Format |
+|---|---|
+| `POST /routing/v1/chat/completions` | OpenAI Chat Completions |
+| `POST /routing/v1/messages` | Anthropic Messages |
+| `POST /routing/v1/responses` | OpenAI Responses API |
+
+## Example
+
+```bash
+curl http://localhost:12000/routing/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
+  }'
+```
+
+Response:
+```json
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "c16d1096c1af4a17abb48fb182918a88"
+}
+```
+
+The response tells you which model would handle this request and which route was matched, without actually making the LLM call.
+
+## Demo Output
+
+```
+=== Model Routing Service Demo ===
+
+--- 1. Code generation query (OpenAI format) ---
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "c16d1096c1af4a17abb48fb182918a88"
+}
+
+--- 2. Complex reasoning query (OpenAI format) ---
+{
+    "model": "openai/gpt-4o",
+    "route": "complex_reasoning",
+    "trace_id": "30795e228aff4d7696f082ed01b75ad4"
+}
+
+--- 3. Simple query - no routing match (OpenAI format) ---
+{
+    "model": "none",
+    "route": null,
+    "trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e"
+}
+
+--- 4. Code generation query (Anthropic format) ---
+{
+    "model": "anthropic/claude-sonnet-4-20250514",
+    "route": "code_generation",
+    "trace_id": "26be822bbdf14a3ba19fe198e55ea4a9"
+}
+
+=== Demo Complete ===
+```
--- a/demos/llm_routing/model_routing_service/config.yaml
+++ b/demos/llm_routing/model_routing_service/config.yaml
@ -0,0 +1,27 @@
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex_reasoning
+        description: complex reasoning tasks, multi-step analysis, or detailed explanations
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: generating new code, writing functions, or creating boilerplate
+
+tracing:
+  random_sampling: 100
--- a/demos/llm_routing/model_routing_service/demo.sh
+++ b/demos/llm_routing/model_routing_service/demo.sh
@ -0,0 +1,65 @@
+#!/bin/bash
+set -e
+
+PLANO_URL="${PLANO_URL:-http://localhost:12000}"
+
+echo "=== Model Routing Service Demo ==="
+echo ""
+echo "This demo shows how to use the /routing/v1/* endpoints to get"
+echo "routing decisions without actually proxying the request to an LLM."
+echo ""
+
+# --- Example 1: OpenAI Chat Completions format ---
+echo "--- 1. Code generation query (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "Write a Python function that implements binary search on a sorted array"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 2: Complex reasoning query ---
+echo "--- 2. Complex reasoning query (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 3: Simple query (no routing match) ---
+echo "--- 3. Simple query - no routing match (OpenAI format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "messages": [
+      {"role": "user", "content": "What is the capital of France?"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+# --- Example 4: Anthropic Messages format ---
+echo "--- 4. Code generation query (Anthropic format) ---"
+echo ""
+curl -s "$PLANO_URL/routing/v1/messages" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4o-mini",
+    "max_tokens": 1024,
+    "messages": [
+      {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"}
+    ]
+  }' | python3 -m json.tool
+echo ""
+
+echo "=== Demo Complete ==="
--- a/demos/llm_routing/openclaw_routing/README.md
+++ b/demos/llm_routing/openclaw_routing/README.md
@ -23,7 +23,6 @@ Plano uses a [preference-aligned router](https://arxiv.org/abs/2506.16655) to an

 ## Prerequisites

- **Docker** running
 - **Plano CLI**: `uv tool install planoai` or `pip install planoai`
 - **OpenClaw**: `npm install -g openclaw@latest`
 - **API keys**:
@ -43,7 +42,7 @@ export ANTHROPIC_API_KEY="your-anthropic-key"

 ```bash
 cd demos/llm_routing/openclaw_routing
-planoai up --service plano --foreground
+planoai up config.yaml
 ```

 ### 3. Set Up OpenClaw
--- a/demos/llm_routing/preference_based_routing/README.md
+++ b/demos/llm_routing/preference_based_routing/README.md
@ -3,25 +3,23 @@ This demo shows how you can use user preferences to route user prompts to approp

 ## How to start the demo

-Make sure your machine is up to date with [latest version of plano]([url](https://github.com/katanemo/plano/tree/main?tab=readme-ov-file#prerequisites)). And you have activated the virtual environment.
+Make sure you have Plano CLI installed (`pip install planoai` or `uv tool install planoai`).

-
-1. start anythingllm
 ```bash
-(venv) $ cd demos/llm_routing/preference_based_routing
-(venv) $ docker compose up -d
+cd demos/llm_routing/preference_based_routing
+./run_demo.sh
 ```
-2. start plano in the foreground
+
+Or manually:
+
+1. Start Plano
 ```bash
-(venv) $ planoai up --service plano --foreground
-# Or if installed with uv: uvx planoai up --service plano --foreground
-2025-05-30 18:00:09,953 - planoai.main - INFO - Starting plano cli version: 0.4.8
-2025-05-30 18:00:09,953 - planoai.main - INFO - Validating /Users/adilhafeez/src/intelligent-prompt-gateway/demos/llm_routing/preference_based_routing/config.yaml
-2025-05-30 18:00:10,422 - cli.core - INFO - Starting plano gateway, image name: plano, tag: katanemo/plano:0.4.8
-2025-05-30 18:00:10,662 - cli.core - INFO - plano status: running, health status: starting
-2025-05-30 18:00:11,712 - cli.core - INFO - plano status: running, health status: starting
-2025-05-30 18:00:12,761 - cli.core - INFO - plano is running and is healthy!
-...
+planoai up config.yaml
+```
+
+2. Start AnythingLLM
+```bash
+docker compose up -d
 ```

 3. open AnythingLLM http://localhost:3001/
--- a/demos/llm_routing/preference_based_routing/docker-compose.yaml
+++ b/demos/llm_routing/preference_based_routing/docker-compose.yaml
@ -1,23 +1,5 @@
 services:

-  plano:
-    build:
-      context: ../../../
-      dockerfile: Dockerfile
-    ports:
-      - "12000:12000"
-      - "12001:12001"
-    environment:
-      - PLANO_CONFIG_PATH=/app/plano_config.yaml
-      - OPENAI_API_KEY=${OPENAI_API_KEY:?OPENAI_API_KEY environment variable is required but not set}
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:?ANTHROPIC_API_KEY environment variable is required but not set}
-      - OTEL_TRACING_GRPC_ENDPOINT=http://host.docker.internal:4317
-      - OTEL_TRACING_ENABLED=true
-      - RUST_LOG=debug
-    volumes:
-      - ./config.yaml:/app/plano_config.yaml:ro
-      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-
  anythingllm:
    image: mintplexlabs/anythingllm
    restart: always
@ -28,7 +10,7 @@ services:
    environment:
      - STORAGE_DIR=/app/server/storage
      - LLM_PROVIDER=generic-openai
-      - GENERIC_OPEN_AI_BASE_PATH=http://plano:12000/v1
+      - GENERIC_OPEN_AI_BASE_PATH=http://host.docker.internal:12000/v1
      - GENERIC_OPEN_AI_MODEL_PREF=gpt-4o-mini
      - GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT=128000
      - GENERIC_OPEN_AI_API_KEY=sk-placeholder
--- a/demos/llm_routing/preference_based_routing/plano_config_local.yaml
+++ b/demos/llm_routing/preference_based_routing/plano_config_local.yaml
@ -13,7 +13,7 @@ model_providers:

  - name: arch-router
    model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
-    base_url: http://host.docker.internal:11434
+    base_url: http://localhost:11434

  - model: openai/gpt-4o-mini
    access_key: $OPENAI_API_KEY
--- a/demos/llm_routing/preference_based_routing/run_demo.sh
+++ b/demos/llm_routing/preference_based_routing/run_demo.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set API keys
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+    if [ -z "$ANTHROPIC_API_KEY" ]; then
+      echo "Warning: ANTHROPIC_API_KEY environment variable is not set. Anthropic features may not work."
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    if [ -n "$ANTHROPIC_API_KEY" ]; then
+      echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env
+    fi
+    echo ".env file created with API keys."
+  fi
+
+  # Step 3: Start Plano
+  echo "Starting Plano with config.yaml..."
+  planoai up config.yaml
+
+  # Step 4: Start services
+  echo "Starting services using Docker Compose..."
+  docker compose up -d
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Docker Compose services..."
+  docker compose down
+
+  # Step 2: Stop Plano
+  echo "Stopping Plano..."
+  planoai down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  start_demo
+fi
--- a/demos/shared/test_runner/run_demo_tests.sh
+++ b/demos/shared/test_runner/run_demo_tests.sh
@ -21,7 +21,7 @@ do
  echo "****************************************"
  cd ../../$demo
  echo "starting plano"
-  planoai up config.yaml
+  planoai up --docker config.yaml
  echo "starting docker containers"
  # only execute docker compose if demo is llm_routing/preference_based_routing
  if [ "$demo" == "llm_routing/preference_based_routing" ]; then
@ -38,7 +38,7 @@ do
    exit 1
  fi
  echo "stopping docker containers and plano"
-  planoai down
+  planoai down --docker
  docker compose down -v
  cd ../../shared/test_runner
 done
--- a/docs/source/_static/img/cli-default-command.png
+++ b/docs/source/_static/img/cli-default-command.png
--- a/docs/source/_static/img/cli-init-command.png
+++ b/docs/source/_static/img/cli-init-command.png
--- a/docs/source/_static/img/cli-trace-command.png
+++ b/docs/source/_static/img/cli-trace-command.png
--- a/docs/source/_static/js/fix-copy.js
+++ b/docs/source/_static/js/fix-copy.js
@ -0,0 +1,18 @@
+/* Fix: Prevent "Copy code" button label from appearing in clipboard content.
+ *
+ * sphinxawesome_theme inserts a copy button inside <pre> elements. When
+ * clipboard.js selects all children of the <pre> to copy, the button's
+ * sr-only text ("Copy code") is included in the selection. This listener
+ * intercepts the copy event and strips that trailing label from the data
+ * written to the clipboard.
+ */
+document.addEventListener('copy', function (e) {
+    if (!e.clipboardData) { return; }
+    var selection = window.getSelection();
+    if (!selection) { return; }
+    var text = selection.toString();
+    var clean = text.replace(/\nCopy code\s*$/, '');
+    if (clean === text) { return; }
+    e.clipboardData.setData('text/plain', clean);
+    e.preventDefault();
+}, true);
--- a/docs/source/build_with_plano/includes/agent/function-calling-agent.yaml
+++ b/docs/source/build_with_plano/includes/agent/function-calling-agent.yaml
@ -54,6 +54,6 @@ endpoints:
    # value could be ip address or a hostname with port
    # this could also be a list of endpoints for load balancing
    # for example endpoint: [ ip1:port, ip2:port ]
-    endpoint: host.docker.internal:18083
+    endpoint: localhost:18083
    # max time to wait for a connection to be established
    connect_timeout: 0.005s
--- a/docs/source/concepts/llm_providers/model_aliases.rst
+++ b/docs/source/concepts/llm_providers/model_aliases.rst
@ -32,7 +32,7 @@ Basic Configuration
        access_key: $ANTHROPIC_API_KEY

      - model: ollama/llama3.1
-        base_url: http://host.docker.internal:11434
+        base_url: http://localhost:11434

    # Define aliases that map to the models above
    model_aliases:
--- a/docs/source/concepts/llm_providers/supported_providers.rst
+++ b/docs/source/concepts/llm_providers/supported_providers.rst
@ -598,9 +598,9 @@ Ollama
      - model: ollama/llama3.1
        base_url: http://localhost:11434

-      # Ollama in Docker (from host)
+      # Ollama running locally
      - model: ollama/codellama
-        base_url: http://host.docker.internal:11434
+        base_url: http://localhost:11434


 OpenAI-Compatible Providers
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Plano Docs"
 copyright = "2025, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.4.8"
+release = " v0.4.11"

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@ -116,6 +116,7 @@ html_theme_options = asdict(theme_options)
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 html_css_files = ["css/custom.css"]
+html_js_files = ["js/fix-copy.js"]

 pygments_style = "lovelace"
 pygments_style_dark = "github-dark"
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -17,11 +17,17 @@ Follow this guide to learn how to quickly set up Plano and integrate it into you
 Prerequisites
 -------------

-Before you begin, ensure you have the following:
+Plano runs **natively** by default — no Docker or Rust toolchain required. Pre-compiled binaries are downloaded automatically on first run.
+
+1. `Python <https://www.python.org/downloads/>`_ (v3.10+)
+2. Supported platforms: Linux (x86_64, aarch64), macOS (Apple Silicon)
+
+**Docker mode** (optional):
+
+If you prefer to run inside Docker, add ``--docker`` to ``planoai up`` / ``planoai down``. This requires:

 1. `Docker System <https://docs.docker.com/get-started/get-docker/>`_ (v24)
 2. `Docker Compose <https://docs.docker.com/compose/install/>`_ (v2.29)
-3. `Python <https://www.python.org/downloads/>`_ (v3.10+)

 Plano's CLI allows you to manage and interact with the Plano efficiently. To install the CLI, simply run the following command:

@ -37,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins

 .. code-block:: console

-   $ uv tool install planoai==0.4.8
+   $ uv tool install planoai==0.4.11

 **Option 2: Install with pip (Traditional)**

@ -45,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins

   $ python -m venv venv
   $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install planoai==0.4.8
+   $ pip install planoai==0.4.11


 .. _llm_routing_quickstart:
@ -84,17 +90,20 @@ Step 2. Start plano

 Once the config file is created, ensure that you have environment variables set up for ``ANTHROPIC_API_KEY`` and ``OPENAI_API_KEY`` (or these are defined in a ``.env`` file).

-Start Plano:
-
 .. code-block:: console

   $ planoai up plano_config.yaml
-   # Or if installed with uv tool: uvx planoai up plano_config.yaml
-   2024-12-05 11:24:51,288 - planoai.main - INFO - Starting plano cli version: 0.4.8
-   2024-12-05 11:24:51,825 - planoai.utils - INFO - Schema validation successful!
-   2024-12-05 11:24:51,825 - planoai.main - INFO - Starting plano
-   ...
-   2024-12-05 11:25:16,131 - planoai.core - INFO - Container is healthy!
+
+On the first run, Plano automatically downloads Envoy, WASM plugins, and brightstaff and caches them at ``~/.plano/``.
+
+To stop Plano, run ``planoai down``.
+
+**Docker mode** (optional):
+
+.. code-block:: console
+
+   $ planoai up plano_config.yaml --docker
+   $ planoai down --docker

 Step 3: Interact with LLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@ -185,9 +194,9 @@ Here is a minimal configuration that wires Plano-Orchestrator to two HTTP servic

  agents:
    - id: flight_agent
-      url: http://host.docker.internal:10520  # your flights service
+      url: http://localhost:10520  # your flights service
    - id: hotel_agent
-      url: http://host.docker.internal:10530  # your hotels service
+      url: http://localhost:10530  # your hotels service

  model_providers:
    - model: openai/gpt-4o
--- a/docs/source/guides/observability/monitoring.rst
+++ b/docs/source/guides/observability/monitoring.rst
@ -59,7 +59,7 @@ are some sample configuration files for both, respectively.
        scheme: http
        static_configs:
        - targets:
-            - host.docker.internal:19901
+            - localhost:19901
        params:
        format: ["prometheus"]

--- a/docs/source/guides/observability/tracing.rst
+++ b/docs/source/guides/observability/tracing.rst
@ -142,6 +142,109 @@ In your observability platform (Jaeger, Grafana Tempo, Datadog, etc.), filter tr
 For complete details on all available signals, detection methods, and best practices, see the :doc:`../../concepts/signals` guide.


+Custom Span Attributes
+-------------------------------------------
+
+Plano can automatically attach **custom span attributes** derived from request headers and **static** attributes
+defined in configuration. This lets you stamp
+traces with identifiers like workspace, tenant, or user IDs without changing application code or adding
+custom instrumentation.
+
+**Why This Is Useful**
+
+- **Tenant-aware debugging**: Filter traces by ``workspace.id`` or ``tenant.id``.
+- **Customer-specific visibility**: Attribute performance or errors to a specific customer.
+- **Low overhead**: No code changes in agents or clients—just headers.
+
+How It Works
+~~~~~~~~~~~~
+
+You configure one or more header prefixes. Any incoming HTTP header whose name starts with one of these
+prefixes is captured as a span attribute. You can also provide static attributes that are always injected.
+
+- The **prefix is only for matching**, not the resulting attribute key.
+- The attribute key is the header name **with the prefix removed**, then hyphens converted to dots.
+
+.. note::
+
+   Custom span attributes are attached to LLM spans when handling ``/v1/...`` requests via ``llm_chat``. For orchestrator requests to ``/agents/...``,
+   these attributes are added to both the orchestrator selection span and to each agent span created by ``agent_chat``.
+
+**Example**
+
+Configured prefix::
+
+  tracing:
+    span_attributes:
+      header_prefixes:
+        - x-katanemo-
+
+Incoming headers::
+
+  X-Katanemo-Workspace-Id: ws_123
+  X-Katanemo-Tenant-Id: ten_456
+
+Resulting span attributes::
+
+  workspace.id = "ws_123"
+  tenant.id = "ten_456"
+
+Configuration
+~~~~~~~~~~~~~
+
+Add the prefix list under ``tracing`` in your config:
+
+.. code-block:: yaml
+
+  tracing:
+    random_sampling: 100
+    span_attributes:
+      header_prefixes:
+        - x-katanemo-
+      static:
+        environment: production
+        service.version: "1.0.0"
+
+Static attributes are always injected alongside any header-derived attributes. If a header-derived
+attribute key matches a static key, the header value overrides the static value.
+
+You can provide multiple prefixes:
+
+.. code-block:: yaml
+
+  tracing:
+    span_attributes:
+      header_prefixes:
+        - x-katanemo-
+        - x-tenant-
+      static:
+        environment: production
+        service.version: "1.0.0"
+
+Notes and Examples
+~~~~~~~~~~~~~~~~~~
+
+- **Prefix must match exactly**: ``katanemo-`` does not match ``x-katanemo-`` headers.
+- **Trailing dash is recommended**: Without it, ``x-katanemo`` would also match ``x-katanemo-foo`` and
+  ``x-katanemofoo``.
+- **Keys are always strings**: Values are captured as string attributes.
+
+**Prefix mismatch example**
+
+Config::
+
+  tracing:
+    span_attributes:
+      header_prefixes:
+        - x-katanemo-
+
+Request headers::
+
+  X-Other-User-Id: usr_999
+
+Result: no attributes are captured from ``X-Other-User-Id``.
+
+
 Benefits of Using ``Traceparent`` Headers
 -----------------------------------------

@ -497,55 +600,7 @@ tools like AWS X-Ray and Datadog, enhancing observability and facilitating faste
 Additional Resources
 --------------------

-CLI Reference
-~~~~~~~~~~~~~
-
-``planoai trace``
-  Trace requests captured by the local OTLP listener.
-
-  **Synopsis**
-
-  .. code-block:: console
-
-     $ planoai trace [TARGET] [OPTIONS]
-
-  **Targets**
-
-  - ``last`` (default): show the most recent trace.
-  - ``any``: allow interactive selection when available.
-  - ``<trace-id>``: full 32-hex trace ID.
-  - ``<short-id>``: first 8 hex characters.
-
-  **Options**
-
-  - ``--filter <pattern>``: limit displayed attributes to matching keys (supports ``*``).
-  - ``--where <key=value>``: match traces containing a specific attribute (repeatable, AND).
-  - ``--list``: list trace IDs only.
-  - ``--no-interactive``: disable interactive prompts/selections.
-  - ``--limit <n>``: limit the number of traces returned.
-  - ``--since <window>``: look back window (``5m``, ``2h``, ``1d``).
-  - ``--json``: output raw JSON instead of formatted output.
-  - ``--verbose, -v``: show all span attributes. By default, inbound/outbound
-    spans are displayed in a compact view.
-
-  **Environment**
-
-  - ``PLANO_TRACE_PORT``: gRPC port used by ``planoai trace`` to query traces
-    (defaults to ``4317``).
-
-``planoai trace listen``
-  Start a local OTLP/gRPC listener.
-
-  **Synopsis**
-
-  .. code-block:: console
-
-     $ planoai trace listen [OPTIONS]
-
-  **Options**
-
-  - ``--host <host>``: bind address (default: ``0.0.0.0``).
-  - ``--port <port>``: gRPC listener port (default: ``4317``).
+For full command documentation (including ``planoai trace`` and all other CLI commands), see :ref:`cli_reference`.

 External References
 ~~~~~~~~~~~~~~~~~~~
--- a/docs/source/guides/state.rst
+++ b/docs/source/guides/state.rst
@ -165,7 +165,7 @@ Then set the environment variable before running Plano:
   ./plano

 .. warning::
-   **Special Characters in Passwords**: If your password contains special characters like ``#``, ``@``, or ``&``, you must URL-encode them in the connection string. For example, ``MyPass#123`` becomes ``MyPass%23123``.
+   **Special Characters in Passwords**: If your password contains special characters like ``#``, ``@``, or ``&``, you must URL-encode them in the connection string. For example, ``P@ss#123`` becomes ``P%40ss%23123``.

 Supabase Connection Strings
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -202,14 +202,14 @@ Use the direct connection (port 5432):

   state_storage:
     type: postgres
-     connection_string: "postgresql://postgres.myproject:$DB_PASSWORD@aws-0-us-west-2.pooler.supabase.com:5432/postgres"
+     connection_string: "postgresql://postgres.[YOUR-PROJECT-REF]:$DB_PASSWORD@aws-0-[REGION].pooler.supabase.com:5432/postgres"

 Then set the environment variable:

 .. code-block:: bash

-   # If your password is "MyPass#123", encode it as "MyPass%23123"
-   export DB_PASSWORD="MyPass%23123"
+   # If your password is "P@ss#123", encode it as "P%40ss%23123"
+   export DB_PASSWORD="<your-url-encoded-password>"

 Troubleshooting
 ---------------
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -62,4 +62,5 @@ Built by contributors to the widely adopted `Envoy Proxy <https://www.envoyproxy
      resources/tech_overview/tech_overview
      resources/deployment
      resources/configuration_reference
+      resources/cli_reference
      resources/llms_txt
--- a/docs/source/resources/cli_reference.rst
+++ b/docs/source/resources/cli_reference.rst
@ -0,0 +1,302 @@
+.. _cli_reference:
+
+CLI Reference
+=============
+
+This reference documents the full ``planoai`` command-line interface for day-to-day development, local testing, and operational workflows.
+Use this page as the canonical source for command syntax, options, and recommended usage patterns.
+
+Quick Navigation
+----------------
+
+- :ref:`cli_reference_global`
+- :ref:`cli_reference_up`
+- :ref:`cli_reference_down`
+- :ref:`cli_reference_build`
+- :ref:`cli_reference_logs`
+- :ref:`cli_reference_init`
+- :ref:`cli_reference_trace`
+- :ref:`cli_reference_prompt_targets`
+- :ref:`cli_reference_cli_agent`
+
+
+.. _cli_reference_global:
+
+Global CLI Usage
+----------------
+
+**Command**
+
+.. code-block:: console
+
+   $ planoai [COMMAND] [OPTIONS]
+
+**Common global options**
+
+- ``--help``: Show the top-level command menu.
+- ``--version``: Show installed CLI version and update status.
+
+**Help patterns**
+
+.. code-block:: console
+
+   $ planoai --help
+   $ planoai trace --help
+   $ planoai init --help
+
+.. figure:: /_static/img/cli-default-command.png
+   :width: 100%
+   :alt: planoai default command screenshot
+
+   ``planoai`` command showing the top-level command menu.
+
+
+
+.. _cli_reference_up:
+
+planoai up
+----------
+
+Start Plano using a configuration file.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai up [FILE] [--path <dir>] [--foreground] [--with-tracing] [--tracing-port <port>]
+
+**Arguments**
+
+- ``FILE`` (optional): explicit path to config file.
+
+**Options**
+
+- ``--path <dir>``: directory to search for config (default ``.``).
+- ``--foreground``: run Plano in foreground.
+- ``--with-tracing``: start local OTLP/gRPC trace collector.
+- ``--tracing-port <port>``: collector port (default ``4317``).
+
+.. note::
+
+   If you use ``--with-tracing``, ensure that port 4317 is free and not already in use by Jaeger or any other observability services or processes. If port 4317 is occupied, the command will fail to start the trace collector.
+
+**Examples**
+
+.. code-block:: console
+
+   $ planoai up config.yaml
+   $ planoai up --path ./deploy
+   $ planoai up --with-tracing
+   $ planoai up --with-tracing --tracing-port 4318
+
+
+.. _cli_reference_down:
+
+planoai down
+------------
+
+Stop Plano (container/process stack managed by the CLI).
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai down
+
+
+.. _cli_reference_build:
+
+planoai build
+-------------
+
+Build Plano Docker image from repository source.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai build
+
+
+.. _cli_reference_logs:
+
+planoai logs
+------------
+
+Stream Plano logs.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai logs [--follow] [--debug]
+
+**Options**
+
+- ``--follow``: stream logs continuously.
+- ``--debug``: include additional gateway/debug streams.
+
+**Examples**
+
+.. code-block:: console
+
+   $ planoai logs
+   $ planoai logs --follow
+   $ planoai logs --follow --debug
+
+
+.. _cli_reference_init:
+
+planoai init
+------------
+
+Generate a new ``config.yaml`` using an interactive wizard, built-in templates, or a clean empty file.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai init [--template <id> | --clean] [--output <path>] [--force] [--list-templates]
+
+**Options**
+
+- ``--template <id>``: create config from a built-in template id.
+- ``--clean``: create an empty config file.
+- ``--output, -o <path>``: output path (default ``config.yaml``).
+- ``--force``: overwrite existing output file.
+- ``--list-templates``: print available template IDs and exit.
+
+**Examples**
+
+.. code-block:: console
+
+   $ planoai init
+   $ planoai init --list-templates
+   $ planoai init --template coding_agent_routing
+   $ planoai init --clean --output ./config/config.yaml
+
+.. figure:: /_static/img/cli-init-command.png
+   :width: 100%
+   :alt: planoai init command screenshot
+
+   ``planoai init --list-templates`` showing built-in starter templates.
+
+
+.. _cli_reference_trace:
+
+planoai trace
+-------------
+
+Inspect request traces from the local OTLP listener.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai trace [TARGET] [OPTIONS]
+
+**Targets**
+
+- ``last`` (default): show most recent trace.
+- ``any``: consider all traces (interactive selection when terminal supports it).
+- ``listen``: start local OTLP listener.
+- ``down``: stop background listener.
+- ``<trace-id>``: full 32-hex trace id.
+- ``<short-id>``: first 8 hex chars of trace id.
+
+**Display options**
+
+- ``--filter <pattern>``: keep only matching attribute keys (supports ``*`` via "glob" syntax).
+- ``--where <key=value>``: locate traces containing key/value (repeatable, AND semantics).
+- ``--list``: list trace IDs instead of full trace output (use with ``--no-interactive`` to fetch plain-text trace IDs only).
+- ``--no-interactive``: disable interactive selection prompts.
+- ``--limit <n>``: limit returned traces.
+- ``--since <window>``: lookback window such as ``5m``, ``2h``, ``1d``.
+- ``--json``: emit JSON payloads.
+- ``--verbose``, ``-v``: show full attribute output (disable compact trimming). Useful for debugging internal attributes.
+
+**Listener options (for ``TARGET=listen``)**
+
+- ``--host <host>``: bind host (default ``0.0.0.0``).
+- ``--port <port>``: bind port (default ``4317``).
+
+.. note::
+
+   When using ``listen``, ensure that port 4317 is free and not already in use by Jaeger or any other observability services or processes. If port 4317 is occupied, the command will fail to start the trace collector. You cannot use other services on the same port when running.
+
+
+**Environment**
+
+- ``PLANO_TRACE_PORT``: query port used by ``planoai trace`` when reading traces (default ``4317``).
+
+**Examples**
+
+.. code-block:: console
+
+   # Start/stop listener
+   $ planoai trace listen
+   $ planoai trace down
+
+   # Basic inspection
+   $ planoai trace
+   $ planoai trace 7f4e9a1c
+   $ planoai trace 7f4e9a1c0d9d4a0bb9bf5a8a7d13f62a
+
+   # Filtering and automation
+   $ planoai trace --where llm.model=openai/gpt-5.2 --since 30m
+   $ planoai trace --filter "http.*"
+   $ planoai trace --list --limit 5
+   $ planoai trace --where http.status_code=500 --json
+
+.. figure:: /_static/img/cli-trace-command.png
+   :width: 100%
+   :alt: planoai trace command screenshot
+
+   ``planoai trace`` command showing trace inspection and filtering capabilities.
+
+**Operational notes**
+
+- ``--host`` and ``--port`` are valid only when ``TARGET`` is ``listen``.
+- ``--list`` cannot be combined with a specific trace-id target.
+
+
+.. _cli_reference_prompt_targets:
+
+planoai prompt_targets
+----------------------
+
+Generate prompt-target metadata from Python methods.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai prompt_targets --file <python-file>
+
+**Options**
+
+- ``--file, --f <python-file>``: required path to a ``.py`` source file.
+
+
+.. _cli_reference_cli_agent:
+
+planoai cli_agent
+-----------------
+
+Start an interactive CLI agent session against a running Plano deployment.
+
+**Synopsis**
+
+.. code-block:: console
+
+   $ planoai cli_agent claude [FILE] [--path <dir>] [--settings '<json>']
+
+**Arguments**
+
+- ``type``: currently ``claude``.
+- ``FILE`` (optional): config file path.
+
+**Options**
+
+- ``--path <dir>``: directory containing config file.
+- ``--settings <json>``: JSON settings payload for agent startup.
--- a/docs/source/resources/db_setup/README.md
+++ b/docs/source/resources/db_setup/README.md
@ -64,8 +64,8 @@ After setting up the database table, configure your application to use Supabase

 **Example:**
 ```bash
-# If your password is "MyPass#123", encode it as "MyPass%23123"
-export DATABASE_URL="postgresql://postgres.myproject:MyPass%23123@aws-0-us-west-2.pooler.supabase.com:5432/postgres"
+# If your password is "P@ss#123", encode it as "P%40ss%23123"
+export DATABASE_URL="postgresql://postgres.[YOUR-PROJECT-REF]:<your-url-encoded-password>@aws-0-[REGION].pooler.supabase.com:5432/postgres"
 ```

 ### Testing the Connection
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@ -3,7 +3,47 @@
 Deployment
 ==========

-This guide shows how to deploy Plano directly using Docker without the ``plano`` CLI, including basic runtime checks for routing and health monitoring.
+Plano can be deployed in two ways: **natively** on the host (default) or inside a **Docker container**.
+
+Native Deployment (Default)
+---------------------------
+
+Plano runs natively by default. Pre-compiled binaries (Envoy, WASM plugins, brightstaff) are automatically downloaded on the first run and cached at ``~/.plano/``.
+
+Supported platforms: Linux (x86_64, aarch64), macOS (Apple Silicon).
+
+Start Plano
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+   planoai up plano_config.yaml
+
+Options:
+
+- ``--foreground`` — stay attached and stream logs (Ctrl+C to stop)
+- ``--with-tracing`` — start a local OTLP trace collector
+
+Runtime files (rendered configs, logs, PID file) are stored in ``~/.plano/run/``.
+
+Stop Plano
+~~~~~~~~~~
+
+.. code-block:: bash
+
+   planoai down
+
+Build from Source (Developer)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to build from source instead of using pre-compiled binaries, you need:
+
+- `Rust <https://rustup.rs>`_ with the ``wasm32-wasip1`` target
+- OpenSSL dev headers (``libssl-dev`` on Debian/Ubuntu, ``openssl`` on macOS)
+
+.. code-block:: bash
+
+   planoai build --native

 Docker Deployment
 -----------------
@ -25,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
   # docker-compose.yml
   services:
     plano:
-       image: katanemo/plano:0.4.8
+       image: katanemo/plano:0.4.11
       container_name: plano
       ports:
         - "10000:10000" # ingress (client -> plano)
@ -53,6 +93,13 @@ Check container health and logs:
   docker compose ps
   docker compose logs -f plano

+You can also use the CLI with Docker mode:
+
+.. code-block:: bash
+
+   planoai up plano_config.yaml --docker
+   planoai down --docker
+
 Runtime Tests
 -------------

--- a/docs/source/resources/includes/agents/agents_config.yaml
+++ b/docs/source/resources/includes/agents/agents_config.yaml
@ -2,9 +2,9 @@ version: v0.3.0

 agents:
  - id: weather_agent
-    url: http://host.docker.internal:10510
+    url: http://localhost:10510
  - id: flight_agent
-    url: http://host.docker.internal:10520
+    url: http://localhost:10520

 model_providers:
  - model: openai/gpt-4o
--- a/docs/source/resources/includes/agents/flights.py
+++ b/docs/source/resources/includes/agents/flights.py
@ -28,7 +28,7 @@ EXTRACTION_MODEL = "openai/gpt-4o-mini"

 # FlightAware AeroAPI configuration
 AEROAPI_BASE_URL = "https://aeroapi.flightaware.com/aeroapi"
-AEROAPI_KEY = os.getenv("AEROAPI_KEY", "ESVFX7TJLxB7OTuayUv0zTQBryA3tOPr")
+AEROAPI_KEY = os.getenv("AEROAPI_KEY")

 # HTTP client for API calls
 http_client = httpx.AsyncClient(timeout=30.0)
--- a/docs/source/resources/includes/plano_config_agents_filters.yaml
+++ b/docs/source/resources/includes/plano_config_agents_filters.yaml
@ -2,16 +2,16 @@ version: v0.3.0

 agents:
  - id: rag_agent
-    url: http://host.docker.internal:10505
+    url: http://localhost:10505

 filters:
  - id: query_rewriter
-    url: http://host.docker.internal:10501
+    url: http://localhost:10501
    # type: mcp # default is mcp
    # transport: streamable-http # default is streamable-http
    # tool: query_rewriter # default name is the filter id
  - id: context_builder
-    url: http://host.docker.internal:10502
+    url: http://localhost:10502

 model_providers:
  - model: openai/gpt-4o-mini
--- a/docs/source/resources/includes/plano_config_full_reference.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference.yaml
@ -4,15 +4,15 @@ version: v0.3.0
 # External HTTP agents - API type is controlled by request path (/v1/responses, /v1/messages, /v1/chat/completions)
 agents:
  - id: weather_agent # Example agent for weather
-    url: http://host.docker.internal:10510
+    url: http://localhost:10510

  - id: flight_agent # Example agent for flights
-    url: http://host.docker.internal:10520
+    url: http://localhost:10520

 # MCP filters applied to requests/responses (e.g., input validation, query rewriting)
 filters:
  - id: input_guards # Example filter for input validation
-    url: http://host.docker.internal:10500
+    url: http://localhost:10500
    # type: mcp (default)
    # transport: streamable-http (default)
    # tool: input_guards (default - same as filter id)
--- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
@ -1,31 +1,31 @@
 agents:
 - id: weather_agent
-  url: http://host.docker.internal:10510
+  url: http://localhost:10510
 - id: flight_agent
-  url: http://host.docker.internal:10520
+  url: http://localhost:10520
 endpoints:
  app_server:
    connect_timeout: 0.005s
    endpoint: 127.0.0.1
    port: 80
  flight_agent:
-    endpoint: host.docker.internal
+    endpoint: localhost
    port: 10520
    protocol: http
  input_guards:
-    endpoint: host.docker.internal
+    endpoint: localhost
    port: 10500
    protocol: http
  mistral_local:
    endpoint: 127.0.0.1
    port: 8001
  weather_agent:
-    endpoint: host.docker.internal
+    endpoint: localhost
    port: 10510
    protocol: http
 filters:
 - id: input_guards
-  url: http://host.docker.internal:10500
+  url: http://localhost:10500
 listeners:
 - address: 0.0.0.0
  agents:
@ -65,8 +65,6 @@ listeners:
    port: 443
    protocol: https
    provider_interface: openai
-  filter_chain:
-  - input_guards
  name: model_1
  port: 12000
  type: model
@ -132,6 +130,6 @@ prompt_targets:
    required: true
    type: int
 tracing:
-  opentracing_grpc_endpoint: http://host.docker.internal:4317
+  opentracing_grpc_endpoint: http://localhost:4317
  random_sampling: 100
 version: v0.3.0
--- a/docs/source/resources/tech_overview/request_lifecycle.rst
+++ b/docs/source/resources/tech_overview/request_lifecycle.rst
@ -46,6 +46,117 @@ Also, Plano utilizes `Envoy event-based thread model <https://blog.envoyproxy.io
 Worker threads rarely share state and operate in a trivially parallel fashion. This threading model
 enables scaling to very high core count CPUs.

+.. code-block:: text
+
+   ┌─────────────────────────────────────────────────────────────────────────────────────┐
+   │                                    P L A N O                                        │
+   │                  AI-native proxy and data plane for agentic applications            │
+   │                                                                                     │
+   │                              ┌─────────────────────┐                                │
+   │                              │    YOUR CLIENTS     │                                │
+   │                              │ (apps· agents · UI) │                                │
+   │                              └──────────┬──────────┘                                │
+   │                                         │                                           │
+   │          ┌──────────────────────────────┼──────────────────────────┐                │
+   │          │                              │                          │                │
+   │   ┌──────▼──────────┐         ┌─────────▼────────┐       ┌────────▼─────────┐       │
+   │   │  Agent Port(s)  │         │   Model Port     │       │  Function-Call   │       │
+   │   │  :8001+         │         │   :12000         │       │  Port  :10000    │       │
+   │   │                 │         │                  │       │                  │       │
+   │   │  route your     │         │  direct LLM      │       │  prompt-target / │       │
+   │   │  prompts to     │         │  calls with      │       │  tool dispatch   │       │
+   │   │  the right      │         │  model-alias     │       │  with parameter  │       │
+   │   │  agent          │         │  translation     │       │  extraction      │       │
+   │   └──────┬──────────┘         └─────────┬────────┘       └────────┬─────────┘       │
+   │          └──────────────────────────────┼─────────────────────────┘                 │
+   │                                         │                                           │
+   │  ╔══════════════════════════════════════▼══════════════════════════════════════╗    │
+   │  ║            BRIGHTSTAFF (SUBSYSTEM) —  Agentic Control Plane                 ║    │
+   │  ║            Async · non-blocking · parallel per-request Tokio tasks          ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌─────────────────────────────────────────────────────────────────────┐   ║    │
+   │  ║   │  Agentic ROUTER                                                     │   ║    │
+   │  ║   │  Reads listener config · maps incoming request to execution path    │   ║    │
+   │  ║   │                                                                     │   ║    │
+   │  ║   │   /agents/*  ──────────────────────►  AGENT PATH                    │   ║    │
+   │  ║   │   /v1/chat|messages|responses ──────►  LLM PATH                     │   ║    │
+   │  ║   └─────────────────────────────────────────────────────────────────────┘   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────────── AGENT PATH ────────────────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌──────────────────────────────────────────────────────────────────────┐  ║    │
+   │  ║   │  FILTER CHAIN                        (pipeline_processor.rs)         │  ║    │
+   │  ║   │                                                                      │  ║    │
+   │  ║   │  prompt ──► [input_guards] ──► [query_rewrite] ──► [context_builder] │  ║    │
+   │  ║   │             guardrails       prompt mutation      RAG / enrichment   │  ║    │
+   │  ║   │                                                                      │  ║    │
+   │  ║   │  Each filter: HTTP or MCP · can mutate, enrich, or short-circuit     │  ║    │
+   │  ║   └──────────────────────────────────┬───────────────────────────────────┘  ║    │
+   │  ║                                      │                                      ║    │
+   │  ║   ┌──────────────────────────────────▼───────────────────────────────────┐  ║    │
+   │  ║   │  AGENT ORCHESTRATOR               (agent_chat_completions.rs)        │  ║    │
+   │  ║   │  Select agent · forward enriched request · manage conversation state │  ║    │
+   │  ║   │  Stream response back · multi-turn aware                             │  ║    │
+   │  ║   └──────────────────────────────────────────────────────────────────────┘  ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────────── LLM PATH ──────────────────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌──────────────────────────────────────────────────────────────────────┐  ║    │
+   │  ║   │  MODEL ROUTER                       (llm_router.rs + router_chat.rs) │  ║    │
+   │  ║   │  Model alias resolution · preference-based provider selection        │  ║    │
+   │  ║   │  "fast-llm" → gpt-4o-mini  ·  "smart-llm" → gpt-4o                   │  ║    │
+   │  ║   └──────────────────────────────────────────────────────────────────────┘  ║    │
+   │  ║                                                                             ║    │
+   │  ║   ─────────────────── ALWAYS ON (every request) ─────────────────────────   ║    │
+   │  ║                                                                             ║    │
+   │  ║   ┌────────────────────┐  ┌─────────────────────┐  ┌──────────────────┐     ║    │
+   │  ║   │  SIGNALS ANALYZER  │  │   STATE STORAGE     │  │  OTEL TRACING    │     ║    │
+   │  ║   │  loop detection    │  │   memory / postgres │  │  traceparent     │     ║    │
+   │  ║   │  repetition score  │  │   /v1/responses     │  │  span injection  │     ║    │
+   │  ║   │  quality indicators│  │   stateful API      │  │  trace export    │     ║    │
+   │  ║   └────────────────────┘  └─────────────────────┘  └──────────────────┘     ║    │
+   │  ╚═════════════════════════════════════╤═══════════════════════════════════════╝    │
+   │                                        │                                            │
+   │  ┌─────────────────────────────────────▼──────────────────────────────────────┐     │
+   │  │  LLM GATEWAY   (llm_gateway.wasm — embedded in Envoy egress filter chain)  │     │
+   │  │                                                                            │     │
+   │  │  Rate limiting  ·  Provider format translation  ·  TTFT metrics            │     │
+   │  │  OpenAI → Anthropic · Gemini · Mistral · Groq · DeepSeek · xAI · Bedrock   │     │
+   │  │                                                                            │     │
+   │  │  Envoy handles beneath this: TLS origination · SNI · retry + backoff       │     │
+   │  │  connection pooling · LOGICAL_DNS · structured access logs                 │     │
+   │  └─────────────────────────────────────┬──────────────────────────────────────┘     │
+   │                                         │                                           │
+   └─────────────────────────────────────────┼───────────────────────────────────────────┘
+                                             │
+                 ┌───────────────────────────┼────────────────────────────┐
+                 │                           │                             │
+       ┌─────────▼──────────┐   ┌────────────▼──────────┐   ┌────────────▼──────────┐
+       │  LLM PROVIDERS     │   │  EXTERNAL AGENTS      │   │  TOOL / API BACKENDS  │
+       │  OpenAI · Anthropic│   │  (filter chain svc)   │   │  (endpoint clusters)  │
+       │  Gemini · Mistral  │   │  HTTP / MCP  :10500+  │   │  user-defined hosts   │
+       │  Groq · DeepSeek   │   │  input_guards         │   │                       │
+       │  xAI · Together.ai │   │  query_rewriter       │   │                       │
+       └────────────────────┘   │  context_builder      │   └───────────────────────┘
+                                └───────────────────────┘
+
+
+     HOW PLANO IS DIFFERENT
+     ─────────────────────────────────────────────────────────────────────────────────
+     Brightstaff is the entire agentic brain — one async Rust binary that handles
+     agent selection, filter chain orchestration, model routing, state, and signals
+     without blocking a thread per request.
+
+     Filter chains are programmable dataplane steps — reusable HTTP/MCP services
+     you wire into any agent, executing in-path before the agent ever sees the prompt.
+
+     The LLM gateway is a zero-overhead WASM plugin inside Envoy — format translation
+     and rate limiting happen in-process with the proxy, not as a separate service hop.
+
+     Envoy provides the transport substrate (TLS, HTTP codecs, retries, connection
+     pools, access logs) so Plano never reimplements solved infrastructure problems.
+
+
 Request Flow (Ingress)
 ----------------------

--- a/tests/e2e/config_native_smoke.yaml
+++ b/tests/e2e/config_native_smoke.yaml
@ -0,0 +1,11 @@
+version: v0.3.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    default: true
--- a/tests/e2e/run_e2e_tests.sh
+++ b/tests/e2e/run_e2e_tests.sh
@ -45,8 +45,8 @@ uv sync

 log startup plano gateway with function calling demo
 cd ../../
-planoai down
-planoai up demos/getting_started/weather_forecast/config.yaml
+planoai down --docker
+planoai up --docker demos/getting_started/weather_forecast/config.yaml
 cd -

 log running e2e tests for prompt gateway
@ -55,11 +55,11 @@ uv run pytest test_prompt_gateway.py

 log shutting down the plano gateway service for prompt_gateway demo
 log ===============================================================
-planoai down
+planoai down --docker

 log startup plano gateway with model alias routing demo
 cd ../../
-planoai up demos/llm_routing/model_alias_routing/config_with_aliases.yaml
+planoai up --docker demos/llm_routing/model_alias_routing/config_with_aliases.yaml
 cd -

 log running e2e tests for model alias routing
@ -71,8 +71,8 @@ log ========================================
 uv run pytest test_openai_responses_api_client.py

 log startup plano gateway with state storage for openai responses api client demo
-planoai down
-planoai up config_memory_state_v1_responses.yaml
+planoai down --docker
+planoai up --docker config_memory_state_v1_responses.yaml

 log running e2e tests for openai responses api client
 log ========================================
--- a/tests/e2e/run_model_alias_tests.sh
+++ b/tests/e2e/run_model_alias_tests.sh
@ -36,8 +36,8 @@ uv sync
 # Start gateway with model alias routing config
 log "startup plano gateway with model alias routing demo"
 cd ../../
-planoai down || true
-planoai up demos/llm_routing/model_alias_routing/config_with_aliases.yaml
+planoai down --docker || true
+planoai up --docker demos/llm_routing/model_alias_routing/config_with_aliases.yaml
 cd -

 # Run both test suites that share this config in a single pytest invocation
@ -46,4 +46,4 @@ uv run pytest -n auto test_model_alias_routing.py test_openai_responses_api_clie

 # Cleanup
 log "shutting down"
-planoai down || true
+planoai down --docker || true
--- a/tests/e2e/run_prompt_gateway_tests.sh
+++ b/tests/e2e/run_prompt_gateway_tests.sh
@ -41,8 +41,8 @@ cd -
 # Start gateway with prompt_gateway config
 log "startup plano gateway with function calling demo"
 cd ../../
-planoai down || true
-planoai up demos/getting_started/weather_forecast/config.yaml
+planoai down --docker || true
+planoai up --docker demos/getting_started/weather_forecast/config.yaml
 cd -

 # Run tests
@ -51,7 +51,7 @@ uv run pytest test_prompt_gateway.py

 # Cleanup
 log "shutting down"
-planoai down || true
+planoai down --docker || true
 cd ../../demos/getting_started/weather_forecast
 docker compose down
 cd -
--- a/tests/e2e/run_responses_state_tests.sh
+++ b/tests/e2e/run_responses_state_tests.sh
@ -35,8 +35,8 @@ uv sync
 # Start gateway with state storage config
 log "startup plano gateway with state storage config"
 cd ../../
-planoai down || true
-planoai up tests/e2e/config_memory_state_v1_responses.yaml
+planoai down --docker || true
+planoai up --docker tests/e2e/config_memory_state_v1_responses.yaml
 cd -

 # Run tests
@ -45,4 +45,4 @@ uv run pytest test_openai_responses_api_client_with_state.py

 # Cleanup
 log "shutting down"
-planoai down || true
+planoai down --docker || true