From 05a85e9a85375a24a038793071258e90d6c8b4ff Mon Sep 17 00:00:00 2001 From: Spherrrical Date: Mon, 13 Apr 2026 19:29:59 -0700 Subject: [PATCH] remove session_affinity_redis and session_affinity_redis_k8s demos --- .../session_affinity_redis/.env.example | 1 - .../session_affinity_redis/README.md | 247 ----------- .../session_affinity_redis/config.yaml | 36 -- .../docker-compose.yaml | 23 - .../session_affinity_redis/run_demo.sh | 94 ---- .../session_affinity_redis/verify_affinity.py | 146 ------ .../session_affinity_redis_k8s/.env.example | 1 - .../session_affinity_redis_k8s/Dockerfile | 95 ---- .../session_affinity_redis_k8s/README.md | 287 ------------ .../build-and-push.sh | 47 -- .../config_k8s.yaml | 38 -- .../session_affinity_redis_k8s/deploy.sh | 136 ------ .../k8s/jaeger.yaml | 56 --- .../k8s/namespace.yaml | 6 - .../k8s/plano-config.yaml | 50 --- .../k8s/plano-secrets.example.yaml | 19 - .../session_affinity_redis_k8s/k8s/plano.yaml | 130 ------ .../session_affinity_redis_k8s/k8s/redis.yaml | 96 ---- .../session_affinity_redis_k8s/run-local.sh | 154 ------- .../verify_affinity.py | 418 ------------------ 20 files changed, 2080 deletions(-) delete mode 100644 demos/llm_routing/session_affinity_redis/.env.example delete mode 100644 demos/llm_routing/session_affinity_redis/README.md delete mode 100644 demos/llm_routing/session_affinity_redis/config.yaml delete mode 100644 demos/llm_routing/session_affinity_redis/docker-compose.yaml delete mode 100755 demos/llm_routing/session_affinity_redis/run_demo.sh delete mode 100644 demos/llm_routing/session_affinity_redis/verify_affinity.py delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/.env.example delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/Dockerfile delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/README.md delete mode 100755 demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml delete mode 100755 demos/llm_routing/session_affinity_redis_k8s/deploy.sh delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml delete mode 100755 demos/llm_routing/session_affinity_redis_k8s/run-local.sh delete mode 100644 demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py diff --git a/demos/llm_routing/session_affinity_redis/.env.example b/demos/llm_routing/session_affinity_redis/.env.example deleted file mode 100644 index f045319d..00000000 --- a/demos/llm_routing/session_affinity_redis/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY=sk-replace-me diff --git a/demos/llm_routing/session_affinity_redis/README.md b/demos/llm_routing/session_affinity_redis/README.md deleted file mode 100644 index d74cf35b..00000000 --- a/demos/llm_routing/session_affinity_redis/README.md +++ /dev/null @@ -1,247 +0,0 @@ -# Session Affinity with Redis — Multi-Replica Model Pinning - -This demo shows Plano's **session affinity** (`X-Model-Affinity` header) backed by a **Redis session cache** instead of the default in-memory store. - -## The Problem - -By default, model affinity stores routing decisions in a per-process `HashMap`. -This works for single-instance deployments, but breaks when you run multiple -Plano replicas behind a load balancer: - -``` -Client ──► Load Balancer ──► Replica A (session pinned here) - └──► Replica B (knows nothing about the session) -``` - -A request that was pinned to `gpt-4o` on Replica A will be re-routed from -scratch on Replica B, defeating the purpose of affinity. - -## The Solution - -Plano's `session_cache` config key accepts a `type: redis` backend that is -shared across all replicas: - -```yaml -routing: - session_ttl_seconds: 300 - session_cache: - type: redis - url: redis://localhost:6379 -``` - -All replicas read and write the same Redis keyspace. A session pinned on any -replica is immediately visible to all others. - -## What to Look For - -| What | Expected behaviour | -|------|--------------------| -| First request with a session ID | Plano routes normally (via Arch-Router) and writes the result to Redis (`SET session-id ... EX 300`) | -| Subsequent requests with the **same** session ID | Plano reads from Redis and skips the router — same model every time | -| Requests with a **different** session ID | Routed independently; may land on a different model | -| After `session_ttl_seconds` elapses | Redis key expires; next request re-routes and sets a new pin | -| `x-plano-pinned: true` response header | Tells you the response was served from the session cache | - -## Architecture - -``` -Client - │ X-Model-Affinity: my-session-id - ▼ -Plano (brightstaff) - ├── GET redis://localhost:6379/my-session-id - │ hit? → return pinned model immediately (no Arch-Router call) - │ miss? → call Arch-Router → SET key EX 300 → return routed model - ▼ -Redis (shared across replicas) -``` - -## Prerequisites - -| Requirement | Notes | -|-------------|-------| -| `planoai` CLI | `pip install planoai` | -| Docker + Docker Compose | For Redis and Jaeger | -| `OPENAI_API_KEY` | Required for routing model (Arch-Router) and downstream LLMs | -| Python 3.11+ | Only needed to run `verify_affinity.py` | - -## Quick Start - -```bash -# 1. Set your API key -export OPENAI_API_KEY=sk-... -# or copy and edit: -cp .env.example .env - -# 2. Start Redis, Jaeger, and Plano -./run_demo.sh up - -# 3. Verify session pinning works -python verify_affinity.py -``` - -## Manual Verification with curl - -### Step 1 — Pin a session (first request sets the affinity) - -```bash -curl -s http://localhost:12000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "x-model-affinity: my-session-abc" \ - -d '{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Write a short poem about the ocean."}]}' \ - | jq '{model, pinned: .x_plano_pinned}' -``` - -Expected output (first request — not yet pinned, Arch-Router picks the model): - -```json -{ - "model": "openai/gpt-5.2", - "pinned": null -} -``` - -### Step 2 — Confirm the pin is held on subsequent requests - -```bash -for i in 1 2 3 4; do - curl -s http://localhost:12000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "x-model-affinity: my-session-abc" \ - -d "{\"model\":\"openai/gpt-4o-mini\",\"messages\":[{\"role\":\"user\",\"content\":\"Request $i\"}]}" \ - | jq -r '"\(.model)"' -done -``` - -Expected output (same model for every request): - -``` -openai/gpt-5.2 -openai/gpt-5.2 -openai/gpt-5.2 -openai/gpt-5.2 -``` - -### Step 3 — Inspect the Redis key directly - -```bash -docker exec plano-session-redis redis-cli \ - GET my-session-abc | python3 -m json.tool -``` - -Expected output: - -```json -{ - "model_name": "openai/gpt-5.2", - "route_name": "deep_reasoning" -} -``` - -```bash -# Check the TTL (seconds remaining) -docker exec plano-session-redis redis-cli TTL my-session-abc -# e.g. 287 -``` - -### Step 4 — Different sessions may get different models - -```bash -for session in session-A session-B session-C; do - model=$(curl -s http://localhost:12000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "x-model-affinity: $session" \ - -d '{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Explain quantum entanglement in detail with equations."}]}' \ - | jq -r '.model') - echo "$session -> $model" -done -``` - -Sessions with content matched to `deep_reasoning` will pin to `openai/gpt-5.2`; -sessions matched to `fast_responses` will pin to `openai/gpt-4o-mini`. - -## Verification Script Output - -Running `python verify_affinity.py` produces output like: - -``` -Plano endpoint : http://localhost:12000/v1/chat/completions -Sessions : 3 -Rounds/session : 4 - -============================================================ -Phase 1: Requests WITHOUT X-Model-Affinity header - (model may vary between requests — that is expected) -============================================================ - Request 1: model = openai/gpt-4o-mini - Request 2: model = openai/gpt-5.2 - Request 3: model = openai/gpt-4o-mini - Models seen across 3 requests: {'openai/gpt-4o-mini', 'openai/gpt-5.2'} - -============================================================ -Phase 2: Requests WITH X-Model-Affinity (session pinning) - Each session should be pinned to exactly one model. -============================================================ - - Session 'demo-session-001': - Round 1: model = openai/gpt-4o-mini [FIRST — sets affinity] - Round 2: model = openai/gpt-4o-mini [PINNED] - Round 3: model = openai/gpt-4o-mini [PINNED] - Round 4: model = openai/gpt-4o-mini [PINNED] - - Session 'demo-session-002': - Round 1: model = openai/gpt-5.2 [FIRST — sets affinity] - Round 2: model = openai/gpt-5.2 [PINNED] - Round 3: model = openai/gpt-5.2 [PINNED] - Round 4: model = openai/gpt-5.2 [PINNED] - - Session 'demo-session-003': - Round 1: model = openai/gpt-4o-mini [FIRST — sets affinity] - Round 2: model = openai/gpt-4o-mini [PINNED] - Round 3: model = openai/gpt-4o-mini [PINNED] - Round 4: model = openai/gpt-4o-mini [PINNED] - -============================================================ -Results -============================================================ - PASS demo-session-001 -> always routed to 'openai/gpt-4o-mini' - PASS demo-session-002 -> always routed to 'openai/gpt-5.2' - PASS demo-session-003 -> always routed to 'openai/gpt-4o-mini' - -All sessions were pinned consistently. -Redis session cache is working correctly. -``` - -## Observability - -Open Jaeger at **http://localhost:16686** and select service `plano`. - -- Requests **without** affinity: look for a span to the Arch-Router service -- Requests **with** affinity (pinned): the Arch-Router span will be absent — - the decision was served from Redis without calling the router at all - -This is the clearest observable signal that the cache is working: pinned -requests are noticeably faster and produce fewer spans. - -## Switching to the In-Memory Backend - -To compare against the default in-memory backend, change `config.yaml`: - -```yaml -routing: - session_ttl_seconds: 300 - session_cache: - type: memory # ← change this -``` - -In-memory mode does **not** require Redis and works identically for a -single Plano process. The difference only becomes visible when you run -multiple replicas. - -## Teardown - -```bash -./run_demo.sh down -``` - -This stops Plano, Redis, and Jaeger. diff --git a/demos/llm_routing/session_affinity_redis/config.yaml b/demos/llm_routing/session_affinity_redis/config.yaml deleted file mode 100644 index bd413582..00000000 --- a/demos/llm_routing/session_affinity_redis/config.yaml +++ /dev/null @@ -1,36 +0,0 @@ -version: v0.4.0 - -listeners: - - type: model - name: model_listener - port: 12000 - -model_providers: - - model: openai/gpt-4o-mini - access_key: $OPENAI_API_KEY - default: true - - - model: openai/gpt-5.2 - access_key: $OPENAI_API_KEY - -routing_preferences: - - name: fast_responses - description: short factual questions, quick lookups, simple summarization, or greetings - models: - - openai/gpt-4o-mini - - - name: deep_reasoning - description: multi-step reasoning, complex analysis, code review, or detailed explanations - models: - - openai/gpt-5.2 - - openai/gpt-4o-mini - -routing: - session_ttl_seconds: 300 - session_cache: - type: redis - url: redis://localhost:6379 - -tracing: - random_sampling: 100 - trace_arch_internal: true diff --git a/demos/llm_routing/session_affinity_redis/docker-compose.yaml b/demos/llm_routing/session_affinity_redis/docker-compose.yaml deleted file mode 100644 index 011fe6c9..00000000 --- a/demos/llm_routing/session_affinity_redis/docker-compose.yaml +++ /dev/null @@ -1,23 +0,0 @@ -services: - redis: - image: redis:7-alpine - container_name: plano-session-redis - restart: unless-stopped - ports: - - "6379:6379" - command: redis-server --save "" --appendonly no - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 1s - timeout: 1s - retries: 10 - - jaeger: - build: - context: ../../shared/jaeger - container_name: plano-session-jaeger - restart: unless-stopped - ports: - - "16686:16686" - - "4317:4317" - - "4318:4318" diff --git a/demos/llm_routing/session_affinity_redis/run_demo.sh b/demos/llm_routing/session_affinity_redis/run_demo.sh deleted file mode 100755 index ca84d44d..00000000 --- a/demos/llm_routing/session_affinity_redis/run_demo.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -load_env() { - if [ -f "$DEMO_DIR/.env" ]; then - set -a - # shellcheck disable=SC1091 - source "$DEMO_DIR/.env" - set +a - fi -} - -check_prereqs() { - local missing=() - command -v docker >/dev/null 2>&1 || missing+=("docker") - command -v planoai >/dev/null 2>&1 || missing+=("planoai (pip install planoai)") - if [ ${#missing[@]} -gt 0 ]; then - echo "ERROR: missing required tools: ${missing[*]}" - exit 1 - fi - - if [ -z "${OPENAI_API_KEY:-}" ]; then - echo "ERROR: OPENAI_API_KEY is not set." - echo " Create a .env file or export the variable before running." - exit 1 - fi -} - -start_demo() { - echo "==> Starting Redis + Jaeger..." - docker compose -f "$DEMO_DIR/docker-compose.yaml" up -d - - echo "==> Waiting for Redis to be ready..." - local retries=0 - until docker exec plano-session-redis redis-cli ping 2>/dev/null | grep -q PONG; do - retries=$((retries + 1)) - if [ $retries -ge 15 ]; then - echo "ERROR: Redis did not become ready in time" - exit 1 - fi - sleep 1 - done - echo " Redis is ready." - - echo "==> Starting Plano..." - planoai up "$DEMO_DIR/config.yaml" - - echo "" - echo "Demo is running!" - echo "" - echo " Model endpoint: http://localhost:12000/v1/chat/completions" - echo " Jaeger UI: http://localhost:16686" - echo " Redis: localhost:6379" - echo "" - echo "Run the verification script to confirm session pinning:" - echo " python $DEMO_DIR/verify_affinity.py" - echo "" - echo "Stop the demo with: $0 down" -} - -stop_demo() { - echo "==> Stopping Plano..." - planoai down 2>/dev/null || true - - echo "==> Stopping Docker services..." - docker compose -f "$DEMO_DIR/docker-compose.yaml" down - - echo "Demo stopped." -} - -usage() { - echo "Usage: $0 [up|down]" - echo "" - echo " up Start Redis, Jaeger, and Plano (default)" - echo " down Stop all services" -} - -load_env - -case "${1:-up}" in - up) - check_prereqs - start_demo - ;; - down) - stop_demo - ;; - *) - usage - exit 1 - ;; -esac diff --git a/demos/llm_routing/session_affinity_redis/verify_affinity.py b/demos/llm_routing/session_affinity_redis/verify_affinity.py deleted file mode 100644 index 213c6fbe..00000000 --- a/demos/llm_routing/session_affinity_redis/verify_affinity.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 -""" -verify_affinity.py — Verify that model affinity (session pinning) works correctly. - -Sends multiple requests with the same X-Model-Affinity session ID and asserts -that every response is served by the same model, demonstrating that Plano's -session cache is working as expected. - -Usage: - python verify_affinity.py [--url URL] [--rounds N] [--sessions N] -""" - -import argparse -import json -import sys -import urllib.error -import urllib.request -from collections import defaultdict - -PLANO_URL = "http://localhost:12000/v1/chat/completions" - -PROMPTS = [ - "What is 2 + 2?", - "Name the capital of France.", - "How many days in a week?", - "What color is the sky?", - "Who wrote Romeo and Juliet?", -] - -MESSAGES_PER_SESSION = [{"role": "user", "content": prompt} for prompt in PROMPTS] - - -def chat(url: str, session_id: str | None, message: str) -> dict: - payload = json.dumps( - { - "model": "openai/gpt-4o-mini", - "messages": [{"role": "user", "content": message}], - } - ).encode() - - headers = {"Content-Type": "application/json"} - if session_id: - headers["x-model-affinity"] = session_id - - req = urllib.request.Request(url, data=payload, headers=headers, method="POST") - try: - with urllib.request.urlopen(req, timeout=30) as resp: - return json.loads(resp.read()) - except urllib.error.URLError as e: - print(f" ERROR: could not reach Plano at {url}: {e}", file=sys.stderr) - print(" Is the demo running? Start it with: ./run_demo.sh up", file=sys.stderr) - sys.exit(1) - - -def extract_model(response: dict) -> str: - return response.get("model", "") - - -def run_verification(url: str, rounds: int, num_sessions: int) -> bool: - print(f"Plano endpoint : {url}") - print(f"Sessions : {num_sessions}") - print(f"Rounds/session : {rounds}") - print() - - all_passed = True - - # --- Phase 1: Requests without session ID --- - print("=" * 60) - print("Phase 1: Requests WITHOUT X-Model-Affinity header") - print(" (model may vary between requests — that is expected)") - print("=" * 60) - models_seen: set[str] = set() - for i in range(min(rounds, 3)): - resp = chat(url, None, PROMPTS[i % len(PROMPTS)]) - model = extract_model(resp) - models_seen.add(model) - print(f" Request {i + 1}: model = {model}") - print(f" Models seen across {min(rounds, 3)} requests: {models_seen}") - print() - - # --- Phase 2: Each session should always get the same model --- - print("=" * 60) - print("Phase 2: Requests WITH X-Model-Affinity (session pinning)") - print(" Each session should be pinned to exactly one model.") - print("=" * 60) - - session_results: dict[str, list[str]] = defaultdict(list) - - for s in range(num_sessions): - session_id = f"demo-session-{s + 1:03d}" - print(f"\n Session '{session_id}':") - - for r in range(rounds): - resp = chat(url, session_id, PROMPTS[r % len(PROMPTS)]) - model = extract_model(resp) - session_results[session_id].append(model) - pinned = " [PINNED]" if r > 0 else " [FIRST — sets affinity]" - print(f" Round {r + 1}: model = {model}{pinned}") - - print() - print("=" * 60) - print("Results") - print("=" * 60) - - for session_id, models in session_results.items(): - unique_models = set(models) - if len(unique_models) == 1: - print(f" PASS {session_id} -> always routed to '{models[0]}'") - else: - print( - f" FAIL {session_id} -> inconsistent models across rounds: {unique_models}" - ) - all_passed = False - - print() - if all_passed: - print("All sessions were pinned consistently.") - print("Redis session cache is working correctly.") - else: - print("One or more sessions were NOT pinned consistently.") - print("Check that Redis is running and Plano is configured with:") - print(" routing:") - print(" session_cache:") - print(" type: redis") - print(" url: redis://localhost:6379") - - return all_passed - - -def main() -> None: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--url", default=PLANO_URL, help="Plano chat completions URL") - parser.add_argument( - "--rounds", type=int, default=4, help="Requests per session (default 4)" - ) - parser.add_argument( - "--sessions", type=int, default=3, help="Number of sessions to test (default 3)" - ) - args = parser.parse_args() - - passed = run_verification(args.url, args.rounds, args.sessions) - sys.exit(0 if passed else 1) - - -if __name__ == "__main__": - main() diff --git a/demos/llm_routing/session_affinity_redis_k8s/.env.example b/demos/llm_routing/session_affinity_redis_k8s/.env.example deleted file mode 100644 index f045319d..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY=sk-replace-me diff --git a/demos/llm_routing/session_affinity_redis_k8s/Dockerfile b/demos/llm_routing/session_affinity_redis_k8s/Dockerfile deleted file mode 100644 index 877b2246..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/Dockerfile +++ /dev/null @@ -1,95 +0,0 @@ -# Plano image for Redis-backed session affinity demo. -# Build context must be the repository root: -# docker build -f demos/llm_routing/session_affinity_redis_k8s/Dockerfile -t . - -# Envoy version — keep in sync with cli/planoai/consts.py ENVOY_VERSION -ARG ENVOY_VERSION=v1.37.0 - -# --- Dependency cache --- -FROM rust:1.93.0 AS deps -RUN rustup -v target add wasm32-wasip1 -WORKDIR /arch - -COPY crates/Cargo.toml crates/Cargo.lock ./ -COPY crates/common/Cargo.toml common/Cargo.toml -COPY crates/hermesllm/Cargo.toml hermesllm/Cargo.toml -COPY crates/prompt_gateway/Cargo.toml prompt_gateway/Cargo.toml -COPY crates/llm_gateway/Cargo.toml llm_gateway/Cargo.toml -COPY crates/brightstaff/Cargo.toml brightstaff/Cargo.toml - -RUN mkdir -p common/src && echo "" > common/src/lib.rs && \ - mkdir -p hermesllm/src && echo "" > hermesllm/src/lib.rs && \ - mkdir -p hermesllm/src/bin && echo "fn main() {}" > hermesllm/src/bin/fetch_models.rs && \ - mkdir -p prompt_gateway/src && echo "#[no_mangle] pub fn _start() {}" > prompt_gateway/src/lib.rs && \ - mkdir -p llm_gateway/src && echo "#[no_mangle] pub fn _start() {}" > llm_gateway/src/lib.rs && \ - mkdir -p brightstaff/src && echo "fn main() {}" > brightstaff/src/main.rs && echo "" > brightstaff/src/lib.rs - -RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway || true -RUN cargo build --release -p brightstaff || true - -# --- WASM plugins --- -FROM deps AS wasm-builder -RUN rm -rf common/src hermesllm/src prompt_gateway/src llm_gateway/src -COPY crates/common/src common/src -COPY crates/hermesllm/src hermesllm/src -COPY crates/prompt_gateway/src prompt_gateway/src -COPY crates/llm_gateway/src llm_gateway/src -RUN find common hermesllm prompt_gateway llm_gateway -name "*.rs" -exec touch {} + -RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway - -# --- Brightstaff binary --- -FROM deps AS brightstaff-builder -RUN rm -rf common/src hermesllm/src brightstaff/src -COPY crates/common/src common/src -COPY crates/hermesllm/src hermesllm/src -COPY crates/brightstaff/src brightstaff/src -RUN find common hermesllm brightstaff -name "*.rs" -exec touch {} + -RUN cargo build --release -p brightstaff - -FROM docker.io/envoyproxy/envoy:${ENVOY_VERSION} AS envoy - -FROM python:3.14-slim AS arch - -RUN set -eux; \ - apt-get update; \ - apt-get upgrade -y; \ - apt-get install -y --no-install-recommends gettext-base curl procps; \ - apt-get clean; rm -rf /var/lib/apt/lists/* - -RUN pip install --no-cache-dir supervisor - -RUN set -eux; \ - dpkg -r --force-depends libpam-modules libpam-modules-bin libpam-runtime libpam0g || true; \ - dpkg -P --force-all libpam-modules libpam-modules-bin libpam-runtime libpam0g || true; \ - rm -rf /etc/pam.d /lib/*/security /usr/lib/security || true - -COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy - -WORKDIR /app - -RUN pip install --no-cache-dir uv - -COPY cli/pyproject.toml ./ -COPY cli/uv.lock ./ -COPY cli/README.md ./ -COPY config/plano_config_schema.yaml /config/plano_config_schema.yaml -COPY config/envoy.template.yaml /config/envoy.template.yaml - -RUN pip install --no-cache-dir -e . - -COPY cli/planoai planoai/ -COPY config/envoy.template.yaml . -COPY config/plano_config_schema.yaml . -RUN mkdir -p /etc/supervisor/conf.d -COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf - -COPY --from=wasm-builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm -COPY --from=wasm-builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm -COPY --from=brightstaff-builder /arch/target/release/brightstaff /app/brightstaff - -RUN mkdir -p /var/log/supervisor && \ - touch /var/log/envoy.log /var/log/supervisor/supervisord.log \ - /var/log/access_ingress.log /var/log/access_ingress_prompt.log \ - /var/log/access_internal.log /var/log/access_llm.log /var/log/access_agent.log - -ENTRYPOINT ["/usr/local/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/demos/llm_routing/session_affinity_redis_k8s/README.md b/demos/llm_routing/session_affinity_redis_k8s/README.md deleted file mode 100644 index 57905bc7..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/README.md +++ /dev/null @@ -1,287 +0,0 @@ -# Session Affinity — Multi-Replica Kubernetes Deployment - -Production-style Kubernetes demo that proves Redis-backed session affinity -(`X-Model-Affinity`) works correctly when Plano runs as multiple replicas -behind a load balancer. - -## Architecture - -``` - ┌─────────────────────────────────────────┐ - │ Kubernetes Cluster │ - │ │ - Client ──────────►│ LoadBalancer Service (port 12000) │ - │ │ │ │ - │ ┌────▼────┐ ┌─────▼───┐ │ - │ │ Plano │ │ Plano │ (replicas) │ - │ │ Pod 0 │ │ Pod 1 │ │ - │ └────┬────┘ └────┬────┘ │ - │ └──────┬───────┘ │ - │ ┌────▼────┐ │ - │ │ Redis │ (StatefulSet) │ - │ │ Pod │ shared session store │ - │ └─────────┘ │ - │ │ - │ ┌──────────┐ │ - │ │ Jaeger │ distributed tracing │ - │ └──────────┘ │ - └─────────────────────────────────────────┘ -``` - -**What makes this production-like:** - -| Feature | Detail | -|---------|--------| -| 2 Plano replicas | `replicas: 2` with HPA (scales 2–5 on CPU) | -| Shared Redis | StatefulSet with PVC — sessions survive pod restarts | -| Session TTL | 600 s, enforced natively by Redis `EX` | -| Eviction policy | `allkeys-lru` — Redis auto-evicts oldest sessions under memory pressure | -| Distributed tracing | Jaeger collects spans from both pods | -| Health probes | Readiness + liveness gates traffic away from unhealthy pods | - -## Quick Start (local — no registry needed) - -```bash -# 1. Install kind if needed -# https://kind.sigs.k8s.io/docs/user/quick-start/#installation -# brew install kind (macOS) - -# 2. Set your API key -export OPENAI_API_KEY=sk-... -# or copy and edit: -cp .env.example .env - -# 3. Build, deploy, and verify in one command -./run-local.sh -``` - -`run-local.sh` creates a kind cluster named `plano-demo` (if it doesn't exist), -builds the image locally, loads it into the cluster with `kind load docker-image` -— **no registry, no push required**. - -Individual steps: - -```bash -./run-local.sh --build-only # (re-)build and reload image into kind -./run-local.sh --deploy-only # (re-)apply k8s manifests -./run-local.sh --verify # run verify_affinity.py -./run-local.sh --down # delete k8s resources (keeps kind cluster) -./run-local.sh --delete-cluster # delete k8s resources + kind cluster -``` - ---- - -## Prerequisites - -| Tool | Notes | -|------|-------| -| `kubectl` | Configured to reach a Kubernetes cluster | -| `docker` | To build and push the custom image | -| Container registry (optional) | Needed only when you are not using the local kind flow | -| `OPENAI_API_KEY` | For model inference | -| Python 3.11+ | Only for `verify_affinity.py` | - -**Cluster:** `run-local.sh` creates and manages a kind cluster named `plano-demo` automatically. Install kind from https://kind.sigs.k8s.io or `brew install kind`. - -## Step 1 — Build the Image - -Build a custom image from the repo root: - -```bash -# From this demo directory: -./build-and-push.sh ghcr.io/yourorg/plano-redis:latest - -# Or manually from the repo root: -docker build \ - -f demos/llm_routing/session_affinity_redis_k8s/Dockerfile \ - -t ghcr.io/yourorg/plano-redis:latest \ - . -docker push ghcr.io/yourorg/plano-redis:latest -``` - -Then update the image reference in `k8s/plano.yaml` (skip this when using `run-local.sh`, which uses `plano-redis:local` automatically): - -```yaml -image: ghcr.io/yourorg/plano-redis:latest # ← replace YOUR_REGISTRY/plano-redis:latest -``` - -## Step 2 — Deploy - -```bash -./deploy.sh -``` - -The script: -1. Creates the `plano-demo` namespace -2. Prompts for `OPENAI_API_KEY` and creates a Kubernetes Secret -3. Applies Redis, Jaeger, ConfigMap, and Plano manifests in order -4. Waits for rollouts to complete - -Expected output: - -``` -==> Applying namespace... -==> Creating API key secret... - OPENAI_API_KEY: [hidden] -==> Applying Redis (StatefulSet + Services)... -==> Applying Jaeger... -==> Applying Plano config (ConfigMap)... -==> Applying Plano deployment + HPA... -==> Waiting for Redis to be ready... -==> Waiting for Plano pods to be ready... - -Deployment complete! - -=== Pods === -NAME READY STATUS NODE -redis-0 1/1 Running node-1 -plano-6d8f9b-xk2pq 1/1 Running node-1 -plano-6d8f9b-r7nlw 1/1 Running node-2 -jaeger-5c7d8f-q9mnb 1/1 Running node-1 - -=== Services === -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) -plano LoadBalancer 10.96.12.50 203.0.113.42 12000:32000/TCP -redis ClusterIP None 6379/TCP -jaeger ClusterIP 10.96.8.71 16686/TCP,... -``` - -## Step 3 — Verify Session Affinity Across Replicas - -```bash -python verify_affinity.py -``` - -The script opens a dedicated `kubectl port-forward` tunnel to **each pod -individually**. This is the definitive test: it routes requests to specific -pods rather than relying on random load-balancer assignment. - -``` -Mode: per-pod port-forward (full cross-replica proof) - -Found 2 Plano pod(s): plano-6d8f9b-xk2pq, plano-6d8f9b-r7nlw -Opening per-pod port-forward tunnels... - - plano-6d8f9b-xk2pq → localhost:19100 - plano-6d8f9b-r7nlw → localhost:19101 - -================================================================== -Phase 1: Cross-replica session pinning - Pods under test : plano-6d8f9b-xk2pq, plano-6d8f9b-r7nlw - Sessions : 4 - Rounds/session : 4 - - Each session is PINNED via one pod and VERIFIED via another. - If Redis is shared, every round must return the same model. -================================================================== - - PASS k8s-session-001 - model : gpt-4o-mini-2024-07-18 - pod order : plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw - - PASS k8s-session-002 - model : gpt-5.2 - pod order : plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq - - PASS k8s-session-003 - model : gpt-4o-mini-2024-07-18 - pod order : plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw - - PASS k8s-session-004 - model : gpt-5.2 - pod order : plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq - -================================================================== -Phase 2: Redis key inspection -================================================================== - k8s-session-001 - model_name : gpt-4o-mini-2024-07-18 - route_name : fast_responses - TTL : 587s remaining - k8s-session-002 - model_name : gpt-5.2 - route_name : deep_reasoning - TTL : 581s remaining - ... - -================================================================== -Summary -================================================================== -All sessions were pinned consistently across replicas. -Redis session cache is working correctly in Kubernetes. -``` - -## What to Look For - -### The cross-replica proof - -Each session's `pod order` line shows it alternating between the two pods: - -``` -pod order: pod-A → pod-B → pod-A → pod-B -``` - -Round 1 sets the Redis key (via pod-A). Rounds 2, 3, 4 read from Redis on -alternating pods. If the model stays the same across all rounds, Redis is the -shared source of truth — **not** any in-process state. - -### Redis keys - -```bash -kubectl exec -it redis-0 -n plano-demo -- redis-cli - -127.0.0.1:6379> KEYS * -1) "k8s-session-001" -2) "k8s-session-002" - -127.0.0.1:6379> GET k8s-session-001 -{"model_name":"gpt-4o-mini-2024-07-18","route_name":"fast_responses"} - -127.0.0.1:6379> TTL k8s-session-001 -(integer) 587 -``` - -### Jaeger traces - -```bash -kubectl port-forward svc/jaeger 16686:16686 -n plano-demo -``` - -Open **http://localhost:16686**, select service `plano`. - -- **Pinned requests** — no span to the Arch-Router (decision served from Redis) -- **First request** per session — spans include the router call + a Redis `SET` -- Both Plano pods appear as separate instances in the trace list - -### Scaling up (HPA in action) - -```bash -# Scale to 3 replicas manually -kubectl scale deployment/plano --replicas=3 -n plano-demo - -# Run verification again — now 3 pods alternate -python verify_affinity.py --sessions 6 -``` - -Existing sessions in Redis are unaffected by the scale event. New pods -immediately participate in the shared session pool. - -## Teardown - -```bash -./deploy.sh --destroy -# Then optionally: -kubectl delete namespace plano-demo -``` - -## Notes - -- The Redis StatefulSet uses a `PersistentVolumeClaim`. Session data survives - pod restarts within a TTL window but is not HA. For production, replace with - Redis Sentinel, Redis Cluster, or a managed service (ElastiCache, MemoryStore). -- `session_max_entries` is not enforced by this backend — Redis uses - `maxmemory-policy: allkeys-lru` instead, which is a global limit across all - keys rather than a per-application cap. -- On **minikube**, run `minikube tunnel` in a separate terminal to get an - external IP for the LoadBalancer service. -- On **kind**, switch to `NodePort` (see the comment in `k8s/plano.yaml`). diff --git a/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh b/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh deleted file mode 100755 index 12632ed9..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash -# build-and-push.sh — Build the Plano demo image and push it to your registry. -# -# Usage: -# ./build-and-push.sh -# -# Example: -# ./build-and-push.sh ghcr.io/yourorg/plano-redis:latest -# ./build-and-push.sh docker.io/youruser/plano-redis:0.4.17 -# -# The build context is the repository root. Run this script from anywhere — -# it resolves the repo root automatically. - -set -euo pipefail - -IMAGE="${1:-}" -if [ -z "$IMAGE" ]; then - echo "Usage: $0 " - echo "" - echo "Example:" - echo " $0 ghcr.io/yourorg/plano-redis:latest" - exit 1 -fi - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" -DOCKERFILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/Dockerfile" - -echo "Repository root : $REPO_ROOT" -echo "Dockerfile : $DOCKERFILE" -echo "Image : $IMAGE" -echo "" - -echo "==> Building image (this takes a few minutes — Rust compile from scratch)..." -docker build \ - --file "$DOCKERFILE" \ - --tag "$IMAGE" \ - --progress=plain \ - "$REPO_ROOT" - -echo "==> Pushing $IMAGE..." -docker push "$IMAGE" - -echo "" -echo "Done. Update k8s/plano.yaml:" -echo " image: $IMAGE" -echo "" -echo "Then deploy with: ./deploy.sh" diff --git a/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml b/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml deleted file mode 100644 index da142762..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml +++ /dev/null @@ -1,38 +0,0 @@ -version: v0.4.0 - -listeners: - - type: model - name: model_listener - port: 12000 - -model_providers: - - model: openai/gpt-4o-mini - access_key: $OPENAI_API_KEY - default: true - - - model: openai/gpt-5.2 - access_key: $OPENAI_API_KEY - -routing_preferences: - - name: fast_responses - description: short factual questions, quick lookups, simple summarization, or greetings - models: - - openai/gpt-4o-mini - - - name: deep_reasoning - description: multi-step reasoning, complex analysis, code review, or detailed explanations - models: - - openai/gpt-5.2 - - openai/gpt-4o-mini - -# Redis is reachable inside the cluster via the service name. -routing: - session_ttl_seconds: 600 - session_cache: - type: redis - url: redis://redis.plano-demo.svc.cluster.local:6379 - -tracing: - random_sampling: 100 - trace_arch_internal: true - opentracing_grpc_endpoint: http://jaeger.plano-demo.svc.cluster.local:4317 diff --git a/demos/llm_routing/session_affinity_redis_k8s/deploy.sh b/demos/llm_routing/session_affinity_redis_k8s/deploy.sh deleted file mode 100755 index 2a3ae295..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/deploy.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env bash -# deploy.sh — Apply all Kubernetes manifests in the correct order. -# -# Usage: -# ./deploy.sh # deploy everything -# ./deploy.sh --destroy # tear down everything (keeps namespace for safety) -# ./deploy.sh --status # show pod and service status - -set -euo pipefail - -DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -K8S_DIR="$DEMO_DIR/k8s" -NS="plano-demo" - -check_prereqs() { - local missing=() - command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") - if [ ${#missing[@]} -gt 0 ]; then - echo "ERROR: missing required tools: ${missing[*]}" - exit 1 - fi - - if ! kubectl cluster-info &>/dev/null; then - echo "ERROR: no Kubernetes cluster reachable. Start minikube/kind or configure kubeconfig." - exit 1 - fi -} - -create_secret() { - if kubectl get secret plano-secrets -n "$NS" &>/dev/null; then - echo " Secret plano-secrets already exists, skipping." - return - fi - - local openai_api_key="${OPENAI_API_KEY:-}" - if [ -z "$openai_api_key" ]; then - echo "" - echo "No 'plano-secrets' secret found in namespace '$NS'." - echo "Enter API keys (input is hidden):" - echo "" - read -r -s -p " OPENAI_API_KEY: " openai_api_key - echo "" - else - echo " Using OPENAI_API_KEY from environment." - fi - - if [ -z "$openai_api_key" ]; then - echo "ERROR: OPENAI_API_KEY cannot be empty." - exit 1 - fi - - kubectl create secret generic plano-secrets \ - --from-literal=OPENAI_API_KEY="$openai_api_key" \ - -n "$NS" - - echo " Secret created." -} - -deploy() { - echo "==> Applying namespace..." - kubectl apply -f "$K8S_DIR/namespace.yaml" - - echo "==> Creating API key secret..." - create_secret - - echo "==> Applying Redis (StatefulSet + Services)..." - kubectl apply -f "$K8S_DIR/redis.yaml" - - echo "==> Applying Jaeger..." - kubectl apply -f "$K8S_DIR/jaeger.yaml" - - echo "==> Applying Plano config (ConfigMap)..." - kubectl apply -f "$K8S_DIR/plano-config.yaml" - - echo "==> Applying Plano deployment + HPA..." - kubectl apply -f "$K8S_DIR/plano.yaml" - - echo "" - echo "==> Waiting for Redis to be ready..." - kubectl rollout status statefulset/redis -n "$NS" --timeout=120s - - echo "==> Waiting for Plano pods to be ready..." - kubectl rollout status deployment/plano -n "$NS" --timeout=120s - - echo "" - echo "Deployment complete!" - show_status - echo "" - echo "Useful commands:" - echo " # Tail logs from all Plano pods:" - echo " kubectl logs -l app=plano -n $NS -f" - echo "" - echo " # Open Jaeger UI:" - echo " kubectl port-forward svc/jaeger 16686:16686 -n $NS &" - echo " open http://localhost:16686" - echo "" - echo " # Access Redis CLI:" - echo " kubectl exec -it redis-0 -n $NS -- redis-cli" - echo "" - echo " # Run the verification script:" - echo " python $DEMO_DIR/verify_affinity.py" -} - -destroy() { - echo "==> Deleting Plano, Jaeger, and Redis resources..." - kubectl delete -f "$K8S_DIR/plano.yaml" --ignore-not-found - kubectl delete -f "$K8S_DIR/jaeger.yaml" --ignore-not-found - kubectl delete -f "$K8S_DIR/redis.yaml" --ignore-not-found - kubectl delete -f "$K8S_DIR/plano-config.yaml" --ignore-not-found - kubectl delete secret plano-secrets -n "$NS" --ignore-not-found - - echo "" - echo "Resources deleted." - echo "Namespace '$NS' was kept. Remove it manually if desired:" - echo " kubectl delete namespace $NS" -} - -show_status() { - echo "" - echo "=== Pods ===" - kubectl get pods -n "$NS" -o wide - echo "" - echo "=== Services ===" - kubectl get svc -n "$NS" - echo "" - echo "=== HPA ===" - kubectl get hpa -n "$NS" 2>/dev/null || true -} - -check_prereqs - -case "${1:-}" in - --destroy) destroy ;; - --status) show_status ;; - *) deploy ;; -esac diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml deleted file mode 100644 index 7d5c2d62..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml +++ /dev/null @@ -1,56 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jaeger - namespace: plano-demo - labels: - app: jaeger -spec: - replicas: 1 - selector: - matchLabels: - app: jaeger - template: - metadata: - labels: - app: jaeger - spec: - containers: - - name: jaeger - image: jaegertracing/jaeger:2.3.0 - ports: - - containerPort: 16686 # UI - - containerPort: 4317 # OTLP gRPC - - containerPort: 4318 # OTLP HTTP - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" ---- -apiVersion: v1 -kind: Service -metadata: - name: jaeger - namespace: plano-demo - labels: - app: jaeger -spec: - selector: - app: jaeger - ports: - - name: ui - port: 16686 - targetPort: 16686 - - name: otlp-grpc - port: 4317 - targetPort: 4317 - - name: otlp-http - port: 4318 - targetPort: 4318 ---- -# NodePort for UI access from your laptop. -# Access at: http://localhost:16686 after: kubectl port-forward svc/jaeger 16686:16686 -n plano-demo diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml deleted file mode 100644 index 4992eaa4..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: plano-demo - labels: - app.kubernetes.io/part-of: plano-session-affinity-demo diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml deleted file mode 100644 index 6c3d5339..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -# ConfigMap wrapping the Plano config file. -# Regenerate after editing config_k8s.yaml: -# kubectl create configmap plano-config \ -# --from-file=plano_config.yaml=../config_k8s.yaml \ -# -n plano-demo --dry-run=client -o yaml | kubectl apply -f - -apiVersion: v1 -kind: ConfigMap -metadata: - name: plano-config - namespace: plano-demo -data: - plano_config.yaml: | - version: v0.4.0 - - listeners: - - type: model - name: model_listener - port: 12000 - - model_providers: - - model: openai/gpt-4o-mini - access_key: $OPENAI_API_KEY - default: true - - - model: openai/gpt-5.2 - access_key: $OPENAI_API_KEY - - routing_preferences: - - name: fast_responses - description: short factual questions, quick lookups, simple summarization, or greetings - models: - - openai/gpt-4o-mini - - - name: deep_reasoning - description: multi-step reasoning, complex analysis, code review, or detailed explanations - models: - - openai/gpt-5.2 - - openai/gpt-4o-mini - - routing: - session_ttl_seconds: 600 - session_cache: - type: redis - url: redis://redis.plano-demo.svc.cluster.local:6379 - - tracing: - random_sampling: 100 - trace_arch_internal: true - opentracing_grpc_endpoint: http://jaeger.plano-demo.svc.cluster.local:4317 diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml deleted file mode 100644 index 32ab8681..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# EXAMPLE — do NOT apply this file directly. -# Create the real secret with: -# -# kubectl create secret generic plano-secrets \ -# --from-literal=OPENAI_API_KEY=sk-... \ -# -n plano-demo -# -# Or use the deploy.sh script, which prompts for keys and creates the secret. -# -# If you use a secrets manager (AWS Secrets Manager, GCP Secret Manager, Vault) -# replace this with an ExternalSecret or a CSI driver volume mount instead. -apiVersion: v1 -kind: Secret -metadata: - name: plano-secrets - namespace: plano-demo -type: Opaque -stringData: - OPENAI_API_KEY: "sk-replace-me" diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml deleted file mode 100644 index 624a58f3..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml +++ /dev/null @@ -1,130 +0,0 @@ ---- -# Plano Deployment — 2 replicas sharing one Redis instance. -# All replicas are stateless; routing state lives entirely in Redis. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: plano - namespace: plano-demo - labels: - app: plano -spec: - replicas: 2 - selector: - matchLabels: - app: plano - template: - metadata: - labels: - app: plano - spec: - containers: - - name: plano - # Local dev: run-local.sh sets this to plano-redis:local and loads it - # into minikube/kind so no registry is needed. - # Production: replace with your registry image and use imagePullPolicy: Always. - image: plano-redis:local - imagePullPolicy: IfNotPresent - ports: - - containerPort: 12000 - name: llm-gateway - envFrom: - - secretRef: - name: plano-secrets - env: - - name: LOG_LEVEL - value: "info" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - volumeMounts: - - name: plano-config - mountPath: /app/plano_config.yaml - subPath: plano_config.yaml - readOnly: true - readinessProbe: - httpGet: - path: /healthz - port: 12000 - initialDelaySeconds: 5 - periodSeconds: 10 - failureThreshold: 3 - livenessProbe: - httpGet: - path: /healthz - port: 12000 - initialDelaySeconds: 15 - periodSeconds: 30 - failureThreshold: 3 - resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "1Gi" - cpu: "1000m" - volumes: - - name: plano-config - configMap: - name: plano-config ---- -# LoadBalancer Service — exposes Plano externally. -# On minikube, run: minikube tunnel -# On kind, use NodePort instead (see comment below). -# On cloud providers (GKE, EKS, AKS), an external IP is assigned automatically. -apiVersion: v1 -kind: Service -metadata: - name: plano - namespace: plano-demo - labels: - app: plano -spec: - type: LoadBalancer - selector: - app: plano - ports: - - name: llm-gateway - port: 12000 - targetPort: 12000 ---- -# Uncomment and use instead of the LoadBalancer above when running on kind/minikube -# without tunnel: -# -# apiVersion: v1 -# kind: Service -# metadata: -# name: plano -# namespace: plano-demo -# spec: -# type: NodePort -# selector: -# app: plano -# ports: -# - name: llm-gateway -# port: 12000 -# targetPort: 12000 -# nodePort: 32000 ---- -# HorizontalPodAutoscaler — scales 2 to 5 replicas based on CPU. -# Demonstrates that new replicas join the existing session state seamlessly. -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: plano - namespace: plano-demo -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: plano - minReplicas: 2 - maxReplicas: 5 - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml deleted file mode 100644 index c47ad09c..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml +++ /dev/null @@ -1,96 +0,0 @@ ---- -# Redis StatefulSet — single-shard, persistence enabled. -# For production, replace with Redis Cluster or a managed service (ElastiCache, MemoryStore, etc.). -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: redis - namespace: plano-demo - labels: - app: redis -spec: - serviceName: redis - replicas: 1 - selector: - matchLabels: - app: redis - template: - metadata: - labels: - app: redis - spec: - containers: - - name: redis - image: redis:7-alpine - ports: - - containerPort: 6379 - name: redis - command: - - redis-server - - --appendonly - - "yes" - - --maxmemory - - "256mb" - - --maxmemory-policy - - allkeys-lru - readinessProbe: - exec: - command: ["redis-cli", "ping"] - initialDelaySeconds: 5 - periodSeconds: 5 - livenessProbe: - exec: - command: ["redis-cli", "ping"] - initialDelaySeconds: 15 - periodSeconds: 20 - resources: - requests: - memory: "64Mi" - cpu: "100m" - limits: - memory: "320Mi" - cpu: "500m" - volumeMounts: - - name: redis-data - mountPath: /data - volumeClaimTemplates: - - metadata: - name: redis-data - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 1Gi ---- -# Stable DNS name: redis.plano-demo.svc.cluster.local:6379 -apiVersion: v1 -kind: Service -metadata: - name: redis - namespace: plano-demo - labels: - app: redis -spec: - selector: - app: redis - ports: - - name: redis - port: 6379 - targetPort: 6379 - clusterIP: None # headless — StatefulSet pods get stable DNS ---- -# Regular ClusterIP for application code (redis://redis:6379) -apiVersion: v1 -kind: Service -metadata: - name: redis-service - namespace: plano-demo - labels: - app: redis -spec: - selector: - app: redis - ports: - - name: redis - port: 6379 - targetPort: 6379 diff --git a/demos/llm_routing/session_affinity_redis_k8s/run-local.sh b/demos/llm_routing/session_affinity_redis_k8s/run-local.sh deleted file mode 100755 index 80808a88..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/run-local.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env bash -# run-local.sh — Build and run the k8s session affinity demo entirely locally with kind. -# No registry, no image push required. -# -# Usage: -# ./run-local.sh # create cluster (if needed), build, deploy, verify -# ./run-local.sh --build-only # build and load the image into kind -# ./run-local.sh --deploy-only # skip build, re-apply k8s manifests -# ./run-local.sh --verify # run verify_affinity.py against the running cluster -# ./run-local.sh --down # tear down k8s resources (keeps kind cluster) -# ./run-local.sh --delete-cluster # also delete the kind cluster - -set -euo pipefail - -DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$DEMO_DIR/../../.." && pwd)" -IMAGE_NAME="plano-redis:local" -KIND_CLUSTER="plano-demo" - -# --------------------------------------------------------------------------- -# Prereq check -# --------------------------------------------------------------------------- - -check_prereqs() { - local missing=() - command -v docker >/dev/null 2>&1 || missing+=("docker") - command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") - command -v kind >/dev/null 2>&1 || missing+=("kind (https://kind.sigs.k8s.io/docs/user/quick-start/#installation)") - command -v python3 >/dev/null 2>&1 || missing+=("python3") - - if [ ${#missing[@]} -gt 0 ]; then - echo "ERROR: missing required tools:" - for t in "${missing[@]}"; do echo " - $t"; done - exit 1 - fi -} - -load_env() { - if [ -f "$DEMO_DIR/.env" ]; then - set -a - # shellcheck disable=SC1091 - source "$DEMO_DIR/.env" - set +a - fi -} - -# --------------------------------------------------------------------------- -# Cluster lifecycle -# --------------------------------------------------------------------------- - -ensure_cluster() { - if kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER}$"; then - echo "==> kind cluster '$KIND_CLUSTER' already exists, reusing." - else - echo "==> Creating kind cluster '$KIND_CLUSTER'..." - kind create cluster --name "$KIND_CLUSTER" - echo " Cluster created." - fi - - # Point kubectl at this cluster - kubectl config use-context "kind-${KIND_CLUSTER}" >/dev/null -} - -# --------------------------------------------------------------------------- -# Build and load -# --------------------------------------------------------------------------- - -build() { - echo "==> Building image '$IMAGE_NAME' from repo root..." - docker build \ - --file "$DEMO_DIR/Dockerfile" \ - --tag "$IMAGE_NAME" \ - --progress=plain \ - "$REPO_ROOT" - - echo "==> Loading '$IMAGE_NAME' into kind cluster '$KIND_CLUSTER'..." - kind load docker-image "$IMAGE_NAME" --name "$KIND_CLUSTER" - echo " Image loaded." -} - -# --------------------------------------------------------------------------- -# Deploy / verify / teardown -# --------------------------------------------------------------------------- - -deploy() { - echo "" - echo "==> Deploying to Kubernetes..." - "$DEMO_DIR/deploy.sh" -} - -verify() { - echo "" - echo "==> Running cross-replica verification..." - python3 "$DEMO_DIR/verify_affinity.py" -} - -down() { - "$DEMO_DIR/deploy.sh" --destroy -} - -delete_cluster() { - echo "==> Deleting kind cluster '$KIND_CLUSTER'..." - kind delete cluster --name "$KIND_CLUSTER" - echo " Cluster deleted." -} - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -case "${1:-}" in - --build-only) - check_prereqs - load_env - ensure_cluster - build - ;; - --deploy-only) - check_prereqs - load_env - ensure_cluster - deploy - ;; - --verify) - check_prereqs - verify - ;; - --down) - check_prereqs - down - ;; - --delete-cluster) - check_prereqs - down - delete_cluster - ;; - "") - check_prereqs - load_env - ensure_cluster - echo "" - build - deploy - echo "" - echo "==> Everything is up. Running verification in 5 seconds..." - echo " (Ctrl-C to skip — run manually with: ./run-local.sh --verify)" - sleep 5 - verify - ;; - *) - echo "Usage: $0 [--build-only | --deploy-only | --verify | --down | --delete-cluster]" - exit 1 - ;; -esac diff --git a/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py b/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py deleted file mode 100644 index 4c368ed7..00000000 --- a/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/env python3 -""" -verify_affinity.py — Prove that Redis-backed session affinity works across Plano replicas. - -Strategy --------- -Kubernetes round-robin is non-deterministic, so simply hammering the LoadBalancer -service is not a reliable proof. Instead this script: - - 1. Discovers the two (or more) Plano pod names with kubectl. - 2. Opens a kubectl port-forward tunnel to EACH pod on a separate local port. - 3. Pins a session via Pod 0 (writes the Redis key). - 4. Reads the same session via Pod 1 (must return the same model — reads Redis). - 5. Repeats across N sessions, round-robining which pod sets vs. reads the pin. - -If every round returns the same model, Redis is the shared source of truth and -multi-replica affinity is proven. - -Usage ------ - # From inside the cluster network (e.g. CI job or jumpbox): - python verify_affinity.py --url http://:12000 - - # From your laptop (uses kubectl port-forward automatically): - python verify_affinity.py - - # More sessions / rounds: - python verify_affinity.py --sessions 5 --rounds 6 - -Requirements ------------- - kubectl — configured to reach the plano-demo namespace - Python 3.11+ -""" - -import argparse -import http.client -import json -import signal -import subprocess -import sys -import time -import urllib.error -import urllib.request -from contextlib import contextmanager - -NAMESPACE = "plano-demo" -BASE_LOCAL_PORT = 19100 # port-forward starts here, increments per pod - -PROMPTS = [ - "Explain the difference between TCP and UDP in detail.", - "Write a merge sort implementation in Python.", - "What is quantum entanglement?", - "Describe the CAP theorem with examples.", - "How does gradient descent work in neural networks?", - "What is the time complexity of Dijkstra's algorithm?", -] - - -# --------------------------------------------------------------------------- -# kubectl helpers -# --------------------------------------------------------------------------- - - -def get_pod_names() -> list[str]: - """Return running Plano pod names in the plano-demo namespace.""" - result = subprocess.run( - [ - "kubectl", - "get", - "pods", - "-n", - NAMESPACE, - "-l", - "app=plano", - "--field-selector=status.phase=Running", - "-o", - "jsonpath={.items[*].metadata.name}", - ], - capture_output=True, - text=True, - check=True, - ) - pods = result.stdout.strip().split() - if not pods or pods == [""]: - raise RuntimeError( - f"No running Plano pods found in namespace '{NAMESPACE}'.\n" - "Is the cluster deployed? Run: ./deploy.sh" - ) - return pods - - -@contextmanager -def port_forward(pod_name: str, local_port: int, remote_port: int = 12000): - """Context manager that starts and stops a kubectl port-forward.""" - proc = subprocess.Popen( - [ - "kubectl", - "port-forward", - f"pod/{pod_name}", - f"{local_port}:{remote_port}", - "-n", - NAMESPACE, - ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - # Give the tunnel a moment to establish - time.sleep(1.5) - try: - yield f"http://localhost:{local_port}" - finally: - proc.send_signal(signal.SIGTERM) - try: - proc.wait(timeout=3) - except subprocess.TimeoutExpired: - proc.kill() - - -# --------------------------------------------------------------------------- -# HTTP helpers -# --------------------------------------------------------------------------- - - -def chat( - base_url: str, - session_id: str | None, - message: str, - model: str = "openai/gpt-4o-mini", - retries: int = 3, - retry_delay: float = 5.0, -) -> dict: - payload = json.dumps( - { - "model": model, - "messages": [{"role": "user", "content": message}], - } - ).encode() - - headers = {"Content-Type": "application/json"} - if session_id: - headers["x-model-affinity"] = session_id - - req = urllib.request.Request( - f"{base_url}/v1/chat/completions", - data=payload, - headers=headers, - method="POST", - ) - last_err: Exception | None = None - for attempt in range(retries): - try: - with urllib.request.urlopen(req, timeout=60) as resp: - body = resp.read() - if not body: - raise RuntimeError(f"Empty response body from {base_url}") - return json.loads(body) - except urllib.error.HTTPError as e: - if e.code in (503, 502, 429) and attempt < retries - 1: - time.sleep(retry_delay * (attempt + 1)) - last_err = e - continue - raise RuntimeError(f"Request to {base_url} failed: {e}") from e - except ( - urllib.error.URLError, - http.client.RemoteDisconnected, - RuntimeError, - ) as e: - if attempt < retries - 1: - time.sleep(retry_delay * (attempt + 1)) - last_err = e - continue - raise RuntimeError(f"Request to {base_url} failed: {e}") from e - except json.JSONDecodeError as e: - raise RuntimeError(f"Invalid JSON from {base_url}: {e}") from e - raise RuntimeError( - f"Request to {base_url} failed after {retries} attempts: {last_err}" - ) - - -def extract_model(response: dict) -> str: - return response.get("model", "") - - -# --------------------------------------------------------------------------- -# Verification phases -# --------------------------------------------------------------------------- - - -def phase_loadbalancer(url: str, rounds: int) -> None: - """Phase 0: quick smoke test against the LoadBalancer / provided URL.""" - print("=" * 66) - print(f"Phase 0: Smoke test against {url}") - print("=" * 66) - for i in range(rounds): - resp = chat(url, None, PROMPTS[i % len(PROMPTS)]) - print(f" Request {i + 1}: model = {extract_model(resp)}") - print() - - -def phase_cross_replica( - pod_urls: dict[str, str], num_sessions: int, rounds: int -) -> bool: - """ - Phase 1 — Cross-replica pinning. - - For each session: - • Round 1: send to pod_A (sets the Redis key) - • Rounds 2+: alternate between pod_A and pod_B - • Assert every round returns the same model. - """ - pod_names = list(pod_urls.keys()) - all_passed = True - session_results: dict[str, dict] = {} - - print("=" * 66) - print("Phase 1: Cross-replica session pinning") - print(f" Pods under test : {', '.join(pod_names)}") - print(f" Sessions : {num_sessions}") - print(f" Rounds/session : {rounds}") - print() - print(" Each session is PINNED via one pod and VERIFIED via another.") - print(" If Redis is shared, every round must return the same model.") - print("=" * 66) - - for s in range(num_sessions): - session_id = f"k8s-session-{s + 1:03d}" - models_seen = [] - pod_sequence = [] - - for r in range(rounds): - # Alternate which pod handles each round - pod_name = pod_names[r % len(pod_names)] - url = pod_urls[pod_name] - - try: - resp = chat(url, session_id, PROMPTS[(s + r) % len(PROMPTS)]) - model = extract_model(resp) - except RuntimeError as e: - print(f" ERROR on {pod_name} round {r + 1}: {e}") - all_passed = False - continue - - models_seen.append(model) - pod_sequence.append(pod_name) - - unique_models = set(models_seen) - passed = len(unique_models) == 1 - - session_results[session_id] = { - "passed": passed, - "model": models_seen[0] if models_seen else "", - "unique_models": unique_models, - "pod_sequence": pod_sequence, - } - - status = "PASS" if passed else "FAIL" - detail = models_seen[0] if passed else str(unique_models) - print(f"\n {status} {session_id}") - print(f" model : {detail}") - print(f" pod order : {' → '.join(pod_sequence)}") - - if not passed: - all_passed = False - - return all_passed - - -def phase_redis_inspect(num_sessions: int) -> None: - """Phase 2: read keys directly from Redis to show what's stored.""" - print() - print("=" * 66) - print("Phase 2: Redis key inspection") - print("=" * 66) - for s in range(num_sessions): - session_id = f"k8s-session-{s + 1:03d}" - result = subprocess.run( - [ - "kubectl", - "exec", - "-n", - NAMESPACE, - "redis-0", - "--", - "redis-cli", - "GET", - session_id, - ], - capture_output=True, - text=True, - ) - raw = result.stdout.strip() - ttl_result = subprocess.run( - [ - "kubectl", - "exec", - "-n", - NAMESPACE, - "redis-0", - "--", - "redis-cli", - "TTL", - session_id, - ], - capture_output=True, - text=True, - ) - ttl = ttl_result.stdout.strip() - - if raw and raw != "(nil)": - try: - data = json.loads(raw) - print(f" {session_id}") - print(f" model_name : {data.get('model_name', '?')}") - print(f" route_name : {data.get('route_name', 'null')}") - print(f" TTL : {ttl}s remaining") - except json.JSONDecodeError: - print(f" {session_id}: (raw) {raw}") - else: - print(f" {session_id}: key not found or expired") - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - - -def main() -> None: - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument( - "--url", - default=None, - help="LoadBalancer URL to use instead of per-pod port-forwards. " - "When set, cross-replica proof is skipped (no pod targeting).", - ) - parser.add_argument( - "--sessions", type=int, default=4, help="Number of sessions (default 4)" - ) - parser.add_argument( - "--rounds", type=int, default=4, help="Rounds per session (default 4)" - ) - parser.add_argument( - "--skip-redis-inspect", action="store_true", help="Skip Redis key inspection" - ) - args = parser.parse_args() - - if args.url: - # Simple mode: hit the LoadBalancer directly - print(f"Mode: LoadBalancer ({args.url})") - print() - phase_loadbalancer(args.url, args.rounds) - print("To get the full cross-replica proof, run without --url.") - sys.exit(0) - - # Full mode: port-forward to each pod individually - print("Mode: per-pod port-forward (full cross-replica proof)") - print() - - try: - pod_names = get_pod_names() - except (subprocess.CalledProcessError, RuntimeError) as e: - print(f"ERROR: {e}", file=sys.stderr) - sys.exit(1) - - if len(pod_names) < 2: - print(f"WARNING: only {len(pod_names)} Plano pod(s) running.") - print(" For a true cross-replica test you need at least 2.") - print(" Scale up: kubectl scale deployment/plano --replicas=2 -n plano-demo") - print() - - print(f"Found {len(pod_names)} Plano pod(s): {', '.join(pod_names)}") - print("Opening per-pod port-forward tunnels...") - print() - - pod_urls: dict[str, str] = {} - contexts = [] - - for i, pod in enumerate(pod_names): - local_port = BASE_LOCAL_PORT + i - ctx = port_forward(pod, local_port) - url = ctx.__enter__() - pod_urls[pod] = url - contexts.append((ctx, url)) - print(f" {pod} → localhost:{local_port}") - - print() - - try: - passed = phase_cross_replica(pod_urls, args.sessions, args.rounds) - - if not args.skip_redis_inspect: - phase_redis_inspect(args.sessions) - - print() - print("=" * 66) - print("Summary") - print("=" * 66) - if passed: - print("All sessions were pinned consistently across replicas.") - print("Redis session cache is working correctly in Kubernetes.") - else: - print("One or more sessions were NOT consistent across replicas.") - print("Check brightstaff logs: kubectl logs -l app=plano -n plano-demo") - - finally: - for ctx, _ in contexts: - try: - ctx.__exit__(None, None, None) - except Exception: - pass - - sys.exit(0 if passed else 1) - - -if __name__ == "__main__": - main()