diff --git a/demos/llm_routing/session_affinity_redis/.env.example b/demos/llm_routing/session_affinity_redis/.env.example new file mode 100644 index 00000000..f045319d --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/.env.example @@ -0,0 +1 @@ +OPENAI_API_KEY=sk-replace-me diff --git a/demos/llm_routing/session_affinity_redis/README.md b/demos/llm_routing/session_affinity_redis/README.md new file mode 100644 index 00000000..d74cf35b --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/README.md @@ -0,0 +1,247 @@ +# Session Affinity with Redis — Multi-Replica Model Pinning + +This demo shows Plano's **session affinity** (`X-Model-Affinity` header) backed by a **Redis session cache** instead of the default in-memory store. + +## The Problem + +By default, model affinity stores routing decisions in a per-process `HashMap`. +This works for single-instance deployments, but breaks when you run multiple +Plano replicas behind a load balancer: + +``` +Client ──► Load Balancer ──► Replica A (session pinned here) + └──► Replica B (knows nothing about the session) +``` + +A request that was pinned to `gpt-4o` on Replica A will be re-routed from +scratch on Replica B, defeating the purpose of affinity. + +## The Solution + +Plano's `session_cache` config key accepts a `type: redis` backend that is +shared across all replicas: + +```yaml +routing: + session_ttl_seconds: 300 + session_cache: + type: redis + url: redis://localhost:6379 +``` + +All replicas read and write the same Redis keyspace. A session pinned on any +replica is immediately visible to all others. + +## What to Look For + +| What | Expected behaviour | +|------|--------------------| +| First request with a session ID | Plano routes normally (via Arch-Router) and writes the result to Redis (`SET session-id ... EX 300`) | +| Subsequent requests with the **same** session ID | Plano reads from Redis and skips the router — same model every time | +| Requests with a **different** session ID | Routed independently; may land on a different model | +| After `session_ttl_seconds` elapses | Redis key expires; next request re-routes and sets a new pin | +| `x-plano-pinned: true` response header | Tells you the response was served from the session cache | + +## Architecture + +``` +Client + │ X-Model-Affinity: my-session-id + ▼ +Plano (brightstaff) + ├── GET redis://localhost:6379/my-session-id + │ hit? → return pinned model immediately (no Arch-Router call) + │ miss? → call Arch-Router → SET key EX 300 → return routed model + ▼ +Redis (shared across replicas) +``` + +## Prerequisites + +| Requirement | Notes | +|-------------|-------| +| `planoai` CLI | `pip install planoai` | +| Docker + Docker Compose | For Redis and Jaeger | +| `OPENAI_API_KEY` | Required for routing model (Arch-Router) and downstream LLMs | +| Python 3.11+ | Only needed to run `verify_affinity.py` | + +## Quick Start + +```bash +# 1. Set your API key +export OPENAI_API_KEY=sk-... +# or copy and edit: +cp .env.example .env + +# 2. Start Redis, Jaeger, and Plano +./run_demo.sh up + +# 3. Verify session pinning works +python verify_affinity.py +``` + +## Manual Verification with curl + +### Step 1 — Pin a session (first request sets the affinity) + +```bash +curl -s http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "x-model-affinity: my-session-abc" \ + -d '{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Write a short poem about the ocean."}]}' \ + | jq '{model, pinned: .x_plano_pinned}' +``` + +Expected output (first request — not yet pinned, Arch-Router picks the model): + +```json +{ + "model": "openai/gpt-5.2", + "pinned": null +} +``` + +### Step 2 — Confirm the pin is held on subsequent requests + +```bash +for i in 1 2 3 4; do + curl -s http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "x-model-affinity: my-session-abc" \ + -d "{\"model\":\"openai/gpt-4o-mini\",\"messages\":[{\"role\":\"user\",\"content\":\"Request $i\"}]}" \ + | jq -r '"\(.model)"' +done +``` + +Expected output (same model for every request): + +``` +openai/gpt-5.2 +openai/gpt-5.2 +openai/gpt-5.2 +openai/gpt-5.2 +``` + +### Step 3 — Inspect the Redis key directly + +```bash +docker exec plano-session-redis redis-cli \ + GET my-session-abc | python3 -m json.tool +``` + +Expected output: + +```json +{ + "model_name": "openai/gpt-5.2", + "route_name": "deep_reasoning" +} +``` + +```bash +# Check the TTL (seconds remaining) +docker exec plano-session-redis redis-cli TTL my-session-abc +# e.g. 287 +``` + +### Step 4 — Different sessions may get different models + +```bash +for session in session-A session-B session-C; do + model=$(curl -s http://localhost:12000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "x-model-affinity: $session" \ + -d '{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Explain quantum entanglement in detail with equations."}]}' \ + | jq -r '.model') + echo "$session -> $model" +done +``` + +Sessions with content matched to `deep_reasoning` will pin to `openai/gpt-5.2`; +sessions matched to `fast_responses` will pin to `openai/gpt-4o-mini`. + +## Verification Script Output + +Running `python verify_affinity.py` produces output like: + +``` +Plano endpoint : http://localhost:12000/v1/chat/completions +Sessions : 3 +Rounds/session : 4 + +============================================================ +Phase 1: Requests WITHOUT X-Model-Affinity header + (model may vary between requests — that is expected) +============================================================ + Request 1: model = openai/gpt-4o-mini + Request 2: model = openai/gpt-5.2 + Request 3: model = openai/gpt-4o-mini + Models seen across 3 requests: {'openai/gpt-4o-mini', 'openai/gpt-5.2'} + +============================================================ +Phase 2: Requests WITH X-Model-Affinity (session pinning) + Each session should be pinned to exactly one model. +============================================================ + + Session 'demo-session-001': + Round 1: model = openai/gpt-4o-mini [FIRST — sets affinity] + Round 2: model = openai/gpt-4o-mini [PINNED] + Round 3: model = openai/gpt-4o-mini [PINNED] + Round 4: model = openai/gpt-4o-mini [PINNED] + + Session 'demo-session-002': + Round 1: model = openai/gpt-5.2 [FIRST — sets affinity] + Round 2: model = openai/gpt-5.2 [PINNED] + Round 3: model = openai/gpt-5.2 [PINNED] + Round 4: model = openai/gpt-5.2 [PINNED] + + Session 'demo-session-003': + Round 1: model = openai/gpt-4o-mini [FIRST — sets affinity] + Round 2: model = openai/gpt-4o-mini [PINNED] + Round 3: model = openai/gpt-4o-mini [PINNED] + Round 4: model = openai/gpt-4o-mini [PINNED] + +============================================================ +Results +============================================================ + PASS demo-session-001 -> always routed to 'openai/gpt-4o-mini' + PASS demo-session-002 -> always routed to 'openai/gpt-5.2' + PASS demo-session-003 -> always routed to 'openai/gpt-4o-mini' + +All sessions were pinned consistently. +Redis session cache is working correctly. +``` + +## Observability + +Open Jaeger at **http://localhost:16686** and select service `plano`. + +- Requests **without** affinity: look for a span to the Arch-Router service +- Requests **with** affinity (pinned): the Arch-Router span will be absent — + the decision was served from Redis without calling the router at all + +This is the clearest observable signal that the cache is working: pinned +requests are noticeably faster and produce fewer spans. + +## Switching to the In-Memory Backend + +To compare against the default in-memory backend, change `config.yaml`: + +```yaml +routing: + session_ttl_seconds: 300 + session_cache: + type: memory # ← change this +``` + +In-memory mode does **not** require Redis and works identically for a +single Plano process. The difference only becomes visible when you run +multiple replicas. + +## Teardown + +```bash +./run_demo.sh down +``` + +This stops Plano, Redis, and Jaeger. diff --git a/demos/llm_routing/session_affinity_redis/config.yaml b/demos/llm_routing/session_affinity_redis/config.yaml new file mode 100644 index 00000000..bd413582 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/config.yaml @@ -0,0 +1,36 @@ +version: v0.4.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: fast_responses + description: short factual questions, quick lookups, simple summarization, or greetings + models: + - openai/gpt-4o-mini + + - name: deep_reasoning + description: multi-step reasoning, complex analysis, code review, or detailed explanations + models: + - openai/gpt-5.2 + - openai/gpt-4o-mini + +routing: + session_ttl_seconds: 300 + session_cache: + type: redis + url: redis://localhost:6379 + +tracing: + random_sampling: 100 + trace_arch_internal: true diff --git a/demos/llm_routing/session_affinity_redis/docker-compose.yaml b/demos/llm_routing/session_affinity_redis/docker-compose.yaml new file mode 100644 index 00000000..011fe6c9 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/docker-compose.yaml @@ -0,0 +1,23 @@ +services: + redis: + image: redis:7-alpine + container_name: plano-session-redis + restart: unless-stopped + ports: + - "6379:6379" + command: redis-server --save "" --appendonly no + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 1s + timeout: 1s + retries: 10 + + jaeger: + build: + context: ../../shared/jaeger + container_name: plano-session-jaeger + restart: unless-stopped + ports: + - "16686:16686" + - "4317:4317" + - "4318:4318" diff --git a/demos/llm_routing/session_affinity_redis/run_demo.sh b/demos/llm_routing/session_affinity_redis/run_demo.sh new file mode 100755 index 00000000..ca84d44d --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/run_demo.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -euo pipefail + +DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +load_env() { + if [ -f "$DEMO_DIR/.env" ]; then + set -a + # shellcheck disable=SC1091 + source "$DEMO_DIR/.env" + set +a + fi +} + +check_prereqs() { + local missing=() + command -v docker >/dev/null 2>&1 || missing+=("docker") + command -v planoai >/dev/null 2>&1 || missing+=("planoai (pip install planoai)") + if [ ${#missing[@]} -gt 0 ]; then + echo "ERROR: missing required tools: ${missing[*]}" + exit 1 + fi + + if [ -z "${OPENAI_API_KEY:-}" ]; then + echo "ERROR: OPENAI_API_KEY is not set." + echo " Create a .env file or export the variable before running." + exit 1 + fi +} + +start_demo() { + echo "==> Starting Redis + Jaeger..." + docker compose -f "$DEMO_DIR/docker-compose.yaml" up -d + + echo "==> Waiting for Redis to be ready..." + local retries=0 + until docker exec plano-session-redis redis-cli ping 2>/dev/null | grep -q PONG; do + retries=$((retries + 1)) + if [ $retries -ge 15 ]; then + echo "ERROR: Redis did not become ready in time" + exit 1 + fi + sleep 1 + done + echo " Redis is ready." + + echo "==> Starting Plano..." + planoai up "$DEMO_DIR/config.yaml" + + echo "" + echo "Demo is running!" + echo "" + echo " Model endpoint: http://localhost:12000/v1/chat/completions" + echo " Jaeger UI: http://localhost:16686" + echo " Redis: localhost:6379" + echo "" + echo "Run the verification script to confirm session pinning:" + echo " python $DEMO_DIR/verify_affinity.py" + echo "" + echo "Stop the demo with: $0 down" +} + +stop_demo() { + echo "==> Stopping Plano..." + planoai down 2>/dev/null || true + + echo "==> Stopping Docker services..." + docker compose -f "$DEMO_DIR/docker-compose.yaml" down + + echo "Demo stopped." +} + +usage() { + echo "Usage: $0 [up|down]" + echo "" + echo " up Start Redis, Jaeger, and Plano (default)" + echo " down Stop all services" +} + +load_env + +case "${1:-up}" in + up) + check_prereqs + start_demo + ;; + down) + stop_demo + ;; + *) + usage + exit 1 + ;; +esac diff --git a/demos/llm_routing/session_affinity_redis/verify_affinity.py b/demos/llm_routing/session_affinity_redis/verify_affinity.py new file mode 100644 index 00000000..213c6fbe --- /dev/null +++ b/demos/llm_routing/session_affinity_redis/verify_affinity.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +verify_affinity.py — Verify that model affinity (session pinning) works correctly. + +Sends multiple requests with the same X-Model-Affinity session ID and asserts +that every response is served by the same model, demonstrating that Plano's +session cache is working as expected. + +Usage: + python verify_affinity.py [--url URL] [--rounds N] [--sessions N] +""" + +import argparse +import json +import sys +import urllib.error +import urllib.request +from collections import defaultdict + +PLANO_URL = "http://localhost:12000/v1/chat/completions" + +PROMPTS = [ + "What is 2 + 2?", + "Name the capital of France.", + "How many days in a week?", + "What color is the sky?", + "Who wrote Romeo and Juliet?", +] + +MESSAGES_PER_SESSION = [{"role": "user", "content": prompt} for prompt in PROMPTS] + + +def chat(url: str, session_id: str | None, message: str) -> dict: + payload = json.dumps( + { + "model": "openai/gpt-4o-mini", + "messages": [{"role": "user", "content": message}], + } + ).encode() + + headers = {"Content-Type": "application/json"} + if session_id: + headers["x-model-affinity"] = session_id + + req = urllib.request.Request(url, data=payload, headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read()) + except urllib.error.URLError as e: + print(f" ERROR: could not reach Plano at {url}: {e}", file=sys.stderr) + print(" Is the demo running? Start it with: ./run_demo.sh up", file=sys.stderr) + sys.exit(1) + + +def extract_model(response: dict) -> str: + return response.get("model", "") + + +def run_verification(url: str, rounds: int, num_sessions: int) -> bool: + print(f"Plano endpoint : {url}") + print(f"Sessions : {num_sessions}") + print(f"Rounds/session : {rounds}") + print() + + all_passed = True + + # --- Phase 1: Requests without session ID --- + print("=" * 60) + print("Phase 1: Requests WITHOUT X-Model-Affinity header") + print(" (model may vary between requests — that is expected)") + print("=" * 60) + models_seen: set[str] = set() + for i in range(min(rounds, 3)): + resp = chat(url, None, PROMPTS[i % len(PROMPTS)]) + model = extract_model(resp) + models_seen.add(model) + print(f" Request {i + 1}: model = {model}") + print(f" Models seen across {min(rounds, 3)} requests: {models_seen}") + print() + + # --- Phase 2: Each session should always get the same model --- + print("=" * 60) + print("Phase 2: Requests WITH X-Model-Affinity (session pinning)") + print(" Each session should be pinned to exactly one model.") + print("=" * 60) + + session_results: dict[str, list[str]] = defaultdict(list) + + for s in range(num_sessions): + session_id = f"demo-session-{s + 1:03d}" + print(f"\n Session '{session_id}':") + + for r in range(rounds): + resp = chat(url, session_id, PROMPTS[r % len(PROMPTS)]) + model = extract_model(resp) + session_results[session_id].append(model) + pinned = " [PINNED]" if r > 0 else " [FIRST — sets affinity]" + print(f" Round {r + 1}: model = {model}{pinned}") + + print() + print("=" * 60) + print("Results") + print("=" * 60) + + for session_id, models in session_results.items(): + unique_models = set(models) + if len(unique_models) == 1: + print(f" PASS {session_id} -> always routed to '{models[0]}'") + else: + print( + f" FAIL {session_id} -> inconsistent models across rounds: {unique_models}" + ) + all_passed = False + + print() + if all_passed: + print("All sessions were pinned consistently.") + print("Redis session cache is working correctly.") + else: + print("One or more sessions were NOT pinned consistently.") + print("Check that Redis is running and Plano is configured with:") + print(" routing:") + print(" session_cache:") + print(" type: redis") + print(" url: redis://localhost:6379") + + return all_passed + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--url", default=PLANO_URL, help="Plano chat completions URL") + parser.add_argument( + "--rounds", type=int, default=4, help="Requests per session (default 4)" + ) + parser.add_argument( + "--sessions", type=int, default=3, help="Number of sessions to test (default 3)" + ) + args = parser.parse_args() + + passed = run_verification(args.url, args.rounds, args.sessions) + sys.exit(0 if passed else 1) + + +if __name__ == "__main__": + main() diff --git a/demos/llm_routing/session_affinity_redis_k8s/.env.example b/demos/llm_routing/session_affinity_redis_k8s/.env.example new file mode 100644 index 00000000..f045319d --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/.env.example @@ -0,0 +1 @@ +OPENAI_API_KEY=sk-replace-me diff --git a/demos/llm_routing/session_affinity_redis_k8s/Dockerfile b/demos/llm_routing/session_affinity_redis_k8s/Dockerfile new file mode 100644 index 00000000..877b2246 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/Dockerfile @@ -0,0 +1,95 @@ +# Plano image for Redis-backed session affinity demo. +# Build context must be the repository root: +# docker build -f demos/llm_routing/session_affinity_redis_k8s/Dockerfile -t . + +# Envoy version — keep in sync with cli/planoai/consts.py ENVOY_VERSION +ARG ENVOY_VERSION=v1.37.0 + +# --- Dependency cache --- +FROM rust:1.93.0 AS deps +RUN rustup -v target add wasm32-wasip1 +WORKDIR /arch + +COPY crates/Cargo.toml crates/Cargo.lock ./ +COPY crates/common/Cargo.toml common/Cargo.toml +COPY crates/hermesllm/Cargo.toml hermesllm/Cargo.toml +COPY crates/prompt_gateway/Cargo.toml prompt_gateway/Cargo.toml +COPY crates/llm_gateway/Cargo.toml llm_gateway/Cargo.toml +COPY crates/brightstaff/Cargo.toml brightstaff/Cargo.toml + +RUN mkdir -p common/src && echo "" > common/src/lib.rs && \ + mkdir -p hermesllm/src && echo "" > hermesllm/src/lib.rs && \ + mkdir -p hermesllm/src/bin && echo "fn main() {}" > hermesllm/src/bin/fetch_models.rs && \ + mkdir -p prompt_gateway/src && echo "#[no_mangle] pub fn _start() {}" > prompt_gateway/src/lib.rs && \ + mkdir -p llm_gateway/src && echo "#[no_mangle] pub fn _start() {}" > llm_gateway/src/lib.rs && \ + mkdir -p brightstaff/src && echo "fn main() {}" > brightstaff/src/main.rs && echo "" > brightstaff/src/lib.rs + +RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway || true +RUN cargo build --release -p brightstaff || true + +# --- WASM plugins --- +FROM deps AS wasm-builder +RUN rm -rf common/src hermesllm/src prompt_gateway/src llm_gateway/src +COPY crates/common/src common/src +COPY crates/hermesllm/src hermesllm/src +COPY crates/prompt_gateway/src prompt_gateway/src +COPY crates/llm_gateway/src llm_gateway/src +RUN find common hermesllm prompt_gateway llm_gateway -name "*.rs" -exec touch {} + +RUN cargo build --release --target wasm32-wasip1 -p prompt_gateway -p llm_gateway + +# --- Brightstaff binary --- +FROM deps AS brightstaff-builder +RUN rm -rf common/src hermesllm/src brightstaff/src +COPY crates/common/src common/src +COPY crates/hermesllm/src hermesllm/src +COPY crates/brightstaff/src brightstaff/src +RUN find common hermesllm brightstaff -name "*.rs" -exec touch {} + +RUN cargo build --release -p brightstaff + +FROM docker.io/envoyproxy/envoy:${ENVOY_VERSION} AS envoy + +FROM python:3.14-slim AS arch + +RUN set -eux; \ + apt-get update; \ + apt-get upgrade -y; \ + apt-get install -y --no-install-recommends gettext-base curl procps; \ + apt-get clean; rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir supervisor + +RUN set -eux; \ + dpkg -r --force-depends libpam-modules libpam-modules-bin libpam-runtime libpam0g || true; \ + dpkg -P --force-all libpam-modules libpam-modules-bin libpam-runtime libpam0g || true; \ + rm -rf /etc/pam.d /lib/*/security /usr/lib/security || true + +COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy + +WORKDIR /app + +RUN pip install --no-cache-dir uv + +COPY cli/pyproject.toml ./ +COPY cli/uv.lock ./ +COPY cli/README.md ./ +COPY config/plano_config_schema.yaml /config/plano_config_schema.yaml +COPY config/envoy.template.yaml /config/envoy.template.yaml + +RUN pip install --no-cache-dir -e . + +COPY cli/planoai planoai/ +COPY config/envoy.template.yaml . +COPY config/plano_config_schema.yaml . +RUN mkdir -p /etc/supervisor/conf.d +COPY config/supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +COPY --from=wasm-builder /arch/target/wasm32-wasip1/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm +COPY --from=wasm-builder /arch/target/wasm32-wasip1/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm +COPY --from=brightstaff-builder /arch/target/release/brightstaff /app/brightstaff + +RUN mkdir -p /var/log/supervisor && \ + touch /var/log/envoy.log /var/log/supervisor/supervisord.log \ + /var/log/access_ingress.log /var/log/access_ingress_prompt.log \ + /var/log/access_internal.log /var/log/access_llm.log /var/log/access_agent.log + +ENTRYPOINT ["/usr/local/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/demos/llm_routing/session_affinity_redis_k8s/README.md b/demos/llm_routing/session_affinity_redis_k8s/README.md new file mode 100644 index 00000000..57905bc7 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/README.md @@ -0,0 +1,287 @@ +# Session Affinity — Multi-Replica Kubernetes Deployment + +Production-style Kubernetes demo that proves Redis-backed session affinity +(`X-Model-Affinity`) works correctly when Plano runs as multiple replicas +behind a load balancer. + +## Architecture + +``` + ┌─────────────────────────────────────────┐ + │ Kubernetes Cluster │ + │ │ + Client ──────────►│ LoadBalancer Service (port 12000) │ + │ │ │ │ + │ ┌────▼────┐ ┌─────▼───┐ │ + │ │ Plano │ │ Plano │ (replicas) │ + │ │ Pod 0 │ │ Pod 1 │ │ + │ └────┬────┘ └────┬────┘ │ + │ └──────┬───────┘ │ + │ ┌────▼────┐ │ + │ │ Redis │ (StatefulSet) │ + │ │ Pod │ shared session store │ + │ └─────────┘ │ + │ │ + │ ┌──────────┐ │ + │ │ Jaeger │ distributed tracing │ + │ └──────────┘ │ + └─────────────────────────────────────────┘ +``` + +**What makes this production-like:** + +| Feature | Detail | +|---------|--------| +| 2 Plano replicas | `replicas: 2` with HPA (scales 2–5 on CPU) | +| Shared Redis | StatefulSet with PVC — sessions survive pod restarts | +| Session TTL | 600 s, enforced natively by Redis `EX` | +| Eviction policy | `allkeys-lru` — Redis auto-evicts oldest sessions under memory pressure | +| Distributed tracing | Jaeger collects spans from both pods | +| Health probes | Readiness + liveness gates traffic away from unhealthy pods | + +## Quick Start (local — no registry needed) + +```bash +# 1. Install kind if needed +# https://kind.sigs.k8s.io/docs/user/quick-start/#installation +# brew install kind (macOS) + +# 2. Set your API key +export OPENAI_API_KEY=sk-... +# or copy and edit: +cp .env.example .env + +# 3. Build, deploy, and verify in one command +./run-local.sh +``` + +`run-local.sh` creates a kind cluster named `plano-demo` (if it doesn't exist), +builds the image locally, loads it into the cluster with `kind load docker-image` +— **no registry, no push required**. + +Individual steps: + +```bash +./run-local.sh --build-only # (re-)build and reload image into kind +./run-local.sh --deploy-only # (re-)apply k8s manifests +./run-local.sh --verify # run verify_affinity.py +./run-local.sh --down # delete k8s resources (keeps kind cluster) +./run-local.sh --delete-cluster # delete k8s resources + kind cluster +``` + +--- + +## Prerequisites + +| Tool | Notes | +|------|-------| +| `kubectl` | Configured to reach a Kubernetes cluster | +| `docker` | To build and push the custom image | +| Container registry (optional) | Needed only when you are not using the local kind flow | +| `OPENAI_API_KEY` | For model inference | +| Python 3.11+ | Only for `verify_affinity.py` | + +**Cluster:** `run-local.sh` creates and manages a kind cluster named `plano-demo` automatically. Install kind from https://kind.sigs.k8s.io or `brew install kind`. + +## Step 1 — Build the Image + +Build a custom image from the repo root: + +```bash +# From this demo directory: +./build-and-push.sh ghcr.io/yourorg/plano-redis:latest + +# Or manually from the repo root: +docker build \ + -f demos/llm_routing/session_affinity_redis_k8s/Dockerfile \ + -t ghcr.io/yourorg/plano-redis:latest \ + . +docker push ghcr.io/yourorg/plano-redis:latest +``` + +Then update the image reference in `k8s/plano.yaml` (skip this when using `run-local.sh`, which uses `plano-redis:local` automatically): + +```yaml +image: ghcr.io/yourorg/plano-redis:latest # ← replace YOUR_REGISTRY/plano-redis:latest +``` + +## Step 2 — Deploy + +```bash +./deploy.sh +``` + +The script: +1. Creates the `plano-demo` namespace +2. Prompts for `OPENAI_API_KEY` and creates a Kubernetes Secret +3. Applies Redis, Jaeger, ConfigMap, and Plano manifests in order +4. Waits for rollouts to complete + +Expected output: + +``` +==> Applying namespace... +==> Creating API key secret... + OPENAI_API_KEY: [hidden] +==> Applying Redis (StatefulSet + Services)... +==> Applying Jaeger... +==> Applying Plano config (ConfigMap)... +==> Applying Plano deployment + HPA... +==> Waiting for Redis to be ready... +==> Waiting for Plano pods to be ready... + +Deployment complete! + +=== Pods === +NAME READY STATUS NODE +redis-0 1/1 Running node-1 +plano-6d8f9b-xk2pq 1/1 Running node-1 +plano-6d8f9b-r7nlw 1/1 Running node-2 +jaeger-5c7d8f-q9mnb 1/1 Running node-1 + +=== Services === +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +plano LoadBalancer 10.96.12.50 203.0.113.42 12000:32000/TCP +redis ClusterIP None 6379/TCP +jaeger ClusterIP 10.96.8.71 16686/TCP,... +``` + +## Step 3 — Verify Session Affinity Across Replicas + +```bash +python verify_affinity.py +``` + +The script opens a dedicated `kubectl port-forward` tunnel to **each pod +individually**. This is the definitive test: it routes requests to specific +pods rather than relying on random load-balancer assignment. + +``` +Mode: per-pod port-forward (full cross-replica proof) + +Found 2 Plano pod(s): plano-6d8f9b-xk2pq, plano-6d8f9b-r7nlw +Opening per-pod port-forward tunnels... + + plano-6d8f9b-xk2pq → localhost:19100 + plano-6d8f9b-r7nlw → localhost:19101 + +================================================================== +Phase 1: Cross-replica session pinning + Pods under test : plano-6d8f9b-xk2pq, plano-6d8f9b-r7nlw + Sessions : 4 + Rounds/session : 4 + + Each session is PINNED via one pod and VERIFIED via another. + If Redis is shared, every round must return the same model. +================================================================== + + PASS k8s-session-001 + model : gpt-4o-mini-2024-07-18 + pod order : plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw + + PASS k8s-session-002 + model : gpt-5.2 + pod order : plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq + + PASS k8s-session-003 + model : gpt-4o-mini-2024-07-18 + pod order : plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw + + PASS k8s-session-004 + model : gpt-5.2 + pod order : plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq → plano-6d8f9b-r7nlw → plano-6d8f9b-xk2pq + +================================================================== +Phase 2: Redis key inspection +================================================================== + k8s-session-001 + model_name : gpt-4o-mini-2024-07-18 + route_name : fast_responses + TTL : 587s remaining + k8s-session-002 + model_name : gpt-5.2 + route_name : deep_reasoning + TTL : 581s remaining + ... + +================================================================== +Summary +================================================================== +All sessions were pinned consistently across replicas. +Redis session cache is working correctly in Kubernetes. +``` + +## What to Look For + +### The cross-replica proof + +Each session's `pod order` line shows it alternating between the two pods: + +``` +pod order: pod-A → pod-B → pod-A → pod-B +``` + +Round 1 sets the Redis key (via pod-A). Rounds 2, 3, 4 read from Redis on +alternating pods. If the model stays the same across all rounds, Redis is the +shared source of truth — **not** any in-process state. + +### Redis keys + +```bash +kubectl exec -it redis-0 -n plano-demo -- redis-cli + +127.0.0.1:6379> KEYS * +1) "k8s-session-001" +2) "k8s-session-002" + +127.0.0.1:6379> GET k8s-session-001 +{"model_name":"gpt-4o-mini-2024-07-18","route_name":"fast_responses"} + +127.0.0.1:6379> TTL k8s-session-001 +(integer) 587 +``` + +### Jaeger traces + +```bash +kubectl port-forward svc/jaeger 16686:16686 -n plano-demo +``` + +Open **http://localhost:16686**, select service `plano`. + +- **Pinned requests** — no span to the Arch-Router (decision served from Redis) +- **First request** per session — spans include the router call + a Redis `SET` +- Both Plano pods appear as separate instances in the trace list + +### Scaling up (HPA in action) + +```bash +# Scale to 3 replicas manually +kubectl scale deployment/plano --replicas=3 -n plano-demo + +# Run verification again — now 3 pods alternate +python verify_affinity.py --sessions 6 +``` + +Existing sessions in Redis are unaffected by the scale event. New pods +immediately participate in the shared session pool. + +## Teardown + +```bash +./deploy.sh --destroy +# Then optionally: +kubectl delete namespace plano-demo +``` + +## Notes + +- The Redis StatefulSet uses a `PersistentVolumeClaim`. Session data survives + pod restarts within a TTL window but is not HA. For production, replace with + Redis Sentinel, Redis Cluster, or a managed service (ElastiCache, MemoryStore). +- `session_max_entries` is not enforced by this backend — Redis uses + `maxmemory-policy: allkeys-lru` instead, which is a global limit across all + keys rather than a per-application cap. +- On **minikube**, run `minikube tunnel` in a separate terminal to get an + external IP for the LoadBalancer service. +- On **kind**, switch to `NodePort` (see the comment in `k8s/plano.yaml`). diff --git a/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh b/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh new file mode 100755 index 00000000..12632ed9 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/build-and-push.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# build-and-push.sh — Build the Plano demo image and push it to your registry. +# +# Usage: +# ./build-and-push.sh +# +# Example: +# ./build-and-push.sh ghcr.io/yourorg/plano-redis:latest +# ./build-and-push.sh docker.io/youruser/plano-redis:0.4.17 +# +# The build context is the repository root. Run this script from anywhere — +# it resolves the repo root automatically. + +set -euo pipefail + +IMAGE="${1:-}" +if [ -z "$IMAGE" ]; then + echo "Usage: $0 " + echo "" + echo "Example:" + echo " $0 ghcr.io/yourorg/plano-redis:latest" + exit 1 +fi + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +DOCKERFILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/Dockerfile" + +echo "Repository root : $REPO_ROOT" +echo "Dockerfile : $DOCKERFILE" +echo "Image : $IMAGE" +echo "" + +echo "==> Building image (this takes a few minutes — Rust compile from scratch)..." +docker build \ + --file "$DOCKERFILE" \ + --tag "$IMAGE" \ + --progress=plain \ + "$REPO_ROOT" + +echo "==> Pushing $IMAGE..." +docker push "$IMAGE" + +echo "" +echo "Done. Update k8s/plano.yaml:" +echo " image: $IMAGE" +echo "" +echo "Then deploy with: ./deploy.sh" diff --git a/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml b/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml new file mode 100644 index 00000000..da142762 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/config_k8s.yaml @@ -0,0 +1,38 @@ +version: v0.4.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: fast_responses + description: short factual questions, quick lookups, simple summarization, or greetings + models: + - openai/gpt-4o-mini + + - name: deep_reasoning + description: multi-step reasoning, complex analysis, code review, or detailed explanations + models: + - openai/gpt-5.2 + - openai/gpt-4o-mini + +# Redis is reachable inside the cluster via the service name. +routing: + session_ttl_seconds: 600 + session_cache: + type: redis + url: redis://redis.plano-demo.svc.cluster.local:6379 + +tracing: + random_sampling: 100 + trace_arch_internal: true + opentracing_grpc_endpoint: http://jaeger.plano-demo.svc.cluster.local:4317 diff --git a/demos/llm_routing/session_affinity_redis_k8s/deploy.sh b/demos/llm_routing/session_affinity_redis_k8s/deploy.sh new file mode 100755 index 00000000..2a3ae295 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/deploy.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# deploy.sh — Apply all Kubernetes manifests in the correct order. +# +# Usage: +# ./deploy.sh # deploy everything +# ./deploy.sh --destroy # tear down everything (keeps namespace for safety) +# ./deploy.sh --status # show pod and service status + +set -euo pipefail + +DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +K8S_DIR="$DEMO_DIR/k8s" +NS="plano-demo" + +check_prereqs() { + local missing=() + command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") + if [ ${#missing[@]} -gt 0 ]; then + echo "ERROR: missing required tools: ${missing[*]}" + exit 1 + fi + + if ! kubectl cluster-info &>/dev/null; then + echo "ERROR: no Kubernetes cluster reachable. Start minikube/kind or configure kubeconfig." + exit 1 + fi +} + +create_secret() { + if kubectl get secret plano-secrets -n "$NS" &>/dev/null; then + echo " Secret plano-secrets already exists, skipping." + return + fi + + local openai_api_key="${OPENAI_API_KEY:-}" + if [ -z "$openai_api_key" ]; then + echo "" + echo "No 'plano-secrets' secret found in namespace '$NS'." + echo "Enter API keys (input is hidden):" + echo "" + read -r -s -p " OPENAI_API_KEY: " openai_api_key + echo "" + else + echo " Using OPENAI_API_KEY from environment." + fi + + if [ -z "$openai_api_key" ]; then + echo "ERROR: OPENAI_API_KEY cannot be empty." + exit 1 + fi + + kubectl create secret generic plano-secrets \ + --from-literal=OPENAI_API_KEY="$openai_api_key" \ + -n "$NS" + + echo " Secret created." +} + +deploy() { + echo "==> Applying namespace..." + kubectl apply -f "$K8S_DIR/namespace.yaml" + + echo "==> Creating API key secret..." + create_secret + + echo "==> Applying Redis (StatefulSet + Services)..." + kubectl apply -f "$K8S_DIR/redis.yaml" + + echo "==> Applying Jaeger..." + kubectl apply -f "$K8S_DIR/jaeger.yaml" + + echo "==> Applying Plano config (ConfigMap)..." + kubectl apply -f "$K8S_DIR/plano-config.yaml" + + echo "==> Applying Plano deployment + HPA..." + kubectl apply -f "$K8S_DIR/plano.yaml" + + echo "" + echo "==> Waiting for Redis to be ready..." + kubectl rollout status statefulset/redis -n "$NS" --timeout=120s + + echo "==> Waiting for Plano pods to be ready..." + kubectl rollout status deployment/plano -n "$NS" --timeout=120s + + echo "" + echo "Deployment complete!" + show_status + echo "" + echo "Useful commands:" + echo " # Tail logs from all Plano pods:" + echo " kubectl logs -l app=plano -n $NS -f" + echo "" + echo " # Open Jaeger UI:" + echo " kubectl port-forward svc/jaeger 16686:16686 -n $NS &" + echo " open http://localhost:16686" + echo "" + echo " # Access Redis CLI:" + echo " kubectl exec -it redis-0 -n $NS -- redis-cli" + echo "" + echo " # Run the verification script:" + echo " python $DEMO_DIR/verify_affinity.py" +} + +destroy() { + echo "==> Deleting Plano, Jaeger, and Redis resources..." + kubectl delete -f "$K8S_DIR/plano.yaml" --ignore-not-found + kubectl delete -f "$K8S_DIR/jaeger.yaml" --ignore-not-found + kubectl delete -f "$K8S_DIR/redis.yaml" --ignore-not-found + kubectl delete -f "$K8S_DIR/plano-config.yaml" --ignore-not-found + kubectl delete secret plano-secrets -n "$NS" --ignore-not-found + + echo "" + echo "Resources deleted." + echo "Namespace '$NS' was kept. Remove it manually if desired:" + echo " kubectl delete namespace $NS" +} + +show_status() { + echo "" + echo "=== Pods ===" + kubectl get pods -n "$NS" -o wide + echo "" + echo "=== Services ===" + kubectl get svc -n "$NS" + echo "" + echo "=== HPA ===" + kubectl get hpa -n "$NS" 2>/dev/null || true +} + +check_prereqs + +case "${1:-}" in + --destroy) destroy ;; + --status) show_status ;; + *) deploy ;; +esac diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml new file mode 100644 index 00000000..7d5c2d62 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/jaeger.yaml @@ -0,0 +1,56 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jaeger + namespace: plano-demo + labels: + app: jaeger +spec: + replicas: 1 + selector: + matchLabels: + app: jaeger + template: + metadata: + labels: + app: jaeger + spec: + containers: + - name: jaeger + image: jaegertracing/jaeger:2.3.0 + ports: + - containerPort: 16686 # UI + - containerPort: 4317 # OTLP gRPC + - containerPort: 4318 # OTLP HTTP + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" +--- +apiVersion: v1 +kind: Service +metadata: + name: jaeger + namespace: plano-demo + labels: + app: jaeger +spec: + selector: + app: jaeger + ports: + - name: ui + port: 16686 + targetPort: 16686 + - name: otlp-grpc + port: 4317 + targetPort: 4317 + - name: otlp-http + port: 4318 + targetPort: 4318 +--- +# NodePort for UI access from your laptop. +# Access at: http://localhost:16686 after: kubectl port-forward svc/jaeger 16686:16686 -n plano-demo diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml new file mode 100644 index 00000000..4992eaa4 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: plano-demo + labels: + app.kubernetes.io/part-of: plano-session-affinity-demo diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml new file mode 100644 index 00000000..6c3d5339 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-config.yaml @@ -0,0 +1,50 @@ +--- +# ConfigMap wrapping the Plano config file. +# Regenerate after editing config_k8s.yaml: +# kubectl create configmap plano-config \ +# --from-file=plano_config.yaml=../config_k8s.yaml \ +# -n plano-demo --dry-run=client -o yaml | kubectl apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: plano-config + namespace: plano-demo +data: + plano_config.yaml: | + version: v0.4.0 + + listeners: + - type: model + name: model_listener + port: 12000 + + model_providers: + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + + routing_preferences: + - name: fast_responses + description: short factual questions, quick lookups, simple summarization, or greetings + models: + - openai/gpt-4o-mini + + - name: deep_reasoning + description: multi-step reasoning, complex analysis, code review, or detailed explanations + models: + - openai/gpt-5.2 + - openai/gpt-4o-mini + + routing: + session_ttl_seconds: 600 + session_cache: + type: redis + url: redis://redis.plano-demo.svc.cluster.local:6379 + + tracing: + random_sampling: 100 + trace_arch_internal: true + opentracing_grpc_endpoint: http://jaeger.plano-demo.svc.cluster.local:4317 diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml new file mode 100644 index 00000000..32ab8681 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano-secrets.example.yaml @@ -0,0 +1,19 @@ +# EXAMPLE — do NOT apply this file directly. +# Create the real secret with: +# +# kubectl create secret generic plano-secrets \ +# --from-literal=OPENAI_API_KEY=sk-... \ +# -n plano-demo +# +# Or use the deploy.sh script, which prompts for keys and creates the secret. +# +# If you use a secrets manager (AWS Secrets Manager, GCP Secret Manager, Vault) +# replace this with an ExternalSecret or a CSI driver volume mount instead. +apiVersion: v1 +kind: Secret +metadata: + name: plano-secrets + namespace: plano-demo +type: Opaque +stringData: + OPENAI_API_KEY: "sk-replace-me" diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml new file mode 100644 index 00000000..624a58f3 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/plano.yaml @@ -0,0 +1,130 @@ +--- +# Plano Deployment — 2 replicas sharing one Redis instance. +# All replicas are stateless; routing state lives entirely in Redis. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: plano + namespace: plano-demo + labels: + app: plano +spec: + replicas: 2 + selector: + matchLabels: + app: plano + template: + metadata: + labels: + app: plano + spec: + containers: + - name: plano + # Local dev: run-local.sh sets this to plano-redis:local and loads it + # into minikube/kind so no registry is needed. + # Production: replace with your registry image and use imagePullPolicy: Always. + image: plano-redis:local + imagePullPolicy: IfNotPresent + ports: + - containerPort: 12000 + name: llm-gateway + envFrom: + - secretRef: + name: plano-secrets + env: + - name: LOG_LEVEL + value: "info" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + volumeMounts: + - name: plano-config + mountPath: /app/plano_config.yaml + subPath: plano_config.yaml + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 15 + periodSeconds: 30 + failureThreshold: 3 + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1000m" + volumes: + - name: plano-config + configMap: + name: plano-config +--- +# LoadBalancer Service — exposes Plano externally. +# On minikube, run: minikube tunnel +# On kind, use NodePort instead (see comment below). +# On cloud providers (GKE, EKS, AKS), an external IP is assigned automatically. +apiVersion: v1 +kind: Service +metadata: + name: plano + namespace: plano-demo + labels: + app: plano +spec: + type: LoadBalancer + selector: + app: plano + ports: + - name: llm-gateway + port: 12000 + targetPort: 12000 +--- +# Uncomment and use instead of the LoadBalancer above when running on kind/minikube +# without tunnel: +# +# apiVersion: v1 +# kind: Service +# metadata: +# name: plano +# namespace: plano-demo +# spec: +# type: NodePort +# selector: +# app: plano +# ports: +# - name: llm-gateway +# port: 12000 +# targetPort: 12000 +# nodePort: 32000 +--- +# HorizontalPodAutoscaler — scales 2 to 5 replicas based on CPU. +# Demonstrates that new replicas join the existing session state seamlessly. +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: plano + namespace: plano-demo +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: plano + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 diff --git a/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml b/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml new file mode 100644 index 00000000..c47ad09c --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/k8s/redis.yaml @@ -0,0 +1,96 @@ +--- +# Redis StatefulSet — single-shard, persistence enabled. +# For production, replace with Redis Cluster or a managed service (ElastiCache, MemoryStore, etc.). +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis + namespace: plano-demo + labels: + app: redis +spec: + serviceName: redis + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: redis + image: redis:7-alpine + ports: + - containerPort: 6379 + name: redis + command: + - redis-server + - --appendonly + - "yes" + - --maxmemory + - "256mb" + - --maxmemory-policy + - allkeys-lru + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 15 + periodSeconds: 20 + resources: + requests: + memory: "64Mi" + cpu: "100m" + limits: + memory: "320Mi" + cpu: "500m" + volumeMounts: + - name: redis-data + mountPath: /data + volumeClaimTemplates: + - metadata: + name: redis-data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi +--- +# Stable DNS name: redis.plano-demo.svc.cluster.local:6379 +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: plano-demo + labels: + app: redis +spec: + selector: + app: redis + ports: + - name: redis + port: 6379 + targetPort: 6379 + clusterIP: None # headless — StatefulSet pods get stable DNS +--- +# Regular ClusterIP for application code (redis://redis:6379) +apiVersion: v1 +kind: Service +metadata: + name: redis-service + namespace: plano-demo + labels: + app: redis +spec: + selector: + app: redis + ports: + - name: redis + port: 6379 + targetPort: 6379 diff --git a/demos/llm_routing/session_affinity_redis_k8s/run-local.sh b/demos/llm_routing/session_affinity_redis_k8s/run-local.sh new file mode 100755 index 00000000..80808a88 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/run-local.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# run-local.sh — Build and run the k8s session affinity demo entirely locally with kind. +# No registry, no image push required. +# +# Usage: +# ./run-local.sh # create cluster (if needed), build, deploy, verify +# ./run-local.sh --build-only # build and load the image into kind +# ./run-local.sh --deploy-only # skip build, re-apply k8s manifests +# ./run-local.sh --verify # run verify_affinity.py against the running cluster +# ./run-local.sh --down # tear down k8s resources (keeps kind cluster) +# ./run-local.sh --delete-cluster # also delete the kind cluster + +set -euo pipefail + +DEMO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$DEMO_DIR/../../.." && pwd)" +IMAGE_NAME="plano-redis:local" +KIND_CLUSTER="plano-demo" + +# --------------------------------------------------------------------------- +# Prereq check +# --------------------------------------------------------------------------- + +check_prereqs() { + local missing=() + command -v docker >/dev/null 2>&1 || missing+=("docker") + command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") + command -v kind >/dev/null 2>&1 || missing+=("kind (https://kind.sigs.k8s.io/docs/user/quick-start/#installation)") + command -v python3 >/dev/null 2>&1 || missing+=("python3") + + if [ ${#missing[@]} -gt 0 ]; then + echo "ERROR: missing required tools:" + for t in "${missing[@]}"; do echo " - $t"; done + exit 1 + fi +} + +load_env() { + if [ -f "$DEMO_DIR/.env" ]; then + set -a + # shellcheck disable=SC1091 + source "$DEMO_DIR/.env" + set +a + fi +} + +# --------------------------------------------------------------------------- +# Cluster lifecycle +# --------------------------------------------------------------------------- + +ensure_cluster() { + if kind get clusters 2>/dev/null | grep -q "^${KIND_CLUSTER}$"; then + echo "==> kind cluster '$KIND_CLUSTER' already exists, reusing." + else + echo "==> Creating kind cluster '$KIND_CLUSTER'..." + kind create cluster --name "$KIND_CLUSTER" + echo " Cluster created." + fi + + # Point kubectl at this cluster + kubectl config use-context "kind-${KIND_CLUSTER}" >/dev/null +} + +# --------------------------------------------------------------------------- +# Build and load +# --------------------------------------------------------------------------- + +build() { + echo "==> Building image '$IMAGE_NAME' from repo root..." + docker build \ + --file "$DEMO_DIR/Dockerfile" \ + --tag "$IMAGE_NAME" \ + --progress=plain \ + "$REPO_ROOT" + + echo "==> Loading '$IMAGE_NAME' into kind cluster '$KIND_CLUSTER'..." + kind load docker-image "$IMAGE_NAME" --name "$KIND_CLUSTER" + echo " Image loaded." +} + +# --------------------------------------------------------------------------- +# Deploy / verify / teardown +# --------------------------------------------------------------------------- + +deploy() { + echo "" + echo "==> Deploying to Kubernetes..." + "$DEMO_DIR/deploy.sh" +} + +verify() { + echo "" + echo "==> Running cross-replica verification..." + python3 "$DEMO_DIR/verify_affinity.py" +} + +down() { + "$DEMO_DIR/deploy.sh" --destroy +} + +delete_cluster() { + echo "==> Deleting kind cluster '$KIND_CLUSTER'..." + kind delete cluster --name "$KIND_CLUSTER" + echo " Cluster deleted." +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +case "${1:-}" in + --build-only) + check_prereqs + load_env + ensure_cluster + build + ;; + --deploy-only) + check_prereqs + load_env + ensure_cluster + deploy + ;; + --verify) + check_prereqs + verify + ;; + --down) + check_prereqs + down + ;; + --delete-cluster) + check_prereqs + down + delete_cluster + ;; + "") + check_prereqs + load_env + ensure_cluster + echo "" + build + deploy + echo "" + echo "==> Everything is up. Running verification in 5 seconds..." + echo " (Ctrl-C to skip — run manually with: ./run-local.sh --verify)" + sleep 5 + verify + ;; + *) + echo "Usage: $0 [--build-only | --deploy-only | --verify | --down | --delete-cluster]" + exit 1 + ;; +esac diff --git a/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py b/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py new file mode 100644 index 00000000..4c368ed7 --- /dev/null +++ b/demos/llm_routing/session_affinity_redis_k8s/verify_affinity.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python3 +""" +verify_affinity.py — Prove that Redis-backed session affinity works across Plano replicas. + +Strategy +-------- +Kubernetes round-robin is non-deterministic, so simply hammering the LoadBalancer +service is not a reliable proof. Instead this script: + + 1. Discovers the two (or more) Plano pod names with kubectl. + 2. Opens a kubectl port-forward tunnel to EACH pod on a separate local port. + 3. Pins a session via Pod 0 (writes the Redis key). + 4. Reads the same session via Pod 1 (must return the same model — reads Redis). + 5. Repeats across N sessions, round-robining which pod sets vs. reads the pin. + +If every round returns the same model, Redis is the shared source of truth and +multi-replica affinity is proven. + +Usage +----- + # From inside the cluster network (e.g. CI job or jumpbox): + python verify_affinity.py --url http://:12000 + + # From your laptop (uses kubectl port-forward automatically): + python verify_affinity.py + + # More sessions / rounds: + python verify_affinity.py --sessions 5 --rounds 6 + +Requirements +------------ + kubectl — configured to reach the plano-demo namespace + Python 3.11+ +""" + +import argparse +import http.client +import json +import signal +import subprocess +import sys +import time +import urllib.error +import urllib.request +from contextlib import contextmanager + +NAMESPACE = "plano-demo" +BASE_LOCAL_PORT = 19100 # port-forward starts here, increments per pod + +PROMPTS = [ + "Explain the difference between TCP and UDP in detail.", + "Write a merge sort implementation in Python.", + "What is quantum entanglement?", + "Describe the CAP theorem with examples.", + "How does gradient descent work in neural networks?", + "What is the time complexity of Dijkstra's algorithm?", +] + + +# --------------------------------------------------------------------------- +# kubectl helpers +# --------------------------------------------------------------------------- + + +def get_pod_names() -> list[str]: + """Return running Plano pod names in the plano-demo namespace.""" + result = subprocess.run( + [ + "kubectl", + "get", + "pods", + "-n", + NAMESPACE, + "-l", + "app=plano", + "--field-selector=status.phase=Running", + "-o", + "jsonpath={.items[*].metadata.name}", + ], + capture_output=True, + text=True, + check=True, + ) + pods = result.stdout.strip().split() + if not pods or pods == [""]: + raise RuntimeError( + f"No running Plano pods found in namespace '{NAMESPACE}'.\n" + "Is the cluster deployed? Run: ./deploy.sh" + ) + return pods + + +@contextmanager +def port_forward(pod_name: str, local_port: int, remote_port: int = 12000): + """Context manager that starts and stops a kubectl port-forward.""" + proc = subprocess.Popen( + [ + "kubectl", + "port-forward", + f"pod/{pod_name}", + f"{local_port}:{remote_port}", + "-n", + NAMESPACE, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + # Give the tunnel a moment to establish + time.sleep(1.5) + try: + yield f"http://localhost:{local_port}" + finally: + proc.send_signal(signal.SIGTERM) + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def chat( + base_url: str, + session_id: str | None, + message: str, + model: str = "openai/gpt-4o-mini", + retries: int = 3, + retry_delay: float = 5.0, +) -> dict: + payload = json.dumps( + { + "model": model, + "messages": [{"role": "user", "content": message}], + } + ).encode() + + headers = {"Content-Type": "application/json"} + if session_id: + headers["x-model-affinity"] = session_id + + req = urllib.request.Request( + f"{base_url}/v1/chat/completions", + data=payload, + headers=headers, + method="POST", + ) + last_err: Exception | None = None + for attempt in range(retries): + try: + with urllib.request.urlopen(req, timeout=60) as resp: + body = resp.read() + if not body: + raise RuntimeError(f"Empty response body from {base_url}") + return json.loads(body) + except urllib.error.HTTPError as e: + if e.code in (503, 502, 429) and attempt < retries - 1: + time.sleep(retry_delay * (attempt + 1)) + last_err = e + continue + raise RuntimeError(f"Request to {base_url} failed: {e}") from e + except ( + urllib.error.URLError, + http.client.RemoteDisconnected, + RuntimeError, + ) as e: + if attempt < retries - 1: + time.sleep(retry_delay * (attempt + 1)) + last_err = e + continue + raise RuntimeError(f"Request to {base_url} failed: {e}") from e + except json.JSONDecodeError as e: + raise RuntimeError(f"Invalid JSON from {base_url}: {e}") from e + raise RuntimeError( + f"Request to {base_url} failed after {retries} attempts: {last_err}" + ) + + +def extract_model(response: dict) -> str: + return response.get("model", "") + + +# --------------------------------------------------------------------------- +# Verification phases +# --------------------------------------------------------------------------- + + +def phase_loadbalancer(url: str, rounds: int) -> None: + """Phase 0: quick smoke test against the LoadBalancer / provided URL.""" + print("=" * 66) + print(f"Phase 0: Smoke test against {url}") + print("=" * 66) + for i in range(rounds): + resp = chat(url, None, PROMPTS[i % len(PROMPTS)]) + print(f" Request {i + 1}: model = {extract_model(resp)}") + print() + + +def phase_cross_replica( + pod_urls: dict[str, str], num_sessions: int, rounds: int +) -> bool: + """ + Phase 1 — Cross-replica pinning. + + For each session: + • Round 1: send to pod_A (sets the Redis key) + • Rounds 2+: alternate between pod_A and pod_B + • Assert every round returns the same model. + """ + pod_names = list(pod_urls.keys()) + all_passed = True + session_results: dict[str, dict] = {} + + print("=" * 66) + print("Phase 1: Cross-replica session pinning") + print(f" Pods under test : {', '.join(pod_names)}") + print(f" Sessions : {num_sessions}") + print(f" Rounds/session : {rounds}") + print() + print(" Each session is PINNED via one pod and VERIFIED via another.") + print(" If Redis is shared, every round must return the same model.") + print("=" * 66) + + for s in range(num_sessions): + session_id = f"k8s-session-{s + 1:03d}" + models_seen = [] + pod_sequence = [] + + for r in range(rounds): + # Alternate which pod handles each round + pod_name = pod_names[r % len(pod_names)] + url = pod_urls[pod_name] + + try: + resp = chat(url, session_id, PROMPTS[(s + r) % len(PROMPTS)]) + model = extract_model(resp) + except RuntimeError as e: + print(f" ERROR on {pod_name} round {r + 1}: {e}") + all_passed = False + continue + + models_seen.append(model) + pod_sequence.append(pod_name) + + unique_models = set(models_seen) + passed = len(unique_models) == 1 + + session_results[session_id] = { + "passed": passed, + "model": models_seen[0] if models_seen else "", + "unique_models": unique_models, + "pod_sequence": pod_sequence, + } + + status = "PASS" if passed else "FAIL" + detail = models_seen[0] if passed else str(unique_models) + print(f"\n {status} {session_id}") + print(f" model : {detail}") + print(f" pod order : {' → '.join(pod_sequence)}") + + if not passed: + all_passed = False + + return all_passed + + +def phase_redis_inspect(num_sessions: int) -> None: + """Phase 2: read keys directly from Redis to show what's stored.""" + print() + print("=" * 66) + print("Phase 2: Redis key inspection") + print("=" * 66) + for s in range(num_sessions): + session_id = f"k8s-session-{s + 1:03d}" + result = subprocess.run( + [ + "kubectl", + "exec", + "-n", + NAMESPACE, + "redis-0", + "--", + "redis-cli", + "GET", + session_id, + ], + capture_output=True, + text=True, + ) + raw = result.stdout.strip() + ttl_result = subprocess.run( + [ + "kubectl", + "exec", + "-n", + NAMESPACE, + "redis-0", + "--", + "redis-cli", + "TTL", + session_id, + ], + capture_output=True, + text=True, + ) + ttl = ttl_result.stdout.strip() + + if raw and raw != "(nil)": + try: + data = json.loads(raw) + print(f" {session_id}") + print(f" model_name : {data.get('model_name', '?')}") + print(f" route_name : {data.get('route_name', 'null')}") + print(f" TTL : {ttl}s remaining") + except json.JSONDecodeError: + print(f" {session_id}: (raw) {raw}") + else: + print(f" {session_id}: key not found or expired") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--url", + default=None, + help="LoadBalancer URL to use instead of per-pod port-forwards. " + "When set, cross-replica proof is skipped (no pod targeting).", + ) + parser.add_argument( + "--sessions", type=int, default=4, help="Number of sessions (default 4)" + ) + parser.add_argument( + "--rounds", type=int, default=4, help="Rounds per session (default 4)" + ) + parser.add_argument( + "--skip-redis-inspect", action="store_true", help="Skip Redis key inspection" + ) + args = parser.parse_args() + + if args.url: + # Simple mode: hit the LoadBalancer directly + print(f"Mode: LoadBalancer ({args.url})") + print() + phase_loadbalancer(args.url, args.rounds) + print("To get the full cross-replica proof, run without --url.") + sys.exit(0) + + # Full mode: port-forward to each pod individually + print("Mode: per-pod port-forward (full cross-replica proof)") + print() + + try: + pod_names = get_pod_names() + except (subprocess.CalledProcessError, RuntimeError) as e: + print(f"ERROR: {e}", file=sys.stderr) + sys.exit(1) + + if len(pod_names) < 2: + print(f"WARNING: only {len(pod_names)} Plano pod(s) running.") + print(" For a true cross-replica test you need at least 2.") + print(" Scale up: kubectl scale deployment/plano --replicas=2 -n plano-demo") + print() + + print(f"Found {len(pod_names)} Plano pod(s): {', '.join(pod_names)}") + print("Opening per-pod port-forward tunnels...") + print() + + pod_urls: dict[str, str] = {} + contexts = [] + + for i, pod in enumerate(pod_names): + local_port = BASE_LOCAL_PORT + i + ctx = port_forward(pod, local_port) + url = ctx.__enter__() + pod_urls[pod] = url + contexts.append((ctx, url)) + print(f" {pod} → localhost:{local_port}") + + print() + + try: + passed = phase_cross_replica(pod_urls, args.sessions, args.rounds) + + if not args.skip_redis_inspect: + phase_redis_inspect(args.sessions) + + print() + print("=" * 66) + print("Summary") + print("=" * 66) + if passed: + print("All sessions were pinned consistently across replicas.") + print("Redis session cache is working correctly in Kubernetes.") + else: + print("One or more sessions were NOT consistent across replicas.") + print("Check brightstaff logs: kubectl logs -l app=plano -n plano-demo") + + finally: + for ctx, _ in contexts: + try: + ctx.__exit__(None, None, None) + except Exception: + pass + + sys.exit(0 if passed else 1) + + +if __name__ == "__main__": + main()