diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md index 80510004..ba101bd3 100644 --- a/.claude/skills/release/SKILL.md +++ b/.claude/skills/release/SKILL.md @@ -25,4 +25,6 @@ Update the version string in ALL of these files: Do NOT change version strings in `*.lock` files or `Cargo.lock`. +After updating all version strings, run `cd cli && uv lock` to update the lock file with the new version. + After making changes, show a summary of all files modified and the old → new version. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25e6f99d..01d5c33f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,13 +133,13 @@ jobs: load: true tags: | ${{ env.PLANO_DOCKER_IMAGE }} - ${{ env.DOCKER_IMAGE }}:0.4.11 + ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest cache-from: type=gha cache-to: type=gha,mode=max - name: Save image as artifact - run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar + run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar - name: Upload image artifact uses: actions/upload-artifact@v6 diff --git a/.gitignore b/.gitignore index af706ea4..391c17fa 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,4 @@ apps/*/dist/ .cursor/ .agents +docs/do/ diff --git a/apps/www/src/components/Hero.tsx b/apps/www/src/components/Hero.tsx index 7952c68f..fcfe5f01 100644 --- a/apps/www/src/components/Hero.tsx +++ b/apps/www/src/components/Hero.tsx @@ -24,7 +24,7 @@ export function Hero() { >
- v0.4.11 + v0.4.12 — diff --git a/build_filter_image.sh b/build_filter_image.sh index 8e041894..15d3d10e 100644 --- a/build_filter_image.sh +++ b/build_filter_image.sh @@ -1 +1 @@ -docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11 +docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.12 diff --git a/cli/planoai/__init__.py b/cli/planoai/__init__.py index b94eadc2..e69352e8 100644 --- a/cli/planoai/__init__.py +++ b/cli/planoai/__init__.py @@ -1,3 +1,3 @@ """Plano CLI - Intelligent Prompt Gateway.""" -__version__ = "0.4.11" +__version__ = "0.4.12" diff --git a/cli/planoai/consts.py b/cli/planoai/consts.py index 145fb640..9c330caa 100644 --- a/cli/planoai/consts.py +++ b/cli/planoai/consts.py @@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4" SERVICE_NAME_ARCHGW = "plano" PLANO_DOCKER_NAME = "plano" -PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.11") +PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.12") DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317" # Native mode constants diff --git a/cli/pyproject.toml b/cli/pyproject.toml index 3f9be272..25cc81a4 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "planoai" -version = "0.4.11" +version = "0.4.12" description = "Python-based CLI tool to manage Plano." authors = [{name = "Katanemo Labs, Inc."}] readme = "README.md" diff --git a/cli/uv.lock b/cli/uv.lock index 9d85bf85..dfca2484 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "planoai" -version = "0.4.9" +version = "0.4.12" source = { editable = "." } dependencies = [ { name = "click" }, diff --git a/demos/llm_routing/model_routing_service/DEMO.md b/demos/llm_routing/model_routing_service/DEMO.md new file mode 100644 index 00000000..a64604a8 --- /dev/null +++ b/demos/llm_routing/model_routing_service/DEMO.md @@ -0,0 +1,341 @@ +# Plano: Intelligent LLM Routing as Infrastructure + +--- + +## Plano + +An AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and smart LLM routing so you stay focused on your agent's core logic. + +- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover +- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request +- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code +- **Runs anywhere** — single binary, no dependencies; self-host the router for full data privacy + +``` +┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐ +│ Client │ ──── │ Plano │ ──── │ OpenAI │ +│ (any │ │ │ │ Anthropic │ +│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│ +└───────────┘ │ analyzes intent → picks model │ └──────────────┘ + └─────────────────────────────────┘ +``` + +--- + +## Live Demo: Routing Decision Service + +The `/routing/v1/*` endpoints return **routing decisions without calling the LLM** — perfect for inspecting, testing, and validating routing behavior. + +--- + +### Demo 1 — Code Generation Request + +```bash +curl -s http://localhost:12000/routing/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Write a Python function that implements binary search"} + ] + }' +``` + +**Response:** +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation" +} +``` + +Plano recognized the coding intent and routed to Claude. + +--- + +### Demo 2 — Complex Reasoning Request + +```bash +curl -s http://localhost:12000/routing/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures"} + ] + }' +``` + +**Response:** +```json +{ + "model": "openai/gpt-4o", + "route": "complex_reasoning" +} +``` + +Same endpoint — Plano routed to GPT-4o for reasoning. + +--- + +### Demo 3 — Simple Question (No Match) + +```bash +curl -s http://localhost:12000/routing/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] + }' +``` + +**Response:** +```json +{ + "model": "none", + "route": "null" +} +``` + +No preference matched — falls back to the default (cheapest) model. + +--- + +### Demo 4 — Anthropic Messages Format + +```bash +curl -s http://localhost:12000/routing/v1/messages \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"} + ] + }' +``` + +**Response:** +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation" +} +``` + +Same routing, Anthropic request format. + +--- + +### Demo 5 — OpenAI Responses API Format + +```bash +curl -s http://localhost:12000/routing/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "input": "Build a React component that renders a sortable data table" + }' +``` + +**Response:** +```json +{ + "model": "anthropic/claude-sonnet-4-20250514", + "route": "code_generation" +} +``` + +Same routing engine, works with the OpenAI Responses API format too. + +--- + +## How Did That Work? + +10 lines of YAML. No code. + +```yaml +model_providers: + + - model: openai/gpt-4o-mini + default: true # fallback for unmatched requests + + - model: openai/gpt-4o + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis + + - model: anthropic/claude-sonnet-4-20250514 + routing_preferences: + - name: code_generation + description: generating new code, writing functions +``` + +That's the entire routing configuration. + +--- + +## Under the Hood: How Routing Preferences Work + +### Writing Good Preferences + +Each `routing_preference` has two fields: + +| Field | Purpose | Example | +|---|---|---| +| `name` | Route identifier (returned in responses) | `code_generation` | +| `description` | Natural language — tells the router **when** to pick this model | `generating new code, writing functions, or creating boilerplate` | + +The `description` is the key lever. Write it like you're explaining to a colleague when to use this model: + +```yaml +# Good — specific, descriptive +routing_preferences: + - name: code_generation + description: generating new code snippets, writing functions, creating boilerplate, or refactoring existing code + +# Too vague — overlaps with everything +routing_preferences: + - name: code + description: anything related to code +``` + +Tips: +- **Be specific** — "multi-step mathematical proofs and formal logic" beats "hard questions" +- **Describe the task, not the model** — focus on what the user is asking for +- **Avoid overlap** — if two preferences match the same request, the router has to guess +- **One model can have multiple preferences** — good at both code and math? List both + +--- + +### How Arch-Router Uses Them + +When a request arrives, Plano constructs a prompt for the 1.5B Arch-Router model: + +```xml +You are a helpful assistant designed to find the best suited route. + + +[ + {"name": "complex_reasoning", "description": "complex reasoning tasks, multi-step analysis"}, + {"name": "code_generation", "description": "generating new code, writing functions"} +] + + + +[{"role": "user", "content": "Write a Python function that implements binary search"}] + + +Your task is to decide which route best suits the user intent... +``` + +The router classifies the intent and responds: +```json +{"route": "code_generation"} +``` + +Plano maps `code_generation` back to the model that owns it → `anthropic/claude-sonnet-4-20250514`. + +--- + +### The Full Flow + +``` +1. Request arrives → "Write binary search in Python" +2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] +3. Arch-Router classifies → {"route": "code_generation"} +4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 +5. Request forwarded → Claude generates the response +``` + +No match? Arch-Router returns `{"route": "other"}` → Plano falls back to the default model. + +--- + +### What Powers the Routing + +**Arch-Router** — a purpose-built 1.5B parameter model for intent classification. + +- Runs locally (Ollama) or hosted — no data leaves your network +- Sub-100ms routing decisions +- Handles multi-turn conversations (automatically truncates to fit context) +- Based on preference-aligned routing research + +--- + +## Multi-Format Support + +Same routing engine, any API format: + +| Endpoint | Format | +|---|---| +| `/routing/v1/chat/completions` | OpenAI Chat Completions | +| `/routing/v1/messages` | Anthropic Messages | +| `/routing/v1/responses` | OpenAI Responses API | + +--- + +## Inline Routing Policy + +Clients can override routing at request time — no config change needed: + +```json +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Write quicksort in Go"}], + "routing_policy": [ + { + "model": "openai/gpt-4o", + "routing_preferences": [ + {"name": "coding", "description": "code generation and debugging"} + ] + }, + { + "model": "openai/gpt-4o-mini", + "routing_preferences": [ + {"name": "general", "description": "simple questions and conversation"} + ] + } + ] +} +``` + +Platform sets defaults. Teams override when needed. + +--- + +## Beyond Routing + +Plano is a full AI data plane: + +- **Guardrails** — prompt/response filtering, PII detection +- **Observability** — OpenTelemetry tracing, per-request metrics +- **Rate Limiting** — token-aware rate limiting per model +- **Multi-Provider** — OpenAI, Anthropic, Azure, Gemini, Groq, DeepSeek, Ollama, and more +- **Model Aliases** — `arch.fast.v1` → `gpt-4o-mini` (swap providers without client changes) + +--- + +## Key Takeaways + +1. **No SDK required** — standard API, any language, any framework +2. **Semantic routing** — plain English preferences, not hand-coded rules +3. **Self-hosted router** — 1.5B model runs locally, no data leaves the network +4. **Inspect before you route** — decision-only endpoints for testing and CI/CD +5. **Platform governance** — centralized keys, aliases, and routing policies + +--- + +## Try It + +```bash +pip install planoai +export OPENAI_API_KEY=... +export ANTHROPIC_API_KEY=... +plano up -f config.yaml +bash demo.sh +``` + +**GitHub:** github.com/katanemo/plano diff --git a/docs/source/conf.py b/docs/source/conf.py index ec476136..e554329f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons project = "Plano Docs" copyright = "2025, Katanemo Labs, Inc" author = "Katanemo Labs, Inc" -release = " v0.4.11" +release = " v0.4.12" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst index 279fde2d..9d51d1c4 100644 --- a/docs/source/get_started/quickstart.rst +++ b/docs/source/get_started/quickstart.rst @@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins .. code-block:: console - $ uv tool install planoai==0.4.11 + $ uv tool install planoai==0.4.12 **Option 2: Install with pip (Traditional)** @@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins $ python -m venv venv $ source venv/bin/activate # On Windows, use: venv\Scripts\activate - $ pip install planoai==0.4.11 + $ pip install planoai==0.4.12 .. _llm_routing_quickstart: diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst index 7b8b0554..2689384e 100644 --- a/docs/source/resources/deployment.rst +++ b/docs/source/resources/deployment.rst @@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration: # docker-compose.yml services: plano: - image: katanemo/plano:0.4.11 + image: katanemo/plano:0.4.12 container_name: plano ports: - "10000:10000" # ingress (client -> plano) @@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``: spec: containers: - name: plano - image: katanemo/plano:0.4.11 + image: katanemo/plano:0.4.12 ports: - containerPort: 12000 # LLM gateway (chat completions, model routing) name: llm-gateway