mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
feat(demos): add affinity testing demo for model pinning
This commit is contained in:
parent
fbc247ab05
commit
92f6015165
5 changed files with 137 additions and 18 deletions
|
|
@ -1,5 +1,5 @@
|
|||
use hermesllm::apis::openai::ChatCompletionsResponse;
|
||||
use hyper::header;
|
||||
use serde::Deserialize;
|
||||
use thiserror::Error;
|
||||
use tracing::warn;
|
||||
|
||||
|
|
@ -12,23 +12,8 @@ pub enum HttpError {
|
|||
Json(serde_json::Error, String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RouterChatCompletionResponse {
|
||||
choices: Vec<RouterChoice>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RouterChoice {
|
||||
message: RouterMessage,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RouterMessage {
|
||||
content: Option<String>,
|
||||
}
|
||||
|
||||
/// Sends a POST request to the given URL and extracts the text content
|
||||
/// from the first choice of a chat-completions-like response.
|
||||
/// from the first choice of the `ChatCompletionsResponse`.
|
||||
///
|
||||
/// Returns `Some((content, elapsed))` on success, or `None` if the response
|
||||
/// had no choices or the first choice had no content.
|
||||
|
|
@ -45,7 +30,7 @@ pub async fn post_and_extract_content(
|
|||
let body = res.text().await?;
|
||||
let elapsed = start_time.elapsed();
|
||||
|
||||
let response: RouterChatCompletionResponse = serde_json::from_str(&body).map_err(|err| {
|
||||
let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| {
|
||||
warn!(error = %err, body = %body, "failed to parse json response");
|
||||
HttpError::Json(err, format!("Failed to parse JSON: {}", body))
|
||||
})?;
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr
|
|||
|------|-------------|
|
||||
| [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) |
|
||||
| [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance |
|
||||
| [Affinity Testing](affinity_testing/) | Verifies `X-Model-Affinity` pinning behavior using standard OpenAI SDK requests |
|
||||
| [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks |
|
||||
| [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks |
|
||||
|
||||
|
|
|
|||
46
demos/affinity_testing/README.md
Normal file
46
demos/affinity_testing/README.md
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
# Affinity Testing (OpenAI SDK)
|
||||
|
||||
Quick demo to verify model affinity pinning using standard OpenAI SDK calls against Plano.
|
||||
|
||||
## 1) Start Plano with affinity config
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=<your-key>
|
||||
|
||||
planoai up demos/affinity_testing/config.yaml
|
||||
```
|
||||
|
||||
`config.yaml` enables affinity cache settings:
|
||||
|
||||
```yaml
|
||||
routing:
|
||||
session_ttl_seconds: 600
|
||||
session_max_entries: 1000
|
||||
```
|
||||
|
||||
## 2) Run the demo script
|
||||
|
||||
```bash
|
||||
python demos/affinity_testing/demo.py
|
||||
```
|
||||
|
||||
The script uses this exact SDK pattern:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
import uuid
|
||||
|
||||
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
|
||||
affinity_id = str(uuid.uuid4())
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-5.2",
|
||||
messages=messages,
|
||||
extra_headers={"X-Model-Affinity": affinity_id},
|
||||
)
|
||||
```
|
||||
|
||||
## Expected behavior
|
||||
|
||||
- Call 1 and call 2 share the same affinity ID and should stay on the same selected model.
|
||||
- Call 3 uses a new affinity ID and should be free to route independently.
|
||||
31
demos/affinity_testing/config.yaml
Normal file
31
demos/affinity_testing/config.yaml
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
version: v0.4.0
|
||||
|
||||
listeners:
|
||||
- type: model
|
||||
name: model_listener
|
||||
port: 12000
|
||||
|
||||
model_providers:
|
||||
- model: openai/gpt-5.2
|
||||
access_key: $OPENAI_API_KEY
|
||||
default: true
|
||||
|
||||
- model: openai/gpt-5.2-chat-latest
|
||||
access_key: $OPENAI_API_KEY
|
||||
|
||||
routing_preferences:
|
||||
- name: complex_reasoning
|
||||
description: complex reasoning tasks, multi-step analysis, or detailed explanations
|
||||
models:
|
||||
- openai/gpt-5.2
|
||||
- openai/gpt-5.2-chat-latest
|
||||
|
||||
- name: code_generation
|
||||
description: generating new code, writing functions, or creating boilerplate
|
||||
models:
|
||||
- openai/gpt-5.2-chat-latest
|
||||
- openai/gpt-5.2
|
||||
|
||||
routing:
|
||||
session_ttl_seconds: 600
|
||||
session_max_entries: 1000
|
||||
56
demos/affinity_testing/demo.py
Normal file
56
demos/affinity_testing/demo.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
import uuid
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
|
||||
|
||||
|
||||
def chat_with_affinity(messages: list[dict[str, str]], affinity_id: str):
|
||||
# Intentionally matches the OpenAI SDK usage expected by this demo.
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-5.2",
|
||||
messages=messages,
|
||||
extra_headers={"X-Model-Affinity": affinity_id},
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
def show(label: str, response):
|
||||
content = response.choices[0].message.content or ""
|
||||
print(f"{label}")
|
||||
print(f" model: {response.model}")
|
||||
print(f" text : {content[:120].replace(chr(10), ' ')}")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
affinity_id = str(uuid.uuid4())
|
||||
print("== Affinity Demo (OpenAI SDK) ==")
|
||||
print(f"affinity id: {affinity_id}")
|
||||
print()
|
||||
|
||||
code_messages = [
|
||||
{"role": "user", "content": "Write Python code for binary search."},
|
||||
]
|
||||
reasoning_messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Explain whether free will can exist with determinism.",
|
||||
},
|
||||
]
|
||||
|
||||
first = chat_with_affinity(code_messages, affinity_id)
|
||||
show("1) first call (new affinity, routes and caches)", first)
|
||||
|
||||
second = chat_with_affinity(reasoning_messages, affinity_id)
|
||||
show("2) second call (same affinity, should stay pinned)", second)
|
||||
|
||||
new_affinity_id = str(uuid.uuid4())
|
||||
third = chat_with_affinity(reasoning_messages, new_affinity_id)
|
||||
show("3) third call (new affinity, fresh routing)", third)
|
||||
|
||||
print("If 1 and 2 use the same model, affinity pinning is working.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue