feat(demos): add affinity testing demo for model pinning

This commit is contained in:
Spherrrical 2026-04-09 11:18:06 -07:00
parent fbc247ab05
commit 92f6015165
5 changed files with 137 additions and 18 deletions

View file

@ -1,5 +1,5 @@
use hermesllm::apis::openai::ChatCompletionsResponse;
use hyper::header;
use serde::Deserialize;
use thiserror::Error;
use tracing::warn;
@ -12,23 +12,8 @@ pub enum HttpError {
Json(serde_json::Error, String),
}
#[derive(Debug, Deserialize)]
struct RouterChatCompletionResponse {
choices: Vec<RouterChoice>,
}
#[derive(Debug, Deserialize)]
struct RouterChoice {
message: RouterMessage,
}
#[derive(Debug, Deserialize)]
struct RouterMessage {
content: Option<String>,
}
/// Sends a POST request to the given URL and extracts the text content
/// from the first choice of a chat-completions-like response.
/// from the first choice of the `ChatCompletionsResponse`.
///
/// Returns `Some((content, elapsed))` on success, or `None` if the response
/// had no choices or the first choice had no content.
@ -45,7 +30,7 @@ pub async fn post_and_extract_content(
let body = res.text().await?;
let elapsed = start_time.elapsed();
let response: RouterChatCompletionResponse = serde_json::from_str(&body).map_err(|err| {
let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| {
warn!(error = %err, body = %body, "failed to parse json response");
HttpError::Json(err, format!("Failed to parse JSON: {}", body))
})?;

View file

@ -15,6 +15,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr
|------|-------------|
| [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) |
| [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance |
| [Affinity Testing](affinity_testing/) | Verifies `X-Model-Affinity` pinning behavior using standard OpenAI SDK requests |
| [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks |
| [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks |

View file

@ -0,0 +1,46 @@
# Affinity Testing (OpenAI SDK)
Quick demo to verify model affinity pinning using standard OpenAI SDK calls against Plano.
## 1) Start Plano with affinity config
```bash
export OPENAI_API_KEY=<your-key>
planoai up demos/affinity_testing/config.yaml
```
`config.yaml` enables affinity cache settings:
```yaml
routing:
session_ttl_seconds: 600
session_max_entries: 1000
```
## 2) Run the demo script
```bash
python demos/affinity_testing/demo.py
```
The script uses this exact SDK pattern:
```python
from openai import OpenAI
import uuid
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
affinity_id = str(uuid.uuid4())
response = client.chat.completions.create(
model="gpt-5.2",
messages=messages,
extra_headers={"X-Model-Affinity": affinity_id},
)
```
## Expected behavior
- Call 1 and call 2 share the same affinity ID and should stay on the same selected model.
- Call 3 uses a new affinity ID and should be free to route independently.

View file

@ -0,0 +1,31 @@
version: v0.4.0
listeners:
- type: model
name: model_listener
port: 12000
model_providers:
- model: openai/gpt-5.2
access_key: $OPENAI_API_KEY
default: true
- model: openai/gpt-5.2-chat-latest
access_key: $OPENAI_API_KEY
routing_preferences:
- name: complex_reasoning
description: complex reasoning tasks, multi-step analysis, or detailed explanations
models:
- openai/gpt-5.2
- openai/gpt-5.2-chat-latest
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
models:
- openai/gpt-5.2-chat-latest
- openai/gpt-5.2
routing:
session_ttl_seconds: 600
session_max_entries: 1000

View file

@ -0,0 +1,56 @@
import uuid
from openai import OpenAI
client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
def chat_with_affinity(messages: list[dict[str, str]], affinity_id: str):
# Intentionally matches the OpenAI SDK usage expected by this demo.
response = client.chat.completions.create(
model="gpt-5.2",
messages=messages,
extra_headers={"X-Model-Affinity": affinity_id},
)
return response
def show(label: str, response):
content = response.choices[0].message.content or ""
print(f"{label}")
print(f" model: {response.model}")
print(f" text : {content[:120].replace(chr(10), ' ')}")
print()
def main():
affinity_id = str(uuid.uuid4())
print("== Affinity Demo (OpenAI SDK) ==")
print(f"affinity id: {affinity_id}")
print()
code_messages = [
{"role": "user", "content": "Write Python code for binary search."},
]
reasoning_messages = [
{
"role": "user",
"content": "Explain whether free will can exist with determinism.",
},
]
first = chat_with_affinity(code_messages, affinity_id)
show("1) first call (new affinity, routes and caches)", first)
second = chat_with_affinity(reasoning_messages, affinity_id)
show("2) second call (same affinity, should stay pinned)", second)
new_affinity_id = str(uuid.uuid4())
third = chat_with_affinity(reasoning_messages, new_affinity_id)
show("3) third call (new affinity, fresh routing)", third)
print("If 1 and 2 use the same model, affinity pinning is working.")
if __name__ == "__main__":
main()