feat(demos): add affinity testing demo for model pinning

2026-06-11 15:05:14 +02:00 · 2026-04-09 11:18:06 -07:00 · 2026-04-09 11:18:06 -07:00 · 92f6015165
commit 92f6015165
parent fbc247ab05
5 changed files with 137 additions and 18 deletions
--- a/crates/brightstaff/src/router/http.rs
+++ b/crates/brightstaff/src/router/http.rs
@ -1,5 +1,5 @@
+use hermesllm::apis::openai::ChatCompletionsResponse;
 use hyper::header;
-use serde::Deserialize;
 use thiserror::Error;
 use tracing::warn;

@ -12,23 +12,8 @@ pub enum HttpError {
    Json(serde_json::Error, String),
 }

-#[derive(Debug, Deserialize)]
-struct RouterChatCompletionResponse {
-    choices: Vec<RouterChoice>,
-}
-
-#[derive(Debug, Deserialize)]
-struct RouterChoice {
-    message: RouterMessage,
-}
-
-#[derive(Debug, Deserialize)]
-struct RouterMessage {
-    content: Option<String>,
-}
-
 /// Sends a POST request to the given URL and extracts the text content
-/// from the first choice of a chat-completions-like response.
+/// from the first choice of the `ChatCompletionsResponse`.
 ///
 /// Returns `Some((content, elapsed))` on success, or `None` if the response
 /// had no choices or the first choice had no content.
@ -45,7 +30,7 @@ pub async fn post_and_extract_content(
    let body = res.text().await?;
    let elapsed = start_time.elapsed();

-    let response: RouterChatCompletionResponse = serde_json::from_str(&body).map_err(|err| {
+    let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| {
        warn!(error = %err, body = %body, "failed to parse json response");
        HttpError::Json(err, format!("Failed to parse JSON: {}", body))
    })?;
--- a/demos/README.md
+++ b/demos/README.md
@ -15,6 +15,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr
 |------|-------------|
 | [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) |
 | [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance |
+| [Affinity Testing](affinity_testing/) | Verifies `X-Model-Affinity` pinning behavior using standard OpenAI SDK requests |
 | [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks |
 | [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks |

--- a/demos/affinity_testing/README.md
+++ b/demos/affinity_testing/README.md
@ -0,0 +1,46 @@
+# Affinity Testing (OpenAI SDK)
+
+Quick demo to verify model affinity pinning using standard OpenAI SDK calls against Plano.
+
+## 1) Start Plano with affinity config
+
+```bash
+export OPENAI_API_KEY=<your-key>
+
+planoai up demos/affinity_testing/config.yaml
+```
+
+`config.yaml` enables affinity cache settings:
+
+```yaml
+routing:
+  session_ttl_seconds: 600
+  session_max_entries: 1000
+```
+
+## 2) Run the demo script
+
+```bash
+python demos/affinity_testing/demo.py
+```
+
+The script uses this exact SDK pattern:
+
+```python
+from openai import OpenAI
+import uuid
+
+client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
+affinity_id = str(uuid.uuid4())
+
+response = client.chat.completions.create(
+    model="gpt-5.2",
+    messages=messages,
+    extra_headers={"X-Model-Affinity": affinity_id},
+)
+```
+
+## Expected behavior
+
+- Call 1 and call 2 share the same affinity ID and should stay on the same selected model.
+- Call 3 uses a new affinity ID and should be free to route independently.
--- a/demos/affinity_testing/config.yaml
+++ b/demos/affinity_testing/config.yaml
@ -0,0 +1,31 @@
+version: v0.4.0
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+  - model: openai/gpt-5.2
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-5.2-chat-latest
+    access_key: $OPENAI_API_KEY
+
+routing_preferences:
+  - name: complex_reasoning
+    description: complex reasoning tasks, multi-step analysis, or detailed explanations
+    models:
+      - openai/gpt-5.2
+      - openai/gpt-5.2-chat-latest
+
+  - name: code_generation
+    description: generating new code, writing functions, or creating boilerplate
+    models:
+      - openai/gpt-5.2-chat-latest
+      - openai/gpt-5.2
+
+routing:
+  session_ttl_seconds: 600
+  session_max_entries: 1000
--- a/demos/affinity_testing/demo.py
+++ b/demos/affinity_testing/demo.py
@ -0,0 +1,56 @@
+import uuid
+
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY")
+
+
+def chat_with_affinity(messages: list[dict[str, str]], affinity_id: str):
+    # Intentionally matches the OpenAI SDK usage expected by this demo.
+    response = client.chat.completions.create(
+        model="gpt-5.2",
+        messages=messages,
+        extra_headers={"X-Model-Affinity": affinity_id},
+    )
+    return response
+
+
+def show(label: str, response):
+    content = response.choices[0].message.content or ""
+    print(f"{label}")
+    print(f"  model: {response.model}")
+    print(f"  text : {content[:120].replace(chr(10), ' ')}")
+    print()
+
+
+def main():
+    affinity_id = str(uuid.uuid4())
+    print("== Affinity Demo (OpenAI SDK) ==")
+    print(f"affinity id: {affinity_id}")
+    print()
+
+    code_messages = [
+        {"role": "user", "content": "Write Python code for binary search."},
+    ]
+    reasoning_messages = [
+        {
+            "role": "user",
+            "content": "Explain whether free will can exist with determinism.",
+        },
+    ]
+
+    first = chat_with_affinity(code_messages, affinity_id)
+    show("1) first call (new affinity, routes and caches)", first)
+
+    second = chat_with_affinity(reasoning_messages, affinity_id)
+    show("2) second call (same affinity, should stay pinned)", second)
+
+    new_affinity_id = str(uuid.uuid4())
+    third = chat_with_affinity(reasoning_messages, new_affinity_id)
+    show("3) third call (new affinity, fresh routing)", third)
+
+    print("If 1 and 2 use the same model, affinity pinning is working.")
+
+
+if __name__ == "__main__":
+    main()