diff --git a/crates/brightstaff/src/router/http.rs b/crates/brightstaff/src/router/http.rs index cdd700d1..ad1b711c 100644 --- a/crates/brightstaff/src/router/http.rs +++ b/crates/brightstaff/src/router/http.rs @@ -1,5 +1,5 @@ +use hermesllm::apis::openai::ChatCompletionsResponse; use hyper::header; -use serde::Deserialize; use thiserror::Error; use tracing::warn; @@ -12,23 +12,8 @@ pub enum HttpError { Json(serde_json::Error, String), } -#[derive(Debug, Deserialize)] -struct RouterChatCompletionResponse { - choices: Vec, -} - -#[derive(Debug, Deserialize)] -struct RouterChoice { - message: RouterMessage, -} - -#[derive(Debug, Deserialize)] -struct RouterMessage { - content: Option, -} - /// Sends a POST request to the given URL and extracts the text content -/// from the first choice of a chat-completions-like response. +/// from the first choice of the `ChatCompletionsResponse`. /// /// Returns `Some((content, elapsed))` on success, or `None` if the response /// had no choices or the first choice had no content. @@ -45,7 +30,7 @@ pub async fn post_and_extract_content( let body = res.text().await?; let elapsed = start_time.elapsed(); - let response: RouterChatCompletionResponse = serde_json::from_str(&body).map_err(|err| { + let response: ChatCompletionsResponse = serde_json::from_str(&body).map_err(|err| { warn!(error = %err, body = %body, "failed to parse json response"); HttpError::Json(err, format!("Failed to parse JSON: {}", body)) })?; diff --git a/demos/README.md b/demos/README.md index 6e467a33..883c3e51 100644 --- a/demos/README.md +++ b/demos/README.md @@ -15,6 +15,7 @@ This directory contains demos showcasing Plano's capabilities as an AI-native pr |------|-------------| | [Preference-Based Routing](llm_routing/preference_based_routing/) | Routes prompts to LLMs based on user-defined preferences and task type (e.g. code generation vs. understanding) | | [Model Alias Routing](llm_routing/model_alias_routing/) | Maps semantic aliases (`arch.summarize.v1`) to provider-specific models for centralized governance | +| [Affinity Testing](affinity_testing/) | Verifies `X-Model-Affinity` pinning behavior using standard OpenAI SDK requests | | [Claude Code Router](llm_routing/claude_code_router/) | Extends Claude Code with multi-provider access and preference-aligned routing for coding tasks | | [Codex Router](llm_routing/codex_router/) | Extends Codex CLI with multi-provider access and preference-aligned routing for coding tasks | diff --git a/demos/affinity_testing/README.md b/demos/affinity_testing/README.md new file mode 100644 index 00000000..2056c873 --- /dev/null +++ b/demos/affinity_testing/README.md @@ -0,0 +1,46 @@ +# Affinity Testing (OpenAI SDK) + +Quick demo to verify model affinity pinning using standard OpenAI SDK calls against Plano. + +## 1) Start Plano with affinity config + +```bash +export OPENAI_API_KEY= + +planoai up demos/affinity_testing/config.yaml +``` + +`config.yaml` enables affinity cache settings: + +```yaml +routing: + session_ttl_seconds: 600 + session_max_entries: 1000 +``` + +## 2) Run the demo script + +```bash +python demos/affinity_testing/demo.py +``` + +The script uses this exact SDK pattern: + +```python +from openai import OpenAI +import uuid + +client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY") +affinity_id = str(uuid.uuid4()) + +response = client.chat.completions.create( + model="gpt-5.2", + messages=messages, + extra_headers={"X-Model-Affinity": affinity_id}, +) +``` + +## Expected behavior + +- Call 1 and call 2 share the same affinity ID and should stay on the same selected model. +- Call 3 uses a new affinity ID and should be free to route independently. diff --git a/demos/affinity_testing/config.yaml b/demos/affinity_testing/config.yaml new file mode 100644 index 00000000..5773d286 --- /dev/null +++ b/demos/affinity_testing/config.yaml @@ -0,0 +1,31 @@ +version: v0.4.0 + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-5.2-chat-latest + access_key: $OPENAI_API_KEY + +routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + models: + - openai/gpt-5.2 + - openai/gpt-5.2-chat-latest + + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + models: + - openai/gpt-5.2-chat-latest + - openai/gpt-5.2 + +routing: + session_ttl_seconds: 600 + session_max_entries: 1000 diff --git a/demos/affinity_testing/demo.py b/demos/affinity_testing/demo.py new file mode 100644 index 00000000..9c3a294e --- /dev/null +++ b/demos/affinity_testing/demo.py @@ -0,0 +1,56 @@ +import uuid + +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:12000/v1", api_key="EMPTY") + + +def chat_with_affinity(messages: list[dict[str, str]], affinity_id: str): + # Intentionally matches the OpenAI SDK usage expected by this demo. + response = client.chat.completions.create( + model="gpt-5.2", + messages=messages, + extra_headers={"X-Model-Affinity": affinity_id}, + ) + return response + + +def show(label: str, response): + content = response.choices[0].message.content or "" + print(f"{label}") + print(f" model: {response.model}") + print(f" text : {content[:120].replace(chr(10), ' ')}") + print() + + +def main(): + affinity_id = str(uuid.uuid4()) + print("== Affinity Demo (OpenAI SDK) ==") + print(f"affinity id: {affinity_id}") + print() + + code_messages = [ + {"role": "user", "content": "Write Python code for binary search."}, + ] + reasoning_messages = [ + { + "role": "user", + "content": "Explain whether free will can exist with determinism.", + }, + ] + + first = chat_with_affinity(code_messages, affinity_id) + show("1) first call (new affinity, routes and caches)", first) + + second = chat_with_affinity(reasoning_messages, affinity_id) + show("2) second call (same affinity, should stay pinned)", second) + + new_affinity_id = str(uuid.uuid4()) + third = chat_with_affinity(reasoning_messages, new_affinity_id) + show("3) third call (new affinity, fresh routing)", third) + + print("If 1 and 2 use the same model, affinity pinning is working.") + + +if __name__ == "__main__": + main()