Introduce brightstaff a new terminal service for llm routing (#477)

2026-07-20 16:41:04 +02:00 · 2025-05-19 09:59:22 -07:00 · 2025-05-19 09:59:22 -07:00 · 27c0f2fdce
commit 27c0f2fdce
parent 1f95fac4af
36 changed files with 2817 additions and 150 deletions
--- a/demos/use_cases/preference_based_routing/README.md
+++ b/demos/use_cases/preference_based_routing/README.md
@ -0,0 +1,2 @@
+# Usage based LLM Routing
+This demo shows how you can use user preferences to route user prompts to appropriate llm. See [arch_config.yaml](arch_config.yaml) for details on how you can define user preferences.
--- a/demos/use_cases/preference_based_routing/arch_config.yaml
+++ b/demos/use_cases/preference_based_routing/arch_config.yaml
@ -0,0 +1,39 @@
+version: "0.1-beta"
+
+routing:
+  model: gpt-4o
+
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+llm_providers:
+
+  - name: archgw-v1-router-model
+    provider_interface: openai
+    model: cotran2/llama-1b-4-26
+    base_url: http://35.192.87.187:8000/v1
+
+  - name: gpt-4o-mini
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o-mini
+    default: true
+
+  - name: gpt-4o
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-4o
+    usage: Generating original content such as scripts, articles, or creative materials.
+
+  - name: o4-mini
+    provider_interface: openai
+    access_key: $OPENAI_API_KEY
+    model: o4-mini
+    usage: Requesting topic ideas specifically related to personal finance and budgeting.
+
+tracing:
+  random_sampling: 100
--- a/demos/use_cases/preference_based_routing/docker-compose.yaml
+++ b/demos/use_cases/preference_based_routing/docker-compose.yaml
@ -0,0 +1,32 @@
+services:
+
+  chatbot_ui:
+    build:
+      context: ../../shared/chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+
+  prometheus:
+    build:
+      context: ../../shared/prometheus
+
+  grafana:
+    build:
+      context: ../../shared/grafana
+    ports:
+      - "3000:3000"
--- a/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
+++ b/demos/use_cases/preference_based_routing/hurl_tests/simple.hurl
@ -0,0 +1,18 @@
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I am running under debt, how should I keep a tab on my expenses?"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^o4-mini/
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" != null
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/use_cases/preference_based_routing/hurl_tests/simple_stream.hurl
+++ b/demos/use_cases/preference_based_routing/hurl_tests/simple_stream.hurl
@ -0,0 +1,16 @@
+POST http://localhost:12000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I am running under debt, how should I keep a tab on my expenses?"
+    }
+  ],
+  "stream": true
+}
+HTTP 200
+[Asserts]
+header "content-type" matches /text\/event-stream/
+body matches /^data: .*?o4-mini.*?\n/
--- a/demos/use_cases/preference_based_routing/test_router_endpoint.rest
+++ b/demos/use_cases/preference_based_routing/test_router_endpoint.rest
@ -0,0 +1,24 @@
+@arch_llm_router_endpoint = http://35.192.87.187:8000
+
+POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{
+  "model": "cotran2/llama-1b-4-26",
+  "messages": [
+    {
+      "role": "user",
+      "content": "You are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o()\n  description: \"complex reasoning problem, require multi step answer\\n\"\n- name: o4-mini()\n  description: \"simple requests, basic fact retrieval, easy to answer\\n\"\n\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n\nuser: Hello\nassistant: Hi! How can I assist you today?\nuser: List us presidents who are born in odd years and are still alive. Order them by their age and I also know what is their home city they were born. And what year they became president. Also give me summary of which president was the best for economy of the US.\n\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n  \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace."
+    }
+  ]
+}
+
+### test 2
+
+POST {{arch_llm_router_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{"model":"cotran2/llama-1b-4-26","messages":[{"role":"user","content":"\nYou are an advanced Routing Assistant designed to select the optimal route based on user requests. \nYour task is to analyze conversations and match them to the most appropriate predefined route.\nReview the available routes config:\n\n# ROUTES CONFIG START\n- name: gpt-4o\n  description: simple requests, basic fact retrieval, easy to answer\n- name: o4-mini()\n  description: complex reasoning problem, require multi step answer\n# ROUTES CONFIG END\n\nExamine the following conversation between a user and an assistant:\n\n# CONVERSATION START\n[{\"role\":\"user\",\"content\":\"What is the capital of France?\"}]\n# CONVERSATION END\n\nYour goal is to identify the most appropriate route that matches the user's LATEST intent. Follow these steps:\n\n1. Carefully read and analyze the provided conversation, focusing on the user's latest request and the conversation scenario.\n2. Check if the user's request and scenario matches any of the routes in the routing configuration (focus on the description).\n3. Find the route that best matches.\n4. Use context clues from the entire conversation to determine the best fit.\n5. Return the best match possible. You only response the name of the route that best matches the user's request, use the exact name in the routes config.\n6. If no route relatively close to matches the user's latest intent or user last message is thank you or greeting, return an empty route ''. \n\n# OUTPUT FORMAT\nYour final output must follow this JSON format:\n{\n  \"route\": \"route_name\" # The matched route name, or empty string '' if no match\n}\n\nBased on your analysis, provide only the JSON object as your final output with no additional text, explanations, or whitespace.\n"}],"stream":false}
+
+### get model list
+GET http://34.46.85.85:8000/v1/models HTTP/1.1