wip

2026-06-17 15:25:17 +02:00 · 2024-11-07 10:13:18 -08:00 · 2024-11-07 10:13:18 -08:00 · 7b99379275
commit 7b99379275
parent 23ed25342a
11 changed files with 761 additions and 392 deletions
--- a/api_prompt_gateway.rest
+++ b/api_prompt_gateway.rest
@ -13,6 +13,20 @@ Content-Type: application/json
  ]
 }

+### prompt gateway request default target
+POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
+
+
 ### prompt gateway request (streaming)
 POST {{prompt_endpoint}}/v1/chat/completions HTTP/1.1
 Content-Type: application/json
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -26,4 +26,4 @@ COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .


-ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:trace"]
+ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug"]
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@ -3,6 +3,7 @@ services:
    image: katanemo/archgw:latest
    ports:
      - "10000:10000"
+      - "10001:10001"
      - "11000:11000"
      - "12000:12000"
      - "19901:9901"
--- a/arch/docker-compose.e2e.yaml
+++ b/arch/docker-compose.e2e.yaml
@ -3,6 +3,7 @@ services:
    image: katanemo/archgw:latest
    ports:
      - "10000:10000"
+      - "10001:10001"
      - "11000:11000"
      - "12000:12000"
      - "19901:9901"
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@ -3,6 +3,7 @@ services:
    image: katanemo/archgw:latest
    ports:
      - "10000:10000"
+      - "10001:10001"
      - "11000:11000"
      - "12000:12000"
      - "19901:9901"
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -98,12 +98,18 @@ static_resources:
                      domains:
                        - "*"
                      routes:
+                      {% for provider in arch_llm_providers %}
                        - match:
                            prefix: "/"
+                            headers:
+                              - name: "x-arch-llm-provider"
+                                string_match:
+                                  exact: {{ provider.name }}
                          route:
                            auto_host_rewrite: true
-                            cluster: arch_llm_listener
+                            cluster: {{ provider.provider }}
                            timeout: 60s
+                      {% endfor %}
                http_filters:
                  - name: envoy.filters.http.compressor
                    typed_config:
@ -131,6 +137,23 @@ static_resources:
                            code:
                              local:
                                filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
+                  - name: envoy.filters.http.wasm
+                    typed_config:
+                      "@type": type.googleapis.com/udpa.type.v1.TypedStruct
+                      type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm
+                      value:
+                        config:
+                          name: "http_config"
+                          root_id: llm_gateway
+                          configuration:
+                            "@type": "type.googleapis.com/google.protobuf.StringValue"
+                            value: |
+                                {{ arch_llm_config | indent(32) }}
+                          vm_config:
+                            runtime: "envoy.wasm.runtime.v8"
+                            code:
+                              local:
+                                filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
                  - name: envoy.filters.http.decompressor
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
@ -146,7 +169,6 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-
    - name: arch_internal
      address:
        socket_address:
@ -457,22 +479,6 @@ static_resources:
                      port_value: 10001
                  hostname: arch_prompt_gateway_listener

-    - name: arch_llm_listener
-      connect_timeout: 5s
-      type: LOGICAL_DNS
-      dns_lookup_family: V4_ONLY
-      lb_policy: ROUND_ROBIN
-      load_assignment:
-        cluster_name: arch_llm_listener
-        endpoints:
-          - lb_endpoints:
-              - endpoint:
-                  address:
-                    socket_address:
-                      address: 0.0.0.0
-                      port_value: 12000
-                  hostname: arch_llm_listener
-
 {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
    - name: opentelemetry_collector
      type: STRICT_DNS
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -1095,7 +1095,17 @@ impl StreamContext {
        {
            let default_target_response_str = if self.streaming_response {
                let chat_completion_response =
-                    serde_json::from_slice::<ChatCompletionsResponse>(&body).unwrap();
+                    match serde_json::from_slice::<ChatCompletionsResponse>(&body) {
+                        Ok(chat_completion_response) => chat_completion_response,
+                        Err(e) => {
+                            warn!(
+                                "error deserializing default target response: {}, body str: {}",
+                                e,
+                                String::from_utf8(body).unwrap()
+                            );
+                            return self.send_server_error(ServerError::Deserialization(e), None);
+                        }
+                    };

                let chunks = vec![
                    ChatCompletionStreamResponse::new(
--- a/demos/weather_forecast/arch_config.yaml
+++ b/demos/weather_forecast/arch_config.yaml
@ -74,4 +74,4 @@ prompt_targets:

 tracing:
  random_sampling: 100
-  # trace_arch: true
+  trace_arch: true
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -1,3 +1,4 @@
+import os
 import time
 import torch
 import app.commons.utilities as utils
@ -15,12 +16,38 @@ from app.function_calling.model_utils import (
 )
 from unittest.mock import patch

+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.resources import Resource
+
+resource = Resource.create(
+    {
+        "service.name": "model-server",
+    }
+)
+
+# Initialize the tracer provider
+trace.set_tracer_provider(TracerProvider(resource=resource))
+tracer = trace.get_tracer(__name__)
+
+
 logger = utils.get_model_server_logger()

 logger.info(f"Ready to serve traffic. available device: {glb.DEVICE}")

 app = FastAPI()

+FastAPIInstrumentor().instrument_app(app)
+
+# Configure the OTLP exporter (Jaeger, Zipkin, etc.)
+otlp_exporter = OTLPSpanExporter(
+    endpoint=os.getenv("OLTP_HOST", "http://localhost:4317")  # noqa: F821
+)
+trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(otlp_exporter))
+

 class EmbeddingRequest(BaseModel):
    input: str
--- a/model_server/poetry.lock
+++ b/model_server/poetry.lock
--- a/model_server/pyproject.toml
+++ b/model_server/pyproject.toml
@ -32,6 +32,10 @@ onnxruntime = "1.19.2"
 httpx = "*"
 pytest-asyncio = "*"
 pytest = "*"
+opentelemetry-api = "^1.28.0"
+opentelemetry-sdk = "^1.28.0"
+opentelemetry-exporter-otlp = "^1.28.0"
+opentelemetry-instrumentation-fastapi = "^0.49b0"

 [tool.poetry.scripts]
 archgw_modelserver = "app.cli:run_server"