Add support for streaming and fixes few issues (see description) (#202)

2026-07-02 15:51:02 +02:00 · 2024-10-28 20:05:06 -04:00 · 2024-10-28 20:05:06 -04:00 · 662a840ac5
commit 662a840ac5
parent 29ff8da60f
45 changed files with 2266 additions and 477 deletions
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -12,6 +12,9 @@ FROM envoyproxy/envoy:v1.31-latest as envoy

 #Build config generator, so that we have a single build image for both Rust and Python
 FROM python:3-slim as arch
+
+RUN apt-get update && apt-get install -y gettext-base && apt-get clean && rm -rf /var/lib/apt/lists/*
+
 COPY --from=builder /arch/target/wasm32-wasi/release/prompt_gateway.wasm /etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
 COPY --from=builder /arch/target/wasm32-wasi/release/llm_gateway.wasm /etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
 COPY --from=envoy /usr/local/bin/envoy /usr/local/bin/envoy
@ -22,4 +25,5 @@ COPY arch/tools/cli/config_generator.py .
 COPY arch/envoy.template.yaml .
 COPY arch/arch_config_schema.yaml .

-CMD ["sh", "-c", "python config_generator.py && envoy -c /etc/envoy/envoy.yaml --component-log-level wasm:debug"]
+
+ENTRYPOINT ["sh", "-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug"]
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -160,4 +160,3 @@ required:
  - version
  - listener
  - llm_providers
-  - prompt_targets
--- a/arch/build_filter_image.sh
+++ b/arch/build_filter_image.sh
@ -1 +1 @@
-docker build -t archgw .. -f Dockerfile
+docker build  -f Dockerfile .. -t katanemo/archgw
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@ -1,6 +1,6 @@
 services:
  archgw:
-    image: archgw:latest
+    image: katanemo/archgw:latest
    ports:
      - "10000:10000"
      - "11000:11000"
@ -10,9 +10,13 @@ services:
      - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
      - ./envoy.template.yaml:/config/envoy.template.yaml
-      - ./target/wasm32-wasi/release/intelligent_prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/intelligent_prompt_gateway.wasm
      - ./arch_config_schema.yaml:/config/arch_config_schema.yaml
-      - ./tools/config_generator.py:/config/config_generator.py
-      - ./arch_logs:/var/log/
-    env_file:
-      - stage.env
+      - ./tools/cli/config_generator.py:/config/config_generator.py
+      - ../crates/target/wasm32-wasi/release/llm_gateway.wasm:/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm
+      - ../crates/target/wasm32-wasi/release/prompt_gateway.wasm:/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm
+      - ~/archgw_logs:/var/log/
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
--- a/arch/docker-compose.e2e.yaml
+++ b/arch/docker-compose.e2e.yaml
@ -0,0 +1,17 @@
+services:
+  archgw:
+    image: katanemo/archgw:latest
+    ports:
+      - "10000:10000"
+      - "11000:11000"
+      - "12000:12000"
+      - "19901:9901"
+    volumes:
+      - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
+      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
+      - ~/archgw_logs:/var/log/
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
+      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
--- a/arch/docker-compose.yaml
+++ b/arch/docker-compose.yaml
@ -7,7 +7,7 @@ services:
      - "12000:12000"
      - "19901:9901"
    volumes:
-      - ${ARCH_CONFIG_FILE:-./demos/function_calling/arch_confg.yaml}:/config/arch_config.yaml
+      - ${ARCH_CONFIG_FILE:-../demos/function_calling/arch_config.yaml}:/config/arch_config.yaml
      - /etc/ssl/cert.pem:/etc/ssl/cert.pem
      - ~/archgw_logs:/var/log/
    env_file:
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -52,6 +52,15 @@ static_resources:
                            cluster: arch_llm_listener
                            timeout: 60s
                http_filters:
+                  - name: envoy.filters.http.compressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor
+                      compressor_library:
+                        name: compress
+                        typed_config:
+                          "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip
+                          memory_level: 3
+                          window_bits: 10
                  - name: envoy.filters.http.wasm
                    typed_config:
                      "@type": type.googleapis.com/udpa.type.v1.TypedStruct
@ -69,6 +78,17 @@ static_resources:
                            code:
                              local:
                                filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm"
+                  - name: envoy.filters.http.decompressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
+                      decompressor_library:
+                        name: decompress
+                        typed_config:
+                          "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip"
+                          window_bits: 9
+                          chunk_size: 8192
+                          # If this ratio is set too low, then body data will not be decompressed completely.
+                          max_inflate_ratio: 1000
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@ -187,6 +207,12 @@ static_resources:
                      domains:
                        - "*"
                      routes:
+                        - match:
+                            prefix: "/healthz"
+                          route:
+                            auto_host_rewrite: true
+                            cluster: openai
+                            timeout: 60s
                      {% for provider in arch_llm_providers %}
                        - match:
                            prefix: "/"
@ -206,6 +232,15 @@ static_resources:
                            body:
                              inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n"
                http_filters:
+                  - name: envoy.filters.http.compressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor
+                      compressor_library:
+                        name: compress
+                        typed_config:
+                          "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip
+                          memory_level: 3
+                          window_bits: 10
                  - name: envoy.filters.http.wasm
                    typed_config:
                      "@type": type.googleapis.com/udpa.type.v1.TypedStruct
@ -223,6 +258,17 @@ static_resources:
                            code:
                              local:
                                filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm"
+                  - name: envoy.filters.http.decompressor
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor
+                      decompressor_library:
+                        name: decompress
+                        typed_config:
+                          "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip"
+                          window_bits: 9
+                          chunk_size: 8192
+                          # If this ratio is set too low, then body data will not be decompressed completely.
+                          max_inflate_ratio: 1000
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -47,13 +47,14 @@ def validate_and_render_schema():
    config_schema_yaml = yaml.safe_load(arch_config_schema)
    inferred_clusters = {}

-    for prompt_target in config_yaml["prompt_targets"]:
-        name = prompt_target.get("endpoint", {}).get("name", "")
-        if name not in inferred_clusters:
-            inferred_clusters[name] = {
-                "name": name,
-                "port": 80,  # default port
-            }
+    if "prompt_targets" in config_yaml:
+        for prompt_target in config_yaml["prompt_targets"]:
+            name = prompt_target.get("endpoint", {}).get("name", "")
+            if name not in inferred_clusters:
+                inferred_clusters[name] = {
+                    "name": name,
+                    "port": 80,  # default port
+                }

    print(inferred_clusters)
    endpoints = config_yaml.get("endpoints", {})