diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 8d6f528d..c18987fd 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -29,6 +29,7 @@ stats_config: - 180000 static_resources: listeners: + ## begin - legacy listeners - name: ingress_traffic address: socket_address: @@ -214,7 +215,10 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - name: egress_api_traffic + ## end - legacy listeners + + # Listener for outbound API traffic to services and clusters + - name: outbound_api_traffic address: socket_address: address: 0.0.0.0 @@ -236,11 +240,11 @@ static_resources: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s - service_name: egress_api_traffic + service_name: outbound_api_traffic random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: egress_api_traffic + stat_prefix: outbound_api_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -288,12 +292,16 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - name: agents_traffic + # Listeners for LLM agents + {% for listener in listeners %} + + {% if listener.agents %} + + - name: {{ listener.name | replace(" ", "_") }} address: socket_address: address: 0.0.0.0 - port_value: 8001 - traffic_direction: OUTBOUND + port_value: {{ listener.port }} filter_chains: - filters: - name: envoy.filters.network.http_connection_manager @@ -325,6 +333,10 @@ static_resources: path: "/var/log/access_llm.log" route_config: name: local_routes + request_headers_to_add: + - header: + key: "x-arch-agent-listener-name" + value: "{{ listener.name }}" virtual_hosts: - name: local_service domains: @@ -380,7 +392,141 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + {% endif %} + {% endfor %} + # Listeners for LLMs + {% for listener in listeners %} + + {% if listener.llm_providers %} + + - name: {{ listener.name | replace(" ", "_") }} + address: + socket_address: + address: {{ listener.address }} + port_value: {{ listener.port }} + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} + generate_request_id: true + tracing: + provider: + name: envoy.tracers.opentelemetry + typed_config: + "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig + grpc_service: + envoy_grpc: + cluster_name: opentelemetry_collector + timeout: 0.250s + service_name: egress_traffic_llm + random_sampling: + value: {{ arch_tracing.random_sampling }} + {% endif %} + stat_prefix: egress_traffic + codec_type: AUTO + scheme_header_transformation: + scheme_to_overwrite: https + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: "/var/log/access_llm.log" + route_config: + name: local_routes + virtual_hosts: + - name: local_service + domains: + - "*" + routes: + {% for provider in listener.llm_providers %} + # if endpoint is set then use custom cluster for upstream llm + {% if provider.endpoint %} + {% set llm_cluster_name = provider.name %} + {% else %} + {% set llm_cluster_name = provider.provider_interface %} + {% endif %} + - match: + prefix: "/" + headers: + - name: "x-arch-llm-provider" + string_match: + exact: {{ llm_cluster_name }} + route: + auto_host_rewrite: true + cluster: {{ llm_cluster_name }} + timeout: 60s + {% endfor %} + - match: + prefix: "/" + direct_response: + status: 400 + body: + inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n" + http_filters: + - name: envoy.filters.http.compressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor + compressor_library: + name: envoy.compression.brotli.compressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.compression.brotli.compressor.v3.Brotli + chunk_size: 8192 + - name: envoy.filters.http.compressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor + compressor_library: + name: compress + typed_config: + "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip + memory_level: 3 + window_bits: 10 + - name: envoy.filters.http.wasm + typed_config: + "@type": type.googleapis.com/udpa.type.v1.TypedStruct + type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm + value: + config: + name: "http_config" + root_id: llm_gateway + configuration: + "@type": "type.googleapis.com/google.protobuf.StringValue" + value: | + {{ arch_llm_config | indent(32) }} + vm_config: + runtime: "envoy.wasm.runtime.v8" + code: + local: + filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm" + - name: envoy.filters.http.decompressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor + decompressor_library: + name: decompress + typed_config: + "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip" + chunk_size: 8192 + # If this ratio is set too low, then body data will not be decompressed completely. + max_inflate_ratio: 1000 + - name: envoy.filters.http.decompressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor + decompressor_library: + name: envoy.compression.brotli.decompressor + typed_config: + "@type": type.googleapis.com/envoy.extensions.compression.brotli.decompressor.v3.Brotli + chunk_size: 8192 + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + + + {% endif %} + {% endfor %} + + # begin - legacy llm listeners - name: egress_traffic address: @@ -595,6 +741,7 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + # end - legacy llm listeners clusters: - name: arch diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py index baae7e0c..a24a1530 100644 --- a/arch/tools/cli/config_generator.py +++ b/arch/tools/cli/config_generator.py @@ -126,6 +126,7 @@ def validate_and_render_schema(): model_name_keys = set() model_usage_name_keys = set() + # legacy listeners # check if type is array or object # if its dict its legacy format let's convert it to array prompt_gateway_listener = { @@ -344,6 +345,7 @@ def validate_and_render_schema(): "arch_tracing": arch_tracing, "local_llms": llms_with_endpoint, "agent_orchestrator": agent_orchestrator, + "listeners": config_yaml["listeners"].copy(), } rendered = template.render(data) diff --git a/crates/brightstaff/src/handlers/agent_chat_completions.rs b/crates/brightstaff/src/handlers/agent_chat_completions.rs index 8d86dea1..4d591c9c 100644 --- a/crates/brightstaff/src/handlers/agent_chat_completions.rs +++ b/crates/brightstaff/src/handlers/agent_chat_completions.rs @@ -35,12 +35,20 @@ pub async fn agent_chat( listeners: Arc>>, ) -> Result>, hyper::Error> { // find listener that is running at port 8001 for agents + let listener_name = request.headers().get("x-arch-agent-listener-name"); let listener = { let listeners = listeners.read().await; - listeners.iter().find(|l| l.port == 8001).cloned() + listeners.iter().find(|l| { + listener_name + .and_then(|name| name.to_str().ok()) + .map(|name| l.name == name) + .unwrap_or(false) + }).cloned() } .unwrap(); + info!("Handling request for listener: {}", listener.name); + let request_path = request.uri().path().to_string(); let mut request_headers = request.headers().clone(); let chat_request_bytes = request.collect().await?.to_bytes(); diff --git a/demos/use_cases/rag_agent/arch_config.yaml b/demos/use_cases/rag_agent/arch_config.yaml index 287d040a..f12577b5 100644 --- a/demos/use_cases/rag_agent/arch_config.yaml +++ b/demos/use_cases/rag_agent/arch_config.yaml @@ -42,8 +42,7 @@ listeners: - access_key: $OPENAI_API_KEY model: openai/gpt-4o-mini address: 0.0.0.0 - port: 12000 + port: 9000 tracing: random_sampling: 100 - trace_arch_internal: true diff --git a/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py b/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py index 98a73671..54a162d5 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") RAG_MODEL = "gpt-4o-mini" # Initialize OpenAI client for archgw diff --git a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py index eda70794..c0166d66 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") QUERY_REWRITE_MODEL = "gpt-4o-mini" # Initialize OpenAI client for archgw diff --git a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py index ce17d7a3..3faaf4ef 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py @@ -14,7 +14,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") RESPONSE_MODEL = "gpt-4o" # Initialize OpenAI client for archgw