fix: make upstream timeout configurable and increase default to 300s (#787)

Hardcoded 30s timeouts in envoy config caused premature termination of long-running LLM requests (tool-use, agentic workflows). Make timeouts configurable via upstream_timeout_ms override and default to 300s.
2026-04-27 09:46:28 +02:00 · 2026-03-04 18:53:32 -06:00 · 2026-03-04 18:53:32 -06:00 · 00bd11061e
commit 00bd11061e
parent 0c7b999770
2 changed files with 12 additions and 3 deletions
--- a/config/envoy.template.yaml
+++ b/config/envoy.template.yaml
@ -336,7 +336,7 @@ static_resources:
                            auto_host_rewrite: true
                            prefix_rewrite: "/agents/"
                            cluster: bright_staff
-                            timeout: {{ listener.timeout | default('30s') }}
+                            timeout: {{ listener.timeout | default('300s') }}
                http_filters:
                  - name: envoy.filters.http.compressor
                    typed_config:
@ -517,12 +517,12 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: {{ llm_cluster_name }}
-                            timeout: 300s
+                            timeout: {{ llm_gateway_listener.timeout }}
                            {% if llm_gateway_listener.max_retries %}
                            retry_policy:
                              retry_on: "5xx,connect-failure,refused-stream,reset,retriable-status-codes"
                              num_retries: {{ llm_gateway_listener.max_retries }}
-                              per_try_timeout: 30s
+                              per_try_timeout: {{ llm_gateway_listener.timeout }}
                              retriable_status_codes: [429, 500, 502, 503, 504]
                              retry_back_off:
                                base_interval: 0.5s