fix: make upstream timeout configurable and increase default to 300s (#787)

Hardcoded 30s timeouts in envoy config caused premature termination of
long-running LLM requests (tool-use, agentic workflows). Make timeouts
configurable via upstream_timeout_ms override and default to 300s.
This commit is contained in:
Syed Hashmi 2026-03-04 18:53:32 -06:00
parent 0c7b999770
commit 00bd11061e
2 changed files with 12 additions and 3 deletions

View file

@ -466,6 +466,15 @@ def validate_and_render_schema():
"upstream_tls_ca_path", "/etc/ssl/certs/ca-certificates.crt"
)
upstream_timeout_ms = overrides.get("upstream_timeout_ms")
if upstream_timeout_ms is not None:
timeout_s = f"{int(upstream_timeout_ms) // 1000}s"
llm_gateway["timeout"] = timeout_s
prompt_gateway["timeout"] = timeout_s
for listener in listeners:
if listener.get("type") == "agent" and "timeout" not in listener:
listener["timeout"] = timeout_s
data = {
"prompt_gateway_listener": prompt_gateway,
"llm_gateway_listener": llm_gateway,

View file

@ -336,7 +336,7 @@ static_resources:
auto_host_rewrite: true
prefix_rewrite: "/agents/"
cluster: bright_staff
timeout: {{ listener.timeout | default('30s') }}
timeout: {{ listener.timeout | default('300s') }}
http_filters:
- name: envoy.filters.http.compressor
typed_config:
@ -517,12 +517,12 @@ static_resources:
route:
auto_host_rewrite: true
cluster: {{ llm_cluster_name }}
timeout: 300s
timeout: {{ llm_gateway_listener.timeout }}
{% if llm_gateway_listener.max_retries %}
retry_policy:
retry_on: "5xx,connect-failure,refused-stream,reset,retriable-status-codes"
num_retries: {{ llm_gateway_listener.max_retries }}
per_try_timeout: 30s
per_try_timeout: {{ llm_gateway_listener.timeout }}
retriable_status_codes: [429, 500, 502, 503, 504]
retry_back_off:
base_interval: 0.5s