diff --git a/crates/brightstaff/src/handlers/pipeline_processor.rs b/crates/brightstaff/src/handlers/pipeline_processor.rs index dfa710b6..c7a2e458 100644 --- a/crates/brightstaff/src/handlers/pipeline_processor.rs +++ b/crates/brightstaff/src/handlers/pipeline_processor.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use common::configuration::{Agent, AgentPipeline}; -use common::consts::ARCH_UPSTREAM_HOST_HEADER; +use common::consts::{ARCH_UPSTREAM_HOST_HEADER, ENVOY_RETRY_HEADER}; use hermesllm::apis::openai::{ChatCompletionsRequest, Message}; use hyper::header::HeaderMap; use tracing::{debug, warn}; @@ -112,6 +112,11 @@ impl PipelineProcessor { .map_err(|_| PipelineError::AgentNotFound(agent.name.clone()))?, ); + agent_headers.insert( + ENVOY_RETRY_HEADER, + hyper::header::HeaderValue::from_str("3").unwrap(), + ); + let response = self .client .post(&self.llm_endpoint) @@ -160,6 +165,11 @@ impl PipelineProcessor { .map_err(|_| PipelineError::AgentNotFound(terminal_agent.name.clone()))?, ); + agent_headers.insert( + ENVOY_RETRY_HEADER, + hyper::header::HeaderValue::from_str("3").unwrap(), + ); + let response = self .client .post(&self.llm_endpoint) diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index 0eb5a036..14972485 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -29,3 +29,4 @@ pub const HALLUCINATION_TEMPLATE: &str = pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http"; pub const OTEL_POST_PATH: &str = "/v1/traces"; pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route"; +pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries"; diff --git a/demos/use_cases/rag_agent/arch_config.yaml b/demos/use_cases/rag_agent/arch_config.yaml index efa03697..1a50f36d 100644 --- a/demos/use_cases/rag_agent/arch_config.yaml +++ b/demos/use_cases/rag_agent/arch_config.yaml @@ -44,7 +44,7 @@ listeners: - access_key: $OPENAI_API_KEY model: openai/gpt-4o-mini address: 0.0.0.0 - port: 9000 + port: 12000 tracing: random_sampling: 100 diff --git a/demos/use_cases/rag_agent/docker-compose.yaml b/demos/use_cases/rag_agent/docker-compose.yaml index 9828cd17..78b27316 100644 --- a/demos/use_cases/rag_agent/docker-compose.yaml +++ b/demos/use_cases/rag_agent/docker-compose.yaml @@ -6,3 +6,12 @@ services: - "16686:16686" - "4317:4317" - "4318:4318" + open-web-ui: + image: ghcr.io/open-webui/open-webui:main + restart: always + ports: + - "8080:8080" + environment: + - DEFAULT_MODEL=gpt-4o-mini + - ENABLE_OPENAI_API=true + - OPENAI_API_BASE_URL=http://host.docker.internal:8001/v1 diff --git a/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py b/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py index 54a162d5..6db2319f 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") RAG_MODEL = "gpt-4o-mini" # Initialize OpenAI client for archgw @@ -91,7 +91,7 @@ async def find_relevant_passages( logger.info(f"Calling archgw to find relevant passages for query: '{query}'") # Prepare extra headers if traceparent is provided - extra_headers = {} + extra_headers = {"x-envoy-max-retries": "3"} if traceparent: extra_headers["traceparent"] = traceparent diff --git a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py index c0166d66..9820a274 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") QUERY_REWRITE_MODEL = "gpt-4o-mini" # Initialize OpenAI client for archgw @@ -50,7 +50,7 @@ async def rewrite_query_with_archgw( try: # Call archgw using OpenAI client - extra_headers = {} + extra_headers = {"x-envoy-max-retries": "3"} if traceparent_header: extra_headers["traceparent"] = traceparent_header logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to rewrite query") diff --git a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py index f3f0c72d..f5967105 100644 --- a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py +++ b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py @@ -20,7 +20,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration for archgw LLM gateway -LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1") +LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1") RESPONSE_MODEL = "gpt-4o" # System prompt for response generation @@ -94,7 +94,7 @@ async def stream_chat_completions( ) # Prepare extra headers if traceparent is provided - extra_headers = {} + extra_headers = {"x-envoy-max-retries": "3"} if traceparent_header: extra_headers["traceparent"] = traceparent_header @@ -191,7 +191,7 @@ async def non_streaming_chat_completions( logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate response") # Prepare extra headers if traceparent is provided - extra_headers = {} + extra_headers = {"x-envoy-max-retries": "3"} if traceparent_header: extra_headers["traceparent"] = traceparent_header