add retry

2026-06-23 15:38:07 +02:00 · 2025-09-17 17:53:19 -07:00 · 2025-09-17 17:53:19 -07:00 · a3f93de85d
commit a3f93de85d
parent 71658ddbd9
7 changed files with 29 additions and 9 deletions
--- a/demos/use_cases/rag_agent/arch_config.yaml
+++ b/demos/use_cases/rag_agent/arch_config.yaml
@ -44,7 +44,7 @@ listeners:
      - access_key: $OPENAI_API_KEY
        model: openai/gpt-4o-mini
    address: 0.0.0.0
-    port: 9000
+    port: 12000

 tracing:
  random_sampling: 100
--- a/demos/use_cases/rag_agent/docker-compose.yaml
+++ b/demos/use_cases/rag_agent/docker-compose.yaml
@ -6,3 +6,12 @@ services:
      - "16686:16686"
      - "4317:4317"
      - "4318:4318"
+  open-web-ui:
+    image: ghcr.io/open-webui/open-webui:main
+    restart: always
+    ports:
+      - "8080:8080"
+    environment:
+      - DEFAULT_MODEL=gpt-4o-mini
+      - ENABLE_OPENAI_API=true
+      - OPENAI_API_BASE_URL=http://host.docker.internal:8001/v1
--- a/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/content_builder_agent.py
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)


 # Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1")
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
 RAG_MODEL = "gpt-4o-mini"

 # Initialize OpenAI client for archgw
@ -91,7 +91,7 @@ async def find_relevant_passages(
        logger.info(f"Calling archgw to find relevant passages for query: '{query}'")

        # Prepare extra headers if traceparent is provided
-        extra_headers = {}
+        extra_headers = {"x-envoy-max-retries": "3"}
        if traceparent:
            extra_headers["traceparent"] = traceparent

--- a/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/query_rewriter_agent.py
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)


 # Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1")
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
 QUERY_REWRITE_MODEL = "gpt-4o-mini"

 # Initialize OpenAI client for archgw
@ -50,7 +50,7 @@ async def rewrite_query_with_archgw(

    try:
        # Call archgw using OpenAI client
-        extra_headers = {}
+        extra_headers = {"x-envoy-max-retries": "3"}
        if traceparent_header:
            extra_headers["traceparent"] = traceparent_header
        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to rewrite query")
--- a/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
+++ b/demos/use_cases/rag_agent/src/rag_agent/response_generator_agent.py
@ -20,7 +20,7 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 # Configuration for archgw LLM gateway
-LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:9000/v1")
+LLM_GATEWAY_ENDPOINT = os.getenv("LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1")
 RESPONSE_MODEL = "gpt-4o"

 # System prompt for response generation
@ -94,7 +94,7 @@ async def stream_chat_completions(
        )

        # Prepare extra headers if traceparent is provided
-        extra_headers = {}
+        extra_headers = {"x-envoy-max-retries": "3"}
        if traceparent_header:
            extra_headers["traceparent"] = traceparent_header

@ -191,7 +191,7 @@ async def non_streaming_chat_completions(
        logger.info(f"Calling archgw at {LLM_GATEWAY_ENDPOINT} to generate response")

        # Prepare extra headers if traceparent is provided
-        extra_headers = {}
+        extra_headers = {"x-envoy-max-retries": "3"}
        if traceparent_header:
            extra_headers["traceparent"] = traceparent_header