feat: transparent openai responses api integration

2026-06-10 18:48:26 +02:00 · 2026-06-10 18:48:26 +02:00 · b28f175b61
commit b28f175b61
parent e7407b86b3
7 changed files with 1674 additions and 86 deletions
--- a/api/openai.py
+++ b/api/openai.py
@ -46,6 +46,110 @@ from routing import choose_endpoint, decrement_usage
 router = APIRouter()


+async def create_chat_with_retries(oclient, send_params, endpoint, model, tracking_model):
+    """Call ``chat.completions.create`` with the router's resilience retries.
+
+    Encapsulates the recovery ladder shared by the chat-completions handler and
+    the translated ``/v1/responses`` path:
+
+      * ``does not support tools`` → retry without ``tools``
+      * llama-server context exhaustion → sliding-window message trim, with a
+        second retry that also strips ``tools``/``tool_choice``
+      * backend connection failure → mark (endpoint, model) unhealthy so the next
+        request reroutes, then re-raise
+      * ``image input is not supported`` → strip images and retry
+
+    On unrecoverable failure the endpoint usage counter is decremented and the
+    exception is re-raised. Returns the established async generator / response.
+    """
+    config = get_config()
+    try:
+        async_gen = await oclient.chat.completions.create(**send_params)
+    except Exception as e:
+        _e_str = str(e)
+        _is_ctx_err = "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str
+        print(f"[ochat] caught={type(e).__name__} ctx={_is_ctx_err} msg={_e_str[:120]}", flush=True)
+        if "does not support tools" in _e_str:
+            # Model doesn't support tools — retry without them
+            print(f"[ochat] retry: no tools", flush=True)
+            try:
+                params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
+                async_gen = await oclient.chat.completions.create(**params_without_tools)
+            except Exception:
+                await decrement_usage(endpoint, tracking_model)
+                raise
+        elif _is_ctx_err:
+            # Backend context limit hit — apply sliding-window trim (context-shift at message level)
+            err_body = getattr(e, "body", {}) or {}
+            err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
+            n_ctx_limit = err_detail.get("n_ctx", 0)
+            actual_tokens = err_detail.get("n_prompt_tokens", 0)
+            # Fallback: parse from string if body parsing yielded nothing (SDK may not parse llama-server errors)
+            if not n_ctx_limit:
+                import re as _re
+                _m = _re.search(r"'n_ctx':\s*(\d+)", _e_str)
+                if _m:
+                    n_ctx_limit = int(_m.group(1))
+                _m = _re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
+                if _m:
+                    actual_tokens = int(_m.group(1))
+            print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True)
+            if not n_ctx_limit:
+                await decrement_usage(endpoint, tracking_model)
+                raise
+            if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
+                _endpoint_nctx[(endpoint, model)] = n_ctx_limit
+
+            msgs_to_trim = send_params.get("messages", [])
+            try:
+                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
+                trimmed_messages = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
+            except Exception as _helper_exc:
+                print(f"[ctx-trim] helper crash: {type(_helper_exc).__name__}: {str(_helper_exc)[:100]}", flush=True)
+                await decrement_usage(endpoint, tracking_model)
+                raise
+            dropped = len(msgs_to_trim) - len(trimmed_messages)
+            print(f"[ctx-trim] target={cal_target} dropped={dropped} remaining={len(trimmed_messages)} retrying-1", flush=True)
+            try:
+                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": trimmed_messages})
+                print(f"[ctx-trim] retry-1 ok", flush=True)
+            except Exception as e2:
+                _e2_str = str(e2)
+                if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
+                    # Still too large — tool definitions likely consuming too many tokens, strip them too
+                    print(f"[ctx-trim] retry-1 still exceeded, stripping tools retrying-2", flush=True)
+                    params_no_tools = {k: v for k, v in send_params.items() if k not in ("tools", "tool_choice")}
+                    try:
+                        async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
+                        print(f"[ctx-trim] retry-2 ok", flush=True)
+                    except Exception:
+                        await decrement_usage(endpoint, tracking_model)
+                        raise
+                else:
+                    await decrement_usage(endpoint, tracking_model)
+                    raise
+        elif _is_backend_connection_error(e):
+            # Upstream connection failed (e.g. llama-server in router mode
+            # whose delegated worker died). Mark (endpoint, model) so the
+            # next request reroutes; the client will retry this one.
+            print(f"[ochat] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
+            await _mark_backend_unhealthy(endpoint, model, _e_str)
+            await decrement_usage(endpoint, tracking_model)
+            raise
+        elif "image input is not supported" in _e_str:
+            # Model doesn't support images — strip and retry
+            print(f"[openai_chat_completions_proxy] Model {model} doesn't support images, retrying with text-only messages")
+            try:
+                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
+            except Exception:
+                await decrement_usage(endpoint, tracking_model)
+                raise
+        else:
+            await decrement_usage(endpoint, tracking_model)
+            raise
+    return async_gen
+
+
@router.post("/v1/embeddings")
 async def openai_embedding_proxy(request: Request):
    """
@ -260,90 +364,7 @@ async def openai_chat_completions_proxy(request: Request):
            _dropped = len(_pre_msgs) - len(_pre_trimmed)
            print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
            send_params = {**send_params, "messages": _pre_trimmed}
-    try:
-        async_gen = await oclient.chat.completions.create(**send_params)
-    except Exception as e:
-        _e_str = str(e)
-        _is_ctx_err = "exceed_context_size_error" in _e_str or "exceeds the available context size" in _e_str
-        print(f"[ochat] caught={type(e).__name__} ctx={_is_ctx_err} msg={_e_str[:120]}", flush=True)
-        if "does not support tools" in _e_str:
-            # Model doesn't support tools — retry without them
-            print(f"[ochat] retry: no tools", flush=True)
-            try:
-                params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
-                async_gen = await oclient.chat.completions.create(**params_without_tools)
-            except Exception:
-                await decrement_usage(endpoint, tracking_model)
-                raise
-        elif _is_ctx_err:
-            # Backend context limit hit — apply sliding-window trim (context-shift at message level)
-            err_body = getattr(e, "body", {}) or {}
-            err_detail = err_body.get("error", {}) if isinstance(err_body, dict) else {}
-            n_ctx_limit = err_detail.get("n_ctx", 0)
-            actual_tokens = err_detail.get("n_prompt_tokens", 0)
-            # Fallback: parse from string if body parsing yielded nothing (SDK may not parse llama-server errors)
-            if not n_ctx_limit:
-                import re as _re
-                _m = _re.search(r"'n_ctx':\s*(\d+)", _e_str)
-                if _m:
-                    n_ctx_limit = int(_m.group(1))
-                _m = _re.search(r"'n_prompt_tokens':\s*(\d+)", _e_str)
-                if _m:
-                    actual_tokens = int(_m.group(1))
-            print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True)
-            if not n_ctx_limit:
-                await decrement_usage(endpoint, tracking_model)
-                raise
-            if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
-                _endpoint_nctx[(endpoint, model)] = n_ctx_limit
-
-            msgs_to_trim = send_params.get("messages", [])
-            try:
-                cal_target = _calibrated_trim_target(msgs_to_trim, n_ctx_limit, actual_tokens)
-                trimmed_messages = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
-            except Exception as _helper_exc:
-                print(f"[ctx-trim] helper crash: {type(_helper_exc).__name__}: {str(_helper_exc)[:100]}", flush=True)
-                await decrement_usage(endpoint, tracking_model)
-                raise
-            dropped = len(msgs_to_trim) - len(trimmed_messages)
-            print(f"[ctx-trim] target={cal_target} dropped={dropped} remaining={len(trimmed_messages)} retrying-1", flush=True)
-            try:
-                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": trimmed_messages})
-                print(f"[ctx-trim] retry-1 ok", flush=True)
-            except Exception as e2:
-                _e2_str = str(e2)
-                if "exceed_context_size_error" in _e2_str or "exceeds the available context size" in _e2_str:
-                    # Still too large — tool definitions likely consuming too many tokens, strip them too
-                    print(f"[ctx-trim] retry-1 still exceeded, stripping tools retrying-2", flush=True)
-                    params_no_tools = {k: v for k, v in send_params.items() if k not in ("tools", "tool_choice")}
-                    try:
-                        async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
-                        print(f"[ctx-trim] retry-2 ok", flush=True)
-                    except Exception:
-                        await decrement_usage(endpoint, tracking_model)
-                        raise
-                else:
-                    await decrement_usage(endpoint, tracking_model)
-                    raise
-        elif _is_backend_connection_error(e):
-            # Upstream connection failed (e.g. llama-server in router mode
-            # whose delegated worker died). Mark (endpoint, model) so the
-            # next request reroutes; the client will retry this one.
-            print(f"[ochat] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
-            await _mark_backend_unhealthy(endpoint, model, _e_str)
-            await decrement_usage(endpoint, tracking_model)
-            raise
-        elif "image input is not supported" in _e_str:
-            # Model doesn't support images — strip and retry
-            print(f"[openai_chat_completions_proxy] Model {model} doesn't support images, retrying with text-only messages")
-            try:
-                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
-            except Exception:
-                await decrement_usage(endpoint, tracking_model)
-                raise
-        else:
-            await decrement_usage(endpoint, tracking_model)
-            raise
+    async_gen = await create_chat_with_retries(oclient, send_params, endpoint, model, tracking_model)

    # 4. Async generator — only streams the already-established async_gen
    async def stream_ochat_response():