fix: stale reservation counters be releasing it only once

2026-06-23 14:56:04 +02:00 · 2026-06-23 14:56:04 +02:00 · 4f42f350a3
commit 4f42f350a3
parent cef71df3df
5 changed files with 319 additions and 262 deletions
--- a/api/ollama.py
+++ b/api/ollama.py
@ -167,31 +167,38 @@ async def proxy(request: Request):

    _affinity_key = _conversation_fingerprint(model, None, prompt)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-    use_openai = is_openai_compatible(endpoint)
-    if use_openai:
-        if ":latest" in model:
-            model = model.split(":latest")
-            model = model[0]
-        params = {
-            "prompt": prompt,
-            "model": model,
-        }
-
-        optional_params = {
-            "stream": stream,
-            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
-            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
-            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
-            "seed": options.get("seed") if options and "seed" in options else None,
-            "stop": options.get("stop") if options and "stop" in options else None,
-            "top_p": options.get("top_p") if options and "top_p" in options else None,
-            "temperature": options.get("temperature") if options and "temperature" in options else None,
-            "suffix": suffix,
+    # _guarded_stream's finally releases the reservation once we hand off; until
+    # then any failure during request building / client construction (including
+    # CancelledError on client disconnect) must release it or the counter leaks.
+    try:
+        use_openai = is_openai_compatible(endpoint)
+        if use_openai:
+            if ":latest" in model:
+                model = model.split(":latest")
+                model = model[0]
+            params = {
+                "prompt": prompt,
+                "model": model,
            }
-        params.update({k: v for k, v in optional_params.items() if v is not None})
-        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-    else:
-        client = get_ollama_client(endpoint)
+
+            optional_params = {
+                "stream": stream,
+                "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
+                "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
+                "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
+                "seed": options.get("seed") if options and "seed" in options else None,
+                "stop": options.get("stop") if options and "stop" in options else None,
+                "top_p": options.get("top_p") if options and "top_p" in options else None,
+                "temperature": options.get("temperature") if options and "temperature" in options else None,
+                "suffix": suffix,
+                }
+            params.update({k: v for k, v in optional_params.items() if v is not None})
+            oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
+        else:
+            client = get_ollama_client(endpoint)
+    except BaseException:
+        await decrement_usage(endpoint, tracking_model)
+        raise

    # 4. Async generator body (error handling + cleanup handled by _guarded_stream)
    async def stream_generate_response():
@ -336,59 +343,70 @@ async def chat_proxy(request: Request):
        opt = False
    _affinity_key = _conversation_fingerprint(model, messages, None)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-    use_openai = is_openai_compatible(endpoint)
-    if use_openai:
-        if ":latest" in model:
-            model = model.split(":latest")
-            model = model[0]
-        if messages:
-            if any("images" in m for m in messages):
-                messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
-            messages = transform_tool_calls_to_openai(messages)
-            messages = _strip_assistant_prefill(messages)
-        params = {
-            "messages": messages,
-            "model": model,
-            }
-        optional_params = {
-            "tools": tools,
-            "stream": stream,
-            "stream_options": {"include_usage": True} if stream else None,
-            "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
-            "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
-            "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
-            "seed": options.get("seed") if options and "seed" in options else None,
-            "stop": options.get("stop") if options and "stop" in options else None,
-            "top_p": options.get("top_p") if options and "top_p" in options else None,
-            "temperature": options.get("temperature") if options and "temperature" in options else None,
-            "logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
-            "top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
-            "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
-            }
-        params.update({k: v for k, v in optional_params.items() if v is not None})
-        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
-    else:
-        client = get_ollama_client(endpoint)
+    # Releasing the reservation is owned by _guarded_stream's finally once we hand
+    # off to the streaming generator. Until then, any failure during request
+    # building / client construction (including CancelledError on client
+    # disconnect) must release it here or the usage counter leaks.
+    try:
+        use_openai = is_openai_compatible(endpoint)
+        if use_openai:
+            if ":latest" in model:
+                model = model.split(":latest")
+                model = model[0]
+            if messages:
+                if any("images" in m for m in messages):
+                    messages = await asyncio.to_thread(transform_images_to_data_urls, messages)
+                messages = transform_tool_calls_to_openai(messages)
+                messages = _strip_assistant_prefill(messages)
+            params = {
+                "messages": messages,
+                "model": model,
+                }
+            optional_params = {
+                "tools": tools,
+                "stream": stream,
+                "stream_options": {"include_usage": True} if stream else None,
+                "max_tokens": options.get("num_predict") if options and "num_predict" in options else None,
+                "frequency_penalty": options.get("frequency_penalty") if options and "frequency_penalty" in options else None,
+                "presence_penalty": options.get("presence_penalty") if options and "presence_penalty" in options else None,
+                "seed": options.get("seed") if options and "seed" in options else None,
+                "stop": options.get("stop") if options and "stop" in options else None,
+                "top_p": options.get("top_p") if options and "top_p" in options else None,
+                "temperature": options.get("temperature") if options and "temperature" in options else None,
+                "logprobs": logprobs if logprobs is not None else (options.get("logprobs") if options and "logprobs" in options else None),
+                "top_logprobs": top_logprobs if top_logprobs is not None else (options.get("top_logprobs") if options and "top_logprobs" in options else None),
+                "response_format": {"type": "json_schema", "json_schema": _format} if _format is not None else None
+                }
+            params.update({k: v for k, v in optional_params.items() if v is not None})
+            oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
+        else:
+            client = get_ollama_client(endpoint)
+    except BaseException:
+        await decrement_usage(endpoint, tracking_model)
+        raise
    # For OpenAI endpoints: make the API call in handler scope
    # (try/except inside async generators is unreliable with Starlette's streaming)
    start_ts = None
    async_gen = None
    if use_openai:
        start_ts = time.perf_counter()
-        # Proactive trim: only for small-ctx models we've already seen run out of space
-        _lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
-        _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
-        if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
-            _pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
-            _pre_est = _count_message_tokens(params.get("messages", []))
-            if _pre_est > _pre_target:
-                _pre_msgs = params.get("messages", [])
-                _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
-                _dropped = len(_pre_msgs) - len(_pre_trimmed)
-                print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
-                params = {**params, "messages": _pre_trimmed}
        try:
+            # Proactive trim: only for small-ctx models we've already seen run out of space
+            _lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
+            _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
+            if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
+                _pre_target = int((_known_nctx - _known_nctx // 4) / 1.2)
+                _pre_est = _count_message_tokens(params.get("messages", []))
+                if _pre_est > _pre_target:
+                    _pre_msgs = params.get("messages", [])
+                    _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
+                    _dropped = len(_pre_msgs) - len(_pre_trimmed)
+                    print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
+                    params = {**params, "messages": _pre_trimmed}
            async_gen = await oclient.chat.completions.create(**params)
+        except asyncio.CancelledError:
+            await decrement_usage(endpoint, tracking_model)
+            raise
        except Exception as e:
            _e_str = str(e)
            print(f"[chat_proxy] caught {type(e).__name__}: {_e_str[:200]}")
@ -595,14 +613,21 @@ async def _handle_embedding_request(

    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
-    use_openai = is_openai_compatible(endpoint)
-    if use_openai:
-        if ":latest" in model:
-            model = model.split(":latest")
-            model = model[0]
-        client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
-    else:
-        client = get_ollama_client(endpoint)
+    # _guarded_stream's finally releases the reservation once we hand off; until
+    # then any failure during client construction (including CancelledError on
+    # client disconnect) must release it or the counter leaks.
+    try:
+        use_openai = is_openai_compatible(endpoint)
+        if use_openai:
+            if ":latest" in model:
+                model = model.split(":latest")
+                model = model[0]
+            client = _make_openai_client(endpoint, api_key=config.api_keys.get(endpoint, "no-key"))
+        else:
+            client = get_ollama_client(endpoint)
+    except BaseException:
+        await decrement_usage(endpoint, tracking_model)
+        raise

    # 3. Async generator body (error handling + cleanup handled by _guarded_stream)
    async def stream_embedding_response():
--- a/api/openai.py
+++ b/api/openai.py
@ -61,8 +61,10 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
        request reroutes, then re-raise
      * ``image input is not supported`` → strip images and retry

-    On unrecoverable failure the endpoint usage counter is decremented and the
-    exception is re-raised. Returns the established async generator / response.
+    The caller owns the usage reservation taken by ``choose_endpoint``: this
+    function never decrements it. On unrecoverable failure the exception is
+    re-raised so the caller's guard releases the slot exactly once. Returns the
+    established async generator / response.
    """
    config = get_config()
    try:
@ -74,12 +76,8 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
        if "does not support tools" in _e_str:
            # Model doesn't support tools — retry without them
            print(f"[ochat] retry: no tools", flush=True)
-            try:
-                params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
-                async_gen = await oclient.chat.completions.create(**params_without_tools)
-            except Exception:
-                await decrement_usage(endpoint, tracking_model)
-                raise
+            params_without_tools = {k: v for k, v in send_params.items() if k != "tools"}
+            async_gen = await oclient.chat.completions.create(**params_without_tools)
        elif _is_ctx_err:
            # Backend context limit hit — apply sliding-window trim (context-shift at message level)
            err_body = getattr(e, "body", {}) or {}
@ -97,7 +95,6 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
                    actual_tokens = int(_m.group(1))
            print(f"[ctx-trim] n_ctx={n_ctx_limit} actual={actual_tokens}", flush=True)
            if not n_ctx_limit:
-                await decrement_usage(endpoint, tracking_model)
                raise
            if n_ctx_limit <= _CTX_TRIM_SMALL_LIMIT:
                _endpoint_nctx[(endpoint, model)] = n_ctx_limit
@ -108,7 +105,6 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
                trimmed_messages = _trim_messages_for_context(msgs_to_trim, n_ctx_limit, target_tokens=cal_target)
            except Exception as _helper_exc:
                print(f"[ctx-trim] helper crash: {type(_helper_exc).__name__}: {str(_helper_exc)[:100]}", flush=True)
-                await decrement_usage(endpoint, tracking_model)
                raise
            dropped = len(msgs_to_trim) - len(trimmed_messages)
            print(f"[ctx-trim] target={cal_target} dropped={dropped} remaining={len(trimmed_messages)} retrying-1", flush=True)
@ -121,14 +117,9 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
                    # Still too large — tool definitions likely consuming too many tokens, strip them too
                    print(f"[ctx-trim] retry-1 still exceeded, stripping tools retrying-2", flush=True)
                    params_no_tools = {k: v for k, v in send_params.items() if k not in ("tools", "tool_choice")}
-                    try:
-                        async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
-                        print(f"[ctx-trim] retry-2 ok", flush=True)
-                    except Exception:
-                        await decrement_usage(endpoint, tracking_model)
-                        raise
+                    async_gen = await oclient.chat.completions.create(**{**params_no_tools, "messages": trimmed_messages})
+                    print(f"[ctx-trim] retry-2 ok", flush=True)
                else:
-                    await decrement_usage(endpoint, tracking_model)
                    raise
        elif _is_backend_connection_error(e):
            # Upstream connection failed (e.g. llama-server in router mode
@ -136,18 +127,12 @@ async def create_chat_with_retries(oclient, send_params, endpoint, model, tracki
            # next request reroutes; the client will retry this one.
            print(f"[ochat] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
            await _mark_backend_unhealthy(endpoint, model, _e_str)
-            await decrement_usage(endpoint, tracking_model)
            raise
        elif "image input is not supported" in _e_str:
            # Model doesn't support images — strip and retry
            print(f"[openai_chat_completions_proxy] Model {model} doesn't support images, retrying with text-only messages")
-            try:
-                async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
-            except Exception:
-                await decrement_usage(endpoint, tracking_model)
-                raise
+            async_gen = await oclient.chat.completions.create(**{**send_params, "messages": _strip_images_from_messages(send_params.get("messages", []))})
        else:
-            await decrement_usage(endpoint, tracking_model)
            raise
    return async_gen

@ -195,13 +180,14 @@ async def openai_embedding_proxy(request: Request):

    # 2. Endpoint logic
    endpoint, tracking_model = await choose_endpoint(model)
-    if is_openai_compatible(endpoint):
-        api_key = config.api_keys.get(endpoint, "no-key")
-    else:
-        api_key = "ollama"
-    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=api_key)
-
+    # The finally below releases the reservation for every exit — success, error,
+    # or CancelledError — so client construction is kept inside the guarded block.
    try:
+        if is_openai_compatible(endpoint):
+            api_key = config.api_keys.get(endpoint, "no-key")
+        else:
+            api_key = "ollama"
+        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=api_key)
        async_gen = await oclient.embeddings.create(input=doc, model=model)
        result = async_gen.model_dump()
        for item in result.get("data", []):
@ -350,23 +336,30 @@ async def openai_chat_completions_proxy(request: Request):

    # Make the API call in handler scope — try/except inside async generators is unreliable
    # with Starlette's streaming machinery, so we resolve errors here before the generator starts.
-    send_params = params
-    if not is_ext_openai_endpoint(endpoint):
-        resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
-        send_params = {**params, "messages": resolved_msgs}
-    # Proactive trim: only for small-ctx models we've already seen run out of space
-    _lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
-    _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
-    if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
-        _pre_target = int(((_known_nctx - _known_nctx // 4)) / 1.2)
-        _pre_est = _count_message_tokens(send_params.get("messages", []))
-        if _pre_est > _pre_target:
-            _pre_msgs = send_params.get("messages", [])
-            _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
-            _dropped = len(_pre_msgs) - len(_pre_trimmed)
-            print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
-            send_params = {**send_params, "messages": _pre_trimmed}
-    async_gen = await create_chat_with_retries(oclient, send_params, endpoint, model, tracking_model)
+    # The reservation taken by choose_endpoint is released by stream_ochat_response's finally
+    # once we hand off; until then, any failure here (including CancelledError on client
+    # disconnect during a cold model load) must release it or the counter leaks.
+    try:
+        send_params = params
+        if not is_ext_openai_endpoint(endpoint):
+            resolved_msgs = await _normalize_images_in_messages(params.get("messages", []))
+            send_params = {**params, "messages": resolved_msgs}
+        # Proactive trim: only for small-ctx models we've already seen run out of space
+        _lookup_model = _normalize_llama_model_name(model) if is_llama_server(endpoint) else model
+        _known_nctx = _endpoint_nctx.get((endpoint, _lookup_model))
+        if _known_nctx and _known_nctx <= _CTX_TRIM_SMALL_LIMIT:
+            _pre_target = int(((_known_nctx - _known_nctx // 4)) / 1.2)
+            _pre_est = _count_message_tokens(send_params.get("messages", []))
+            if _pre_est > _pre_target:
+                _pre_msgs = send_params.get("messages", [])
+                _pre_trimmed = _trim_messages_for_context(_pre_msgs, _known_nctx, target_tokens=_pre_target)
+                _dropped = len(_pre_msgs) - len(_pre_trimmed)
+                print(f"[ctx-pre] n_ctx={_known_nctx} est={_pre_est} target={_pre_target} dropped={_dropped}", flush=True)
+                send_params = {**send_params, "messages": _pre_trimmed}
+        async_gen = await create_chat_with_retries(oclient, send_params, endpoint, model, tracking_model)
+    except BaseException:
+        await decrement_usage(endpoint, tracking_model)
+        raise

    # 4. Async generator — only streams the already-established async_gen
    async def stream_ochat_response():
@ -547,12 +540,17 @@ async def openai_completions_proxy(request: Request):
    # 2. Endpoint logic
    _affinity_key = _conversation_fingerprint(model, None, prompt)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-    oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))

    # 3. Async generator that streams completions data and decrements the counter
-    # Make the API call in handler scope (try/except inside async generators is unreliable)
+    # Make the API call in handler scope (try/except inside async generators is unreliable).
+    # The reservation is released by stream_ocompletions_response's finally once we hand off;
+    # until then any failure here — including CancelledError on client disconnect — releases it.
    try:
+        oclient = _make_openai_client(endpoint, default_headers=default_headers, api_key=config.api_keys.get(endpoint, "no-key"))
        async_gen = await oclient.completions.create(**params)
+    except asyncio.CancelledError:
+        await decrement_usage(endpoint, tracking_model)
+        raise
    except Exception as e:
        if _is_backend_connection_error(e):
            print(f"[ocompl] backend connection error → marking ({endpoint}, {model}) unhealthy", flush=True)
@ -775,36 +773,38 @@ async def rerank_proxy(request: Request):
            ),
        )

-    if ":latest" in model:
-        model = model.split(":latest")[0]
-
-    # Build upstream rerank request body – forward only recognised fields
-    upstream_payload: dict = {"model": model, "query": query, "documents": documents}
-    for optional_key in ("top_n", "return_documents", "max_tokens_per_doc"):
-        if optional_key in payload:
-            upstream_payload[optional_key] = payload[optional_key]
-
-    # Determine upstream URL:
-    #   llama-server / llama-swap expose /v1/rerank (base already contains /v1)
-    #   External OpenAI endpoints expose /rerank under their /v1 base
-    if is_llama_server(endpoint):
-        # llama-server / llama-swap: endpoint may or may not already contain /v1
-        if "/v1" in endpoint:
-            rerank_url = f"{endpoint}/rerank"
-        else:
-            rerank_url = f"{endpoint}/v1/rerank"
-    else:
-        # External OpenAI-compatible: ep2base gives us the /v1 base
-        rerank_url = f"{ep2base(endpoint)}/rerank"
-
-    api_key = config.api_keys.get(endpoint, "no-key")
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {api_key}",
-    }
-
-    client: aiohttp.ClientSession = get_session(endpoint)
+    # The finally below releases the reservation for every exit (success, error,
+    # or CancelledError), so request building and session lookup stay inside it.
    try:
+        if ":latest" in model:
+            model = model.split(":latest")[0]
+
+        # Build upstream rerank request body – forward only recognised fields
+        upstream_payload: dict = {"model": model, "query": query, "documents": documents}
+        for optional_key in ("top_n", "return_documents", "max_tokens_per_doc"):
+            if optional_key in payload:
+                upstream_payload[optional_key] = payload[optional_key]
+
+        # Determine upstream URL:
+        #   llama-server / llama-swap expose /v1/rerank (base already contains /v1)
+        #   External OpenAI endpoints expose /rerank under their /v1 base
+        if is_llama_server(endpoint):
+            # llama-server / llama-swap: endpoint may or may not already contain /v1
+            if "/v1" in endpoint:
+                rerank_url = f"{endpoint}/rerank"
+            else:
+                rerank_url = f"{endpoint}/v1/rerank"
+        else:
+            # External OpenAI-compatible: ep2base gives us the /v1 base
+            rerank_url = f"{ep2base(endpoint)}/rerank"
+
+        api_key = config.api_keys.get(endpoint, "no-key")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}",
+        }
+
+        client: aiohttp.ClientSession = get_session(endpoint)
        async with client.post(rerank_url, json=upstream_payload, headers=headers) as resp:
            response_bytes = await resp.read()
            if resp.status >= 400:
--- a/api/responses.py
+++ b/api/responses.py
@ -140,9 +140,9 @@ async def _run_to_completion(*, native, oclient, endpoint, model, tracking_model
                             send_params, native_params):
    """Drive the backend to completion (no client streaming).

-    Returns ``(output_items, usage)`` where usage is responses-shaped. Caller is
-    responsible for ``decrement_usage`` (translated failures self-decrement inside
-    ``create_chat_with_retries``)."""
+    Returns ``(output_items, usage)`` where usage is responses-shaped. The caller
+    owns the usage reservation and must release it (this function and
+    ``create_chat_with_retries`` never decrement)."""
    if native:
        resp_obj = await oclient.responses.create(stream=False, **native_params)
        data = resp_obj.model_dump()
@ -209,38 +209,46 @@ async def openai_responses_proxy(request: Request):
                return StreamingResponse(_served_cached(), media_type="text/event-stream")
            return JSONResponse(content=resp_obj)

-    # Endpoint selection (reserves a slot — must be released exactly once).
+    # Endpoint selection (reserves a slot — must be released exactly once). The
+    # release is owned by the per-branch finally (_bg_run / _stream / the
+    # non-streaming try) once we hand off; any failure during client/param
+    # construction (including CancelledError on client disconnect) must release
+    # it here or the usage counter leaks.
    _affinity_key = _conversation_fingerprint(model, messages, None)
    endpoint, tracking_model = await choose_endpoint(model, affinity_key=_affinity_key)
-    oclient = _make_openai_client(endpoint, default_headers=default_headers,
-                                  api_key=config.api_keys.get(endpoint, "no-key"))
-    native = is_ext_openai_endpoint(endpoint)
+    try:
+        oclient = _make_openai_client(endpoint, default_headers=default_headers,
+                                      api_key=config.api_keys.get(endpoint, "no-key"))
+        native = is_ext_openai_endpoint(endpoint)

-    # Build backend params for both shapes.
-    send_params = {"messages": messages, "model": model}
-    _opt = {
-        "temperature": payload.get("temperature"),
-        "top_p": payload.get("top_p"),
-        "max_tokens": payload.get("max_output_tokens"),
-        "tools": tools_responses_to_chat(tools),
-        "tool_choice": payload.get("tool_choice"),
-        "response_format": _text_format_to_response_format(payload.get("text")),
-    }
-    send_params.update({k: v for k, v in _opt.items() if v is not None})
+        # Build backend params for both shapes.
+        send_params = {"messages": messages, "model": model}
+        _opt = {
+            "temperature": payload.get("temperature"),
+            "top_p": payload.get("top_p"),
+            "max_tokens": payload.get("max_output_tokens"),
+            "tools": tools_responses_to_chat(tools),
+            "tool_choice": payload.get("tool_choice"),
+            "response_format": _text_format_to_response_format(payload.get("text")),
+        }
+        send_params.update({k: v for k, v in _opt.items() if v is not None})

-    native_instructions, native_input = messages_to_responses_input(messages)
-    native_params = {"model": model, "input": native_input, "store": False}
-    _nopt = {
-        "instructions": native_instructions,
-        "temperature": payload.get("temperature"),
-        "top_p": payload.get("top_p"),
-        "max_output_tokens": payload.get("max_output_tokens"),
-        "tools": tools,
-        "tool_choice": payload.get("tool_choice"),
-        "text": payload.get("text"),
-        "reasoning": payload.get("reasoning"),
-    }
-    native_params.update({k: v for k, v in _nopt.items() if v is not None})
+        native_instructions, native_input = messages_to_responses_input(messages)
+        native_params = {"model": model, "input": native_input, "store": False}
+        _nopt = {
+            "instructions": native_instructions,
+            "temperature": payload.get("temperature"),
+            "top_p": payload.get("top_p"),
+            "max_output_tokens": payload.get("max_output_tokens"),
+            "tools": tools,
+            "tool_choice": payload.get("tool_choice"),
+            "text": payload.get("text"),
+            "reasoning": payload.get("reasoning"),
+        }
+        native_params.update({k: v for k, v in _nopt.items() if v is not None})
+    except BaseException:
+        await decrement_usage(endpoint, tracking_model)
+        raise

    async def _persist(status, output_items=None, usage=None, error=None, insert=False):
        if not store:
@ -275,30 +283,37 @@ async def openai_responses_proxy(request: Request):

    # ---- background: run detached, return queued immediately --------------
    if background:
-        await _persist("queued", insert=True)
+        # Once the task is created, _bg_run's finally owns the release. Guard the
+        # pre-task setup so a failure there (queued persist, task creation, or a
+        # client disconnect) still releases the reservation.
+        try:
+            await _persist("queued", insert=True)

-        async def _bg_run():
-            try:
-                await get_db().update_response_status(response_id, "in_progress")
-                output_items, usage = await _run_to_completion(
-                    native=native, oclient=oclient, endpoint=endpoint, model=model,
-                    tracking_model=tracking_model, send_params=send_params,
-                    native_params=native_params)
-                await _track(usage)
-                await _persist("completed", output_items=output_items, usage=usage)
-                await _cache_store(output_items, usage)
-            except asyncio.CancelledError:
-                await get_db().update_response_status(response_id, "cancelled")
-                raise
-            except Exception as e:
-                await get_db().update_response_status(
-                    response_id, "failed",
-                    error={"message": str(e)[:500], "type": type(e).__name__})
-            finally:
-                await decrement_usage(endpoint, tracking_model)
-                _background_tasks.pop(response_id, None)
+            async def _bg_run():
+                try:
+                    await get_db().update_response_status(response_id, "in_progress")
+                    output_items, usage = await _run_to_completion(
+                        native=native, oclient=oclient, endpoint=endpoint, model=model,
+                        tracking_model=tracking_model, send_params=send_params,
+                        native_params=native_params)
+                    await _track(usage)
+                    await _persist("completed", output_items=output_items, usage=usage)
+                    await _cache_store(output_items, usage)
+                except asyncio.CancelledError:
+                    await get_db().update_response_status(response_id, "cancelled")
+                    raise
+                except Exception as e:
+                    await get_db().update_response_status(
+                        response_id, "failed",
+                        error={"message": str(e)[:500], "type": type(e).__name__})
+                finally:
+                    await decrement_usage(endpoint, tracking_model)
+                    _background_tasks.pop(response_id, None)

-        task = asyncio.create_task(_bg_run())
+            task = asyncio.create_task(_bg_run())
+        except BaseException:
+            await decrement_usage(endpoint, tracking_model)
+            raise
        _background_tasks[response_id] = task
        queued = build_response_object(response_id=response_id, model=model, output_items=[],
                                       status="queued", created_at=created_at,
@ -308,18 +323,25 @@ async def openai_responses_proxy(request: Request):

    # ---- streaming sync ----------------------------------------------------
    if stream:
-        if native:
-            source = await oclient.responses.create(stream=True, **native_params)
-            translator = _NativeStream(response_id)
-        else:
-            source = await create_chat_with_retries(
-                oclient, {**send_params, "stream": True,
-                          "stream_options": {"include_usage": True}},
-                endpoint, model, tracking_model)
-            translator = ChatToResponsesStream(
-                response_id, model, created_at=created_at,
-                previous_response_id=previous_response_id, instructions=instructions,
-                metadata=metadata)
+        # _stream's finally owns the release once iteration starts. Establishing
+        # the source can fail (or be cancelled) before that — release here, since
+        # create_chat_with_retries no longer self-decrements.
+        try:
+            if native:
+                source = await oclient.responses.create(stream=True, **native_params)
+                translator = _NativeStream(response_id)
+            else:
+                source = await create_chat_with_retries(
+                    oclient, {**send_params, "stream": True,
+                              "stream_options": {"include_usage": True}},
+                    endpoint, model, tracking_model)
+                translator = ChatToResponsesStream(
+                    response_id, model, created_at=created_at,
+                    previous_response_id=previous_response_id, instructions=instructions,
+                    metadata=metadata)
+        except BaseException:
+            await decrement_usage(endpoint, tracking_model)
+            raise

        async def _stream():
            await _persist("in_progress", insert=True)