perf(chat): kill auto-pin preflight + speculative build, rely on reactive 429 recovery

The preflight pattern probed the LLM with a 1-token ping before each
cold turn (when requested_llm_config_id==0, llm_config_id<0, and the
45s healthy TTL had expired) to detect 429s before fanning out into
planner/classifier/title-gen. To absorb its ~1-5s RTT cost we built the
agent speculatively in parallel; on 429 we discarded the build and
repinned.

Three problems with that design:

1. False security. Provider rate limits are token-bucket. A 1-token
   ping consumes ~5 tokens; the real request consumes 10-50K. The
   probe can return 200 while the real call still 429s.
2. Pure overhead in the common case. On warm-agent-cache turns the
   probe dominates wall time: ~2.5s of TTFT pure tax for ~99% of users
   who never see a 429.
3. The in-stream recovery loop (catch of _is_provider_rate_limited
   gated by not _first_event_logged) already does the right thing
   reactively: mark_runtime_cooldown -> resolve_or_get_pinned_llm_config_id
   with exclude_config_ids={previous} -> rebuild agent -> retry the
   stream. Preflight was never the only safety net; it was a redundant
   probe in front of one.

Changes:
- Delete _preflight_llm, _settle_speculative_agent_build, and the
  _PREFLIGHT_TIMEOUT_SEC / _PREFLIGHT_MAX_TOKENS constants.
- Drop the parallel agent_build_task / preflight_task plumbing in
  both stream_new_chat and stream_resume_chat; build the agent inline
  with await _build_main_agent_for_thread(...).
- Drop the unused is_recently_healthy / mark_healthy imports here
  (still exported from auto_model_pin_service since OpenRouter
  catalogue refresh and a few tests reference clear_healthy).
- Remove the obsolete preflight + settle-speculative tests from
  test_stream_new_chat_contract.py.

Net: -447 LOC. ~2.5s removed from TTFT on every cold preflight-eligible
turn. 429 recovery path is unchanged - same repin/rebuild/retry, just
not paid in advance on the healthy path.
This commit is contained in:
CREDO23 2026-05-20 11:03:08 +02:00
parent 1791241c0c
commit c3db25302b
2 changed files with 40 additions and 487 deletions

View file

@ -209,128 +209,6 @@ def test_stream_exception_classifies_openrouter_429_payload():
assert extra is None
@pytest.mark.asyncio
async def test_preflight_swallows_non_rate_limit_errors_and_re_raises_429(monkeypatch):
"""``_preflight_llm`` is best-effort.
- On rate-limit shaped exceptions (provider 429) it MUST re-raise so the
caller can drive the cooldown/repin branch.
- On any other transient failure it MUST swallow the error so the normal
stream path continues without surfacing preflight noise to the user.
"""
from types import SimpleNamespace
from app.tasks.chat.stream_new_chat import _preflight_llm
class _RateLimitedError(Exception):
"""Class-name carries 'RateLimit' so _is_provider_rate_limited triggers."""
rate_calls: list[dict] = []
other_calls: list[dict] = []
async def _fake_acompletion_429(**kwargs):
rate_calls.append(kwargs)
raise _RateLimitedError("simulated 429")
async def _fake_acompletion_other(**kwargs):
other_calls.append(kwargs)
raise RuntimeError("some unrelated transient failure")
fake_llm = SimpleNamespace(
model="openrouter/google/gemma-4-31b-it:free",
api_key="test",
api_base=None,
)
import litellm # type: ignore[import-not-found]
monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_429)
with pytest.raises(_RateLimitedError):
await _preflight_llm(fake_llm)
assert len(rate_calls) == 1
assert rate_calls[0]["max_tokens"] == 1
assert rate_calls[0]["stream"] is False
monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_other)
# MUST NOT raise: non-rate-limit failures are swallowed.
await _preflight_llm(fake_llm)
assert len(other_calls) == 1
@pytest.mark.asyncio
async def test_preflight_skipped_for_auto_router_model():
"""Router-mode ``model='auto'`` has no single deployment to ping; the
LiteLLM router itself owns per-deployment rate-limit accounting, so the
preflight helper must short-circuit instead of issuing a probe."""
from types import SimpleNamespace
from app.tasks.chat.stream_new_chat import _preflight_llm
fake_llm = SimpleNamespace(model="auto", api_key="x", api_base=None)
# Should return without raising or making any LiteLLM call.
await _preflight_llm(fake_llm)
@pytest.mark.asyncio
async def test_settle_speculative_agent_build_swallows_exceptions():
"""``_settle_speculative_agent_build`` MUST always return cleanly so the
caller can safely re-touch the request-scoped session afterwards.
The helper guards the parallel preflight + agent-build path: when the
speculative build is being discarded (429 or non-429 preflight failure)
we await it solely to release any in-flight ``AsyncSession`` usage
the build's outcome is irrelevant. Any exception (including
``CancelledError``) leaking out would skip the caller's recovery flow
and re-introduce the very session-concurrency hazard the helper exists
to prevent.
"""
import asyncio
from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
async def _raises() -> None:
raise RuntimeError("speculative build crashed")
async def _succeeds() -> str:
return "agent"
async def _slow() -> None:
await asyncio.sleep(0.05)
for coro in (_raises(), _succeeds(), _slow()):
task = asyncio.create_task(coro)
await _settle_speculative_agent_build(task)
assert task.done()
@pytest.mark.asyncio
async def test_settle_speculative_agent_build_handles_already_done_task():
"""Done tasks (success or failure) must still be settled without raising."""
import asyncio
from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
async def _ok() -> str:
return "ok"
async def _bad() -> None:
raise ValueError("nope")
ok_task = asyncio.create_task(_ok())
bad_task = asyncio.create_task(_bad())
# Drive both to completion before settling.
await asyncio.sleep(0)
await asyncio.sleep(0)
await _settle_speculative_agent_build(ok_task)
await _settle_speculative_agent_build(bad_task)
assert ok_task.result() == "ok"
# ``bad_task`` exception was consumed by the settle helper; calling
# ``.exception()`` after the fact must still return the original error
# (the helper observes it but doesn't clear it).
assert isinstance(bad_task.exception(), ValueError)
def test_stream_exception_classifies_thread_busy():
exc = BusyError(request_id="thread-123")
kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(