mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
perf(chat): kill auto-pin preflight + speculative build, rely on reactive 429 recovery
The preflight pattern probed the LLM with a 1-token ping before each
cold turn (when requested_llm_config_id==0, llm_config_id<0, and the
45s healthy TTL had expired) to detect 429s before fanning out into
planner/classifier/title-gen. To absorb its ~1-5s RTT cost we built the
agent speculatively in parallel; on 429 we discarded the build and
repinned.
Three problems with that design:
1. False security. Provider rate limits are token-bucket. A 1-token
ping consumes ~5 tokens; the real request consumes 10-50K. The
probe can return 200 while the real call still 429s.
2. Pure overhead in the common case. On warm-agent-cache turns the
probe dominates wall time: ~2.5s of TTFT pure tax for ~99% of users
who never see a 429.
3. The in-stream recovery loop (catch of _is_provider_rate_limited
gated by not _first_event_logged) already does the right thing
reactively: mark_runtime_cooldown -> resolve_or_get_pinned_llm_config_id
with exclude_config_ids={previous} -> rebuild agent -> retry the
stream. Preflight was never the only safety net; it was a redundant
probe in front of one.
Changes:
- Delete _preflight_llm, _settle_speculative_agent_build, and the
_PREFLIGHT_TIMEOUT_SEC / _PREFLIGHT_MAX_TOKENS constants.
- Drop the parallel agent_build_task / preflight_task plumbing in
both stream_new_chat and stream_resume_chat; build the agent inline
with await _build_main_agent_for_thread(...).
- Drop the unused is_recently_healthy / mark_healthy imports here
(still exported from auto_model_pin_service since OpenRouter
catalogue refresh and a few tests reference clear_healthy).
- Remove the obsolete preflight + settle-speculative tests from
test_stream_new_chat_contract.py.
Net: -447 LOC. ~2.5s removed from TTFT on every cold preflight-eligible
turn. 429 recovery path is unchanged - same repin/rebuild/retry, just
not paid in advance on the healthy path.
This commit is contained in:
parent
1791241c0c
commit
c3db25302b
2 changed files with 40 additions and 487 deletions
|
|
@ -209,128 +209,6 @@ def test_stream_exception_classifies_openrouter_429_payload():
|
|||
assert extra is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preflight_swallows_non_rate_limit_errors_and_re_raises_429(monkeypatch):
|
||||
"""``_preflight_llm`` is best-effort.
|
||||
|
||||
- On rate-limit shaped exceptions (provider 429) it MUST re-raise so the
|
||||
caller can drive the cooldown/repin branch.
|
||||
- On any other transient failure it MUST swallow the error so the normal
|
||||
stream path continues without surfacing preflight noise to the user.
|
||||
"""
|
||||
from types import SimpleNamespace
|
||||
|
||||
from app.tasks.chat.stream_new_chat import _preflight_llm
|
||||
|
||||
class _RateLimitedError(Exception):
|
||||
"""Class-name carries 'RateLimit' so _is_provider_rate_limited triggers."""
|
||||
|
||||
rate_calls: list[dict] = []
|
||||
other_calls: list[dict] = []
|
||||
|
||||
async def _fake_acompletion_429(**kwargs):
|
||||
rate_calls.append(kwargs)
|
||||
raise _RateLimitedError("simulated 429")
|
||||
|
||||
async def _fake_acompletion_other(**kwargs):
|
||||
other_calls.append(kwargs)
|
||||
raise RuntimeError("some unrelated transient failure")
|
||||
|
||||
fake_llm = SimpleNamespace(
|
||||
model="openrouter/google/gemma-4-31b-it:free",
|
||||
api_key="test",
|
||||
api_base=None,
|
||||
)
|
||||
|
||||
import litellm # type: ignore[import-not-found]
|
||||
|
||||
monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_429)
|
||||
with pytest.raises(_RateLimitedError):
|
||||
await _preflight_llm(fake_llm)
|
||||
assert len(rate_calls) == 1
|
||||
assert rate_calls[0]["max_tokens"] == 1
|
||||
assert rate_calls[0]["stream"] is False
|
||||
|
||||
monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_other)
|
||||
# MUST NOT raise: non-rate-limit failures are swallowed.
|
||||
await _preflight_llm(fake_llm)
|
||||
assert len(other_calls) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_preflight_skipped_for_auto_router_model():
|
||||
"""Router-mode ``model='auto'`` has no single deployment to ping; the
|
||||
LiteLLM router itself owns per-deployment rate-limit accounting, so the
|
||||
preflight helper must short-circuit instead of issuing a probe."""
|
||||
from types import SimpleNamespace
|
||||
|
||||
from app.tasks.chat.stream_new_chat import _preflight_llm
|
||||
|
||||
fake_llm = SimpleNamespace(model="auto", api_key="x", api_base=None)
|
||||
# Should return without raising or making any LiteLLM call.
|
||||
await _preflight_llm(fake_llm)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_settle_speculative_agent_build_swallows_exceptions():
|
||||
"""``_settle_speculative_agent_build`` MUST always return cleanly so the
|
||||
caller can safely re-touch the request-scoped session afterwards.
|
||||
|
||||
The helper guards the parallel preflight + agent-build path: when the
|
||||
speculative build is being discarded (429 or non-429 preflight failure)
|
||||
we await it solely to release any in-flight ``AsyncSession`` usage —
|
||||
the build's outcome is irrelevant. Any exception (including
|
||||
``CancelledError``) leaking out would skip the caller's recovery flow
|
||||
and re-introduce the very session-concurrency hazard the helper exists
|
||||
to prevent.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
|
||||
|
||||
async def _raises() -> None:
|
||||
raise RuntimeError("speculative build crashed")
|
||||
|
||||
async def _succeeds() -> str:
|
||||
return "agent"
|
||||
|
||||
async def _slow() -> None:
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
for coro in (_raises(), _succeeds(), _slow()):
|
||||
task = asyncio.create_task(coro)
|
||||
await _settle_speculative_agent_build(task)
|
||||
assert task.done()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_settle_speculative_agent_build_handles_already_done_task():
|
||||
"""Done tasks (success or failure) must still be settled without raising."""
|
||||
import asyncio
|
||||
|
||||
from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
|
||||
|
||||
async def _ok() -> str:
|
||||
return "ok"
|
||||
|
||||
async def _bad() -> None:
|
||||
raise ValueError("nope")
|
||||
|
||||
ok_task = asyncio.create_task(_ok())
|
||||
bad_task = asyncio.create_task(_bad())
|
||||
# Drive both to completion before settling.
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
|
||||
await _settle_speculative_agent_build(ok_task)
|
||||
await _settle_speculative_agent_build(bad_task)
|
||||
assert ok_task.result() == "ok"
|
||||
# ``bad_task`` exception was consumed by the settle helper; calling
|
||||
# ``.exception()`` after the fact must still return the original error
|
||||
# (the helper observes it but doesn't clear it).
|
||||
assert isinstance(bad_task.exception(), ValueError)
|
||||
|
||||
|
||||
def test_stream_exception_classifies_thread_busy():
|
||||
exc = BusyError(request_id="thread-123")
|
||||
kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue