From 7c1c394fe4768c05babc0330e2f8955e82167046 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Sat, 2 May 2026 02:07:44 +0530
Subject: [PATCH] feat(stream_new_chat): add lightweight LLM preflight probe
 for auto-pin

---
 .../unit/test_stream_new_chat_contract.py     | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/surfsense_backend/tests/unit/test_stream_new_chat_contract.py b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
index ed69ca348..6a1b4c13b 100644
--- a/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
+++ b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
@@ -175,6 +175,68 @@ def test_stream_exception_classifies_openrouter_429_payload():
     assert extra is None
 
 
+@pytest.mark.asyncio
+async def test_preflight_swallows_non_rate_limit_errors_and_re_raises_429(monkeypatch):
+    """``_preflight_llm`` is best-effort.
+
+    - On rate-limit shaped exceptions (provider 429) it MUST re-raise so the
+      caller can drive the cooldown/repin branch.
+    - On any other transient failure it MUST swallow the error so the normal
+      stream path continues without surfacing preflight noise to the user.
+    """
+    from types import SimpleNamespace
+
+    from app.tasks.chat.stream_new_chat import _preflight_llm
+
+    class _RateLimitedExc(Exception):
+        """Class-name carries 'RateLimit' so _is_provider_rate_limited triggers."""
+
+    rate_calls: list[dict] = []
+    other_calls: list[dict] = []
+
+    async def _fake_acompletion_429(**kwargs):
+        rate_calls.append(kwargs)
+        raise _RateLimitedExc("simulated 429")
+
+    async def _fake_acompletion_other(**kwargs):
+        other_calls.append(kwargs)
+        raise RuntimeError("some unrelated transient failure")
+
+    fake_llm = SimpleNamespace(
+        model="openrouter/google/gemma-4-31b-it:free",
+        api_key="test",
+        api_base=None,
+    )
+
+    import litellm  # type: ignore[import-not-found]
+
+    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_429)
+    with pytest.raises(_RateLimitedExc):
+        await _preflight_llm(fake_llm)
+    assert len(rate_calls) == 1
+    assert rate_calls[0]["max_tokens"] == 1
+    assert rate_calls[0]["stream"] is False
+
+    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_other)
+    # MUST NOT raise: non-rate-limit failures are swallowed.
+    await _preflight_llm(fake_llm)
+    assert len(other_calls) == 1
+
+
+@pytest.mark.asyncio
+async def test_preflight_skipped_for_auto_router_model():
+    """Router-mode ``model='auto'`` has no single deployment to ping; the
+    LiteLLM router itself owns per-deployment rate-limit accounting, so the
+    preflight helper must short-circuit instead of issuing a probe."""
+    from types import SimpleNamespace
+
+    from app.tasks.chat.stream_new_chat import _preflight_llm
+
+    fake_llm = SimpleNamespace(model="auto", api_key="x", api_base=None)
+    # Should return without raising or making any LiteLLM call.
+    await _preflight_llm(fake_llm)
+
+
 def test_stream_exception_classifies_thread_busy():
     exc = BusyError(request_id="thread-123")
     kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(