diff --git a/surfsense_backend/app/services/auto_model_pin_service.py b/surfsense_backend/app/services/auto_model_pin_service.py index f6a223866..b2acd6f56 100644 --- a/surfsense_backend/app/services/auto_model_pin_service.py +++ b/surfsense_backend/app/services/auto_model_pin_service.py @@ -34,6 +34,7 @@ logger = logging.getLogger(__name__) AUTO_FASTEST_ID = 0 AUTO_FASTEST_MODE = "auto_fastest" _RUNTIME_COOLDOWN_SECONDS = 600 +_HEALTHY_TTL_SECONDS = 45 # In-memory runtime cooldown map for configs that recently hard-failed at # provider runtime (e.g. OpenRouter 429 on a pinned free model). This keeps @@ -41,6 +42,13 @@ _RUNTIME_COOLDOWN_SECONDS = 600 _runtime_cooldown_until: dict[int, float] = {} _runtime_cooldown_lock = threading.Lock() +# Short-TTL "recently healthy" cache for configs that just passed a runtime +# preflight ping. Lets back-to-back turns on the same model skip the probe +# without eroding correctness — entries auto-expire and are wiped any time +# the same config is cooled down or the OR catalogue is refreshed. +_healthy_until: dict[int, float] = {} +_healthy_lock = threading.Lock() + @dataclass class AutoPinResolution: @@ -89,6 +97,9 @@ def mark_runtime_cooldown( with _runtime_cooldown_lock: _runtime_cooldown_until[int(config_id)] = until _prune_runtime_cooldowns() + # A cooled cfg can never be "recently healthy"; drop any stale credit so + # the next turn that resolves to it (after cooldown) re-runs preflight. + clear_healthy(int(config_id)) logger.info( "auto_pin_runtime_cooled_down config_id=%s reason=%s cooldown_seconds=%s", config_id, @@ -106,6 +117,52 @@ def clear_runtime_cooldown(config_id: int | None = None) -> None: _runtime_cooldown_until.pop(int(config_id), None) +def _prune_healthy(now_ts: float | None = None) -> None: + now = time.time() if now_ts is None else now_ts + stale = [cid for cid, until in _healthy_until.items() if until <= now] + for cid in stale: + _healthy_until.pop(cid, None) + + +def is_recently_healthy(config_id: int) -> bool: + """Return True if ``config_id`` passed preflight within the TTL window.""" + with _healthy_lock: + _prune_healthy() + return int(config_id) in _healthy_until + + +def mark_healthy( + config_id: int, + *, + ttl_seconds: int = _HEALTHY_TTL_SECONDS, +) -> None: + """Record that ``config_id`` just passed a preflight probe. + + Subsequent calls within ``ttl_seconds`` can skip the preflight ping. The + healthy state is intentionally process-local — it's a latency hint, not a + correctness primitive — so multi-worker drift is acceptable. + """ + if ttl_seconds <= 0: + ttl_seconds = _HEALTHY_TTL_SECONDS + until = time.time() + int(ttl_seconds) + with _healthy_lock: + _healthy_until[int(config_id)] = until + _prune_healthy() + + +def clear_healthy(config_id: int | None = None) -> None: + """Drop one (or all) healthy-cache entries. + + Called from runtime cooldown and OR catalogue refresh so a freshly cooled + or replaced config never carries stale "healthy" credit. + """ + with _healthy_lock: + if config_id is None: + _healthy_until.clear() + return + _healthy_until.pop(int(config_id), None) + + def _global_candidates() -> list[dict]: """Return Auto-eligible global cfgs. diff --git a/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py b/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py index 8696a8829..d333f0b7a 100644 --- a/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py +++ b/surfsense_backend/tests/unit/services/test_auto_model_pin_service.py @@ -6,7 +6,10 @@ from types import SimpleNamespace import pytest from app.services.auto_model_pin_service import ( + clear_healthy, clear_runtime_cooldown, + is_recently_healthy, + mark_healthy, mark_runtime_cooldown, resolve_or_get_pinned_llm_config_id, ) @@ -17,8 +20,10 @@ pytestmark = pytest.mark.unit @pytest.fixture(autouse=True) def _clear_runtime_cooldown_map(): clear_runtime_cooldown() + clear_healthy() yield clear_runtime_cooldown() + clear_healthy() @dataclass @@ -866,3 +871,51 @@ async def test_auto_pin_repin_excludes_previous_config_on_runtime_retry(monkeypa ) assert result.resolved_llm_config_id == -2 assert result.from_existing_pin is False + + +# --------------------------------------------------------------------------- +# Healthy-status cache (preflight TTL companion) +# --------------------------------------------------------------------------- + + +def test_mark_healthy_then_is_recently_healthy_true_within_ttl(): + mark_healthy(-42, ttl_seconds=60) + assert is_recently_healthy(-42) is True + + +def test_healthy_expires_after_ttl(monkeypatch): + import app.services.auto_model_pin_service as svc + + real_time = svc.time.time + base = real_time() + + monkeypatch.setattr(svc.time, "time", lambda: base) + mark_healthy(-7, ttl_seconds=10) + assert is_recently_healthy(-7) is True + + monkeypatch.setattr(svc.time, "time", lambda: base + 11) + assert is_recently_healthy(-7) is False + + +def test_mark_runtime_cooldown_invalidates_healthy_cache(): + mark_healthy(-9, ttl_seconds=60) + assert is_recently_healthy(-9) is True + + mark_runtime_cooldown(-9, reason="test", cooldown_seconds=60) + assert is_recently_healthy(-9) is False + + +def test_clear_healthy_removes_single_entry(): + mark_healthy(-11, ttl_seconds=60) + mark_healthy(-12, ttl_seconds=60) + clear_healthy(-11) + assert is_recently_healthy(-11) is False + assert is_recently_healthy(-12) is True + + +def test_clear_healthy_no_args_drops_all_entries(): + mark_healthy(-21, ttl_seconds=60) + mark_healthy(-22, ttl_seconds=60) + clear_healthy() + assert is_recently_healthy(-21) is False + assert is_recently_healthy(-22) is False