feat(auto_pin): add short-TTL healthy-status cache for preflight reuse

This commit is contained in:
Anish Sarkar 2026-05-02 02:07:16 +05:30
parent 25ccc959cf
commit 14686cdf82
2 changed files with 110 additions and 0 deletions

View file

@ -34,6 +34,7 @@ logger = logging.getLogger(__name__)
AUTO_FASTEST_ID = 0
AUTO_FASTEST_MODE = "auto_fastest"
_RUNTIME_COOLDOWN_SECONDS = 600
_HEALTHY_TTL_SECONDS = 45
# In-memory runtime cooldown map for configs that recently hard-failed at
# provider runtime (e.g. OpenRouter 429 on a pinned free model). This keeps
@ -41,6 +42,13 @@ _RUNTIME_COOLDOWN_SECONDS = 600
_runtime_cooldown_until: dict[int, float] = {}
_runtime_cooldown_lock = threading.Lock()
# Short-TTL "recently healthy" cache for configs that just passed a runtime
# preflight ping. Lets back-to-back turns on the same model skip the probe
# without eroding correctness — entries auto-expire and are wiped any time
# the same config is cooled down or the OR catalogue is refreshed.
_healthy_until: dict[int, float] = {}
_healthy_lock = threading.Lock()
@dataclass
class AutoPinResolution:
@ -89,6 +97,9 @@ def mark_runtime_cooldown(
with _runtime_cooldown_lock:
_runtime_cooldown_until[int(config_id)] = until
_prune_runtime_cooldowns()
# A cooled cfg can never be "recently healthy"; drop any stale credit so
# the next turn that resolves to it (after cooldown) re-runs preflight.
clear_healthy(int(config_id))
logger.info(
"auto_pin_runtime_cooled_down config_id=%s reason=%s cooldown_seconds=%s",
config_id,
@ -106,6 +117,52 @@ def clear_runtime_cooldown(config_id: int | None = None) -> None:
_runtime_cooldown_until.pop(int(config_id), None)
def _prune_healthy(now_ts: float | None = None) -> None:
now = time.time() if now_ts is None else now_ts
stale = [cid for cid, until in _healthy_until.items() if until <= now]
for cid in stale:
_healthy_until.pop(cid, None)
def is_recently_healthy(config_id: int) -> bool:
"""Return True if ``config_id`` passed preflight within the TTL window."""
with _healthy_lock:
_prune_healthy()
return int(config_id) in _healthy_until
def mark_healthy(
config_id: int,
*,
ttl_seconds: int = _HEALTHY_TTL_SECONDS,
) -> None:
"""Record that ``config_id`` just passed a preflight probe.
Subsequent calls within ``ttl_seconds`` can skip the preflight ping. The
healthy state is intentionally process-local it's a latency hint, not a
correctness primitive so multi-worker drift is acceptable.
"""
if ttl_seconds <= 0:
ttl_seconds = _HEALTHY_TTL_SECONDS
until = time.time() + int(ttl_seconds)
with _healthy_lock:
_healthy_until[int(config_id)] = until
_prune_healthy()
def clear_healthy(config_id: int | None = None) -> None:
"""Drop one (or all) healthy-cache entries.
Called from runtime cooldown and OR catalogue refresh so a freshly cooled
or replaced config never carries stale "healthy" credit.
"""
with _healthy_lock:
if config_id is None:
_healthy_until.clear()
return
_healthy_until.pop(int(config_id), None)
def _global_candidates() -> list[dict]:
"""Return Auto-eligible global cfgs.

View file

@ -6,7 +6,10 @@ from types import SimpleNamespace
import pytest
from app.services.auto_model_pin_service import (
clear_healthy,
clear_runtime_cooldown,
is_recently_healthy,
mark_healthy,
mark_runtime_cooldown,
resolve_or_get_pinned_llm_config_id,
)
@ -17,8 +20,10 @@ pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True)
def _clear_runtime_cooldown_map():
clear_runtime_cooldown()
clear_healthy()
yield
clear_runtime_cooldown()
clear_healthy()
@dataclass
@ -866,3 +871,51 @@ async def test_auto_pin_repin_excludes_previous_config_on_runtime_retry(monkeypa
)
assert result.resolved_llm_config_id == -2
assert result.from_existing_pin is False
# ---------------------------------------------------------------------------
# Healthy-status cache (preflight TTL companion)
# ---------------------------------------------------------------------------
def test_mark_healthy_then_is_recently_healthy_true_within_ttl():
mark_healthy(-42, ttl_seconds=60)
assert is_recently_healthy(-42) is True
def test_healthy_expires_after_ttl(monkeypatch):
import app.services.auto_model_pin_service as svc
real_time = svc.time.time
base = real_time()
monkeypatch.setattr(svc.time, "time", lambda: base)
mark_healthy(-7, ttl_seconds=10)
assert is_recently_healthy(-7) is True
monkeypatch.setattr(svc.time, "time", lambda: base + 11)
assert is_recently_healthy(-7) is False
def test_mark_runtime_cooldown_invalidates_healthy_cache():
mark_healthy(-9, ttl_seconds=60)
assert is_recently_healthy(-9) is True
mark_runtime_cooldown(-9, reason="test", cooldown_seconds=60)
assert is_recently_healthy(-9) is False
def test_clear_healthy_removes_single_entry():
mark_healthy(-11, ttl_seconds=60)
mark_healthy(-12, ttl_seconds=60)
clear_healthy(-11)
assert is_recently_healthy(-11) is False
assert is_recently_healthy(-12) is True
def test_clear_healthy_no_args_drops_all_entries():
mark_healthy(-21, ttl_seconds=60)
mark_healthy(-22, ttl_seconds=60)
clear_healthy()
assert is_recently_healthy(-21) is False
assert is_recently_healthy(-22) is False