5 changed files with 40 additions and 100381 deletions
--- a/context_window.py
+++ b/context_window.py
@ -7,18 +7,6 @@ fits inside (n_ctx - safety_margin).
 Also owns the per-(endpoint, model) n_ctx cache that the routes populate from
 exceed_context_size_error bodies and from finish_reason=="length" signals.
 """
 import os
 # Point tiktoken at the vendored cl100k_base vocab so the encoding loads offline,
 # without a network download. The download would otherwise fail anyway: this repo
 # has a top-level `requests` package that shadows the pip `requests` tiktoken's
 # downloader imports, so get_encoding() would silently fall back to char/4. See
 # vendor/tiktoken/. setdefault lets an explicit env override win.
 os.environ.setdefault(
    "TIKTOKEN_CACHE_DIR",
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "vendor", "tiktoken"),
 )
 try:
    import tiktoken as _tiktoken
    _tiktoken_enc = _tiktoken.get_encoding("cl100k_base")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 aiohappyeyeballs==2.6.1
-aiohttp==3.14.0
+aiohttp==3.13.5
 aiosignal==1.4.0
 annotated-types==0.7.0
 anyio==4.13.0
--- a/test/requirements_test.txt
+++ b/test/requirements_test.txt
@ -1,3 +1,4 @@
 pytest>=8.0
 pytest-asyncio>=0.24
 pytest-cov>=5.0
 aioresponses>=0.7
--- a/test/test_fetch.py
+++ b/test/test_fetch.py
@ -1,19 +1,11 @@
-"""Tests for fetch.available_models and fetch.loaded_models.
+"""Tests for fetch.available_models and fetch.loaded_models using aioresponses mocking."""
 The backend probes obtain their HTTP client via ``backends.probe.get_probe_session``
 and only ever call ``async with client.get(url, headers=...) as resp``. We patch that
 seam with a tiny fake session instead of mocking aiohttp's internals (aioresponses),
 so the suite stays independent of aiohttp's private ClientResponse/ConnectionKey
 structure across version bumps.
 """
 import time
 from contextlib import contextmanager
 from unittest.mock import patch, MagicMock
 import pytest
 from aioresponses import aioresponses
 import router
 import backends.probe as probe
 from conftest import TEST_OLLAMA, TEST_LLAMA
 MOCK_OLLAMA_EP = "http://mock-ollama:11434"
@ -30,73 +22,6 @@ def _make_cfg(ollama_eps=None, llama_eps=None, api_keys=None):
    return cfg
 # ── Fake probe session ────────────────────────────────────────────────────────
 class _MockResponse:
    """Minimal stand-in for the aiohttp response used by the probes."""
    def __init__(self, *, status=200, payload=None, text=None):
        self.status = status
        self._payload = payload
        self._text = text if text is not None else ""
    async def json(self):
        return self._payload
    async def text(self):
        return self._text
    async def __aenter__(self):
        return self
    async def __aexit__(self, *exc):
        return False
 class _RaisingCtx:
    """``async with client.get(...)`` that raises on entry — mimics a failed connection."""
    def __init__(self, exc):
        self._exc = exc
    async def __aenter__(self):
        raise self._exc
    async def __aexit__(self, *exc):
        return False
 class _MockProbeSession:
    """Stand-in for the aiohttp ClientSession returned by ``get_probe_session``.
    Routes are registered by exact URL via :meth:`add_get`. A registered exception
    is raised when the route is entered; otherwise a :class:`_MockResponse` is yielded.
    An unregistered GET fails loudly so tests can't silently pass on a wrong URL.
    """
    def __init__(self):
        self._routes = {}
    def add_get(self, url, *, status=200, payload=None, text=None, exception=None):
        self._routes[url] = exception if exception is not None else _MockResponse(
            status=status, payload=payload, text=text
        )
    def get(self, url, **kwargs):
        if url not in self._routes:
            raise AssertionError(f"unexpected probe GET {url}")
        entry = self._routes[url]
        return _RaisingCtx(entry) if isinstance(entry, Exception) else entry
@contextmanager
 def mock_probe():
    """Patch the probe's session factory to return a fresh :class:`_MockProbeSession`."""
    session = _MockProbeSession()
    with patch.object(probe, "get_probe_session", lambda endpoint: session):
        yield session
@pytest.fixture(autouse=True)
 def clear_caches(aio_session):
    """aio_session fixture already clears caches and sets up app_state."""
@ -106,8 +31,8 @@ def clear_caches(aio_session):
 class TestFetchAvailableModels:
    async def test_ollama_tags(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/tags",
                payload={"models": [
                    {"name": "llama3.2:latest"},
@ -119,8 +44,8 @@ class TestFetchAvailableModels:
    async def test_openai_compatible_models_endpoint(self):
        cfg = _make_cfg(llama_eps=[MOCK_LLAMA_EP])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_LLAMA_EP}/models",
                payload={"data": [{"id": "unsloth/model:Q8_0"}]},
            )
@ -129,8 +54,8 @@ class TestFetchAvailableModels:
    async def test_caches_successful_result(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/tags",
                payload={"models": [{"name": "llama3.2:latest"}]},
            )
@ -141,19 +66,20 @@ class TestFetchAvailableModels:
    async def test_returns_empty_on_http_500(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(f"{MOCK_OLLAMA_EP}/api/tags", status=500, payload={"error": "oops"})
+            m.get(f"{MOCK_OLLAMA_EP}/api/tags", status=500, payload={"error": "oops"})
            models = await router.fetch.available_models(MOCK_OLLAMA_EP)
        assert models == set()
    async def test_returns_empty_on_connection_error(self):
        import aiohttp
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        import aiohttp
-            m.add_get(
+        with patch.object(router, "config", cfg), aioresponses() as m:
            m.get(
                f"{MOCK_OLLAMA_EP}/api/tags",
-                exception=aiohttp.ClientConnectionError(
+                exception=aiohttp.ClientConnectorError(
-                    "Cannot connect to host mock-ollama:11434 [Connection refused]"
+                    connection_key=MagicMock(host="mock-ollama", port=11434),
                    os_error=OSError(111, "refused"),
                ),
            )
            models = await router.fetch.available_models(MOCK_OLLAMA_EP)
@ -161,8 +87,8 @@ class TestFetchAvailableModels:
    async def test_stale_cache_returned_while_refresh_runs(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/tags",
                payload={"models": [{"name": "llama3.2:latest"}]},
            )
@ -173,8 +99,8 @@ class TestFetchAvailableModels:
            models, _ = router._models_cache[MOCK_OLLAMA_EP]
            router._models_cache[MOCK_OLLAMA_EP] = (models, time.time() - 400)
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/tags",
                payload={"models": [{"name": "llama3.2:latest"}]},
            )
@ -188,8 +114,8 @@ class TestFetchAvailableModels:
        async with router._available_error_cache_lock:
            router._available_error_cache[MOCK_OLLAMA_EP] = time.time()
-        with patch.object(router, "config", cfg), mock_probe():
+        with patch.object(router, "config", cfg), aioresponses():
-            # No route registered — if a call happens it raises AssertionError
+            # No HTTP mock registered — if a call happens it will raise
            models = await router.fetch.available_models(MOCK_OLLAMA_EP)
        assert models == set()
@ -197,8 +123,8 @@ class TestFetchAvailableModels:
 class TestFetchLoadedModels:
    async def test_ollama_ps(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/ps",
                payload={"models": [{"name": "llama3.2:latest"}]},
            )
@ -207,8 +133,8 @@ class TestFetchLoadedModels:
    async def test_llama_server_filters_loaded(self):
        cfg = _make_cfg(llama_eps=[MOCK_LLAMA_EP])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_LLAMA_EP}/models",
                payload={"data": [
                    {"id": "model-a", "status": {"value": "loaded"}},
@ -220,8 +146,8 @@ class TestFetchLoadedModels:
    async def test_llama_server_no_status_field_always_loaded(self):
        cfg = _make_cfg(llama_eps=[MOCK_LLAMA_EP])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_LLAMA_EP}/models",
                payload={"data": [{"id": "always-on-model"}]},
            )
@ -230,8 +156,8 @@ class TestFetchLoadedModels:
    async def test_returns_empty_on_error(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(f"{MOCK_OLLAMA_EP}/api/ps", status=503, payload={})
+            m.get(f"{MOCK_OLLAMA_EP}/api/ps", status=503, payload={})
            models = await router.fetch.loaded_models(MOCK_OLLAMA_EP)
        assert models == set()
@ -244,8 +170,8 @@ class TestFetchLoadedModels:
    async def test_caches_result(self):
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/ps",
                payload={"models": [{"name": "qwen:7b"}]},
            )
@ -257,15 +183,15 @@ class TestFetchLoadedModels:
        # Regression: issue #83 — /api/ps failures must be recorded so
        # `choose_endpoint` can exclude unhealthy backends from routing.
        cfg = _make_cfg(ollama_eps=[MOCK_OLLAMA_EP], llama_eps=[])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(f"{MOCK_OLLAMA_EP}/api/ps", status=502, payload={})
+            m.get(f"{MOCK_OLLAMA_EP}/api/ps", status=502, payload={})
            await router.fetch.loaded_models(MOCK_OLLAMA_EP)
        assert MOCK_OLLAMA_EP in router._loaded_error_cache
    async def test_records_error_for_llama_server_on_failure(self):
        cfg = _make_cfg(ollama_eps=[], llama_eps=[MOCK_LLAMA_EP])
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(f"{MOCK_LLAMA_EP}/models", status=502, payload={})
+            m.get(f"{MOCK_LLAMA_EP}/models", status=502, payload={})
            await router.fetch.loaded_models(MOCK_LLAMA_EP)
        assert MOCK_LLAMA_EP in router._loaded_error_cache
@ -275,8 +201,8 @@ class TestFetchLoadedModels:
        # network probe instead of short-circuiting on the error cache.
        async with router._loaded_error_cache_lock:
            router._loaded_error_cache[MOCK_OLLAMA_EP] = time.time() - 301
-        with patch.object(router, "config", cfg), mock_probe() as m:
+        with patch.object(router, "config", cfg), aioresponses() as m:
-            m.add_get(
+            m.get(
                f"{MOCK_OLLAMA_EP}/api/ps",
                payload={"models": [{"name": "qwen:7b"}]},
            )
--- a/vendor/tiktoken/9b5ad71b2ce5302211f9c61530b329a4922fc6a4
+++ b/vendor/tiktoken/9b5ad71b2ce5302211f9c61530b329a4922fc6a4