From 9bfba34e8e76554314dc857cce1e2757a7910993 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:29:56 +0200
Subject: [PATCH 01/34] perf(mcp): add per-call, discovery, and oauth-refresh
 timing logs

---
 .../app/agents/new_chat/tools/mcp_tool.py     | 76 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
index 64368a878..b3c26f331 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
@@ -38,6 +38,9 @@ from app.agents.new_chat.tools.hitl import request_approval
 from app.agents.new_chat.tools.mcp_client import MCPClient
 from app.db import SearchSourceConnector
 from app.services.mcp_oauth.registry import MCP_SERVICES, get_service_by_connector_type
+from app.utils.perf import get_perf_logger
+
+_perf_log = get_perf_logger()
 
 logger = logging.getLogger(__name__)
 
@@ -293,15 +296,21 @@ async def _create_mcp_tool_from_definition_http(
         timeout: float = 60.0,
     ) -> str:
         """Execute a single MCP HTTP call with the given headers."""
+        call_start = time.perf_counter()
         async with (
             streamablehttp_client(url, headers=call_headers) as (read, write, _),
             ClientSession(read, write) as session,
         ):
+            init_start = time.perf_counter()
             await session.initialize()
+            init_elapsed = time.perf_counter() - init_start
+
+            tool_start = time.perf_counter()
             response = await asyncio.wait_for(
                 session.call_tool(original_tool_name, arguments=call_kwargs),
                 timeout=timeout,
             )
+            tool_elapsed = time.perf_counter() - tool_start
 
             result = []
             for content in response.content:
@@ -312,7 +321,18 @@ async def _create_mcp_tool_from_definition_http(
                 else:
                     result.append(str(content))
 
-            return "\n".join(result) if result else ""
+            payload = "\n".join(result) if result else ""
+
+        _perf_log.info(
+            "[mcp_http_call] connector=%s tool=%s init=%.3fs call=%.3fs total=%.3fs out_chars=%d",
+            connector_id,
+            original_tool_name,
+            init_elapsed,
+            tool_elapsed,
+            time.perf_counter() - call_start,
+            len(payload),
+        )
+        return payload
 
     async def mcp_http_tool_call(**kwargs) -> str:
         """Execute the MCP tool call via HTTP transport."""
@@ -792,14 +812,25 @@ async def _maybe_refresh_mcp_oauth_token(
     except (ValueError, TypeError):
         return server_config
 
+    refresh_start = time.perf_counter()
     try:
         new_access = await _refresh_connector_token(session, connector)
         if not new_access:
+            _perf_log.info(
+                "[mcp_oauth_refresh] connector=%s elapsed=%.3fs outcome=no_token",
+                connector.id,
+                time.perf_counter() - refresh_start,
+            )
             return server_config
 
         logger.info(
             "Proactively refreshed MCP OAuth token for connector %s", connector.id
         )
+        _perf_log.info(
+            "[mcp_oauth_refresh] connector=%s elapsed=%.3fs outcome=refreshed",
+            connector.id,
+            time.perf_counter() - refresh_start,
+        )
 
         refreshed_config = dict(server_config)
         refreshed_config["headers"] = {
@@ -809,6 +840,11 @@ async def _maybe_refresh_mcp_oauth_token(
         return refreshed_config
 
     except Exception:
+        _perf_log.info(
+            "[mcp_oauth_refresh] connector=%s elapsed=%.3fs outcome=failed",
+            connector.id,
+            time.perf_counter() - refresh_start,
+        )
         logger.warning(
             "Failed to refresh MCP OAuth token for connector %s",
             connector.id,
@@ -1074,9 +1110,11 @@ async def load_mcp_tools(
                 )
 
         async def _discover_one(task: dict[str, Any]) -> list[StructuredTool]:
+            discover_start = time.perf_counter()
+            transport = task["transport"]
             try:
-                if task["transport"] in ("streamable-http", "http", "sse"):
-                    return await asyncio.wait_for(
+                if transport in ("streamable-http", "http", "sse"):
+                    result = await asyncio.wait_for(
                         _load_http_mcp_tools(
                             task["connector_id"],
                             task["connector_name"],
@@ -1091,7 +1129,7 @@ async def load_mcp_tools(
                         timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
                     )
                 else:
-                    return await asyncio.wait_for(
+                    result = await asyncio.wait_for(
                         _load_stdio_mcp_tools(
                             task["connector_id"],
                             task["connector_name"],
@@ -1101,7 +1139,23 @@ async def load_mcp_tools(
                         ),
                         timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
                     )
+                _perf_log.info(
+                    "[mcp_discover] connector=%s name=%r transport=%s tools=%d elapsed=%.3fs",
+                    task["connector_id"],
+                    task["connector_name"],
+                    transport,
+                    len(result),
+                    time.perf_counter() - discover_start,
+                )
+                return result
             except TimeoutError:
+                _perf_log.info(
+                    "[mcp_discover] connector=%s name=%r transport=%s elapsed=%.3fs outcome=timeout",
+                    task["connector_id"],
+                    task["connector_name"],
+                    transport,
+                    time.perf_counter() - discover_start,
+                )
                 logger.error(
                     "MCP connector %d timed out after %ds during discovery",
                     task["connector_id"],
@@ -1109,6 +1163,13 @@ async def load_mcp_tools(
                 )
                 return []
             except Exception as e:
+                _perf_log.info(
+                    "[mcp_discover] connector=%s name=%r transport=%s elapsed=%.3fs outcome=error",
+                    task["connector_id"],
+                    task["connector_name"],
+                    transport,
+                    time.perf_counter() - discover_start,
+                )
                 logger.exception(
                     "Failed to load tools from MCP connector %d: %s",
                     task["connector_id"],
@@ -1116,7 +1177,14 @@ async def load_mcp_tools(
                 )
                 return []
 
+        gather_start = time.perf_counter()
         results = await asyncio.gather(*[_discover_one(t) for t in discovery_tasks])
+        _perf_log.info(
+            "[mcp_discover] gather_wall=%.3fs connectors=%d total_tools=%d",
+            time.perf_counter() - gather_start,
+            len(discovery_tasks),
+            sum(len(r) for r in results),
+        )
         tools: list[StructuredTool] = [tool for sublist in results for tool in sublist]
 
         _mcp_tools_cache[cache_key] = (now, tools)

From 9e81f2a35bfeb6d2893b1707423cd3186d8b0bb6 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:01 +0200
Subject: [PATCH 02/34] perf(subagent): add subagent compile timing log

---
 .../middleware.py                             | 51 ++++++++++++++++---
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py
index 8f51ffed7..0119752c1 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import time
 from typing import Any, cast
 
 from deepagents.backends.protocol import BackendFactory, BackendProtocol
@@ -15,8 +16,12 @@ from langchain.agents import create_agent
 from langchain.chat_models import init_chat_model
 from langgraph.types import Checkpointer
 
+from app.utils.perf import get_perf_logger
+
 from .task_tool import build_task_tool_with_parent_config
 
+_perf_log = get_perf_logger()
+
 
 class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware):
     """``SubAgentMiddleware`` variant that compiles each subagent against the parent checkpointer."""
@@ -54,8 +59,11 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware):
     def _surf_compile_subagent_graphs(self) -> list[dict[str, Any]]:
         """Mirror of ``SubAgentMiddleware._get_subagents`` that threads the parent checkpointer."""
         specs: list[dict[str, Any]] = []
+        loop_start = time.perf_counter()
+        timings: list[tuple[str, float, str]] = []  # (name, elapsed, source)
 
         for spec in self._subagents:
+            spec_start = time.perf_counter()
             if "runnable" in spec:
                 compiled = cast(CompiledSubAgent, spec)
                 specs.append(
@@ -65,6 +73,9 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware):
                         "runnable": compiled["runnable"],
                     }
                 )
+                timings.append(
+                    (compiled["name"], time.perf_counter() - spec_start, "precompiled")
+                )
                 continue
 
             if "model" not in spec:
@@ -79,20 +90,44 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware):
                 model = init_chat_model(model)
 
             middleware: list[Any] = list(spec.get("middleware", []))
+            tools_count = len(spec.get("tools") or [])
+            mw_count = len(middleware)
 
+            compile_start = time.perf_counter()
+            runnable = create_agent(
+                model,
+                system_prompt=spec["system_prompt"],
+                tools=spec["tools"],
+                middleware=middleware,
+                name=spec["name"],
+                checkpointer=self._surf_checkpointer,
+            )
+            compile_elapsed = time.perf_counter() - compile_start
             specs.append(
                 {
                     "name": spec["name"],
                     "description": spec["description"],
-                    "runnable": create_agent(
-                        model,
-                        system_prompt=spec["system_prompt"],
-                        tools=spec["tools"],
-                        middleware=middleware,
-                        name=spec["name"],
-                        checkpointer=self._surf_checkpointer,
-                    ),
+                    "runnable": runnable,
                 }
             )
+            timings.append(
+                (
+                    spec["name"],
+                    compile_elapsed,
+                    f"compiled tools={tools_count} mw={mw_count}",
+                )
+            )
+
+        total_elapsed = time.perf_counter() - loop_start
+        per_subagent = ", ".join(
+            f"{name}={elapsed * 1000:.0f}ms[{source}]"
+            for name, elapsed, source in timings
+        )
+        _perf_log.info(
+            "[subagent_compile] total=%.3fs count=%d details=[%s]",
+            total_elapsed,
+            len(timings),
+            per_subagent,
+        )
 
         return specs

From 33bfce4406b906191649792d61e3763520ff28e4 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:05 +0200
Subject: [PATCH 03/34] perf(subagent): add atask EXIT breakdown timing log

---
 .../task_tool.py                              | 103 +++++++++++++-----
 1 file changed, 78 insertions(+), 25 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
index f9b316e23..f6a9ff146 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
@@ -9,6 +9,7 @@ re-raises any new pending interrupt back to the parent.
 from __future__ import annotations
 
 import logging
+import time
 from typing import Annotated, Any, NoReturn
 
 from deepagents.middleware.subagents import TASK_TOOL_DESCRIPTION
@@ -19,6 +20,8 @@ from langchain_core.tools import StructuredTool
 from langgraph.errors import GraphInterrupt
 from langgraph.types import Command, Interrupt
 
+from app.utils.perf import get_perf_logger
+
 from .config import (
     consume_surfsense_resume,
     drain_parent_null_resume,
@@ -35,6 +38,7 @@ from .resume import (
 )
 
 logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
 
 
 def _reraise_stamped_subagent_interrupt(
@@ -209,6 +213,7 @@ def build_task_tool_with_parent_config(
         ],
         runtime: ToolRuntime,
     ) -> str | Command:
+        atask_start = time.perf_counter()
         logger.info(
             "[hitl_route] atask ENTRY: subagent_type=%r tool_call_id=%s",
             subagent_type,
@@ -230,8 +235,10 @@ def build_task_tool_with_parent_config(
         # Resume bridge — see ``task`` above.
         pending_id: str | None = None
         pending_value: Any = None
+        aget_state_elapsed = 0.0
         aget_state = getattr(subagent, "aget_state", None)
         if callable(aget_state):
+            aget_state_start = time.perf_counter()
             try:
                 snapshot = await aget_state(sub_config)
                 pending_id, pending_value = get_first_pending_subagent_interrupt(
@@ -248,32 +255,78 @@ def build_task_tool_with_parent_config(
                     "Subagent aget_state failed; falling back to fresh ainvoke",
                     exc_info=True,
                 )
+            finally:
+                aget_state_elapsed = time.perf_counter() - aget_state_start
 
-        if pending_value is not None:
-            resume_value = consume_surfsense_resume(runtime)
-            if resume_value is None:
-                raise RuntimeError(
-                    f"Subagent {subagent_type!r} has a pending interrupt but no "
-                    "surfsense_resume_value on config; resume bridge is broken."
-                )
-            expected = hitlrequest_action_count(pending_value)
-            resume_value = fan_out_decisions_to_match(resume_value, expected)
-            # Prevent the parent's resume payload from leaking into subagent
-            # interrupts via langgraph's parent_scratchpad fallback.
-            drain_parent_null_resume(runtime)
-            try:
-                result = await subagent.ainvoke(
-                    build_resume_command(resume_value, pending_id),
-                    config=sub_config,
-                )
-            except GraphInterrupt as gi:
-                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
-        else:
-            try:
-                result = await subagent.ainvoke(subagent_state, config=sub_config)
-            except GraphInterrupt as gi:
-                _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
-        return _return_command_with_state_update(result, runtime.tool_call_id)
+        invoke_path = "resume" if pending_value is not None else "fresh"
+        ainvoke_start = time.perf_counter()
+        ainvoke_outcome = "ok"
+        try:
+            if pending_value is not None:
+                resume_value = consume_surfsense_resume(runtime)
+                if resume_value is None:
+                    raise RuntimeError(
+                        f"Subagent {subagent_type!r} has a pending interrupt but no "
+                        "surfsense_resume_value on config; resume bridge is broken."
+                    )
+                expected = hitlrequest_action_count(pending_value)
+                resume_value = fan_out_decisions_to_match(resume_value, expected)
+                # Prevent the parent's resume payload from leaking into subagent
+                # interrupts via langgraph's parent_scratchpad fallback.
+                drain_parent_null_resume(runtime)
+                try:
+                    result = await subagent.ainvoke(
+                        build_resume_command(resume_value, pending_id),
+                        config=sub_config,
+                    )
+                except GraphInterrupt as gi:
+                    ainvoke_outcome = "interrupted"
+                    _perf_log.info(
+                        "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
+                        "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
+                        subagent_type,
+                        invoke_path,
+                        ainvoke_outcome,
+                        aget_state_elapsed,
+                        time.perf_counter() - ainvoke_start,
+                        time.perf_counter() - atask_start,
+                    )
+                    _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+            else:
+                try:
+                    result = await subagent.ainvoke(subagent_state, config=sub_config)
+                except GraphInterrupt as gi:
+                    ainvoke_outcome = "interrupted"
+                    _perf_log.info(
+                        "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
+                        "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
+                        subagent_type,
+                        invoke_path,
+                        ainvoke_outcome,
+                        aget_state_elapsed,
+                        time.perf_counter() - ainvoke_start,
+                        time.perf_counter() - atask_start,
+                    )
+                    _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
+            ainvoke_elapsed = time.perf_counter() - ainvoke_start
+        except GraphInterrupt:
+            raise
+
+        merge_start = time.perf_counter()
+        cmd = _return_command_with_state_update(result, runtime.tool_call_id)
+        merge_elapsed = time.perf_counter() - merge_start
+        _perf_log.info(
+            "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
+            "aget_state=%.3fs ainvoke=%.3fs merge=%.3fs total=%.3fs",
+            subagent_type,
+            invoke_path,
+            ainvoke_outcome,
+            aget_state_elapsed,
+            ainvoke_elapsed,
+            merge_elapsed,
+            time.perf_counter() - atask_start,
+        )
+        return cmd
 
     return StructuredTool.from_function(
         name="task",

From bd153d3cdb9d0a4cf6086754189c01627a2d61f3 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:09 +0200
Subject: [PATCH 04/34] perf(multi-agent): add kb_context_projection timing log

---
 .../shared/kb_context_projection.py           | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py
index 01c77bb84..e8a4c9899 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import time
 from typing import Any
 
 from langchain.agents.middleware import AgentMiddleware, AgentState
@@ -10,6 +11,9 @@ from langgraph.runtime import Runtime
 
 from app.agents.new_chat.filesystem_state import SurfSenseFilesystemState
 from app.agents.new_chat.middleware.knowledge_search import _render_priority_message
+from app.utils.perf import get_perf_logger
+
+_perf_log = get_perf_logger()
 
 
 class KbContextProjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
@@ -30,17 +34,34 @@ class KbContextProjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         runtime: Runtime[Any],
     ) -> dict[str, Any] | None:
         del runtime
+        start = time.perf_counter()
         tree_text = state.get("workspace_tree_text")
         priority = state.get("kb_priority")
         if not tree_text and not priority:
+            _perf_log.info(
+                "[kb_context_projection] tree=0 priority=0 elapsed=%.3fs",
+                time.perf_counter() - start,
+            )
             return None
 
         messages = list(state.get("messages") or [])
         insert_at = max(len(messages) - 1, 0)
+        tree_chars = 0
         if tree_text:
+            tree_chars = len(tree_text)
             messages.insert(insert_at, SystemMessage(content=tree_text))
+        priority_count = 0
         if priority:
+            priority_count = (
+                len(priority) if hasattr(priority, "__len__") else 1
+            )
             messages.insert(insert_at, _render_priority_message(priority))
+        _perf_log.info(
+            "[kb_context_projection] tree_chars=%d priority_items=%d elapsed=%.3fs",
+            tree_chars,
+            priority_count,
+            time.perf_counter() - start,
+        )
         return {"messages": messages}
 
 

From 1df40fbe3136c2563558b76c10a705292f50193a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:14 +0200
Subject: [PATCH 05/34] perf(new-chat): add knowledge_tree middleware timing
 log

---
 .../new_chat/middleware/knowledge_tree.py       | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py
index 4ed1e53f4..6bd6430d1 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_tree.py
@@ -24,6 +24,7 @@ from __future__ import annotations
 
 import asyncio
 import logging
+import time
 from typing import Any
 
 from langchain.agents.middleware import AgentMiddleware, AgentState
@@ -41,6 +42,9 @@ from app.agents.new_chat.path_resolver import (
     doc_to_virtual_path,
 )
 from app.db import Document, shielded_async_session
+from app.utils.perf import get_perf_logger
+
+_perf_log = get_perf_logger()
 
 try:
     from litellm import token_counter
@@ -124,6 +128,7 @@ class KnowledgeTreeMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         if self.filesystem_mode != FilesystemMode.CLOUD:
             return None
 
+        start = time.perf_counter()
         update: dict[str, Any] = {}
         if not state.get("cwd"):
             update["cwd"] = DOCUMENTS_ROOT
@@ -131,7 +136,11 @@ class KnowledgeTreeMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         anon_doc = state.get("kb_anon_doc")
         if anon_doc:
             tree_msg = self._render_anon_tree(anon_doc)
+            cache_outcome = "anon"
         else:
+            version = int(state.get("tree_version") or 0)
+            cache_key = (self.search_space_id, version, False)
+            cache_outcome = "hit" if cache_key in self._cache else "miss"
             tree_msg = await self._render_kb_tree(state)
 
         update["workspace_tree_text"] = tree_msg
@@ -141,6 +150,14 @@ class KnowledgeTreeMiddleware(AgentMiddleware):  # type: ignore[type-arg]
             insert_at = max(len(messages) - 1, 0)
             messages.insert(insert_at, SystemMessage(content=tree_msg))
             update["messages"] = messages
+
+        _perf_log.info(
+            "[knowledge_tree] cache=%s chars=%d elapsed=%.3fs space=%d",
+            cache_outcome,
+            len(tree_msg),
+            time.perf_counter() - start,
+            self.search_space_id,
+        )
         return update
 
     def before_agent(  # type: ignore[override]

From b3b66e4c487bc899a0e58efca8b72b127861a976 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:19 +0200
Subject: [PATCH 06/34] perf(new-chat): add memory_injection middleware timing
 log

---
 .../new_chat/middleware/memory_injection.py   | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py b/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
index 6179adccd..659d1bea2 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
@@ -8,6 +8,7 @@ Injects memory markdown into the system prompt on every turn:
 from __future__ import annotations
 
 import logging
+import time
 from typing import Any
 from uuid import UUID
 
@@ -19,8 +20,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.agents.new_chat.tools.update_memory import MEMORY_HARD_LIMIT, MEMORY_SOFT_LIMIT
 from app.db import ChatVisibility, SearchSpace, User, shielded_async_session
+from app.utils.perf import get_perf_logger
 
 logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
 
 
 class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
@@ -53,9 +56,13 @@ class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         if not isinstance(last_message, HumanMessage):
             return None
 
+        start = time.perf_counter()
+        db_elapsed = 0.0
         memory_blocks: list[str] = []
+        scope = "team" if self.visibility == ChatVisibility.SEARCH_SPACE else "user"
 
         async with shielded_async_session() as session:
+            db_start = time.perf_counter()
             if self.visibility == ChatVisibility.SEARCH_SPACE:
                 team_memory = await self._load_team_memory(session)
                 if team_memory:
@@ -96,7 +103,15 @@ class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
                             f"</memory_warning>"
                         )
 
+        db_elapsed = time.perf_counter() - db_start
+
         if not memory_blocks:
+            _perf_log.info(
+                "[memory_injection] scope=%s injected=0 db=%.3fs total=%.3fs",
+                scope,
+                db_elapsed,
+                time.perf_counter() - start,
+            )
             return None
 
         memory_text = "\n\n".join(memory_blocks)
@@ -106,6 +121,13 @@ class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         insert_idx = 1 if len(new_messages) > 1 else 0
         new_messages.insert(insert_idx, memory_msg)
 
+        _perf_log.info(
+            "[memory_injection] scope=%s injected=1 chars=%d db=%.3fs total=%.3fs",
+            scope,
+            len(memory_text),
+            db_elapsed,
+            time.perf_counter() - start,
+        )
         return {"messages": new_messages}
 
     async def _load_user_memory(

From 581bbfb5c1301d84afa1f6c402ac724657894cd1 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:25 +0200
Subject: [PATCH 07/34] perf(tokens): add per-call latency to capture log

---
 .../app/services/token_tracking_service.py          | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/app/services/token_tracking_service.py b/surfsense_backend/app/services/token_tracking_service.py
index 9406d9be4..58d06ba31 100644
--- a/surfsense_backend/app/services/token_tracking_service.py
+++ b/surfsense_backend/app/services/token_tracking_service.py
@@ -357,9 +357,19 @@ class TokenTrackingCallback(CustomLogger):
             cost_micros=cost_micros,
             call_kind=call_kind,
         )
+
+        # Per-LLM-call wall-clock latency (LiteLLM passes datetime objects).
+        call_latency_s: float | None = None
+        try:
+            if start_time is not None and end_time is not None:
+                delta = end_time - start_time
+                call_latency_s = getattr(delta, "total_seconds", lambda: float(delta))()
+        except Exception:
+            call_latency_s = None
+
         logger.info(
             "[TokenTracking] Captured: model=%s kind=%s prompt=%d completion=%d total=%d "
-            "cost=$%.6f (%d micros) (accumulator now has %d calls)",
+            "cost=$%.6f (%d micros) (accumulator now has %d calls)%s",
             model,
             call_kind,
             prompt_tokens,
@@ -368,6 +378,7 @@ class TokenTrackingCallback(CustomLogger):
             cost_usd,
             cost_micros,
             len(acc.calls),
+            f" latency={call_latency_s:.3f}s" if call_latency_s is not None else "",
         )
 
 

From 3a5e16e868b152fbfe895290f99f9b8078c6a2ae Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:28 +0200
Subject: [PATCH 08/34] perf(calendar): stop echoing raw events into
 evidence.items

---
 .../subagents/connectors/calendar/system_prompt.md            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md
index 5b27c18ba..a663f5b37 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md
@@ -29,7 +29,7 @@ You are a Google Calendar specialist for the user's connected calendar.
 | `error`                     | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
 | tool raises / unknown       | `error`       | `"Calendar tool failed unexpectedly. Ask the user to retry shortly."`                                                        |
 
-Surface the tool's `event_id`, `title` / `summary`, `start_at`, `end_at`, and `html_link` inside `evidence` when the tool returned them. For `search_calendar_events`, place the raw `events` array inside `evidence.items`. Never invent a field the tool did not return.
+Surface the tool's `event_id`, `title` / `summary`, `start_at`, `end_at`, and `html_link` inside `evidence` when the tool returned them. For `search_calendar_events`, set `evidence.items` to `{ "total": N }` and list the matched events in `action_summary` (title, date, start time; one line per event; up to 10 entries, then `"...and N more"`). Never invent a field the tool did not return.
 
 ## Examples
 
@@ -115,7 +115,7 @@ Rules:
 - `status=success` → `next_step=null`, `missing_fields=null`.
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
-- For `search_calendar_events` results, populate `evidence.items` with `{ "events": [...], "total": N }`.
+- For `search_calendar_events` results, set `evidence.items` to `{ "total": N }` and list the matched events in `action_summary` (title, date, start time; up to 10 entries, then `"...and N more"`).
 - For ambiguous matches across `update_calendar_event` / `delete_calendar_event`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`, where `label` should include the event title and start time for human readability).
 
 Infer before you call; map every tool outcome faithfully.

From 148139401722f05efe960ac676d974ed92a6d803 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Tue, 19 May 2026 21:30:34 +0200
Subject: [PATCH 09/34] chore(scripts): add MCP session lifetime probe

---
 .../scripts/probe_mcp_session_lifetime.py     | 563 ++++++++++++++++++
 1 file changed, 563 insertions(+)
 create mode 100644 surfsense_backend/scripts/probe_mcp_session_lifetime.py

diff --git a/surfsense_backend/scripts/probe_mcp_session_lifetime.py b/surfsense_backend/scripts/probe_mcp_session_lifetime.py
new file mode 100644
index 000000000..66be5bc14
--- /dev/null
+++ b/surfsense_backend/scripts/probe_mcp_session_lifetime.py
@@ -0,0 +1,563 @@
+"""Probe MCP server session lifetime / staleness behavior — read-only.
+
+Goal
+----
+Empirically answer two questions for our actual third-party MCP servers
+(Atlassian, Linear, Slack, ClickUp, Airtable, ...):
+
+1. How expensive is the initial ``initialize`` handshake (``init=`` cost)?
+2. How long can a ``ClientSession`` sit idle and still survive a
+   subsequent ``list_tools()`` call?
+
+This script informs the design choice between
+
+* per-call sessions (current, ~1s init tax per call),
+* per-turn session reuse (LangChain-style, holds a session for the
+  duration of a chat turn),
+* a long-lived session pool (IBM-style, sessions reused across turns).
+
+The probe is read-only: it only ever calls ``session.list_tools()``,
+which is the safest MCP method. No tool calls against user data are
+performed.
+
+Usage
+-----
+Run from the repo root or from ``surfsense_backend/``::
+
+    uv run python -m scripts.probe_mcp_session_lifetime
+    uv run python -m scripts.probe_mcp_session_lifetime --quick
+    uv run python -m scripts.probe_mcp_session_lifetime --connectors 7,19,20
+    uv run python -m scripts.probe_mcp_session_lifetime --intervals 5,30,60,300
+
+Output
+------
+* Live progress to stderr (``[connector=7 t=+30s] OK 0.142s``).
+* Final per-connector table to stdout.
+* Raw results JSON to ``./mcp_session_probe_<timestamp>.json``.
+
+The default test reaches 1800s of idle (~30 min). Use ``--quick`` to
+stop at 60s for fast iteration. All connectors probe concurrently so
+total wall-clock time equals the longest interval, not the sum.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from typing import Any
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_BACKEND_ROOT = os.path.dirname(_HERE)
+if _BACKEND_ROOT not in sys.path:
+    sys.path.insert(0, _BACKEND_ROOT)
+
+import httpx  # noqa: E402
+from mcp import ClientSession  # noqa: E402
+from mcp.client.streamable_http import streamable_http_client  # noqa: E402
+from sqlalchemy import cast, select  # noqa: E402
+from sqlalchemy.dialects.postgresql import JSONB  # noqa: E402
+
+from app.agents.new_chat.tools.mcp_tool import (  # noqa: E402
+    _inject_oauth_headers,
+    _maybe_refresh_mcp_oauth_token,
+)
+from app.db import SearchSourceConnector, async_session_maker  # noqa: E402
+
+logging.basicConfig(
+    level=logging.WARNING,
+    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+    stream=sys.stderr,
+)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("mcp").setLevel(logging.ERROR)
+logger = logging.getLogger("mcp_probe")
+logger.setLevel(logging.INFO)
+
+
+DEFAULT_INTERVALS_SECONDS = [5, 30, 60, 300, 900, 1800]
+QUICK_INTERVALS_SECONDS = [5, 30, 60]
+PER_CALL_TIMEOUT_SECONDS = 60.0
+
+
+@dataclass
+class CheckpointResult:
+    """One ``list_tools()`` call against a long-lived session."""
+
+    idle_seconds_target: int
+    elapsed_since_open_seconds: float
+    elapsed_since_last_call_seconds: float
+    success: bool
+    latency_seconds: float | None
+    tools_returned: int | None
+    error_type: str | None
+    error_message: str | None
+
+
+@dataclass
+class ConnectorProbeResult:
+    """Per-connector aggregated probe outcome."""
+
+    connector_id: int
+    connector_name: str
+    connector_type: str
+    url: str
+    init_latency_seconds: float | None
+    first_call_latency_seconds: float | None
+    checkpoints: list[CheckpointResult] = field(default_factory=list)
+    fatal_error: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Connector loading + auth
+# ---------------------------------------------------------------------------
+
+
+async def _fetch_connectors(
+    connector_ids: list[int] | None,
+) -> list[SearchSourceConnector]:
+    """Pull every MCP-shaped connector (or only the requested IDs)."""
+    async with async_session_maker() as session:
+        stmt = select(SearchSourceConnector).filter(
+            cast(SearchSourceConnector.config, JSONB).has_key("server_config"),
+        )
+        if connector_ids:
+            stmt = stmt.filter(SearchSourceConnector.id.in_(connector_ids))
+        result = await session.execute(stmt)
+        connectors = list(result.scalars())
+
+    if connector_ids:
+        found_ids = {c.id for c in connectors}
+        missing = [cid for cid in connector_ids if cid not in found_ids]
+        if missing:
+            logger.warning("Requested connector IDs not found: %s", missing)
+    return connectors
+
+
+async def _resolve_authed_server_config(
+    connector: SearchSourceConnector,
+) -> dict[str, Any] | None:
+    """Refresh OAuth (if needed) and return a server_config with auth headers.
+
+    Returns ``None`` if the connector cannot be probed (missing url,
+    decrypt failure, no refresh token, etc.).
+    """
+    cfg = connector.config or {}
+    server_config = cfg.get("server_config", {})
+    if not isinstance(server_config, dict):
+        return None
+
+    if cfg.get("mcp_oauth"):
+        async with async_session_maker() as session:
+            attached = await session.get(SearchSourceConnector, connector.id)
+            if attached is None:
+                return None
+            refreshed = await _maybe_refresh_mcp_oauth_token(
+                session,
+                attached,
+                attached.config or {},
+                server_config,
+            )
+            attached_cfg = attached.config or {}
+        server_config = _inject_oauth_headers(attached_cfg, refreshed)
+        if server_config is None:
+            return None
+    return server_config
+
+
+# ---------------------------------------------------------------------------
+# The actual probe
+# ---------------------------------------------------------------------------
+
+
+def _classify_error(exc: BaseException) -> tuple[str, str]:
+    """Return ``(short_label, human_message)`` for a failed call."""
+    name = type(exc).__name__
+    msg = str(exc) or repr(exc)
+    if isinstance(exc, asyncio.TimeoutError):
+        return "timeout", f"call exceeded {PER_CALL_TIMEOUT_SECONDS}s"
+    if "404" in msg or "Not Found" in msg or "session" in msg.lower():
+        return "session_expired", msg
+    if "401" in msg or "Unauthorized" in msg:
+        return "auth_401", msg
+    if "ClosedResourceError" in name or "Closed" in name:
+        return "stream_closed", msg
+    if "Connection" in name or "ConnectError" in name:
+        return "connection_error", msg
+    return name, msg
+
+
+async def _probe_one_connector(
+    connector: SearchSourceConnector,
+    intervals: list[int],
+) -> ConnectorProbeResult:
+    """Open a single long-lived session, call ``list_tools`` at each interval."""
+    connector_type = (
+        connector.connector_type.value
+        if hasattr(connector.connector_type, "value")
+        else str(connector.connector_type)
+    )
+    server_config = await _resolve_authed_server_config(connector)
+    if server_config is None:
+        return ConnectorProbeResult(
+            connector_id=connector.id,
+            connector_name=connector.name,
+            connector_type=connector_type,
+            url="(unresolved)",
+            init_latency_seconds=None,
+            first_call_latency_seconds=None,
+            fatal_error="failed_to_resolve_server_config",
+        )
+
+    url = server_config.get("url")
+    headers = server_config.get("headers", {})
+    if not url:
+        return ConnectorProbeResult(
+            connector_id=connector.id,
+            connector_name=connector.name,
+            connector_type=connector_type,
+            url="(missing)",
+            init_latency_seconds=None,
+            first_call_latency_seconds=None,
+            fatal_error="missing_url",
+        )
+
+    transport = server_config.get("transport", "streamable-http")
+    if transport not in ("streamable-http", "http", "sse"):
+        return ConnectorProbeResult(
+            connector_id=connector.id,
+            connector_name=connector.name,
+            connector_type=connector_type,
+            url=url,
+            init_latency_seconds=None,
+            first_call_latency_seconds=None,
+            fatal_error=f"unsupported_transport:{transport}",
+        )
+
+    result = ConnectorProbeResult(
+        connector_id=connector.id,
+        connector_name=connector.name,
+        connector_type=connector_type,
+        url=url,
+        init_latency_seconds=None,
+        first_call_latency_seconds=None,
+    )
+
+    open_started = time.perf_counter()
+    last_call_at: float | None = None
+
+    # Manually drive the context-manager protocol so the session lives
+    # across our sleep intervals. ``streamable_http_client`` spawns a
+    # background task for the SSE receive loop; ``ClientSession`` spawns
+    # another for request multiplexing. We must close them in reverse order.
+    http_client = httpx.AsyncClient(headers=headers, timeout=PER_CALL_TIMEOUT_SECONDS)
+    transport_cm = None
+    session_cm = None
+    session = None
+    try:
+        transport_cm = streamable_http_client(url, http_client=http_client)
+        read, write, _ = await transport_cm.__aenter__()
+        session_cm = ClientSession(read, write)
+        session = await session_cm.__aenter__()
+
+        init_start = time.perf_counter()
+        await asyncio.wait_for(session.initialize(), timeout=PER_CALL_TIMEOUT_SECONDS)
+        result.init_latency_seconds = time.perf_counter() - init_start
+        logger.info(
+            "[connector=%s name=%r] init=%.3fs",
+            connector.id,
+            connector.name,
+            result.init_latency_seconds,
+        )
+
+        first_call_start = time.perf_counter()
+        first_response = await asyncio.wait_for(
+            session.list_tools(), timeout=PER_CALL_TIMEOUT_SECONDS
+        )
+        result.first_call_latency_seconds = time.perf_counter() - first_call_start
+        last_call_at = time.perf_counter()
+        logger.info(
+            "[connector=%s name=%r] first_call=%.3fs tools=%d",
+            connector.id,
+            connector.name,
+            result.first_call_latency_seconds,
+            len(first_response.tools),
+        )
+
+        for interval in intervals:
+            target_elapsed = open_started + (
+                result.init_latency_seconds + result.first_call_latency_seconds + interval
+            )
+            sleep_for = max(0.0, target_elapsed - time.perf_counter())
+            await asyncio.sleep(sleep_for)
+
+            call_start = time.perf_counter()
+            elapsed_since_open = call_start - open_started
+            elapsed_since_last = call_start - (last_call_at or call_start)
+            try:
+                response = await asyncio.wait_for(
+                    session.list_tools(), timeout=PER_CALL_TIMEOUT_SECONDS
+                )
+                latency = time.perf_counter() - call_start
+                last_call_at = time.perf_counter()
+                checkpoint = CheckpointResult(
+                    idle_seconds_target=interval,
+                    elapsed_since_open_seconds=round(elapsed_since_open, 3),
+                    elapsed_since_last_call_seconds=round(elapsed_since_last, 3),
+                    success=True,
+                    latency_seconds=round(latency, 3),
+                    tools_returned=len(response.tools),
+                    error_type=None,
+                    error_message=None,
+                )
+                logger.info(
+                    "[connector=%s t=+%ds] OK %.3fs (tools=%d)",
+                    connector.id,
+                    interval,
+                    latency,
+                    len(response.tools),
+                )
+                result.checkpoints.append(checkpoint)
+            except Exception as exc:  # noqa: BLE001
+                label, msg = _classify_error(exc)
+                latency_at_failure = time.perf_counter() - call_start
+                checkpoint = CheckpointResult(
+                    idle_seconds_target=interval,
+                    elapsed_since_open_seconds=round(elapsed_since_open, 3),
+                    elapsed_since_last_call_seconds=round(elapsed_since_last, 3),
+                    success=False,
+                    latency_seconds=round(latency_at_failure, 3),
+                    tools_returned=None,
+                    error_type=label,
+                    error_message=msg[:300],
+                )
+                logger.warning(
+                    "[connector=%s t=+%ds] FAILED %s after %.3fs: %s",
+                    connector.id,
+                    interval,
+                    label,
+                    latency_at_failure,
+                    msg[:200],
+                )
+                result.checkpoints.append(checkpoint)
+                # Session is presumed dead — further checkpoints would all
+                # fail the same way and just waste wall time.
+                break
+
+    except Exception as exc:  # noqa: BLE001
+        label, msg = _classify_error(exc)
+        result.fatal_error = f"{label}: {msg[:200]}"
+        logger.exception(
+            "[connector=%s] fatal during open/init: %s",
+            connector.id,
+            exc,
+        )
+    finally:
+        if session_cm is not None:
+            try:
+                await session_cm.__aexit__(None, None, None)
+            except Exception:
+                pass
+        if transport_cm is not None:
+            try:
+                await transport_cm.__aexit__(None, None, None)
+            except Exception:
+                pass
+        try:
+            await http_client.aclose()
+        except Exception:
+            pass
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+
+
+def _render_table(results: list[ConnectorProbeResult]) -> str:
+    """Pretty-print a per-connector summary suitable for the terminal."""
+    lines: list[str] = []
+    lines.append("=" * 100)
+    lines.append("MCP Session Lifetime Probe Results")
+    lines.append("=" * 100)
+
+    for result in results:
+        lines.append("")
+        lines.append(
+            f"Connector {result.connector_id} | {result.connector_type} | "
+            f"{result.connector_name!r}"
+        )
+        lines.append(f"  url: {result.url}")
+        if result.fatal_error:
+            lines.append(f"  FATAL: {result.fatal_error}")
+            continue
+        lines.append(
+            f"  init handshake: "
+            f"{result.init_latency_seconds:.3f}s"
+            if result.init_latency_seconds is not None
+            else "  init handshake: (failed)"
+        )
+        lines.append(
+            f"  first list_tools (cold): "
+            f"{result.first_call_latency_seconds:.3f}s"
+            if result.first_call_latency_seconds is not None
+            else "  first list_tools: (failed)"
+        )
+        if not result.checkpoints:
+            lines.append("  (no idle checkpoints recorded)")
+            continue
+        lines.append(
+            f"  {'idle_s':>8} | {'since_last':>10} | {'outcome':>16} | "
+            f"{'latency':>9} | {'tools':>5}"
+        )
+        for cp in result.checkpoints:
+            outcome = "OK" if cp.success else (cp.error_type or "FAIL")
+            latency = f"{cp.latency_seconds:.3f}s" if cp.latency_seconds is not None else "-"
+            tools = str(cp.tools_returned) if cp.tools_returned is not None else "-"
+            lines.append(
+                f"  {cp.idle_seconds_target:>8} | "
+                f"{cp.elapsed_since_last_call_seconds:>10.1f} | "
+                f"{outcome:>16} | "
+                f"{latency:>9} | "
+                f"{tools:>5}"
+            )
+
+    lines.append("")
+    lines.append("=" * 100)
+    lines.append("Summary")
+    lines.append("=" * 100)
+    survived: dict[int, list[int]] = {}
+    for result in results:
+        for cp in result.checkpoints:
+            if cp.success:
+                survived.setdefault(cp.idle_seconds_target, []).append(
+                    result.connector_id
+                )
+    if survived:
+        for interval in sorted(survived):
+            ids = sorted(survived[interval])
+            lines.append(
+                f"  Idle {interval:>5}s: {len(ids)}/{len(results)} connectors "
+                f"survived ({ids})"
+            )
+    else:
+        lines.append("  (no successful checkpoints)")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def _parse_int_list(value: str) -> list[int]:
+    return [int(x) for x in value.split(",") if x.strip()]
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Probe MCP server session lifetime (read-only)",
+    )
+    parser.add_argument(
+        "--connectors",
+        type=_parse_int_list,
+        default=None,
+        help="Comma-separated connector IDs to probe. Default: all MCP connectors.",
+    )
+    parser.add_argument(
+        "--intervals",
+        type=_parse_int_list,
+        default=None,
+        help="Comma-separated idle intervals in seconds. "
+        f"Default: {DEFAULT_INTERVALS_SECONDS}",
+    )
+    parser.add_argument(
+        "--quick",
+        action="store_true",
+        help=f"Short run (intervals={QUICK_INTERVALS_SECONDS}) for fast iteration.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Optional path for the raw JSON results.",
+    )
+    return parser.parse_args()
+
+
+async def _async_main() -> int:
+    args = _parse_args()
+    if args.intervals is not None:
+        intervals = args.intervals
+    elif args.quick:
+        intervals = QUICK_INTERVALS_SECONDS
+    else:
+        intervals = DEFAULT_INTERVALS_SECONDS
+
+    longest = max(intervals) if intervals else 0
+    logger.info(
+        "Probing intervals=%s (longest=%ds, ~%dmin total wall time)",
+        intervals,
+        longest,
+        (longest + 30) // 60,
+    )
+
+    connectors = await _fetch_connectors(args.connectors)
+    if not connectors:
+        logger.error("No MCP connectors found to probe.")
+        return 2
+    logger.info(
+        "Probing %d connector(s): %s",
+        len(connectors),
+        [f"{c.id}:{c.name}" for c in connectors],
+    )
+
+    started_at = time.time()
+    results = await asyncio.gather(
+        *[_probe_one_connector(c, intervals) for c in connectors],
+        return_exceptions=False,
+    )
+    elapsed = time.time() - started_at
+    logger.info("All probes complete in %.1fs", elapsed)
+
+    table = _render_table(results)
+    print(table)
+
+    output_path = (
+        args.output
+        or f"mcp_session_probe_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+    with open(output_path, "w", encoding="utf-8") as fp:
+        json.dump(
+            {
+                "started_at": datetime.fromtimestamp(started_at).isoformat(),
+                "elapsed_seconds": round(elapsed, 1),
+                "intervals_tested": intervals,
+                "results": [asdict(r) for r in results],
+            },
+            fp,
+            indent=2,
+        )
+    logger.info("Raw results saved to %s", output_path)
+    return 0
+
+
+def main() -> None:
+    try:
+        exit_code = asyncio.run(_async_main())
+    except KeyboardInterrupt:
+        logger.warning("Interrupted by user")
+        exit_code = 130
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()

From 553becea280853312cac55f6cf1a28c89c023973 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:40:00 +0200
Subject: [PATCH 10/34] perf(gmail subagent): stop echoing raw emails array
 into evidence.items

---
 .../subagents/connectors/gmail/system_prompt.md               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md
index d74e9bdc4..c04d69ad0 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md
@@ -33,7 +33,7 @@ You are a Gmail specialist for the user's connected Gmail mailbox.
 | `error`                     | `error`       | Relay the tool's `message` verbatim as `next_step`.                                                                          |
 | tool raises / unknown       | `error`       | `"Gmail tool failed unexpectedly. Ask the user to retry shortly."`                                                           |
 
-Surface the tool's `message_id`, `thread_id`, `draft_id`, `subject`, and recipient fields inside `evidence` when the tool returned them. For `search_gmail`, place the raw `emails` array inside `evidence.items`. Never invent a field the tool did not return.
+Surface the tool's `message_id`, `thread_id`, `draft_id`, `subject`, and recipient fields inside `evidence` when the tool returned them. For `search_gmail`, set `evidence.items` to `{ "total": N }` and list the matched emails in `action_summary` (sender, subject, date; one line per email; up to 10 entries, then `"...and N more"`). Never invent a field the tool did not return.
 
 ## Examples
 
@@ -114,7 +114,7 @@ Rules:
 - `status=success` → `next_step=null`, `missing_fields=null`.
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
-- For `search_gmail` results, populate `evidence.items` with `{ "emails": [...], "total": N }`.
+- For `search_gmail` results, set `evidence.items` to `{ "total": N }` and list the matched emails in `action_summary` (sender, subject, date; up to 10 entries, then `"...and N more"`).
 - For ambiguous matches across `update_gmail_draft` / `trash_gmail_email` / `read_gmail_email`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`).
 
 Infer before you call; verify before you send; map every tool outcome faithfully.

From d3d396a473b90b6940e8c21dcf5c1967137f78bf Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:40:18 +0200
Subject: [PATCH 11/34] perf(linear subagent): stop echoing raw issues list
 into evidence.items

---
 .../subagents/connectors/linear/system_prompt.md              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md
index f7dbeb9a9..1d96a4105 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md
@@ -32,7 +32,7 @@ Failure handling:
 <example>
 Supervisor: "Find issues assigned to me with priority Urgent."
 1. Discovery: list issues with filters `{assignee: "me", priority: 1}`.
-2. Return `status=success` with the matched issues in `evidence.items`.
+2. Return `status=success` with `evidence.items` set to `{ "total": N }` and the matched issues listed in `action_summary` (identifier, title, state, assignee; one line per issue; up to 10 entries, then `"...and N more"`).
 </example>
 
 <example>
@@ -106,7 +106,7 @@ Rules:
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
 - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: issue, user, project, state, etc.).
-- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+- For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (identifier, title, state, assignee; up to 10 entries, then `"...and N more"`).
 </output_contract>
 
 Discover before you mutate; never guess identifiers.

From 6e5dd54bbfad4cdd72d4ab0ac1332bd1aa46545f Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:40:33 +0200
Subject: [PATCH 12/34] perf(slack subagent): stop echoing raw messages list
 into evidence.items

---
 .../subagents/connectors/slack/system_prompt.md               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md
index c8edfc1db..3c24b19c9 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md
@@ -37,7 +37,7 @@ Failure handling:
 Supervisor: "Summarize the latest discussion in #marketing."
 1. Search channels for "marketing" → one strong match. Capture the channel ID.
 2. Read that channel's recent message history.
-3. Return `status=success` with the message list in `evidence.items`.
+3. Return `status=success` with `evidence.items` set to `{ "total": N }` and the messages listed in `action_summary` (sender, timestamp, text snippet; one line per message; up to 10 entries, then `"...and N more"`).
 </example>
 
 <example>
@@ -92,7 +92,7 @@ Rules:
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
 - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: channel, user, message, thread).
-- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+- For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (channel/user, key identifier, timestamp, short snippet; up to 10 entries, then `"...and N more"`).
 </output_contract>
 
 Discover before you post; never guess channel, user, or thread targets.

From 6be1b22ef6d1c62c80b3696ec5281725047468aa Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:40:48 +0200
Subject: [PATCH 13/34] perf(jira subagent): stop echoing raw issues list into
 evidence.items

---
 .../subagents/connectors/jira/system_prompt.md                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md
index 79c46f8a0..4dcc56454 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md
@@ -39,7 +39,7 @@ Failure handling:
 <example>
 Supervisor: "Find issues assigned to me with status 'In Progress'."
 1. JQL search with `assignee = currentUser() AND status = "In Progress"`.
-2. Return `status=success` with the matched issues in `evidence.items`.
+2. Return `status=success` with `evidence.items` set to `{ "total": N }` and the matched issues listed in `action_summary` (issue key, summary, status, assignee; one line per issue; up to 10 entries, then `"...and N more"`).
 </example>
 
 <example>
@@ -116,7 +116,7 @@ Rules:
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
 - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: site, project, issue, user, transition, etc.).
-- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+- For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (issue key, summary, status, assignee; up to 10 entries, then `"...and N more"`).
 </output_contract>
 
 Discover before you mutate; never guess identifiers, transitions, or required fields.

From 1b2f13e25ca5e455f9e5019d0b5ad98e3c203d6a Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:41:04 +0200
Subject: [PATCH 14/34] perf(clickup subagent): stop echoing raw tasks list
 into evidence.items

---
 .../subagents/connectors/clickup/system_prompt.md             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md
index eaea5827b..898197f14 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md
@@ -36,7 +36,7 @@ Failure handling:
 <example>
 Supervisor: "Find tasks about the homepage redesign."
 1. Workspace search for "homepage redesign" → matched tasks.
-2. Return `status=success` with the matched tasks in `evidence.items`.
+2. Return `status=success` with `evidence.items` set to `{ "total": N }` and the matched tasks listed in `action_summary` (task id, title, status, assignees; one line per task; up to 10 entries, then `"...and N more"`).
 </example>
 
 <example>
@@ -98,7 +98,7 @@ Rules:
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
 - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: task, list, member, status, custom-field choice, etc.).
-- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+- For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (task id, title, status, assignees; up to 10 entries, then `"...and N more"`).
 </output_contract>
 
 Discover before you mutate; never guess identifiers, list statuses, or assignees.

From 56d8ff89e2d46e890fe9c11999479af0abda8c16 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:41:18 +0200
Subject: [PATCH 15/34] perf(airtable subagent): stop echoing raw records list
 into evidence.items

---
 .../subagents/connectors/airtable/system_prompt.md            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md
index 1b7e84710..9434db7a1 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md
@@ -38,7 +38,7 @@ Supervisor: "List open tasks in the Project Tracker base."
 2. List tables in that base → identify the Tasks table; capture its table ID.
 3. Get table schema → identify the status field and the choice IDs that represent "open" states.
 4. List records with a typed filter on the status field for those choice IDs.
-5. Return `status=success` with the matched records in `evidence.items`.
+5. Return `status=success` with `evidence.items` set to `{ "total": N }` and the matched records listed in `action_summary` (record id, primary-field value, and 1-2 most relevant fields; one line per record; up to 10 entries, then `"...and N more"`).
 </example>
 
 <example>
@@ -97,7 +97,7 @@ Rules:
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
 - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: base, table, field, choice, record, etc.).
-- For discovery-only queries (lists), populate `evidence.items` with the structured list.
+- For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (record id, primary-field value, and 1-2 most relevant fields; up to 10 entries, then `"...and N more"`).
 </output_contract>
 
 Discover before you mutate; never guess identifiers, choice IDs, or required fields.

From f4e66718be1d81541fb0e576da0e80ff90f3a46d Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:41:36 +0200
Subject: [PATCH 16/34] perf(discord subagent): stop echoing raw
 channels/messages payload into evidence.items

---
 .../subagents/connectors/discord/system_prompt.md               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md
index a0ba6d87e..249f9ec8b 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md
@@ -24,7 +24,7 @@ You are a Discord specialist for the user's connected Discord server.
 | `error`                                               | `error`       | Relay the tool's `message` verbatim as `next_step`. |
 | tool raises / unknown                                 | `error`       | `"Discord tool failed unexpectedly. Ask the user to retry shortly."` |
 
-Surface the tool's `message`, `channel_id`, `message_id`, and the listed channels/messages payload inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+Surface the tool's `message`, `channel_id`, and `message_id` inside `evidence` when the tool returned them. For `list_discord_channels` and `read_discord_messages`, set `evidence.items` to `{ "total": N }` and list the matched entries in `action_summary` (channel name or sender + timestamp + short text snippet; one line per entry; up to 10 entries, then `"...and N more"`). Never invent a field the tool did not return.
 
 ## Examples
 

From 20f7896a99d42892c9d02ab19e7a4613964e0dc0 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:41:47 +0200
Subject: [PATCH 17/34] perf(luma subagent): stop echoing raw events list into
 evidence.items

---
 .../multi_agent_chat/subagents/connectors/luma/system_prompt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md
index 953dbff58..0f42161b3 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md
@@ -26,7 +26,7 @@ You are a Luma specialist for the user's connected Luma account.
 | `error`                                            | `error`       | Relay the tool's `message` verbatim as `next_step` (this covers Luma Plus 403s and other API errors). |
 | tool raises / unknown                              | `error`       | `"Luma tool failed unexpectedly. Ask the user to retry shortly."` |
 
-Surface the tool's `message`, `event_id`, `name`, `start_at`, and `url` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+Surface the tool's `message`, `event_id`, `name`, `start_at`, and `url` inside `evidence` when the tool returned them. For `list_luma_events`, set `evidence.items` to `{ "total": N }` and list the matched events in `action_summary` (event name, start date/time, location if present; one line per event; up to 10 entries, then `"...and N more"`). Never invent a field the tool did not return.
 
 ## Examples
 

From 6c173dc2a7df14077b34d58ee5a9a5c83b37d740 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:42:03 +0200
Subject: [PATCH 18/34] perf(teams subagent): stop echoing raw
 teams/channels/messages payload into evidence.items

---
 .../subagents/connectors/teams/system_prompt.md                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md
index b79c59f90..c3a280f79 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md
@@ -26,7 +26,7 @@ You are a Microsoft Teams specialist for the user's connected Teams account.
 | `error`                                                                   | `error`       | Relay the tool's `message` verbatim as `next_step`. |
 | tool raises / unknown                                                     | `error`       | `"Teams tool failed unexpectedly. Ask the user to retry shortly."` |
 
-Surface the tool's `message`, `team_id`, `team_name`, `channel_id`, `channel_name`, and `message_id` inside `evidence` when the tool returned them. Never invent a field the tool did not return.
+Surface the tool's `message`, `team_id`, `team_name`, `channel_id`, `channel_name`, and `message_id` inside `evidence` when the tool returned them. For `list_teams_channels` and `read_teams_messages`, set `evidence.items` to `{ "total": N }` and list the matched entries in `action_summary` (team › channel, or sender + timestamp + short text snippet; one line per entry; up to 10 entries, then `"...and N more"`). Never invent a field the tool did not return.
 
 ## Examples
 

From b554c600bba5f5e9ffc414c74c609ac5b9475205 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:42:57 +0200
Subject: [PATCH 19/34] perf(research subagent): cap evidence.findings and
 evidence.sources to bound output

---
 .../subagents/builtins/research/system_prompt.md                | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md
index cf558db62..f1a22ddf1 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md
@@ -50,4 +50,6 @@ Rules:
 - `status=success` -> `next_step=null`, `missing_fields=null`.
 - `status=partial|blocked|error` -> `next_step` must be non-null.
 - `status=blocked` due to missing required inputs -> `missing_fields` must be non-null.
+- `evidence.findings`: max 10 entries, each a single sentence stating one distinct fact. Do not paste raw paragraphs, scraped pages, or quote blocks.
+- `evidence.sources`: max 10 URLs, one per finding when applicable. List each URL once.
 </output_contract>

From 5edf0520c4d44780c73e94e4cce83e461c6070a8 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:43:32 +0200
Subject: [PATCH 20/34] perf(kb subagent, cloud): cap evidence.content_excerpt
 to 500 chars

---
 .../subagents/builtins/knowledge_base/system_prompt_cloud.md     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
index 60cafb30c..514ec6639 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
@@ -118,5 +118,6 @@ Rules:
 - `status=success` → `next_step=null`, `missing_fields=null`.
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- `evidence.content_excerpt`: max ~500 characters. Surface a short excerpt or a one-sentence summary, not the full file body. The supervisor already sees the tool's raw output.
 
 Infer before you call; map every tool outcome faithfully.

From 0cdda14922f6ac07deae581d6448a30074887f50 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:43:36 +0200
Subject: [PATCH 21/34] perf(kb subagent, desktop): cap
 evidence.content_excerpt to 500 chars

---
 .../subagents/builtins/knowledge_base/system_prompt_desktop.md   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
index 8f64f2eb6..bfa96ee5b 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
+++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
@@ -118,5 +118,6 @@ Rules:
 - `status=success` → `next_step=null`, `missing_fields=null`.
 - `status=partial|blocked|error` → `next_step` must be non-null.
 - `status=blocked` due to missing required inputs → `missing_fields` must be non-null.
+- `evidence.content_excerpt`: max ~500 characters. Surface a short excerpt or a one-sentence summary, not the full file body. The supervisor already sees the tool's raw output.
 
 Infer before you call; map every tool outcome faithfully.

From 6090980c5e314c1dae6d4e8a354088b75a79e94d Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:51:44 +0200
Subject: [PATCH 22/34] obs(tokens): log prompt-cache read/write counts and hit
 ratio per LLM call

---
 .../app/services/token_tracking_service.py    | 32 ++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/app/services/token_tracking_service.py b/surfsense_backend/app/services/token_tracking_service.py
index 58d06ba31..986e6bf05 100644
--- a/surfsense_backend/app/services/token_tracking_service.py
+++ b/surfsense_backend/app/services/token_tracking_service.py
@@ -325,6 +325,22 @@ class TokenTrackingCallback(CustomLogger):
             total_tokens = getattr(usage, "total_tokens", 0) or 0
             call_kind = "chat"
 
+        # Prompt-cache accounting. Field shapes differ by provider:
+        # - OpenAI / Azure: ``usage.prompt_tokens_details.cached_tokens``
+        # - Anthropic:      ``usage.cache_read_input_tokens`` + ``usage.cache_creation_input_tokens``
+        # LiteLLM normalizes both; we read both shapes and prefer whichever is set.
+        cached_tokens = 0
+        cache_creation_tokens = 0
+        if not is_image:
+            prompt_details = getattr(usage, "prompt_tokens_details", None)
+            if prompt_details is not None:
+                cached_tokens = getattr(prompt_details, "cached_tokens", 0) or 0
+            if cached_tokens == 0:
+                cached_tokens = getattr(usage, "cache_read_input_tokens", 0) or 0
+            cache_creation_tokens = (
+                getattr(usage, "cache_creation_input_tokens", 0) or 0
+            )
+
         model = kwargs.get("model", "unknown")
 
         cost_usd = _extract_cost_usd(
@@ -367,9 +383,13 @@ class TokenTrackingCallback(CustomLogger):
         except Exception:
             call_latency_s = None
 
+        cache_hit_ratio: float | None = None
+        if prompt_tokens > 0 and (cached_tokens > 0 or cache_creation_tokens > 0):
+            cache_hit_ratio = cached_tokens / prompt_tokens
+
         logger.info(
             "[TokenTracking] Captured: model=%s kind=%s prompt=%d completion=%d total=%d "
-            "cost=$%.6f (%d micros) (accumulator now has %d calls)%s",
+            "cost=$%.6f (%d micros) (accumulator now has %d calls)%s%s",
             model,
             call_kind,
             prompt_tokens,
@@ -379,6 +399,16 @@ class TokenTrackingCallback(CustomLogger):
             cost_micros,
             len(acc.calls),
             f" latency={call_latency_s:.3f}s" if call_latency_s is not None else "",
+            (
+                f" cache_read={cached_tokens} cache_write={cache_creation_tokens}"
+                f" hit_ratio={cache_hit_ratio:.1%}"
+                if cache_hit_ratio is not None
+                else (
+                    f" cache_read={cached_tokens} cache_write={cache_creation_tokens}"
+                    if (cached_tokens or cache_creation_tokens)
+                    else ""
+                )
+            ),
         )
 
 

From 32f6766cb6c5787fd56607fb020117e21681a1e3 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 09:55:39 +0200
Subject: [PATCH 23/34] fix(tokens): use canonical prompt_tokens_details path
 for cache fields

LiteLLM normalizes every provider's cache fields onto
usage.prompt_tokens_details (cached_tokens + cache_creation_tokens).
The earlier fallback to usage.cache_read_input_tokens /
usage.cache_creation_input_tokens was wrong: Anthropic-shaped fields
only live there via a trailing setattr loop, and the canonical field
name on the wrapper is cache_creation_tokens (not _input_tokens).
---
 .../app/services/token_tracking_service.py    | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/services/token_tracking_service.py b/surfsense_backend/app/services/token_tracking_service.py
index 986e6bf05..3f07e6f9e 100644
--- a/surfsense_backend/app/services/token_tracking_service.py
+++ b/surfsense_backend/app/services/token_tracking_service.py
@@ -325,21 +325,23 @@ class TokenTrackingCallback(CustomLogger):
             total_tokens = getattr(usage, "total_tokens", 0) or 0
             call_kind = "chat"
 
-        # Prompt-cache accounting. Field shapes differ by provider:
-        # - OpenAI / Azure: ``usage.prompt_tokens_details.cached_tokens``
-        # - Anthropic:      ``usage.cache_read_input_tokens`` + ``usage.cache_creation_input_tokens``
-        # LiteLLM normalizes both; we read both shapes and prefer whichever is set.
+        # Prompt-cache accounting. LiteLLM normalizes every provider's cache
+        # fields onto ``usage.prompt_tokens_details``:
+        # - ``cached_tokens``         — cache reads (OpenAI/Azure native, DeepSeek
+        #                               mapped from ``prompt_cache_hit_tokens``,
+        #                               Anthropic mapped from ``cache_read_input_tokens``).
+        # - ``cache_creation_tokens`` — cache writes (Anthropic only; OpenAI/Azure
+        #                               do not expose a write count).
+        # See ``litellm.types.utils.Usage.__init__`` for the mapping.
         cached_tokens = 0
         cache_creation_tokens = 0
         if not is_image:
             prompt_details = getattr(usage, "prompt_tokens_details", None)
             if prompt_details is not None:
                 cached_tokens = getattr(prompt_details, "cached_tokens", 0) or 0
-            if cached_tokens == 0:
-                cached_tokens = getattr(usage, "cache_read_input_tokens", 0) or 0
-            cache_creation_tokens = (
-                getattr(usage, "cache_creation_input_tokens", 0) or 0
-            )
+                cache_creation_tokens = (
+                    getattr(prompt_details, "cache_creation_tokens", 0) or 0
+                )
 
         model = kwargs.get("model", "unknown")
 

From 4fa85a9a947461581ced1ff0c0d65ba6e743ef19 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 10:02:38 +0200
Subject: [PATCH 24/34] perf(kb-search): offload sync embed_texts to thread

embed_texts holds a threading.Lock and runs a sync embedding call inside
search_knowledge_base, an async coroutine on the KB priority middleware
critical path. Blocking the event loop here stalls every other coroutine
on the worker (SSE keepalives, concurrent chat requests, background
tasks). Wrap in asyncio.to_thread so the embed runs on the default
executor pool while the loop keeps serving.
---
 .../app/agents/new_chat/middleware/knowledge_search.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index dc06f8763..98bbf3bd7 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -457,7 +457,7 @@ async def search_knowledge_base(
     if not query:
         return []
 
-    [embedding] = embed_texts([query])
+    [embedding] = await asyncio.to_thread(embed_texts, [query])
     doc_types = _resolve_search_types(available_connectors, available_document_types)
     retriever_top_k = min(top_k * 3, 30)
 

From 52d425f17054a9fd5ee8eccf74459f238f5664da Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 10:03:14 +0200
Subject: [PATCH 25/34] perf(kb-persistence): offload sync embed_texts to
 thread

_create_document and _update_document run on the chat critical path
when the filesystem subagent writes via the user's chat turn. Both
called embed_texts synchronously inside an async coroutine, blocking
the event loop for the duration of the embed.
---
 .../app/agents/new_chat/middleware/kb_persistence.py     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
index d577441dd..cc30f4897 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
@@ -32,6 +32,7 @@ exact same routine when ``aafter_agent`` was skipped (e.g. client disconnect).
 
 from __future__ import annotations
 
+import asyncio
 import logging
 from datetime import UTC, datetime
 from typing import Any
@@ -249,11 +250,11 @@ async def _create_document(
     session.add(doc)
     await session.flush()
 
-    summary_embedding = embed_texts([content])[0]
+    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
     doc.embedding = summary_embedding
     chunks = chunk_text(content)
     if chunks:
-        chunk_embeddings = embed_texts(chunks)
+        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
         session.add_all(
             [
                 Chunk(document_id=doc.id, content=text, embedding=embedding)
@@ -295,13 +296,13 @@ async def _update_document(
         search_space_id,
     )
 
-    summary_embedding = embed_texts([content])[0]
+    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
     document.embedding = summary_embedding
 
     await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
     chunks = chunk_text(content)
     if chunks:
-        chunk_embeddings = embed_texts(chunks)
+        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
         session.add_all(
             [
                 Chunk(document_id=document.id, content=text, embedding=embedding)

From a3d6fa6196f1871fc2e580f82c335305778362ca Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 10:03:42 +0200
Subject: [PATCH 26/34] perf(document-converters): offload sync
 embed_text/embed_texts to thread

generate_document_summary and create_document_chunks are async helpers
called from the chat path and from many connector indexers. Both wrapped
embed_text/embed_texts directly inside the coroutine, blocking the event
loop for the full duration of the embedding call.
---
 surfsense_backend/app/utils/document_converters.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py
index ed52c1b7b..9bc8103c5 100644
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@@ -1,3 +1,4 @@
+import asyncio
 import hashlib
 import logging
 import threading
@@ -221,7 +222,9 @@ async def generate_document_summary(
     else:
         enhanced_summary_content = summary_content
 
-    summary_embedding = embed_text(enhanced_summary_content)
+    summary_embedding = await asyncio.to_thread(
+        embed_text, enhanced_summary_content
+    )
 
     return enhanced_summary_content, summary_embedding
 
@@ -237,7 +240,7 @@ async def create_document_chunks(content: str) -> list[Chunk]:
         List of Chunk objects with embeddings
     """
     chunk_texts = [c.text for c in config.chunker_instance.chunk(content)]
-    chunk_embeddings = embed_texts(chunk_texts)
+    chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
     return [
         Chunk(content=text, embedding=emb)
         for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)

From a8de98895a93fe376c21ab3478f5f1871526a7c1 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 10:04:26 +0200
Subject: [PATCH 27/34] perf(revert-service): offload sync embed_texts to
 thread

_restore_in_place_document and _reinsert_document_from_revision are
async helpers invoked by the synchronous-feeling POST /api/threads/.../revert
route; both ran embed_texts inline, blocking the event loop while the
HTTP client waited.
---
 surfsense_backend/app/services/revert_service.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/services/revert_service.py b/surfsense_backend/app/services/revert_service.py
index d02a31345..60f6503aa 100644
--- a/surfsense_backend/app/services/revert_service.py
+++ b/surfsense_backend/app/services/revert_service.py
@@ -29,6 +29,7 @@ same trap waiting to happen).
 
 from __future__ import annotations
 
+import asyncio
 import logging
 from dataclasses import dataclass
 from datetime import UTC, datetime
@@ -234,7 +235,7 @@ async def _restore_in_place_document(
             if isinstance(c, dict) and isinstance(c.get("content"), str)
         ]
         if chunk_texts:
-            chunk_embeddings = embed_texts(chunk_texts)
+            chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
             session.add_all(
                 [
                     Chunk(document_id=doc.id, content=text, embedding=embedding)
@@ -244,7 +245,9 @@ async def _restore_in_place_document(
                 ]
             )
             if isinstance(revision.content_before, str):
-                doc.embedding = embed_texts([revision.content_before])[0]
+                doc.embedding = (
+                    await asyncio.to_thread(embed_texts, [revision.content_before])
+                )[0]
 
     doc.updated_at = datetime.now(UTC)
     return RevertOutcome(status="ok", message="Document restored from snapshot.")
@@ -320,7 +323,7 @@ async def _reinsert_document_from_revision(
     session.add(new_doc)
     await session.flush()
 
-    new_doc.embedding = embed_texts([content])[0]
+    new_doc.embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
     chunk_texts = []
     chunks_before = revision.chunks_before
     if isinstance(chunks_before, list):
@@ -330,7 +333,7 @@ async def _reinsert_document_from_revision(
             if isinstance(c, dict) and isinstance(c.get("content"), str)
         ]
     if chunk_texts:
-        chunk_embeddings = embed_texts(chunk_texts)
+        chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
         session.add_all(
             [
                 Chunk(document_id=new_doc.id, content=text, embedding=embedding)

From 1791241c0c8c451b7c82ec6e688b4264498da3fb Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 10:09:38 +0200
Subject: [PATCH 28/34] perf(indexers): offload sync embed_text to thread
 across background workers

Connector kb_sync_services (gmail, onedrive, google_calendar, jira),
streaming indexers (discord, luma, teams) and the file-processor save
path all called embed_text inside async coroutines, blocking the
background worker's event loop for the duration of the embed. Wrap each
call site in asyncio.to_thread so concurrent indexing tasks stop
serialising on the embed.
---
 surfsense_backend/app/services/gmail/kb_sync_service.py   | 5 ++++-
 .../app/services/google_calendar/kb_sync_service.py       | 8 ++++++--
 surfsense_backend/app/services/jira/kb_sync_service.py    | 8 ++++++--
 .../app/services/onedrive/kb_sync_service.py              | 5 ++++-
 .../app/tasks/connector_indexers/discord_indexer.py       | 4 +++-
 .../app/tasks/connector_indexers/luma_indexer.py          | 5 ++++-
 .../app/tasks/connector_indexers/teams_indexer.py         | 5 ++++-
 surfsense_backend/app/tasks/document_processors/_save.py  | 5 +++--
 8 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/surfsense_backend/app/services/gmail/kb_sync_service.py b/surfsense_backend/app/services/gmail/kb_sync_service.py
index 885ee4b94..6ff5f3c2b 100644
--- a/surfsense_backend/app/services/gmail/kb_sync_service.py
+++ b/surfsense_backend/app/services/gmail/kb_sync_service.py
@@ -1,3 +1,4 @@
+import asyncio
 import logging
 from datetime import datetime
 
@@ -100,7 +101,9 @@ class GmailKBSyncService:
             else:
                 logger.warning("No LLM configured -- using fallback summary")
                 summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(indexable_content)
             now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
diff --git a/surfsense_backend/app/services/google_calendar/kb_sync_service.py b/surfsense_backend/app/services/google_calendar/kb_sync_service.py
index 602a55738..1f017ec4d 100644
--- a/surfsense_backend/app/services/google_calendar/kb_sync_service.py
+++ b/surfsense_backend/app/services/google_calendar/kb_sync_service.py
@@ -116,7 +116,9 @@ class GoogleCalendarKBSyncService:
                 summary_content = (
                     f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
                 )
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(indexable_content)
             now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -295,7 +297,9 @@ class GoogleCalendarKBSyncService:
                 summary_content = (
                     f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
                 )
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(indexable_content)
             now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
diff --git a/surfsense_backend/app/services/jira/kb_sync_service.py b/surfsense_backend/app/services/jira/kb_sync_service.py
index 8e88bee81..5f6668377 100644
--- a/surfsense_backend/app/services/jira/kb_sync_service.py
+++ b/surfsense_backend/app/services/jira/kb_sync_service.py
@@ -98,7 +98,9 @@ class JiraKBSyncService:
                 summary_content = (
                     f"Jira Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
                 )
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(issue_content)
             now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -212,7 +214,9 @@ class JiraKBSyncService:
                 summary_content = (
                     f"Jira Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
                 )
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(issue_content)
 
diff --git a/surfsense_backend/app/services/onedrive/kb_sync_service.py b/surfsense_backend/app/services/onedrive/kb_sync_service.py
index e9b2e38ea..e1da3b4a1 100644
--- a/surfsense_backend/app/services/onedrive/kb_sync_service.py
+++ b/surfsense_backend/app/services/onedrive/kb_sync_service.py
@@ -1,3 +1,4 @@
+import asyncio
 import logging
 from datetime import datetime
 
@@ -95,7 +96,9 @@ class OneDriveKBSyncService:
             else:
                 logger.warning("No LLM configured — using fallback summary")
                 summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
-                summary_embedding = embed_text(summary_content)
+                summary_embedding = await asyncio.to_thread(
+                    embed_text, summary_content
+                )
 
             chunks = await create_document_chunks(indexable_content)
             now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
index 5e784cb4f..180f21412 100644
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@@ -670,7 +670,9 @@ async def index_discord_messages(
 
                 # Heavy processing (embeddings, chunks)
                 chunks = await create_document_chunks(item["combined_document_string"])
-                doc_embedding = embed_text(item["combined_document_string"])
+                doc_embedding = await asyncio.to_thread(
+                    embed_text, item["combined_document_string"]
+                )
 
                 # Update document to READY with actual content
                 document.title = f"{item['guild_name']}#{item['channel_name']}"
diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
index a698bfd46..555d60273 100644
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@@ -6,6 +6,7 @@ Implements 2-phase document status updates for real-time UI feedback:
 - Phase 2: Process each event: pending → processing → ready/failed
 """
 
+import asyncio
 import time
 from collections.abc import Awaitable, Callable
 from datetime import datetime, timedelta
@@ -465,7 +466,9 @@ async def index_luma_events(
                     summary_content = (
                         f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
                     )
-                    summary_embedding = embed_text(summary_content)
+                    summary_embedding = await asyncio.to_thread(
+                        embed_text, summary_content
+                    )
 
                 chunks = await create_document_chunks(item["event_markdown"])
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
index 12cdf384e..25994895a 100644
--- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
@@ -9,6 +9,7 @@ Uses 2-phase document status updates for real-time UI feedback:
 - Phase 2: Process each document: pending → processing → ready/failed
 """
 
+import asyncio
 import time
 from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime
@@ -581,7 +582,9 @@ async def index_teams_messages(
 
                 # Heavy processing (embeddings, chunks)
                 chunks = await create_document_chunks(item["combined_document_string"])
-                doc_embedding = embed_text(item["combined_document_string"])
+                doc_embedding = await asyncio.to_thread(
+                    embed_text, item["combined_document_string"]
+                )
 
                 # Update document to READY with actual content
                 document.title = f"{item['team_name']} - {item['channel_name']}"
diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py
index ae45f7a69..d633dd4f6 100644
--- a/surfsense_backend/app/tasks/document_processors/_save.py
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@@ -2,6 +2,7 @@
 Unified document save/update logic for file processors.
 """
 
+import asyncio
 import logging
 
 from sqlalchemy.exc import SQLAlchemyError
@@ -43,7 +44,7 @@ async def _generate_summary(
     """
     if not enable_summary:
         summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
-        return summary, embed_text(summary)
+        return summary, await asyncio.to_thread(embed_text, summary)
 
     if etl_service == "DOCLING":
         from app.services.docling_service import create_docling_service
@@ -65,7 +66,7 @@ async def _generate_summary(
                 parts.append(f"**{formatted_key}:** {value}")
 
         enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
-        return enhanced, embed_text(enhanced)
+        return enhanced, await asyncio.to_thread(embed_text, enhanced)
 
     # Standard summary (Unstructured / LlamaCloud / others)
     meta = {

From c3db25302b50ff84bd0971697b0841a0071b118d Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 11:03:08 +0200
Subject: [PATCH 29/34] perf(chat): kill auto-pin preflight + speculative
 build, rely on reactive 429 recovery

The preflight pattern probed the LLM with a 1-token ping before each
cold turn (when requested_llm_config_id==0, llm_config_id<0, and the
45s healthy TTL had expired) to detect 429s before fanning out into
planner/classifier/title-gen. To absorb its ~1-5s RTT cost we built the
agent speculatively in parallel; on 429 we discarded the build and
repinned.

Three problems with that design:

1. False security. Provider rate limits are token-bucket. A 1-token
   ping consumes ~5 tokens; the real request consumes 10-50K. The
   probe can return 200 while the real call still 429s.
2. Pure overhead in the common case. On warm-agent-cache turns the
   probe dominates wall time: ~2.5s of TTFT pure tax for ~99% of users
   who never see a 429.
3. The in-stream recovery loop (catch of _is_provider_rate_limited
   gated by not _first_event_logged) already does the right thing
   reactively: mark_runtime_cooldown -> resolve_or_get_pinned_llm_config_id
   with exclude_config_ids={previous} -> rebuild agent -> retry the
   stream. Preflight was never the only safety net; it was a redundant
   probe in front of one.

Changes:
- Delete _preflight_llm, _settle_speculative_agent_build, and the
  _PREFLIGHT_TIMEOUT_SEC / _PREFLIGHT_MAX_TOKENS constants.
- Drop the parallel agent_build_task / preflight_task plumbing in
  both stream_new_chat and stream_resume_chat; build the agent inline
  with await _build_main_agent_for_thread(...).
- Drop the unused is_recently_healthy / mark_healthy imports here
  (still exported from auto_model_pin_service since OpenRouter
  catalogue refresh and a few tests reference clear_healthy).
- Remove the obsolete preflight + settle-speculative tests from
  test_stream_new_chat_contract.py.

Net: -447 LOC. ~2.5s removed from TTFT on every cold preflight-eligible
turn. 429 recovery path is unchanged - same repin/rebuild/retry, just
not paid in advance on the healthy path.
---
 .../app/tasks/chat/stream_new_chat.py         | 405 ++----------------
 .../unit/test_stream_new_chat_contract.py     | 122 ------
 2 files changed, 40 insertions(+), 487 deletions(-)

diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index 3d639affb..da84e7350 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -64,8 +64,6 @@ from app.db import (
 )
 from app.prompts import TITLE_GENERATION_PROMPT
 from app.services.auto_model_pin_service import (
-    is_recently_healthy,
-    mark_healthy,
     mark_runtime_cooldown,
     resolve_or_get_pinned_llm_config_id,
 )
@@ -502,54 +500,6 @@ def _is_provider_rate_limited(exc: BaseException) -> bool:
     )
 
 
-_PREFLIGHT_TIMEOUT_SEC: float = 2.5
-_PREFLIGHT_MAX_TOKENS: int = 1
-
-
-async def _preflight_llm(llm: Any) -> None:
-    """Issue a minimal completion to confirm the pinned model isn't 429'ing.
-
-    Used before agent build / planner / classifier / title-gen so a known-bad
-    free OpenRouter deployment is detected and repinned before it cascades
-    into multiple wasted internal calls. The probe is intentionally cheap:
-    one token, low timeout, tagged ``surfsense:internal`` so token tracking
-    and SSE pipelines treat it as overhead rather than user output.
-
-    Raises the original exception when the provider responds with a
-    rate-limit-shaped error so the caller can drive the cooldown/repin
-    branch via :func:`_is_provider_rate_limited`. Other transient failures
-    are swallowed — the caller continues to the normal stream path and the
-    in-stream recovery loop remains the safety net.
-    """
-    from litellm import acompletion
-
-    model = getattr(llm, "model", None)
-    if not model or model == "auto":
-        # Auto-mode router doesn't have a single deployment to ping; the
-        # router itself handles per-deployment rate-limit accounting.
-        return
-
-    try:
-        await acompletion(
-            model=model,
-            messages=[{"role": "user", "content": "ping"}],
-            api_key=getattr(llm, "api_key", None),
-            api_base=getattr(llm, "api_base", None),
-            max_tokens=_PREFLIGHT_MAX_TOKENS,
-            timeout=_PREFLIGHT_TIMEOUT_SEC,
-            stream=False,
-            metadata={"tags": ["surfsense:internal", "auto-pin-preflight"]},
-        )
-    except Exception as exc:
-        if _is_provider_rate_limited(exc):
-            raise
-        logging.getLogger(__name__).debug(
-            "auto_pin_preflight non_rate_limit_error model=%s err=%s",
-            model,
-            exc,
-        )
-
-
 async def _build_main_agent_for_thread(
     agent_factory: Any,
     *,
@@ -567,9 +517,9 @@ async def _build_main_agent_for_thread(
     disabled_tools: list[str] | None = None,
     mentioned_document_ids: list[int] | None = None,
 ) -> Any:
-    """Single (re)build path so the agent factory cannot drift across
-    initial build, preflight repin, and mid-stream 429 recovery for one
-    ``thread_id``: a graph swap mid-turn would corrupt checkpointer state."""
+    """Single (re)build path so the agent factory cannot drift across the
+    initial build and mid-stream 429 recovery for one ``thread_id``: a
+    graph swap mid-turn would corrupt checkpointer state."""
     return await agent_factory(
         llm=llm,
         search_space_id=search_space_id,
@@ -587,29 +537,6 @@ async def _build_main_agent_for_thread(
     )
 
 
-async def _settle_speculative_agent_build(task: asyncio.Task[Any]) -> None:
-    """Wait for a discarded speculative agent build to release shared state.
-
-    Used by the parallel preflight + agent-build path. The speculative build
-    closes over the request-scoped ``AsyncSession`` (for the brief connector
-    discovery / tool-factory window before its CPU work moves into a worker
-    thread). If preflight reports a 429 we want to fall back to the original
-    repin → reload → rebuild path, but we MUST NOT touch ``session`` again
-    until any in-flight session work owned by the speculative build has
-    fully settled — :class:`sqlalchemy.ext.asyncio.AsyncSession` is not
-    concurrency-safe and the same hazard cost us a hard ``InvalidRequestError``
-    earlier in this PR (see ``connector_service`` parallel-gather revert).
-
-    We simply ``await`` the task and swallow any exception: in this path the
-    build's outcome is irrelevant — success populates the agent cache (a free
-    side effect), failure is discarded. The wasted CPU is acceptable since
-    429 fallbacks are rare and the original sequential code also paid the
-    full build cost on the same path.
-    """
-    with contextlib.suppress(BaseException):
-        await task
-
-
 def _classify_stream_exception(
     exc: Exception,
     *,
@@ -1237,39 +1164,6 @@ async def stream_new_chat(
             yield streaming_service.format_done()
             return
 
-        # Auto-mode preflight ping. Runs ONLY for thread-pinned auto cfgs
-        # (negative ids selected via ``resolve_or_get_pinned_llm_config_id``)
-        # whose health hasn't already been confirmed within the TTL window.
-        # Detecting a 429 here lets us repin BEFORE the planner/classifier/
-        # title-generation LLM calls fan out and each independently hit the
-        # same upstream rate limit.
-        #
-        # PERF: preflight is a network round-trip to the LLM provider (~1-5s)
-        # and is independent of the agent build (CPU-bound, ~5-7s). They used
-        # to run sequentially → ``preflight + build`` on cold cache = 11.5s.
-        # We now kick off preflight as a background task FIRST, then run the
-        # synchronous setup work and the agent build in parallel. In the
-        # success path (the common case) total wall time drops to roughly
-        # ``max(preflight, build)`` — the preflight finishes during the
-        # agent compile and we just consume its result. In the rare 429
-        # path the speculative build is awaited to completion (so its
-        # session usage is fully released) via
-        # :func:`_settle_speculative_agent_build`, then discarded, and
-        # we fall back to the original repin-and-rebuild flow.
-        preflight_needed = (
-            requested_llm_config_id == 0
-            and llm_config_id < 0
-            and not is_recently_healthy(llm_config_id)
-        )
-        preflight_task: asyncio.Task[None] | None = None
-        _t_preflight = 0.0
-        if preflight_needed:
-            _t_preflight = time.perf_counter()
-            preflight_task = asyncio.create_task(
-                _preflight_llm(llm),
-                name=f"auto_pin_preflight:{llm_config_id}",
-            )
-
         # Create connector service
         _t0 = time.perf_counter()
         connector_service = ConnectorService(session, search_space_id=search_space_id)
@@ -1303,136 +1197,26 @@ async def stream_new_chat(
             if use_multi_agent
             else create_surfsense_deep_agent
         )
-        # Speculative agent build — runs in parallel with the preflight
-        # task (if any). Built with the *current* ``llm`` / ``agent_config``;
-        # if preflight reports 429 we will discard this future and rebuild
-        # against the freshly pinned config below.
-        agent_build_task = asyncio.create_task(
-            _build_main_agent_for_thread(
-                agent_factory,
-                llm=llm,
-                search_space_id=search_space_id,
-                db_session=session,
-                connector_service=connector_service,
-                checkpointer=checkpointer,
-                user_id=user_id,
-                thread_id=chat_id,
-                agent_config=agent_config,
-                firecrawl_api_key=firecrawl_api_key,
-                thread_visibility=visibility,
-                filesystem_selection=filesystem_selection,
-                disabled_tools=disabled_tools,
-                mentioned_document_ids=mentioned_document_ids,
-            ),
-            name="agent_build:stream_new_chat",
+        # Build the agent inline. Provider 429s surface through the
+        # in-stream recovery loop below (``_is_provider_rate_limited``),
+        # which repins the thread to an eligible alternative config and
+        # rebuilds the agent before the user sees any output.
+        agent = await _build_main_agent_for_thread(
+            agent_factory,
+            llm=llm,
+            search_space_id=search_space_id,
+            db_session=session,
+            connector_service=connector_service,
+            checkpointer=checkpointer,
+            user_id=user_id,
+            thread_id=chat_id,
+            agent_config=agent_config,
+            firecrawl_api_key=firecrawl_api_key,
+            thread_visibility=visibility,
+            filesystem_selection=filesystem_selection,
+            disabled_tools=disabled_tools,
+            mentioned_document_ids=mentioned_document_ids,
         )
-
-        agent: Any = None
-        if preflight_task is not None:
-            try:
-                await preflight_task
-                mark_healthy(llm_config_id)
-                _perf_log.info(
-                    "[stream_new_chat] auto_pin_preflight ok config_id=%s took=%.3fs (parallel)",
-                    llm_config_id,
-                    time.perf_counter() - _t_preflight,
-                )
-            except Exception as preflight_exc:
-                # Both branches below need the session: the non-429 path
-                # may unwind via cleanup that uses ``session``, and the
-                # 429 path explicitly calls ``resolve_or_get_pinned_llm_config_id``
-                # against it. Wait for the speculative build to release its
-                # session usage before we proceed.
-                await _settle_speculative_agent_build(agent_build_task)
-                if not _is_provider_rate_limited(preflight_exc):
-                    raise
-                # 429: speculative agent is discarded; run the original
-                # repin → reload → rebuild path against the freshly
-                # pinned config.
-                previous_config_id = llm_config_id
-                mark_runtime_cooldown(
-                    previous_config_id, reason="preflight_rate_limited"
-                )
-                try:
-                    llm_config_id = (
-                        await resolve_or_get_pinned_llm_config_id(
-                            session,
-                            thread_id=chat_id,
-                            search_space_id=search_space_id,
-                            user_id=user_id,
-                            selected_llm_config_id=0,
-                            exclude_config_ids={previous_config_id},
-                            requires_image_input=_requires_image_input,
-                        )
-                    ).resolved_llm_config_id
-                except ValueError as pin_error:
-                    yield _emit_stream_error(
-                        message=str(pin_error),
-                        error_kind="server_error",
-                        error_code="SERVER_ERROR",
-                    )
-                    yield streaming_service.format_done()
-                    return
-
-                llm, agent_config, llm_load_error = await _load_llm_bundle(
-                    llm_config_id
-                )
-                if llm_load_error or not llm:
-                    yield _emit_stream_error(
-                        message=llm_load_error or "Failed to create LLM instance",
-                        error_kind="server_error",
-                        error_code="SERVER_ERROR",
-                    )
-                    yield streaming_service.format_done()
-                    return
-                # Trust the freshly-resolved cfg for the remainder of this
-                # turn rather than recursing into another preflight; the
-                # in-stream 429 recovery loop is still in place as the
-                # safety net if even this fallback hits an upstream cap.
-                mark_healthy(llm_config_id)
-                _log_chat_stream_error(
-                    flow=flow,
-                    error_kind="rate_limited",
-                    error_code="RATE_LIMITED",
-                    severity="info",
-                    is_expected=True,
-                    request_id=request_id,
-                    thread_id=chat_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                    message=(
-                        "Auto-pinned model failed preflight; switched to another "
-                        "eligible model and continuing."
-                    ),
-                    extra={
-                        "auto_runtime_recover": True,
-                        "preflight": True,
-                        "previous_config_id": previous_config_id,
-                        "fallback_config_id": llm_config_id,
-                    },
-                )
-                # Rebuild against the new llm/agent_config. Sequential
-                # here because we no longer have anything to overlap with.
-                agent = await agent_factory(
-                    llm=llm,
-                    search_space_id=search_space_id,
-                    db_session=session,
-                    connector_service=connector_service,
-                    checkpointer=checkpointer,
-                    user_id=user_id,
-                    thread_id=chat_id,
-                    agent_config=agent_config,
-                    firecrawl_api_key=firecrawl_api_key,
-                    thread_visibility=visibility,
-                    disabled_tools=disabled_tools,
-                    mentioned_document_ids=mentioned_document_ids,
-                    filesystem_selection=filesystem_selection,
-                )
-
-        if agent is None:
-            # Either no preflight was needed, or preflight succeeded —
-            # in both cases the speculative build is the agent we want.
-            agent = await agent_build_task
         _perf_log.info(
             "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
         )
@@ -2678,25 +2462,6 @@ async def stream_resume_chat(
             yield streaming_service.format_done()
             return
 
-        # Auto-mode preflight ping (resume path). Mirrors ``stream_new_chat``:
-        # one cheap probe before the agent is rebuilt so a 429'd pin gets
-        # repinned without burning planner/classifier/title calls first.
-        # See ``stream_new_chat`` for the full rationale on the speculative
-        # parallel build pattern below.
-        preflight_needed = (
-            requested_llm_config_id == 0
-            and llm_config_id < 0
-            and not is_recently_healthy(llm_config_id)
-        )
-        preflight_task: asyncio.Task[None] | None = None
-        _t_preflight = 0.0
-        if preflight_needed:
-            _t_preflight = time.perf_counter()
-            preflight_task = asyncio.create_task(
-                _preflight_llm(llm),
-                name=f"auto_pin_preflight_resume:{llm_config_id}",
-            )
-
         _t0 = time.perf_counter()
         connector_service = ConnectorService(session, search_space_id=search_space_id)
 
@@ -2726,115 +2491,25 @@ async def stream_resume_chat(
             if _app_config.MULTI_AGENT_CHAT_ENABLED
             else create_surfsense_deep_agent
         )
-        agent_build_task = asyncio.create_task(
-            _build_main_agent_for_thread(
-                agent_factory,
-                llm=llm,
-                search_space_id=search_space_id,
-                db_session=session,
-                connector_service=connector_service,
-                checkpointer=checkpointer,
-                user_id=user_id,
-                thread_id=chat_id,
-                agent_config=agent_config,
-                firecrawl_api_key=firecrawl_api_key,
-                thread_visibility=visibility,
-                filesystem_selection=filesystem_selection,
-                disabled_tools=disabled_tools,
-            ),
-            name="agent_build:stream_resume",
+        # Build the agent inline. Provider 429s are handled by the
+        # in-stream recovery loop, which repins to an eligible
+        # alternative config and rebuilds the agent before the user sees
+        # any output.
+        agent = await _build_main_agent_for_thread(
+            agent_factory,
+            llm=llm,
+            search_space_id=search_space_id,
+            db_session=session,
+            connector_service=connector_service,
+            checkpointer=checkpointer,
+            user_id=user_id,
+            thread_id=chat_id,
+            agent_config=agent_config,
+            firecrawl_api_key=firecrawl_api_key,
+            thread_visibility=visibility,
+            filesystem_selection=filesystem_selection,
+            disabled_tools=disabled_tools,
         )
-
-        agent: Any = None
-        if preflight_task is not None:
-            try:
-                await preflight_task
-                mark_healthy(llm_config_id)
-                _perf_log.info(
-                    "[stream_resume] auto_pin_preflight ok config_id=%s took=%.3fs (parallel)",
-                    llm_config_id,
-                    time.perf_counter() - _t_preflight,
-                )
-            except Exception as preflight_exc:
-                # Same session-safety rationale as ``stream_new_chat``.
-                await _settle_speculative_agent_build(agent_build_task)
-                if not _is_provider_rate_limited(preflight_exc):
-                    raise
-                previous_config_id = llm_config_id
-                mark_runtime_cooldown(
-                    previous_config_id, reason="preflight_rate_limited"
-                )
-                try:
-                    llm_config_id = (
-                        await resolve_or_get_pinned_llm_config_id(
-                            session,
-                            thread_id=chat_id,
-                            search_space_id=search_space_id,
-                            user_id=user_id,
-                            selected_llm_config_id=0,
-                            exclude_config_ids={previous_config_id},
-                        )
-                    ).resolved_llm_config_id
-                except ValueError as pin_error:
-                    yield _emit_stream_error(
-                        message=str(pin_error),
-                        error_kind="server_error",
-                        error_code="SERVER_ERROR",
-                    )
-                    yield streaming_service.format_done()
-                    return
-
-                llm, agent_config, llm_load_error = await _load_llm_bundle(
-                    llm_config_id
-                )
-                if llm_load_error or not llm:
-                    yield _emit_stream_error(
-                        message=llm_load_error or "Failed to create LLM instance",
-                        error_kind="server_error",
-                        error_code="SERVER_ERROR",
-                    )
-                    yield streaming_service.format_done()
-                    return
-                mark_healthy(llm_config_id)
-                _log_chat_stream_error(
-                    flow="resume",
-                    error_kind="rate_limited",
-                    error_code="RATE_LIMITED",
-                    severity="info",
-                    is_expected=True,
-                    request_id=request_id,
-                    thread_id=chat_id,
-                    search_space_id=search_space_id,
-                    user_id=user_id,
-                    message=(
-                        "Auto-pinned model failed preflight; switched to another "
-                        "eligible model and continuing."
-                    ),
-                    extra={
-                        "auto_runtime_recover": True,
-                        "preflight": True,
-                        "previous_config_id": previous_config_id,
-                        "fallback_config_id": llm_config_id,
-                    },
-                )
-                agent = await _build_main_agent_for_thread(
-                    agent_factory,
-                    llm=llm,
-                    search_space_id=search_space_id,
-                    db_session=session,
-                    connector_service=connector_service,
-                    checkpointer=checkpointer,
-                    user_id=user_id,
-                    thread_id=chat_id,
-                    agent_config=agent_config,
-                    firecrawl_api_key=firecrawl_api_key,
-                    thread_visibility=visibility,
-                    filesystem_selection=filesystem_selection,
-                    disabled_tools=disabled_tools,
-                )
-
-        if agent is None:
-            agent = await agent_build_task
         _perf_log.info(
             "[stream_resume] Agent created in %.3fs", time.perf_counter() - _t0
         )
diff --git a/surfsense_backend/tests/unit/test_stream_new_chat_contract.py b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
index 208204ca9..19b06201f 100644
--- a/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
+++ b/surfsense_backend/tests/unit/test_stream_new_chat_contract.py
@@ -209,128 +209,6 @@ def test_stream_exception_classifies_openrouter_429_payload():
     assert extra is None
 
 
-@pytest.mark.asyncio
-async def test_preflight_swallows_non_rate_limit_errors_and_re_raises_429(monkeypatch):
-    """``_preflight_llm`` is best-effort.
-
-    - On rate-limit shaped exceptions (provider 429) it MUST re-raise so the
-      caller can drive the cooldown/repin branch.
-    - On any other transient failure it MUST swallow the error so the normal
-      stream path continues without surfacing preflight noise to the user.
-    """
-    from types import SimpleNamespace
-
-    from app.tasks.chat.stream_new_chat import _preflight_llm
-
-    class _RateLimitedError(Exception):
-        """Class-name carries 'RateLimit' so _is_provider_rate_limited triggers."""
-
-    rate_calls: list[dict] = []
-    other_calls: list[dict] = []
-
-    async def _fake_acompletion_429(**kwargs):
-        rate_calls.append(kwargs)
-        raise _RateLimitedError("simulated 429")
-
-    async def _fake_acompletion_other(**kwargs):
-        other_calls.append(kwargs)
-        raise RuntimeError("some unrelated transient failure")
-
-    fake_llm = SimpleNamespace(
-        model="openrouter/google/gemma-4-31b-it:free",
-        api_key="test",
-        api_base=None,
-    )
-
-    import litellm  # type: ignore[import-not-found]
-
-    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_429)
-    with pytest.raises(_RateLimitedError):
-        await _preflight_llm(fake_llm)
-    assert len(rate_calls) == 1
-    assert rate_calls[0]["max_tokens"] == 1
-    assert rate_calls[0]["stream"] is False
-
-    monkeypatch.setattr(litellm, "acompletion", _fake_acompletion_other)
-    # MUST NOT raise: non-rate-limit failures are swallowed.
-    await _preflight_llm(fake_llm)
-    assert len(other_calls) == 1
-
-
-@pytest.mark.asyncio
-async def test_preflight_skipped_for_auto_router_model():
-    """Router-mode ``model='auto'`` has no single deployment to ping; the
-    LiteLLM router itself owns per-deployment rate-limit accounting, so the
-    preflight helper must short-circuit instead of issuing a probe."""
-    from types import SimpleNamespace
-
-    from app.tasks.chat.stream_new_chat import _preflight_llm
-
-    fake_llm = SimpleNamespace(model="auto", api_key="x", api_base=None)
-    # Should return without raising or making any LiteLLM call.
-    await _preflight_llm(fake_llm)
-
-
-@pytest.mark.asyncio
-async def test_settle_speculative_agent_build_swallows_exceptions():
-    """``_settle_speculative_agent_build`` MUST always return cleanly so the
-    caller can safely re-touch the request-scoped session afterwards.
-
-    The helper guards the parallel preflight + agent-build path: when the
-    speculative build is being discarded (429 or non-429 preflight failure)
-    we await it solely to release any in-flight ``AsyncSession`` usage —
-    the build's outcome is irrelevant. Any exception (including
-    ``CancelledError``) leaking out would skip the caller's recovery flow
-    and re-introduce the very session-concurrency hazard the helper exists
-    to prevent.
-    """
-    import asyncio
-
-    from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
-
-    async def _raises() -> None:
-        raise RuntimeError("speculative build crashed")
-
-    async def _succeeds() -> str:
-        return "agent"
-
-    async def _slow() -> None:
-        await asyncio.sleep(0.05)
-
-    for coro in (_raises(), _succeeds(), _slow()):
-        task = asyncio.create_task(coro)
-        await _settle_speculative_agent_build(task)
-        assert task.done()
-
-
-@pytest.mark.asyncio
-async def test_settle_speculative_agent_build_handles_already_done_task():
-    """Done tasks (success or failure) must still be settled without raising."""
-    import asyncio
-
-    from app.tasks.chat.stream_new_chat import _settle_speculative_agent_build
-
-    async def _ok() -> str:
-        return "ok"
-
-    async def _bad() -> None:
-        raise ValueError("nope")
-
-    ok_task = asyncio.create_task(_ok())
-    bad_task = asyncio.create_task(_bad())
-    # Drive both to completion before settling.
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-
-    await _settle_speculative_agent_build(ok_task)
-    await _settle_speculative_agent_build(bad_task)
-    assert ok_task.result() == "ok"
-    # ``bad_task`` exception was consumed by the settle helper; calling
-    # ``.exception()`` after the fact must still return the original error
-    # (the helper observes it but doesn't clear it).
-    assert isinstance(bad_task.exception(), ValueError)
-
-
 def test_stream_exception_classifies_thread_busy():
     exc = BusyError(request_id="thread-123")
     kind, code, severity, is_expected, user_message, extra = _classify_stream_exception(

From 71dead0406cf93a53029e08c3f6433d056bc6998 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 11:42:52 +0200
Subject: [PATCH 30/34] perf(kb-planner): route internal planner calls to
 dedicated small/fast LLM

Adds an optional planner LLM role wired through KnowledgePriorityMiddleware
so KB query rewriting, date extraction, and recency classification run on a
cheap model (e.g. gpt-4o-mini, Haiku, Azure nano) instead of the user's
chat LLM. Operators opt in by setting is_planner: true on exactly one
global config; without it, behavior is unchanged.
---
 .../main_agent/knowledge_priority.py          |  2 +
 .../app/agents/new_chat/chat_deepagent.py     |  2 +
 .../new_chat/middleware/knowledge_search.py   | 30 +++++++----
 surfsense_backend/app/config/__init__.py      | 13 +++++
 .../app/config/global_llm_config.example.yaml | 53 +++++++++++++++++++
 surfsense_backend/app/services/llm_service.py | 33 ++++++++++++
 6 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
index fcdb1c61e..27cee8b37 100644
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/knowledge_priority.py
@@ -6,6 +6,7 @@ from langchain_core.language_models import BaseChatModel
 
 from app.agents.new_chat.filesystem_selection import FilesystemMode
 from app.agents.new_chat.middleware import KnowledgePriorityMiddleware
+from app.services.llm_service import get_planner_llm
 
 
 def build_knowledge_priority_mw(
@@ -19,6 +20,7 @@ def build_knowledge_priority_mw(
 ) -> KnowledgePriorityMiddleware:
     return KnowledgePriorityMiddleware(
         llm=llm,
+        planner_llm=get_planner_llm(),
         search_space_id=search_space_id,
         filesystem_mode=filesystem_mode,
         available_connectors=available_connectors,
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index 605c31416..f8db333ba 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -102,6 +102,7 @@ from app.agents.new_chat.tools.registry import (
 )
 from app.db import ChatVisibility
 from app.services.connector_service import ConnectorService
+from app.services.llm_service import get_planner_llm
 from app.utils.perf import get_perf_logger
 
 _perf_log = get_perf_logger()
@@ -1077,6 +1078,7 @@ def _build_compiled_agent_blocking(
         else None,
         KnowledgePriorityMiddleware(
             llm=llm,
+            planner_llm=get_planner_llm(),
             search_space_id=search_space_id,
             filesystem_mode=filesystem_mode,
             available_connectors=available_connectors,
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 98bbf3bd7..77b413940 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -579,6 +579,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         self,
         *,
         llm: BaseChatModel | None = None,
+        planner_llm: BaseChatModel | None = None,
         search_space_id: int,
         filesystem_mode: FilesystemMode = FilesystemMode.CLOUD,
         available_connectors: list[str] | None = None,
@@ -588,6 +589,15 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         inject_system_message: bool = True,  # For backwards compatibility
     ) -> None:
         self.llm = llm
+        # The planner LLM handles short, structured internal tasks (query
+        # rewriting, date extraction, recency classification). When an
+        # operator marks a global config ``is_planner: true`` we route
+        # those calls to a cheap/fast model (e.g. gpt-4o-mini, Haiku, Azure
+        # gpt-5.x-nano) instead of the user's chat LLM — those classification
+        # tasks don't need frontier-tier capability. Falls back to the chat
+        # LLM when no planner config is wired up so deployments without one
+        # keep working unchanged.
+        self.planner_llm = planner_llm or llm
         self.search_space_id = search_space_id
         self.filesystem_mode = filesystem_mode
         self.available_connectors = available_connectors
@@ -598,7 +608,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         # Build the kb-planner private Runnable ONCE here so we don't pay
         # the ``create_agent`` compile cost (50-200ms) on every turn.
         # Disabled by default behind ``enable_kb_planner_runnable``; when
-        # off the planner falls back to the legacy ``self.llm.ainvoke``
+        # off the planner falls back to the legacy ``planner_llm.ainvoke``
         # path.
         self._planner: Runnable | None = None
         self._planner_compile_failed = False
@@ -608,7 +618,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
 
         Returns ``None`` when the feature flag is disabled, when the LLM is
         unavailable, or when ``create_agent`` raises (we fall back to the
-        legacy ``self.llm.ainvoke`` path in that case). Compilation happens
+        legacy ``planner_llm.ainvoke`` path in that case). Compilation happens
         lazily on first call, then memoized via ``self._planner``.
 
         The compiled agent is constructed without tools — the planner's
@@ -618,7 +628,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         """
         if self._planner is not None or self._planner_compile_failed:
             return self._planner
-        if self.llm is None:
+        if self.planner_llm is None:
             return None
         flags = get_flags()
         if not flags.enable_kb_planner_runnable or flags.disable_new_agent_stack:
@@ -628,13 +638,13 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
 
         try:
             self._planner = create_agent(
-                self.llm,
+                self.planner_llm,
                 tools=[],
                 middleware=[RetryAfterMiddleware(max_retries=2)],
             )
         except Exception as exc:  # pragma: no cover - defensive
             logger.warning(
-                "kb-planner Runnable compile failed; falling back to llm.ainvoke: %s",
+                "kb-planner Runnable compile failed; falling back to planner_llm.ainvoke: %s",
                 exc,
             )
             self._planner_compile_failed = True
@@ -647,12 +657,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         messages: Sequence[BaseMessage],
         user_text: str,
     ) -> tuple[str, datetime | None, datetime | None, bool]:
-        if self.llm is None:
+        if self.planner_llm is None:
             return user_text, None, None, False
 
         recent_conversation = _render_recent_conversation(
             messages,
-            llm=self.llm,
+            llm=self.planner_llm,
             user_text=user_text,
         )
         prompt = _build_kb_planner_prompt(
@@ -663,8 +673,8 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
         t0 = loop.time()
 
         # Prefer the compiled-once planner Runnable when enabled; otherwise
-        # fall back to ``self.llm.ainvoke``. The ``surfsense:internal`` tag
-        # is preserved on both paths so ``_stream_agent_events`` still
+        # fall back to ``planner_llm.ainvoke``. The ``surfsense:internal``
+        # tag is preserved on both paths so ``_stream_agent_events`` still
         # suppresses the planner's intermediate events from the UI.
         planner = self._build_kb_planner_runnable()
         try:
@@ -684,7 +694,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
                     else AIMessage(content="")
                 )
             else:
-                response = await self.llm.ainvoke(
+                response = await self.planner_llm.ainvoke(
                     [HumanMessage(content=prompt)],
                     config={"tags": ["surfsense:internal"]},
                 )
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 448818e88..5643c048b 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -110,6 +110,19 @@ def load_global_llm_configs():
         except Exception as e:
             print(f"Warning: Failed to score global LLM configs: {e}")
 
+        # Planner LLM is a singleton role. If an operator accidentally
+        # marks multiple configs ``is_planner: true``, only the first one
+        # is used at runtime — surface the others at startup so the
+        # mistake is caught before traffic, not silently buried.
+        planner_cfgs = [c for c in configs if c.get("is_planner") is True]
+        if len(planner_cfgs) > 1:
+            extra_ids = [c.get("id") for c in planner_cfgs[1:]]
+            print(
+                "Warning: Multiple global LLM configs marked is_planner=true "
+                f"(ids {[c.get('id') for c in planner_cfgs]}); using id "
+                f"{planner_cfgs[0].get('id')} and ignoring {extra_ids}"
+            )
+
         return configs
     except Exception as e:
         print(f"Warning: Failed to load global LLM configs: {e}")
diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml
index d92640c8d..83d556754 100644
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@@ -258,6 +258,45 @@ global_llm_configs:
     use_default_system_instructions: true
     citations_enabled: true
 
+  # Example: Planner LLM - small, fast model used for internal utility tasks
+  #
+  # The PLANNER role handles short, structured internal calls (KB query
+  # rewriting, date extraction, recency classification, etc.) that don't
+  # need frontier-tier capability. Pointing the planner at a cheap+fast
+  # model (gpt-4o-mini, Claude Haiku, Azure gpt-5.x-nano, Groq Llama, ...)
+  # typically saves 500ms-1.5s per turn vs. routing those same internal
+  # calls through the user's chat model.
+  #
+  # Activation:
+  #   - Mark EXACTLY ONE global config with ``is_planner: true``.
+  #   - If multiple are marked, the first one wins and a WARNING is logged.
+  #   - If none is marked, every internal call falls back to the user's
+  #     chat LLM (same behavior as before this flag existed).
+  #
+  # This config is operator-only — it is NOT exposed in the user-facing
+  # model selector, never billed against premium quota, and the
+  # billing_tier / anonymous_enabled fields below are ignored.
+  - id: -9
+    name: "Global Planner (GPT-4o mini)"
+    description: "Internal-only planner LLM for query rewriting and classification"
+    is_planner: true
+    billing_tier: "free"
+    anonymous_enabled: false
+    seo_enabled: false
+    quota_reserve_tokens: 1000
+    provider: "OPENAI"
+    model_name: "gpt-4o-mini"
+    api_key: "sk-your-openai-api-key-here"
+    api_base: ""
+    rpm: 3500
+    tpm: 200000
+    litellm_params:
+      temperature: 0
+      max_tokens: 1000
+    system_instructions: ""
+    use_default_system_instructions: true
+    citations_enabled: false
+
 # =============================================================================
 # OpenRouter Integration
 # =============================================================================
@@ -493,6 +532,20 @@ global_vision_llm_configs:
 # - Lower temperature (0.3) is recommended for accurate screenshot analysis
 # - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
 #
+# PLANNER LLM NOTES:
+# - is_planner: true marks a config as the internal-only planner LLM (small,
+#   fast model used for KB query rewriting, date extraction, recency
+#   classification, etc.). Only one config may carry this flag — if
+#   multiple do, the first one wins and a startup WARNING is logged.
+# - When no config is marked is_planner, every internal utility call falls
+#   back to the user's chat LLM (the historical behavior).
+# - Planner configs are NOT shown in the user-facing model selector and
+#   are NOT billed against the user's premium quota. Their billing_tier,
+#   anonymous_enabled, seo_* fields are ignored.
+# - Recommended models: gpt-4o-mini, claude-3-5-haiku, gemini-1.5-flash,
+#   azure gpt-5.x-nano, groq llama3-8b — anything <200ms p50 on a 1-2k
+#   prompt. Frontier models here defeat the purpose of the flag.
+#
 # TOKEN QUOTA & ANONYMOUS ACCESS NOTES:
 # - billing_tier: "free" or "premium". Controls whether registered users need premium token quota.
 # - anonymous_enabled: true/false. Whether the model appears in the public no-login catalog.
diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py
index ade202c72..fa97fb33a 100644
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@@ -659,3 +659,36 @@ async def get_user_long_context_llm(
     return await get_document_summary_llm(
         session, search_space_id, disable_streaming=disable_streaming
     )
+
+
+def get_planner_llm() -> ChatLiteLLM | None:
+    """Return a planner LLM instance from the first global config marked
+    ``is_planner: true``, or ``None`` if no planner config is defined.
+
+    The planner role handles short, structured internal tasks (KB search
+    planning: query rewriting, date extraction, recency classification).
+    These tasks are well-served by small/fast models (e.g. gpt-4o-mini,
+    Claude Haiku, Azure gpt-5.x-nano) — using the user's chat LLM for them
+    is unnecessarily expensive and slow.
+
+    This helper reads from ``config.GLOBAL_LLM_CONFIGS`` (loaded at import
+    time from ``global_llm_config.yaml``) so it has no DB cost and can be
+    called synchronously from middleware/factory code. It returns the same
+    instance shape as the global path of ``get_search_space_llm_instance``.
+
+    Callers MUST fall back to their chat LLM when this returns ``None`` so
+    deployments without a planner config keep working unchanged.
+    """
+    from app.agents.new_chat.llm_config import create_chat_litellm_from_config
+
+    planner_cfg = next(
+        (
+            cfg
+            for cfg in config.GLOBAL_LLM_CONFIGS
+            if cfg.get("is_planner") is True
+        ),
+        None,
+    )
+    if not planner_cfg:
+        return None
+    return create_chat_litellm_from_config(planner_cfg)

From db8bffab38bc1eb3b39e0c805d29ea960a7c3e08 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 11:58:15 +0200
Subject: [PATCH 31/34] perf(prompt-cache): enable Azure prompt_cache_key
 routing hint

Splits the OpenAI-family gate into per-param predicates so AZURE and
AZURE_OPENAI configs now receive prompt_cache_key for backend routing
affinity (Microsoft auto-caches GPT-4o+ deployments at >=1024 tokens;
the key clusters same-prefix requests on the same GPU pool and raises
hit rate on turn 2+). prompt_cache_retention stays opted out for Azure
because litellm 1.83.14's Azure transformer would drop it silently;
revisit when Azure's supported params list is updated.
---
 .../app/agents/new_chat/prompt_caching.py     | 101 +++++++++++++-----
 .../agents/new_chat/test_prompt_caching.py    |  63 +++++++++--
 2 files changed, 129 insertions(+), 35 deletions(-)

diff --git a/surfsense_backend/app/agents/new_chat/prompt_caching.py b/surfsense_backend/app/agents/new_chat/prompt_caching.py
index 9fe47cdac..b58a48266 100644
--- a/surfsense_backend/app/agents/new_chat/prompt_caching.py
+++ b/surfsense_backend/app/agents/new_chat/prompt_caching.py
@@ -39,9 +39,19 @@ For OpenAI-family configs we additionally pass:
 
 - ``prompt_cache_key=f"surfsense-thread-{thread_id}"`` — routing hint that
   raises hit rate by sending requests with a shared prefix to the same
-  backend.
+  backend. Supported by ``openai/``, ``deepseek/``, ``xai/``, and
+  ``azure/`` (added to LiteLLM's Azure transformer in
+  https://github.com/BerriAI/litellm/pull/20989, Feb 2026; verified
+  against ``AzureOpenAIConfig.get_supported_openai_params`` in our
+  installed litellm 1.83.14 for ``azure/gpt-4o``, ``azure/gpt-4o-mini``,
+  ``azure/gpt-5.4``, ``azure/gpt-5.4-mini``).
 - ``prompt_cache_retention="24h"`` — extends cache TTL beyond the default
-  5-10 min in-memory cache.
+  5-10 min in-memory cache. Set ONLY for OpenAI/DeepSeek/xAI: Azure's
+  server-side support landed in Microsoft's docs on 2026-05-13 but
+  LiteLLM 1.83.14's Azure transformer still omits it from its supported
+  params list, so it gets silently dropped by ``litellm.drop_params``.
+  Azure's default in-memory retention (5-10 min, max 1 h) already
+  bridges intra-conversation turns; revisit when LiteLLM bumps Azure.
 
 Safety net: ``litellm.drop_params=True`` is set globally in
 ``app.services.llm_service`` at module-load time. Any kwarg the destination
@@ -81,13 +91,31 @@ _DEFAULT_INJECTION_POINTS: tuple[dict[str, Any], ...] = (
     {"location": "message", "index": -1},
 )
 
-# Providers (uppercase ``AgentConfig.provider`` values) that natively expose
-# OpenAI-style automatic prompt caching with ``prompt_cache_key`` and
-# ``prompt_cache_retention`` kwargs. Strict whitelist — many other providers
-# in ``PROVIDER_MAP`` route through litellm's ``openai`` prefix without
-# implementing the OpenAI prompt-cache surface (e.g. MOONSHOT, ZHIPU,
-# MINIMAX), so we can't infer family from the litellm prefix alone.
-_OPENAI_FAMILY_PROVIDERS: frozenset[str] = frozenset({"OPENAI", "DEEPSEEK", "XAI"})
+# Providers (uppercase ``AgentConfig.provider`` values) that accept the
+# OpenAI ``prompt_cache_key`` routing hint. Microsoft's Azure OpenAI docs
+# (2026-05-13) confirm automatic prompt caching applies to every GPT-4o
+# or newer Azure deployment at ≥1024 tokens with no configuration needed,
+# and that ``prompt_cache_key`` is combined with the prefix hash to
+# improve routing affinity and therefore cache hit rate. LiteLLM's Azure
+# transformer ships ``prompt_cache_key`` in its supported params as of
+# https://github.com/BerriAI/litellm/pull/20989.
+#
+# Strict whitelist — many other providers in ``PROVIDER_MAP`` route
+# through litellm's ``openai`` prefix without implementing the OpenAI
+# prompt-cache surface (e.g. MOONSHOT, ZHIPU, MINIMAX), so we can't infer
+# family from the litellm prefix alone.
+_PROMPT_CACHE_KEY_PROVIDERS: frozenset[str] = frozenset(
+    {"OPENAI", "DEEPSEEK", "XAI", "AZURE", "AZURE_OPENAI"}
+)
+
+# Subset of ``_PROMPT_CACHE_KEY_PROVIDERS`` that also accept
+# ``prompt_cache_retention="24h"``. Azure is excluded: see module
+# docstring — LiteLLM 1.83.14's Azure transformer omits the param so
+# ``drop_params`` silently strips it. Re-add Azure once a future LiteLLM
+# release wires it into ``AzureOpenAIConfig.get_supported_openai_params``.
+_PROMPT_CACHE_RETENTION_PROVIDERS: frozenset[str] = frozenset(
+    {"OPENAI", "DEEPSEEK", "XAI"}
+)
 
 
 def _is_router_llm(llm: BaseChatModel) -> bool:
@@ -101,13 +129,13 @@ def _is_router_llm(llm: BaseChatModel) -> bool:
     return type(llm).__name__ == "ChatLiteLLMRouter"
 
 
-def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
-    """Whether the config targets an OpenAI-style prompt-cache surface.
+def _provider_supports_prompt_cache_key(agent_config: AgentConfig | None) -> bool:
+    """Whether the config targets a provider that accepts ``prompt_cache_key``.
 
-    Strict — only returns True when the user explicitly chose OPENAI,
-    DEEPSEEK, or XAI as the provider in their ``NewLLMConfig`` /
-    ``YAMLConfig``. Auto-mode and custom providers return False because
-    we can't statically know the destination.
+    Strict — only returns True for explicitly chosen OPENAI, DEEPSEEK,
+    XAI, AZURE, or AZURE_OPENAI providers. Auto-mode and custom
+    providers return False because we can't statically know the
+    destination and the router fans out across mixed providers.
     """
     if agent_config is None or not agent_config.provider:
         return False
@@ -115,7 +143,25 @@ def _is_openai_family_config(agent_config: AgentConfig | None) -> bool:
         return False
     if agent_config.custom_provider:
         return False
-    return agent_config.provider.upper() in _OPENAI_FAMILY_PROVIDERS
+    return agent_config.provider.upper() in _PROMPT_CACHE_KEY_PROVIDERS
+
+
+def _provider_supports_prompt_cache_retention(
+    agent_config: AgentConfig | None,
+) -> bool:
+    """Whether the config targets a provider that accepts ``prompt_cache_retention``.
+
+    Tighter than :func:`_provider_supports_prompt_cache_key` — Azure
+    deployments are excluded until LiteLLM ships the param in its Azure
+    transformer (see module docstring).
+    """
+    if agent_config is None or not agent_config.provider:
+        return False
+    if agent_config.is_auto_mode:
+        return False
+    if agent_config.custom_provider:
+        return False
+    return agent_config.provider.upper() in _PROMPT_CACHE_RETENTION_PROVIDERS
 
 
 def _get_or_init_model_kwargs(llm: BaseChatModel) -> dict[str, Any] | None:
@@ -173,16 +219,23 @@ def apply_litellm_prompt_caching(
             dict(point) for point in _DEFAULT_INJECTION_POINTS
         ]
 
-    # OpenAI-family extras only when we statically know the destination is
-    # OpenAI / DeepSeek / xAI. Auto-mode router fans out across providers
-    # so we can't safely set OpenAI-only kwargs there (drop_params would
-    # strip them but it's wasteful to set them in the first place).
+    # OpenAI-style extras only when we statically know the destination
+    # accepts them. Auto-mode router fans out across mixed providers so
+    # we can't safely set destination-specific kwargs there (drop_params
+    # would strip them but it's wasteful to set them in the first
+    # place).
     if _is_router_llm(llm):
         return
-    if not _is_openai_family_config(agent_config):
-        return
 
-    if thread_id is not None and "prompt_cache_key" not in model_kwargs:
+    if (
+        thread_id is not None
+        and "prompt_cache_key" not in model_kwargs
+        and _provider_supports_prompt_cache_key(agent_config)
+    ):
         model_kwargs["prompt_cache_key"] = f"surfsense-thread-{thread_id}"
-    if "prompt_cache_retention" not in model_kwargs:
+
+    if (
+        "prompt_cache_retention" not in model_kwargs
+        and _provider_supports_prompt_cache_retention(agent_config)
+    ):
         model_kwargs["prompt_cache_retention"] = "24h"
diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
index 4cf53969d..c3de15c58 100644
--- a/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_prompt_caching.py
@@ -12,13 +12,19 @@ prompt caching. It mutates ``llm.model_kwargs`` so the kwargs flow to
    the deepagent stack accumulates multiple ``SystemMessage``\ s in
    ``state["messages"]`` and ``role: system`` would tag every one of
    them, blowing past Anthropic's 4-block ``cache_control`` cap.
-2. Adds ``prompt_cache_key``/``prompt_cache_retention`` only for
-   single-model OPENAI/DEEPSEEK/XAI configs (where OpenAI's automatic
-   prompt-cache surface is available).
-3. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
-   OpenAI-only kwargs because the router fans out across providers.
-4. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
-5. Defensive: LLMs without a writable ``model_kwargs`` are silently
+2. Adds ``prompt_cache_key`` for OPENAI/DEEPSEEK/XAI/AZURE/AZURE_OPENAI
+   configs (Microsoft's Azure transformer was added to LiteLLM in
+   https://github.com/BerriAI/litellm/pull/20989, Feb 2026).
+3. Adds ``prompt_cache_retention="24h"`` ONLY for OPENAI/DEEPSEEK/XAI.
+   Azure's server-side support landed in Microsoft's docs on 2026-05-13
+   but LiteLLM 1.83.14 hasn't wired it through yet, so we let Azure use
+   its default in-memory retention rather than send a param that
+   ``litellm.drop_params`` would silently strip.
+4. Treats ``ChatLiteLLMRouter`` (auto-mode) as universal-only — no
+   destination-specific kwargs because the router fans out across
+   providers.
+5. Idempotent: user-supplied values in ``model_kwargs`` are preserved.
+6. Defensive: LLMs without a writable ``model_kwargs`` are silently
    skipped rather than raising.
 """
 
@@ -191,9 +197,9 @@ def test_does_not_overwrite_user_supplied_prompt_cache_key() -> None:
 
 @pytest.mark.parametrize("provider", ["OPENAI", "DEEPSEEK", "XAI"])
 def test_sets_openai_family_extras(provider: str) -> None:
-    """OpenAI-style providers gain ``prompt_cache_key`` (raises hit rate
-    via routing affinity) and ``prompt_cache_retention="24h"`` (extends
-    cache TTL beyond the default 5-10 min)."""
+    """Native OpenAI-style providers gain ``prompt_cache_key`` (raises
+    hit rate via routing affinity) and ``prompt_cache_retention="24h"``
+    (extends cache TTL beyond the default 5-10 min)."""
     cfg = _make_cfg(provider=provider)
     llm = _FakeLLM()
 
@@ -203,6 +209,27 @@ def test_sets_openai_family_extras(provider: str) -> None:
     assert llm.model_kwargs["prompt_cache_retention"] == "24h"
 
 
+@pytest.mark.parametrize("provider", ["AZURE", "AZURE_OPENAI"])
+def test_azure_gets_prompt_cache_key_only(provider: str) -> None:
+    """Azure configs gain ``prompt_cache_key`` for routing affinity
+    (Microsoft auto-caches every GPT-4o+ deployment at ≥1024 tokens;
+    the key clusters same-prefix requests on the same backend GPU pool
+    so hit rate climbs). They DO NOT get ``prompt_cache_retention``
+    because LiteLLM 1.83.14's Azure transformer omits it from its
+    supported params list — ``drop_params`` would silently strip it.
+    Azure's default in-memory retention (5-10 min, max 1 h) is already
+    enough to cover intra-conversation turns; revisit when LiteLLM
+    bumps Azure to match its OpenAI surface."""
+    cfg = _make_cfg(provider=provider, model_name="gpt-5.4")
+    llm = _FakeLLM(model="azure/gpt-5.4")
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=42)
+
+    assert llm.model_kwargs["prompt_cache_key"] == "surfsense-thread-42"
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
 def test_skips_prompt_cache_key_when_no_thread_id() -> None:
     """Without a thread id we can't construct a per-thread key. Retention
     is still useful so we set it (it's free)."""
@@ -215,12 +242,26 @@ def test_skips_prompt_cache_key_when_no_thread_id() -> None:
     assert llm.model_kwargs["prompt_cache_retention"] == "24h"
 
 
+def test_azure_skips_prompt_cache_key_when_no_thread_id() -> None:
+    """Azure without a thread id ends up with no extras (retention is
+    Azure-skipped, key needs a thread id) — universal injection points
+    still land."""
+    cfg = _make_cfg(provider="AZURE", model_name="gpt-5.4")
+    llm = _FakeLLM(model="azure/gpt-5.4")
+
+    apply_litellm_prompt_caching(llm, agent_config=cfg, thread_id=None)
+
+    assert "prompt_cache_key" not in llm.model_kwargs
+    assert "prompt_cache_retention" not in llm.model_kwargs
+    assert "cache_control_injection_points" in llm.model_kwargs
+
+
 @pytest.mark.parametrize(
     "provider",
     ["ANTHROPIC", "BEDROCK", "VERTEX_AI", "GOOGLE_AI_STUDIO", "GROQ", "MOONSHOT"],
 )
 def test_no_openai_extras_for_other_providers(provider: str) -> None:
-    """Non-OpenAI-family providers don't expose ``prompt_cache_key`` —
+    """Non-OpenAI-style providers don't expose ``prompt_cache_key`` —
     skip it. ``cache_control_injection_points`` is still set (universal)."""
     cfg = _make_cfg(provider=provider)
     llm = _FakeLLM()

From c0aa4261ac0f8b9444e26ff9888a7c1e22815903 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 16:11:07 +0200
Subject: [PATCH 32/34] perf(mcp): persist list_tools discovery in
 connector.config.cached_tools

Skip the ~1-3s MCP initialize + list_tools handshake on every cache miss
by reading tool definitions from the connector row we already load. Lazy
populate on first miss, self-heal on corrupt cache, zero schema migration.
---
 .../app/agents/new_chat/tools/mcp_tool.py     | 122 ++++++++++------
 .../agents/new_chat/tools/mcp_tools_cache.py  |  94 +++++++++++++
 .../new_chat/tools/test_mcp_tools_cache.py    | 130 ++++++++++++++++++
 3 files changed, 304 insertions(+), 42 deletions(-)
 create mode 100644 surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
 create mode 100644 surfsense_backend/tests/unit/agents/new_chat/tools/test_mcp_tools_cache.py

diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
index b3c26f331..3d4679fb8 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
@@ -36,6 +36,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.agents.new_chat.middleware.dedup_tool_calls import dedup_key_full_args
 from app.agents.new_chat.tools.hitl import request_approval
 from app.agents.new_chat.tools.mcp_client import MCPClient
+from app.agents.new_chat.tools.mcp_tools_cache import (
+    CachedMCPTools,
+    read_cached_tools,
+    write_cached_tools,
+)
 from app.db import SearchSourceConnector
 from app.services.mcp_oauth.registry import MCP_SERVICES, get_service_by_connector_type
 from app.utils.perf import get_perf_logger
@@ -516,6 +521,7 @@ async def _load_http_mcp_tools(
     is_generic_mcp: bool = False,
     *,
     bypass_internal_hitl: bool = False,
+    cached_tools: CachedMCPTools | None = None,
 ) -> list[StructuredTool]:
     """Load tools from an HTTP-based MCP server.
 
@@ -526,6 +532,8 @@ async def _load_http_mcp_tools(
         readonly_tools: Tool names that skip HITL approval (read-only operations).
         tool_name_prefix: If set, each tool name is prefixed for multi-account
             disambiguation (e.g. ``linear_25``).
+        cached_tools: If provided, skip live discovery and rebuild wrappers
+            from the persisted definitions.
     """
     tools: list[StructuredTool] = []
 
@@ -549,15 +557,23 @@ async def _load_http_mcp_tools(
 
     allowed_set = set(allowed_tools) if allowed_tools else None
 
-    async def _discover(disc_headers: dict[str, str]) -> list[dict[str, Any]]:
-        """Connect, initialize, and list tools from the MCP server."""
+    async def _discover(
+        disc_headers: dict[str, str],
+    ) -> tuple[dict[str, str | None], list[dict[str, Any]]]:
+        """Connect, initialize, and list tools — returns (serverInfo, tools)."""
         async with (
             streamablehttp_client(url, headers=disc_headers) as (read, write, _),
             ClientSession(read, write) as session,
         ):
-            await session.initialize()
+            init_result = await session.initialize()
+            server_info: dict[str, str | None] = {"name": None, "version": None}
+            si = getattr(init_result, "serverInfo", None)
+            if si is not None:
+                server_info["name"] = getattr(si, "name", None)
+                server_info["version"] = getattr(si, "version", None)
+
             response = await session.list_tools()
-            return [
+            return server_info, [
                 {
                     "name": tool.name,
                     "description": tool.description or "",
@@ -568,47 +584,65 @@ async def _load_http_mcp_tools(
                 for tool in response.tools
             ]
 
-    try:
-        tool_definitions = await _discover(headers)
-    except Exception as first_err:
-        if not _is_auth_error(first_err) or connector_id is None:
-            logger.exception(
-                "Failed to connect to HTTP MCP server at '%s' (connector %d): %s",
-                url,
-                connector_id,
-                first_err,
-            )
-            return tools
-
-        logger.warning(
-            "HTTP MCP discovery for connector %d got 401 — attempting token refresh",
-            connector_id,
-        )
-        fresh_headers = await _force_refresh_and_get_headers(connector_id)
-        if fresh_headers is None:
-            await _mark_connector_auth_expired(connector_id)
-            logger.error(
-                "HTTP MCP discovery for connector %d: token refresh failed, marking auth_expired",
-                connector_id,
-            )
-            return tools
-
+    if cached_tools is not None:
+        tool_definitions = [
+            {
+                "name": td.name,
+                "description": td.description,
+                "input_schema": td.input_schema,
+            }
+            for td in cached_tools.tools
+        ]
+    else:
         try:
-            tool_definitions = await _discover(fresh_headers)
-            headers = fresh_headers
-            logger.info(
-                "HTTP MCP discovery for connector %d succeeded after 401 recovery",
+            server_info, tool_definitions = await _discover(headers)
+        except Exception as first_err:
+            if not _is_auth_error(first_err) or connector_id is None:
+                logger.exception(
+                    "Failed to connect to HTTP MCP server at '%s' (connector %d): %s",
+                    url,
+                    connector_id,
+                    first_err,
+                )
+                return tools
+
+            logger.warning(
+                "HTTP MCP discovery for connector %d got 401 — attempting token refresh",
                 connector_id,
             )
-        except Exception as retry_err:
-            logger.exception(
-                "HTTP MCP discovery for connector %d still failing after refresh: %s",
-                connector_id,
-                retry_err,
-            )
-            if _is_auth_error(retry_err):
+            fresh_headers = await _force_refresh_and_get_headers(connector_id)
+            if fresh_headers is None:
                 await _mark_connector_auth_expired(connector_id)
-            return tools
+                logger.error(
+                    "HTTP MCP discovery for connector %d: token refresh failed, marking auth_expired",
+                    connector_id,
+                )
+                return tools
+
+            try:
+                server_info, tool_definitions = await _discover(fresh_headers)
+                headers = fresh_headers
+                logger.info(
+                    "HTTP MCP discovery for connector %d succeeded after 401 recovery",
+                    connector_id,
+                )
+            except Exception as retry_err:
+                logger.exception(
+                    "HTTP MCP discovery for connector %d still failing after refresh: %s",
+                    connector_id,
+                    retry_err,
+                )
+                if _is_auth_error(retry_err):
+                    await _mark_connector_auth_expired(connector_id)
+                return tools
+
+        await write_cached_tools(
+            connector_id,
+            tool_definitions,
+            server_name=server_info.get("name"),
+            server_version=server_info.get("version"),
+            transport=server_config.get("transport", "streamable-http"),
+        )
 
     total_discovered = len(tool_definitions)
 
@@ -1099,6 +1133,7 @@ async def load_mcp_tools(
                         "tool_name_prefix": tool_name_prefix,
                         "transport": server_config.get("transport", "stdio"),
                         "is_generic_mcp": svc_cfg is None,
+                        "cached_tools": read_cached_tools(connector),
                     }
                 )
 
@@ -1112,6 +1147,7 @@ async def load_mcp_tools(
         async def _discover_one(task: dict[str, Any]) -> list[StructuredTool]:
             discover_start = time.perf_counter()
             transport = task["transport"]
+            cached_tools = task.get("cached_tools")
             try:
                 if transport in ("streamable-http", "http", "sse"):
                     result = await asyncio.wait_for(
@@ -1125,6 +1161,7 @@ async def load_mcp_tools(
                             tool_name_prefix=task["tool_name_prefix"],
                             is_generic_mcp=task.get("is_generic_mcp", False),
                             bypass_internal_hitl=bypass_internal_hitl,
+                            cached_tools=cached_tools,
                         ),
                         timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
                     )
@@ -1140,12 +1177,13 @@ async def load_mcp_tools(
                         timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
                     )
                 _perf_log.info(
-                    "[mcp_discover] connector=%s name=%r transport=%s tools=%d elapsed=%.3fs",
+                    "[mcp_discover] connector=%s name=%r transport=%s tools=%d elapsed=%.3fs cache=%s",
                     task["connector_id"],
                     task["connector_name"],
                     transport,
                     len(result),
                     time.perf_counter() - discover_start,
+                    "hit" if cached_tools is not None else "miss",
                 )
                 return result
             except TimeoutError:
diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
new file mode 100644
index 000000000..3c79ed1d3
--- /dev/null
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
@@ -0,0 +1,94 @@
+"""Persist MCP ``list_tools`` results in ``SearchSourceConnector.config.cached_tools``."""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from typing import Any
+
+from pydantic import BaseModel, Field, ValidationError
+from sqlalchemy import select
+from sqlalchemy.orm.attributes import flag_modified
+
+from app.db import SearchSourceConnector, async_session_maker
+
+logger = logging.getLogger(__name__)
+
+
+class CachedMCPToolDef(BaseModel):
+    name: str
+    description: str = ""
+    input_schema: dict[str, Any] = Field(default_factory=dict)
+
+
+class CachedMCPTools(BaseModel):
+    discovered_at: datetime
+    server_version: str | None = None
+    server_name: str | None = None
+    transport: str | None = None
+    tools: list[CachedMCPToolDef]
+
+
+def read_cached_tools(connector: SearchSourceConnector) -> CachedMCPTools | None:
+    """Return parsed cached tools or ``None`` if missing / corrupt (caller falls back to live discovery)."""
+    cfg = connector.config or {}
+    raw = cfg.get("cached_tools")
+    if not raw or not isinstance(raw, dict):
+        return None
+
+    try:
+        return CachedMCPTools.model_validate(raw)
+    except ValidationError as exc:
+        logger.warning(
+            "MCP connector %d has corrupt cached_tools — falling back to live discovery: %s",
+            connector.id,
+            exc,
+        )
+        return None
+
+
+async def write_cached_tools(
+    connector_id: int,
+    tool_definitions: list[dict[str, Any]],
+    *,
+    server_name: str | None = None,
+    server_version: str | None = None,
+    transport: str | None = None,
+) -> None:
+    """Best-effort persist; uses its own session so a write failure cannot poison the caller's transaction."""
+    payload = CachedMCPTools(
+        discovered_at=datetime.now(UTC),
+        server_version=server_version,
+        server_name=server_name,
+        transport=transport,
+        tools=[CachedMCPToolDef.model_validate(td) for td in tool_definitions],
+    )
+
+    try:
+        async with async_session_maker() as session:
+            result = await session.execute(
+                select(SearchSourceConnector).filter(
+                    SearchSourceConnector.id == connector_id,
+                )
+            )
+            connector = result.scalars().first()
+            if connector is None:
+                return
+
+            cfg = dict(connector.config or {})
+            cfg["cached_tools"] = payload.model_dump(mode="json")
+            connector.config = cfg
+            flag_modified(connector, "config")
+            await session.commit()
+
+            logger.info(
+                "Persisted cached_tools for MCP connector %d (%d tools)",
+                connector_id,
+                len(payload.tools),
+            )
+    except Exception:
+        logger.warning(
+            "Failed to persist cached_tools for MCP connector %d",
+            connector_id,
+            exc_info=True,
+        )
diff --git a/surfsense_backend/tests/unit/agents/new_chat/tools/test_mcp_tools_cache.py b/surfsense_backend/tests/unit/agents/new_chat/tools/test_mcp_tools_cache.py
new file mode 100644
index 000000000..bae97ba9f
--- /dev/null
+++ b/surfsense_backend/tests/unit/agents/new_chat/tools/test_mcp_tools_cache.py
@@ -0,0 +1,130 @@
+"""Unit tests for ``mcp_tools_cache``."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+from types import SimpleNamespace
+
+import pytest
+
+from app.agents.new_chat.tools.mcp_tools_cache import (
+    CachedMCPToolDef,
+    CachedMCPTools,
+    read_cached_tools,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_connector(config: dict | None) -> SimpleNamespace:
+    return SimpleNamespace(id=42, config=config)
+
+
+def test_read_returns_none_when_config_is_none() -> None:
+    assert read_cached_tools(_make_connector(None)) is None
+
+
+def test_read_returns_none_when_cached_tools_missing() -> None:
+    assert read_cached_tools(_make_connector({"server_config": {}})) is None
+
+
+def test_read_returns_none_when_cached_tools_is_not_a_dict() -> None:
+    assert read_cached_tools(_make_connector({"cached_tools": []})) is None
+    assert read_cached_tools(_make_connector({"cached_tools": "stale"})) is None
+
+
+def test_read_parses_minimal_valid_payload() -> None:
+    parsed = read_cached_tools(
+        _make_connector(
+            {
+                "cached_tools": {
+                    "discovered_at": "2026-05-20T10:00:00+00:00",
+                    "tools": [
+                        {
+                            "name": "list_issues",
+                            "description": "List Linear issues",
+                            "input_schema": {"type": "object"},
+                        }
+                    ],
+                }
+            }
+        )
+    )
+    assert parsed is not None
+    assert parsed.server_version is None
+    assert parsed.server_name is None
+    assert parsed.transport is None
+    assert len(parsed.tools) == 1
+    assert parsed.tools[0].name == "list_issues"
+
+
+def test_read_parses_full_payload_with_serverinfo() -> None:
+    parsed = read_cached_tools(
+        _make_connector(
+            {
+                "cached_tools": {
+                    "discovered_at": "2026-05-20T10:00:00+00:00",
+                    "server_version": "1.2.3",
+                    "server_name": "atlassian-mcp",
+                    "transport": "streamable-http",
+                    "tools": [
+                        {"name": "create_issue", "input_schema": {}},
+                        {"name": "list_issues", "input_schema": {}},
+                    ],
+                }
+            }
+        )
+    )
+    assert parsed is not None
+    assert parsed.server_version == "1.2.3"
+    assert parsed.server_name == "atlassian-mcp"
+    assert parsed.transport == "streamable-http"
+    assert [t.name for t in parsed.tools] == ["create_issue", "list_issues"]
+
+
+def test_read_returns_none_for_corrupt_payload(caplog) -> None:
+    parsed = read_cached_tools(
+        _make_connector(
+            {
+                "cached_tools": {
+                    "discovered_at": "not-a-date",
+                    "tools": "should-be-a-list",
+                }
+            }
+        )
+    )
+    assert parsed is None
+    assert any("corrupt cached_tools" in r.getMessage() for r in caplog.records)
+
+
+def test_read_returns_none_when_tools_missing() -> None:
+    parsed = read_cached_tools(
+        _make_connector(
+            {"cached_tools": {"discovered_at": "2026-05-20T10:00:00+00:00"}}
+        )
+    )
+    assert parsed is None
+
+
+def test_tool_def_defaults_description_and_schema() -> None:
+    td = CachedMCPToolDef.model_validate({"name": "ping"})
+    assert td.description == ""
+    assert td.input_schema == {}
+
+
+def test_model_dump_json_mode_is_round_trippable() -> None:
+    original = CachedMCPTools(
+        discovered_at=datetime(2026, 5, 20, 10, 0, 0, tzinfo=UTC),
+        server_version="1.2.3",
+        server_name="atlassian-mcp",
+        transport="streamable-http",
+        tools=[CachedMCPToolDef(name="list_issues")],
+    )
+    payload = original.model_dump(mode="json")
+
+    assert payload["discovered_at"] == "2026-05-20T10:00:00Z"
+    assert payload["tools"][0]["name"] == "list_issues"
+
+    reparsed = CachedMCPTools.model_validate(payload)
+    assert reparsed.discovered_at == original.discovered_at
+    assert reparsed.tools[0].name == "list_issues"

From 704d1bf18f507025220c269d412fc0da72151f2e Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 17:43:27 +0200
Subject: [PATCH 33/34] refactor(mcp): per-connector cache refresh on lifecycle
 events

Collapse the invalidate + warmup pair into a single
refresh_mcp_tools_cache_for_connector(connector_id, search_space_id)
helper and scope live discovery to the one connector that changed
instead of the whole search space.

- new mcp_tool.discover_single_mcp_connector: load one connector,
  refresh OAuth if needed, force live MCP discovery so its cached_tools
  row is rewritten; returned wrappers are discarded since the in-process
  LRU is rebuilt lazily on the next user query
- mcp_tools_cache.refresh_mcp_tools_cache_for_connector: synchronously
  evicts the per-space LRU (LRU keys cannot scope finer) and schedules
  the per-connector prefetch via loop.create_task
- routes (OAuth callback, MCP POST, MCP PUT) collapse their two
  back-to-back calls into a single refresh call; DELETE handlers keep
  using bare invalidate_mcp_tools_cache (nothing to prefetch)

No new automated tests: the new functions are I/O glue (DB + network)
where mocked unit tests would test implementation rather than behavior.
The existing 9 unit tests for the cached_tools data shape are unchanged.
---
 .../app/agents/new_chat/tools/mcp_tool.py     | 88 +++++++++++++++++++
 .../agents/new_chat/tools/mcp_tools_cache.py  | 51 +++++++++++
 .../app/routes/mcp_oauth_route.py             | 21 +++--
 .../routes/search_source_connectors_routes.py | 12 ++-
 4 files changed, 161 insertions(+), 11 deletions(-)

diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
index 3d4679fb8..6c4cfb6be 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tool.py
@@ -1007,6 +1007,94 @@ def invalidate_mcp_tools_cache(search_space_id: int | None = None) -> None:
         _mcp_tools_cache.clear()
 
 
+async def discover_single_mcp_connector(connector_id: int) -> None:
+    """Force live MCP discovery for one connector so its ``cached_tools`` row is fresh.
+
+    ``_load_http_mcp_tools`` persists ``cached_tools`` as a side effect of any
+    live discovery; passing ``cached_tools=None`` here guarantees we go to the
+    network. The returned wrappers are discarded — the in-process LRU is
+    rebuilt lazily on the next user query. Stdio connectors are not cached and
+    are skipped.
+    """
+    from app.db import async_session_maker
+
+    started = time.perf_counter()
+    try:
+        async with async_session_maker() as session:
+            connector = await session.get(SearchSourceConnector, connector_id)
+            if connector is None:
+                logger.info(
+                    "discover_single_mcp_connector: connector %d not found",
+                    connector_id,
+                )
+                return
+
+            cfg = connector.config or {}
+            server_config = cfg.get("server_config", {})
+            if not server_config or not isinstance(server_config, dict):
+                return
+
+            transport = server_config.get("transport", "stdio")
+            if transport not in ("streamable-http", "http", "sse"):
+                return
+
+            if cfg.get("mcp_oauth"):
+                server_config = await _maybe_refresh_mcp_oauth_token(
+                    session, connector, cfg, server_config
+                )
+                cfg = connector.config or {}
+                server_config = _inject_oauth_headers(cfg, server_config)
+                if server_config is None:
+                    logger.info(
+                        "discover_single_mcp_connector: OAuth token unavailable for connector %d",
+                        connector_id,
+                    )
+                    return
+
+            ct = (
+                connector.connector_type.value
+                if hasattr(connector.connector_type, "value")
+                else str(connector.connector_type)
+            )
+            svc_cfg = get_service_by_connector_type(ct)
+            allowed_tools = svc_cfg.allowed_tools if svc_cfg else []
+            readonly_tools = svc_cfg.readonly_tools if svc_cfg else frozenset()
+
+            await asyncio.wait_for(
+                _load_http_mcp_tools(
+                    connector.id,
+                    connector.name,
+                    server_config,
+                    trusted_tools=cfg.get("trusted_tools", []),
+                    allowed_tools=allowed_tools,
+                    readonly_tools=readonly_tools,
+                    tool_name_prefix=None,
+                    is_generic_mcp=svc_cfg is None,
+                    bypass_internal_hitl=True,
+                    cached_tools=None,
+                ),
+                timeout=_MCP_DISCOVERY_TIMEOUT_SECONDS,
+            )
+
+            _perf_log.info(
+                "[mcp_prefetch] connector=%s elapsed=%.3fs",
+                connector_id,
+                time.perf_counter() - started,
+            )
+    except TimeoutError:
+        logger.warning(
+            "discover_single_mcp_connector: connector %d timed out after %ds",
+            connector_id,
+            _MCP_DISCOVERY_TIMEOUT_SECONDS,
+        )
+    except Exception:
+        logger.warning(
+            "discover_single_mcp_connector: failed for connector %d",
+            connector_id,
+            exc_info=True,
+        )
+
+
 async def load_mcp_tools(
     session: AsyncSession,
     search_space_id: int,
diff --git a/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py b/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
index 3c79ed1d3..81027e1c4 100644
--- a/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
+++ b/surfsense_backend/app/agents/new_chat/tools/mcp_tools_cache.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import asyncio
 import logging
 from datetime import UTC, datetime
 from typing import Any
@@ -14,6 +15,8 @@ from app.db import SearchSourceConnector, async_session_maker
 
 logger = logging.getLogger(__name__)
 
+_pending_prefetch_tasks: set[asyncio.Task[None]] = set()
+
 
 class CachedMCPToolDef(BaseModel):
     name: str
@@ -92,3 +95,51 @@ async def write_cached_tools(
             connector_id,
             exc_info=True,
         )
+
+
+def refresh_mcp_tools_cache_for_connector(
+    connector_id: int,
+    search_space_id: int,
+) -> None:
+    """Maintain the MCP tool cache after a single-connector lifecycle event.
+
+    Synchronously evicts the in-process LRU for the connector's search space
+    (LRU keys are per-space, so eviction cannot be scoped finer), then schedules
+    a background live discovery for this connector alone so its persisted
+    ``cached_tools`` row is refreshed before the next user query.
+
+    Idempotent. Eviction is best-effort; prefetch is best-effort and only runs
+    when an event loop is available. Neither path raises.
+    """
+    try:
+        from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+
+        invalidate_mcp_tools_cache(search_space_id)
+    except Exception:
+        logger.debug(
+            "MCP in-process cache eviction skipped for space %d",
+            search_space_id,
+            exc_info=True,
+        )
+
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        return
+
+    task = loop.create_task(_run_connector_prefetch(connector_id))
+    _pending_prefetch_tasks.add(task)
+    task.add_done_callback(_pending_prefetch_tasks.discard)
+
+
+async def _run_connector_prefetch(connector_id: int) -> None:
+    from app.agents.new_chat.tools.mcp_tool import discover_single_mcp_connector
+
+    try:
+        await discover_single_mcp_connector(connector_id)
+    except Exception:
+        logger.warning(
+            "MCP background prefetch failed for connector_id=%d",
+            connector_id,
+            exc_info=True,
+        )
diff --git a/surfsense_backend/app/routes/mcp_oauth_route.py b/surfsense_backend/app/routes/mcp_oauth_route.py
index 1abc1f1ec..57248d631 100644
--- a/surfsense_backend/app/routes/mcp_oauth_route.py
+++ b/surfsense_backend/app/routes/mcp_oauth_route.py
@@ -428,7 +428,7 @@ async def mcp_oauth_callback(
             await session.commit()
             await session.refresh(db_connector)
 
-            _invalidate_cache(space_id)
+            _refresh_mcp_cache(db_connector.id, space_id)
 
             logger.info(
                 "Re-authenticated %s MCP connector %s for user %s",
@@ -481,7 +481,7 @@ async def mcp_oauth_callback(
                 detail="A connector for this service already exists.",
             ) from e
 
-        _invalidate_cache(space_id)
+        _refresh_mcp_cache(new_connector.id, space_id)
 
         logger.info(
             "Created %s MCP connector %s for user %s in space %s",
@@ -658,10 +658,17 @@ async def reauth_mcp_service(
 # ---------------------------------------------------------------------------
 
 
-def _invalidate_cache(space_id: int) -> None:
-    try:
-        from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+def _refresh_mcp_cache(connector_id: int, space_id: int) -> None:
+    """Evict the in-process MCP tool LRU and schedule background prefetch.
 
-        invalidate_mcp_tools_cache(space_id)
+    Wraps :func:`refresh_mcp_tools_cache_for_connector` so any failure is
+    isolated from the OAuth response flow.
+    """
+    try:
+        from app.agents.new_chat.tools.mcp_tools_cache import (
+            refresh_mcp_tools_cache_for_connector,
+        )
+
+        refresh_mcp_tools_cache_for_connector(connector_id, space_id)
     except Exception:
-        logger.debug("MCP cache invalidation skipped", exc_info=True)
+        logger.debug("MCP cache refresh skipped", exc_info=True)
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index e9ffb7050..1338fe16b 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -2650,9 +2650,11 @@ async def create_mcp_connector(
             f"for user {user.id} in search space {search_space_id}"
         )
 
-        from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+        from app.agents.new_chat.tools.mcp_tools_cache import (
+            refresh_mcp_tools_cache_for_connector,
+        )
 
-        invalidate_mcp_tools_cache(search_space_id)
+        refresh_mcp_tools_cache_for_connector(db_connector.id, search_space_id)
 
         connector_read = SearchSourceConnectorRead.model_validate(db_connector)
         return MCPConnectorRead.from_connector(connector_read)
@@ -2828,9 +2830,11 @@ async def update_mcp_connector(
 
         logger.info(f"Updated MCP connector {connector_id}")
 
-        from app.agents.new_chat.tools.mcp_tool import invalidate_mcp_tools_cache
+        from app.agents.new_chat.tools.mcp_tools_cache import (
+            refresh_mcp_tools_cache_for_connector,
+        )
 
-        invalidate_mcp_tools_cache(connector.search_space_id)
+        refresh_mcp_tools_cache_for_connector(connector.id, connector.search_space_id)
 
         connector_read = SearchSourceConnectorRead.model_validate(connector)
         return MCPConnectorRead.from_connector(connector_read)

From 2be3f04df50e00c980c6bf388ec8d16cdf7f6aa8 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Wed, 20 May 2026 19:11:00 +0200
Subject: [PATCH 34/34] chore(scripts): drop one-off MCP session lifetime probe

The probe answered its question (informing the cached_tools persistence
design). Future MCP session-pooling work, if revived, can recreate it.
---
 .../scripts/probe_mcp_session_lifetime.py     | 563 ------------------
 1 file changed, 563 deletions(-)
 delete mode 100644 surfsense_backend/scripts/probe_mcp_session_lifetime.py

diff --git a/surfsense_backend/scripts/probe_mcp_session_lifetime.py b/surfsense_backend/scripts/probe_mcp_session_lifetime.py
deleted file mode 100644
index 66be5bc14..000000000
--- a/surfsense_backend/scripts/probe_mcp_session_lifetime.py
+++ /dev/null
@@ -1,563 +0,0 @@
-"""Probe MCP server session lifetime / staleness behavior — read-only.
-
-Goal
-----
-Empirically answer two questions for our actual third-party MCP servers
-(Atlassian, Linear, Slack, ClickUp, Airtable, ...):
-
-1. How expensive is the initial ``initialize`` handshake (``init=`` cost)?
-2. How long can a ``ClientSession`` sit idle and still survive a
-   subsequent ``list_tools()`` call?
-
-This script informs the design choice between
-
-* per-call sessions (current, ~1s init tax per call),
-* per-turn session reuse (LangChain-style, holds a session for the
-  duration of a chat turn),
-* a long-lived session pool (IBM-style, sessions reused across turns).
-
-The probe is read-only: it only ever calls ``session.list_tools()``,
-which is the safest MCP method. No tool calls against user data are
-performed.
-
-Usage
------
-Run from the repo root or from ``surfsense_backend/``::
-
-    uv run python -m scripts.probe_mcp_session_lifetime
-    uv run python -m scripts.probe_mcp_session_lifetime --quick
-    uv run python -m scripts.probe_mcp_session_lifetime --connectors 7,19,20
-    uv run python -m scripts.probe_mcp_session_lifetime --intervals 5,30,60,300
-
-Output
-------
-* Live progress to stderr (``[connector=7 t=+30s] OK 0.142s``).
-* Final per-connector table to stdout.
-* Raw results JSON to ``./mcp_session_probe_<timestamp>.json``.
-
-The default test reaches 1800s of idle (~30 min). Use ``--quick`` to
-stop at 60s for fast iteration. All connectors probe concurrently so
-total wall-clock time equals the longest interval, not the sum.
-"""
-
-from __future__ import annotations
-
-import argparse
-import asyncio
-import json
-import logging
-import os
-import sys
-import time
-from dataclasses import asdict, dataclass, field
-from datetime import datetime
-from typing import Any
-
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_BACKEND_ROOT = os.path.dirname(_HERE)
-if _BACKEND_ROOT not in sys.path:
-    sys.path.insert(0, _BACKEND_ROOT)
-
-import httpx  # noqa: E402
-from mcp import ClientSession  # noqa: E402
-from mcp.client.streamable_http import streamable_http_client  # noqa: E402
-from sqlalchemy import cast, select  # noqa: E402
-from sqlalchemy.dialects.postgresql import JSONB  # noqa: E402
-
-from app.agents.new_chat.tools.mcp_tool import (  # noqa: E402
-    _inject_oauth_headers,
-    _maybe_refresh_mcp_oauth_token,
-)
-from app.db import SearchSourceConnector, async_session_maker  # noqa: E402
-
-logging.basicConfig(
-    level=logging.WARNING,
-    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
-    stream=sys.stderr,
-)
-logging.getLogger("httpx").setLevel(logging.ERROR)
-logging.getLogger("mcp").setLevel(logging.ERROR)
-logger = logging.getLogger("mcp_probe")
-logger.setLevel(logging.INFO)
-
-
-DEFAULT_INTERVALS_SECONDS = [5, 30, 60, 300, 900, 1800]
-QUICK_INTERVALS_SECONDS = [5, 30, 60]
-PER_CALL_TIMEOUT_SECONDS = 60.0
-
-
-@dataclass
-class CheckpointResult:
-    """One ``list_tools()`` call against a long-lived session."""
-
-    idle_seconds_target: int
-    elapsed_since_open_seconds: float
-    elapsed_since_last_call_seconds: float
-    success: bool
-    latency_seconds: float | None
-    tools_returned: int | None
-    error_type: str | None
-    error_message: str | None
-
-
-@dataclass
-class ConnectorProbeResult:
-    """Per-connector aggregated probe outcome."""
-
-    connector_id: int
-    connector_name: str
-    connector_type: str
-    url: str
-    init_latency_seconds: float | None
-    first_call_latency_seconds: float | None
-    checkpoints: list[CheckpointResult] = field(default_factory=list)
-    fatal_error: str | None = None
-
-
-# ---------------------------------------------------------------------------
-# Connector loading + auth
-# ---------------------------------------------------------------------------
-
-
-async def _fetch_connectors(
-    connector_ids: list[int] | None,
-) -> list[SearchSourceConnector]:
-    """Pull every MCP-shaped connector (or only the requested IDs)."""
-    async with async_session_maker() as session:
-        stmt = select(SearchSourceConnector).filter(
-            cast(SearchSourceConnector.config, JSONB).has_key("server_config"),
-        )
-        if connector_ids:
-            stmt = stmt.filter(SearchSourceConnector.id.in_(connector_ids))
-        result = await session.execute(stmt)
-        connectors = list(result.scalars())
-
-    if connector_ids:
-        found_ids = {c.id for c in connectors}
-        missing = [cid for cid in connector_ids if cid not in found_ids]
-        if missing:
-            logger.warning("Requested connector IDs not found: %s", missing)
-    return connectors
-
-
-async def _resolve_authed_server_config(
-    connector: SearchSourceConnector,
-) -> dict[str, Any] | None:
-    """Refresh OAuth (if needed) and return a server_config with auth headers.
-
-    Returns ``None`` if the connector cannot be probed (missing url,
-    decrypt failure, no refresh token, etc.).
-    """
-    cfg = connector.config or {}
-    server_config = cfg.get("server_config", {})
-    if not isinstance(server_config, dict):
-        return None
-
-    if cfg.get("mcp_oauth"):
-        async with async_session_maker() as session:
-            attached = await session.get(SearchSourceConnector, connector.id)
-            if attached is None:
-                return None
-            refreshed = await _maybe_refresh_mcp_oauth_token(
-                session,
-                attached,
-                attached.config or {},
-                server_config,
-            )
-            attached_cfg = attached.config or {}
-        server_config = _inject_oauth_headers(attached_cfg, refreshed)
-        if server_config is None:
-            return None
-    return server_config
-
-
-# ---------------------------------------------------------------------------
-# The actual probe
-# ---------------------------------------------------------------------------
-
-
-def _classify_error(exc: BaseException) -> tuple[str, str]:
-    """Return ``(short_label, human_message)`` for a failed call."""
-    name = type(exc).__name__
-    msg = str(exc) or repr(exc)
-    if isinstance(exc, asyncio.TimeoutError):
-        return "timeout", f"call exceeded {PER_CALL_TIMEOUT_SECONDS}s"
-    if "404" in msg or "Not Found" in msg or "session" in msg.lower():
-        return "session_expired", msg
-    if "401" in msg or "Unauthorized" in msg:
-        return "auth_401", msg
-    if "ClosedResourceError" in name or "Closed" in name:
-        return "stream_closed", msg
-    if "Connection" in name or "ConnectError" in name:
-        return "connection_error", msg
-    return name, msg
-
-
-async def _probe_one_connector(
-    connector: SearchSourceConnector,
-    intervals: list[int],
-) -> ConnectorProbeResult:
-    """Open a single long-lived session, call ``list_tools`` at each interval."""
-    connector_type = (
-        connector.connector_type.value
-        if hasattr(connector.connector_type, "value")
-        else str(connector.connector_type)
-    )
-    server_config = await _resolve_authed_server_config(connector)
-    if server_config is None:
-        return ConnectorProbeResult(
-            connector_id=connector.id,
-            connector_name=connector.name,
-            connector_type=connector_type,
-            url="(unresolved)",
-            init_latency_seconds=None,
-            first_call_latency_seconds=None,
-            fatal_error="failed_to_resolve_server_config",
-        )
-
-    url = server_config.get("url")
-    headers = server_config.get("headers", {})
-    if not url:
-        return ConnectorProbeResult(
-            connector_id=connector.id,
-            connector_name=connector.name,
-            connector_type=connector_type,
-            url="(missing)",
-            init_latency_seconds=None,
-            first_call_latency_seconds=None,
-            fatal_error="missing_url",
-        )
-
-    transport = server_config.get("transport", "streamable-http")
-    if transport not in ("streamable-http", "http", "sse"):
-        return ConnectorProbeResult(
-            connector_id=connector.id,
-            connector_name=connector.name,
-            connector_type=connector_type,
-            url=url,
-            init_latency_seconds=None,
-            first_call_latency_seconds=None,
-            fatal_error=f"unsupported_transport:{transport}",
-        )
-
-    result = ConnectorProbeResult(
-        connector_id=connector.id,
-        connector_name=connector.name,
-        connector_type=connector_type,
-        url=url,
-        init_latency_seconds=None,
-        first_call_latency_seconds=None,
-    )
-
-    open_started = time.perf_counter()
-    last_call_at: float | None = None
-
-    # Manually drive the context-manager protocol so the session lives
-    # across our sleep intervals. ``streamable_http_client`` spawns a
-    # background task for the SSE receive loop; ``ClientSession`` spawns
-    # another for request multiplexing. We must close them in reverse order.
-    http_client = httpx.AsyncClient(headers=headers, timeout=PER_CALL_TIMEOUT_SECONDS)
-    transport_cm = None
-    session_cm = None
-    session = None
-    try:
-        transport_cm = streamable_http_client(url, http_client=http_client)
-        read, write, _ = await transport_cm.__aenter__()
-        session_cm = ClientSession(read, write)
-        session = await session_cm.__aenter__()
-
-        init_start = time.perf_counter()
-        await asyncio.wait_for(session.initialize(), timeout=PER_CALL_TIMEOUT_SECONDS)
-        result.init_latency_seconds = time.perf_counter() - init_start
-        logger.info(
-            "[connector=%s name=%r] init=%.3fs",
-            connector.id,
-            connector.name,
-            result.init_latency_seconds,
-        )
-
-        first_call_start = time.perf_counter()
-        first_response = await asyncio.wait_for(
-            session.list_tools(), timeout=PER_CALL_TIMEOUT_SECONDS
-        )
-        result.first_call_latency_seconds = time.perf_counter() - first_call_start
-        last_call_at = time.perf_counter()
-        logger.info(
-            "[connector=%s name=%r] first_call=%.3fs tools=%d",
-            connector.id,
-            connector.name,
-            result.first_call_latency_seconds,
-            len(first_response.tools),
-        )
-
-        for interval in intervals:
-            target_elapsed = open_started + (
-                result.init_latency_seconds + result.first_call_latency_seconds + interval
-            )
-            sleep_for = max(0.0, target_elapsed - time.perf_counter())
-            await asyncio.sleep(sleep_for)
-
-            call_start = time.perf_counter()
-            elapsed_since_open = call_start - open_started
-            elapsed_since_last = call_start - (last_call_at or call_start)
-            try:
-                response = await asyncio.wait_for(
-                    session.list_tools(), timeout=PER_CALL_TIMEOUT_SECONDS
-                )
-                latency = time.perf_counter() - call_start
-                last_call_at = time.perf_counter()
-                checkpoint = CheckpointResult(
-                    idle_seconds_target=interval,
-                    elapsed_since_open_seconds=round(elapsed_since_open, 3),
-                    elapsed_since_last_call_seconds=round(elapsed_since_last, 3),
-                    success=True,
-                    latency_seconds=round(latency, 3),
-                    tools_returned=len(response.tools),
-                    error_type=None,
-                    error_message=None,
-                )
-                logger.info(
-                    "[connector=%s t=+%ds] OK %.3fs (tools=%d)",
-                    connector.id,
-                    interval,
-                    latency,
-                    len(response.tools),
-                )
-                result.checkpoints.append(checkpoint)
-            except Exception as exc:  # noqa: BLE001
-                label, msg = _classify_error(exc)
-                latency_at_failure = time.perf_counter() - call_start
-                checkpoint = CheckpointResult(
-                    idle_seconds_target=interval,
-                    elapsed_since_open_seconds=round(elapsed_since_open, 3),
-                    elapsed_since_last_call_seconds=round(elapsed_since_last, 3),
-                    success=False,
-                    latency_seconds=round(latency_at_failure, 3),
-                    tools_returned=None,
-                    error_type=label,
-                    error_message=msg[:300],
-                )
-                logger.warning(
-                    "[connector=%s t=+%ds] FAILED %s after %.3fs: %s",
-                    connector.id,
-                    interval,
-                    label,
-                    latency_at_failure,
-                    msg[:200],
-                )
-                result.checkpoints.append(checkpoint)
-                # Session is presumed dead — further checkpoints would all
-                # fail the same way and just waste wall time.
-                break
-
-    except Exception as exc:  # noqa: BLE001
-        label, msg = _classify_error(exc)
-        result.fatal_error = f"{label}: {msg[:200]}"
-        logger.exception(
-            "[connector=%s] fatal during open/init: %s",
-            connector.id,
-            exc,
-        )
-    finally:
-        if session_cm is not None:
-            try:
-                await session_cm.__aexit__(None, None, None)
-            except Exception:
-                pass
-        if transport_cm is not None:
-            try:
-                await transport_cm.__aexit__(None, None, None)
-            except Exception:
-                pass
-        try:
-            await http_client.aclose()
-        except Exception:
-            pass
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Reporting
-# ---------------------------------------------------------------------------
-
-
-def _render_table(results: list[ConnectorProbeResult]) -> str:
-    """Pretty-print a per-connector summary suitable for the terminal."""
-    lines: list[str] = []
-    lines.append("=" * 100)
-    lines.append("MCP Session Lifetime Probe Results")
-    lines.append("=" * 100)
-
-    for result in results:
-        lines.append("")
-        lines.append(
-            f"Connector {result.connector_id} | {result.connector_type} | "
-            f"{result.connector_name!r}"
-        )
-        lines.append(f"  url: {result.url}")
-        if result.fatal_error:
-            lines.append(f"  FATAL: {result.fatal_error}")
-            continue
-        lines.append(
-            f"  init handshake: "
-            f"{result.init_latency_seconds:.3f}s"
-            if result.init_latency_seconds is not None
-            else "  init handshake: (failed)"
-        )
-        lines.append(
-            f"  first list_tools (cold): "
-            f"{result.first_call_latency_seconds:.3f}s"
-            if result.first_call_latency_seconds is not None
-            else "  first list_tools: (failed)"
-        )
-        if not result.checkpoints:
-            lines.append("  (no idle checkpoints recorded)")
-            continue
-        lines.append(
-            f"  {'idle_s':>8} | {'since_last':>10} | {'outcome':>16} | "
-            f"{'latency':>9} | {'tools':>5}"
-        )
-        for cp in result.checkpoints:
-            outcome = "OK" if cp.success else (cp.error_type or "FAIL")
-            latency = f"{cp.latency_seconds:.3f}s" if cp.latency_seconds is not None else "-"
-            tools = str(cp.tools_returned) if cp.tools_returned is not None else "-"
-            lines.append(
-                f"  {cp.idle_seconds_target:>8} | "
-                f"{cp.elapsed_since_last_call_seconds:>10.1f} | "
-                f"{outcome:>16} | "
-                f"{latency:>9} | "
-                f"{tools:>5}"
-            )
-
-    lines.append("")
-    lines.append("=" * 100)
-    lines.append("Summary")
-    lines.append("=" * 100)
-    survived: dict[int, list[int]] = {}
-    for result in results:
-        for cp in result.checkpoints:
-            if cp.success:
-                survived.setdefault(cp.idle_seconds_target, []).append(
-                    result.connector_id
-                )
-    if survived:
-        for interval in sorted(survived):
-            ids = sorted(survived[interval])
-            lines.append(
-                f"  Idle {interval:>5}s: {len(ids)}/{len(results)} connectors "
-                f"survived ({ids})"
-            )
-    else:
-        lines.append("  (no successful checkpoints)")
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Entry point
-# ---------------------------------------------------------------------------
-
-
-def _parse_int_list(value: str) -> list[int]:
-    return [int(x) for x in value.split(",") if x.strip()]
-
-
-def _parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Probe MCP server session lifetime (read-only)",
-    )
-    parser.add_argument(
-        "--connectors",
-        type=_parse_int_list,
-        default=None,
-        help="Comma-separated connector IDs to probe. Default: all MCP connectors.",
-    )
-    parser.add_argument(
-        "--intervals",
-        type=_parse_int_list,
-        default=None,
-        help="Comma-separated idle intervals in seconds. "
-        f"Default: {DEFAULT_INTERVALS_SECONDS}",
-    )
-    parser.add_argument(
-        "--quick",
-        action="store_true",
-        help=f"Short run (intervals={QUICK_INTERVALS_SECONDS}) for fast iteration.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=None,
-        help="Optional path for the raw JSON results.",
-    )
-    return parser.parse_args()
-
-
-async def _async_main() -> int:
-    args = _parse_args()
-    if args.intervals is not None:
-        intervals = args.intervals
-    elif args.quick:
-        intervals = QUICK_INTERVALS_SECONDS
-    else:
-        intervals = DEFAULT_INTERVALS_SECONDS
-
-    longest = max(intervals) if intervals else 0
-    logger.info(
-        "Probing intervals=%s (longest=%ds, ~%dmin total wall time)",
-        intervals,
-        longest,
-        (longest + 30) // 60,
-    )
-
-    connectors = await _fetch_connectors(args.connectors)
-    if not connectors:
-        logger.error("No MCP connectors found to probe.")
-        return 2
-    logger.info(
-        "Probing %d connector(s): %s",
-        len(connectors),
-        [f"{c.id}:{c.name}" for c in connectors],
-    )
-
-    started_at = time.time()
-    results = await asyncio.gather(
-        *[_probe_one_connector(c, intervals) for c in connectors],
-        return_exceptions=False,
-    )
-    elapsed = time.time() - started_at
-    logger.info("All probes complete in %.1fs", elapsed)
-
-    table = _render_table(results)
-    print(table)
-
-    output_path = (
-        args.output
-        or f"mcp_session_probe_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
-    )
-    with open(output_path, "w", encoding="utf-8") as fp:
-        json.dump(
-            {
-                "started_at": datetime.fromtimestamp(started_at).isoformat(),
-                "elapsed_seconds": round(elapsed, 1),
-                "intervals_tested": intervals,
-                "results": [asdict(r) for r in results],
-            },
-            fp,
-            indent=2,
-        )
-    logger.info("Raw results saved to %s", output_path)
-    return 0
-
-
-def main() -> None:
-    try:
-        exit_code = asyncio.run(_async_main())
-    except KeyboardInterrupt:
-        logger.warning("Interrupted by user")
-        exit_code = 130
-    sys.exit(exit_code)
-
-
-if __name__ == "__main__":
-    main()