Harden multi-agent for production: resume cleanup, busy-mutex race, deny propagation, disabled-tools.

2026-05-07 23:02:39 +02:00 · 2026-05-04 20:48:55 +02:00 · 2026-05-04 20:48:55 +02:00 · 65f1f8f73c
commit 65f1f8f73c
parent 7735becd02
7 changed files with 128 additions and 29 deletions
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -46,6 +46,7 @@ from app.agents.new_chat.memory_extraction import (
    extract_and_save_memory,
    extract_and_save_team_memory,
 )
+from app.agents.new_chat.errors import BusyError
 from app.agents.new_chat.middleware.busy_mutex import release_lock as _release_busy_lock
 from app.agents.new_chat.middleware.kb_persistence import (
    commit_staged_filesystem_state,
@ -1977,6 +1978,11 @@ async def stream_new_chat(
    _premium_reserved = 0
    _premium_request_id: str | None = None

+    # ``BusyMutexMiddleware.abefore_agent`` raises ``BusyError`` *before*
+    # acquiring the lock, so a concurrent caller must not release the
+    # in-flight caller's lock from its own ``finally`` block.
+    _busy_error_raised = False
+
    session = async_session_maker()
    try:
        # Mark AI as responding to this user for live collaboration
@ -2094,10 +2100,6 @@ async def stream_new_chat(

        _t0 = time.perf_counter()
        if use_multi_agent:
-            # TODO: Propagate ``disabled_tools`` into registry subagents. Today only the main
-            # agent honors UI disables; ``task`` delegates still get full specialist tool sets.
-            # Deliverables (and similar) are user-disableable but implemented on subagents, so
-            # disabling them in the UI does not fully apply until subagents filter too.
            agent = await create_registry_deep_agent(
                llm=llm,
                search_space_id=search_space_id,
@ -2620,6 +2622,13 @@ async def stream_new_chat(
        yield streaming_service.format_finish()
        yield streaming_service.format_done()

+    except BusyError as e:
+        _busy_error_raised = True
+        yield streaming_service.format_error(str(e))
+        yield streaming_service.format_finish_step()
+        yield streaming_service.format_finish()
+        yield streaming_service.format_done()
+
    except Exception as e:
        # Handle any errors
        import traceback
@ -2697,12 +2706,15 @@ async def stream_new_chat(

        # Release the busy lock here too: ``aafter_agent`` does not fire if the
        # graph paused on ``interrupt()`` or the stream bailed out early.
-        with contextlib.suppress(Exception):
-            if _release_busy_lock(str(chat_id)):
-                _perf_log.info(
-                    "[stream_new_chat] released stale busy lock (chat_id=%s)",
-                    chat_id,
-                )
+        # Skip on ``BusyError``: this caller never acquired the lock, so a
+        # release here would steal the in-flight caller's lock.
+        if not _busy_error_raised:
+            with contextlib.suppress(Exception):
+                if _release_busy_lock(str(chat_id)):
+                    _perf_log.info(
+                        "[stream_new_chat] released stale busy lock (chat_id=%s)",
+                        chat_id,
+                    )

        # Break circular refs held by the agent graph, tools, and LLM
        # wrappers so the GC can reclaim them in a single pass.
@ -2754,6 +2766,10 @@ async def stream_resume_chat(

    accumulator = start_turn()

+    # See ``stream_new_chat``: skip the finally release when ``BusyError``
+    # short-circuited before this caller acquired the lock.
+    _busy_error_raised = False
+
    session = async_session_maker()
    try:
        if user_id:
@ -3036,6 +3052,13 @@ async def stream_resume_chat(
        yield streaming_service.format_finish()
        yield streaming_service.format_done()

+    except BusyError as e:
+        _busy_error_raised = True
+        yield streaming_service.format_error(str(e))
+        yield streaming_service.format_finish_step()
+        yield streaming_service.format_finish()
+        yield streaming_service.format_done()
+
    except Exception as e:
        import traceback

@ -3086,12 +3109,14 @@ async def stream_resume_chat(

        # Release the busy lock left held by the originally-interrupted turn,
        # and any re-interrupt or early bailout from this resume.
-        with contextlib.suppress(Exception):
-            if _release_busy_lock(str(chat_id)):
-                _perf_log.info(
-                    "[stream_resume] released stale busy lock (chat_id=%s)",
-                    chat_id,
-                )
+        # Skip on ``BusyError``: this caller never acquired the lock.
+        if not _busy_error_raised:
+            with contextlib.suppress(Exception):
+                if _release_busy_lock(str(chat_id)):
+                    _perf_log.info(
+                        "[stream_resume] released stale busy lock (chat_id=%s)",
+                        chat_id,
+                    )

        agent = llm = connector_service = None
        stream_result = None