chat: fix mixed-decision HITL crash and fold resumed assistant messages into the interrupted bubble.

2026-05-12 09:12:40 +02:00 · 2026-05-09 22:54:07 +02:00 · 2026-05-09 22:54:07 +02:00 · 932bf22a34
commit 932bf22a34
parent 2e132513be
6 changed files with 208 additions and 40 deletions
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/config.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/config.py
@ -6,12 +6,19 @@ exposes the side-channel ``stream_resume_chat`` uses to ferry resume payloads.

 from __future__ import annotations

+import logging
 from typing import Any

 from langchain.tools import ToolRuntime

 from .constants import DEFAULT_SUBAGENT_RECURSION_LIMIT

+logger = logging.getLogger(__name__)
+
+# langgraph stores the parent task's scratchpad under this configurable key;
+# subagents inherit the chain via ``parent_scratchpad`` fallback.
+_LANGGRAPH_SCRATCHPAD_KEY = "__pregel_scratchpad"
+

 def subagent_invoke_config(runtime: ToolRuntime) -> dict[str, Any]:
    """RunnableConfig for the nested invoke; raises ``recursion_limit`` to the parent's budget."""
@ -42,3 +49,42 @@ def has_surfsense_resume(runtime: ToolRuntime) -> bool:
    if not isinstance(configurable, dict):
        return False
    return "surfsense_resume_value" in configurable
+
+
+def drain_parent_null_resume(runtime: ToolRuntime) -> None:
+    """Consume the parent's lingering ``NULL_TASK_ID/RESUME`` write before delegating.
+
+    ``stream_resume_chat`` wakes the main agent with
+    ``Command(resume={"decisions": [...]})`` so the propagated
+    ``_lg_interrupt(...)`` can return. langgraph stores that payload as the
+    parent task's ``null_resume`` pending write, which only gets consumed
+    *after* ``subagent.[a]invoke`` returns (when the post-call propagation
+    re-fires). While the subagent is mid-execution, any *new* ``interrupt()``
+    inside it (e.g. a follow-up tool call after a mixed approve/reject) walks
+    ``subagent_scratchpad → parent_scratchpad.get_null_resume`` and picks up
+    the parent's still-live decisions — mismatching against a different number
+    of hanging tool calls and crashing ``HumanInTheLoopMiddleware``.
+
+    Draining the write here closes that cross-graph leak so subagent
+    interrupts pause cleanly and re-propagate as a fresh approval card.
+    """
+    cfg = runtime.config or {}
+    configurable = cfg.get("configurable") if isinstance(cfg, dict) else None
+    if not isinstance(configurable, dict):
+        return
+    scratchpad = configurable.get(_LANGGRAPH_SCRATCHPAD_KEY)
+    if scratchpad is None:
+        return
+    consume = getattr(scratchpad, "get_null_resume", None)
+    if not callable(consume):
+        return
+    try:
+        consume(True)
+    except Exception:
+        # Defensive: if langgraph's internal scratchpad shape changes we don't
+        # want to break the resume path. Worst case the original ValueError
+        # still surfaces — same behavior as before this fix.
+        logger.debug(
+            "drain_parent_null_resume: scratchpad.get_null_resume raised",
+            exc_info=True,
+        )
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
@ -20,6 +20,7 @@ from langgraph.types import Command

 from .config import (
    consume_surfsense_resume,
+    drain_parent_null_resume,
    has_surfsense_resume,
    subagent_invoke_config,
 )
@ -157,6 +158,9 @@ def build_task_tool_with_parent_config(
                )
            expected = hitlrequest_action_count(pending_value)
            resume_value = fan_out_decisions_to_match(resume_value, expected)
+            # Prevent the parent's resume payload from leaking into subagent
+            # interrupts via langgraph's parent_scratchpad fallback.
+            drain_parent_null_resume(runtime)
            result = subagent.invoke(
                build_resume_command(resume_value, pending_id),
                config=sub_config,
@ -221,6 +225,9 @@ def build_task_tool_with_parent_config(
                )
            expected = hitlrequest_action_count(pending_value)
            resume_value = fan_out_decisions_to_match(resume_value, expected)
+            # Prevent the parent's resume payload from leaking into subagent
+            # interrupts via langgraph's parent_scratchpad fallback.
+            drain_parent_null_resume(runtime)
            result = await subagent.ainvoke(
                build_resume_command(resume_value, pending_id),
                config=sub_config,