chat: fix mixed-decision HITL crash and fold resumed assistant messages into the interrupted bubble.

This commit is contained in:
CREDO23 2026-05-09 22:54:07 +02:00
parent 2e132513be
commit 932bf22a34
6 changed files with 208 additions and 40 deletions

View file

@ -6,12 +6,19 @@ exposes the side-channel ``stream_resume_chat`` uses to ferry resume payloads.
from __future__ import annotations
import logging
from typing import Any
from langchain.tools import ToolRuntime
from .constants import DEFAULT_SUBAGENT_RECURSION_LIMIT
logger = logging.getLogger(__name__)
# langgraph stores the parent task's scratchpad under this configurable key;
# subagents inherit the chain via ``parent_scratchpad`` fallback.
_LANGGRAPH_SCRATCHPAD_KEY = "__pregel_scratchpad"
def subagent_invoke_config(runtime: ToolRuntime) -> dict[str, Any]:
"""RunnableConfig for the nested invoke; raises ``recursion_limit`` to the parent's budget."""
@ -42,3 +49,42 @@ def has_surfsense_resume(runtime: ToolRuntime) -> bool:
if not isinstance(configurable, dict):
return False
return "surfsense_resume_value" in configurable
def drain_parent_null_resume(runtime: ToolRuntime) -> None:
"""Consume the parent's lingering ``NULL_TASK_ID/RESUME`` write before delegating.
``stream_resume_chat`` wakes the main agent with
``Command(resume={"decisions": [...]})`` so the propagated
``_lg_interrupt(...)`` can return. langgraph stores that payload as the
parent task's ``null_resume`` pending write, which only gets consumed
*after* ``subagent.[a]invoke`` returns (when the post-call propagation
re-fires). While the subagent is mid-execution, any *new* ``interrupt()``
inside it (e.g. a follow-up tool call after a mixed approve/reject) walks
``subagent_scratchpad parent_scratchpad.get_null_resume`` and picks up
the parent's still-live decisions — mismatching against a different number
of hanging tool calls and crashing ``HumanInTheLoopMiddleware``.
Draining the write here closes that cross-graph leak so subagent
interrupts pause cleanly and re-propagate as a fresh approval card.
"""
cfg = runtime.config or {}
configurable = cfg.get("configurable") if isinstance(cfg, dict) else None
if not isinstance(configurable, dict):
return
scratchpad = configurable.get(_LANGGRAPH_SCRATCHPAD_KEY)
if scratchpad is None:
return
consume = getattr(scratchpad, "get_null_resume", None)
if not callable(consume):
return
try:
consume(True)
except Exception:
# Defensive: if langgraph's internal scratchpad shape changes we don't
# want to break the resume path. Worst case the original ValueError
# still surfaces — same behavior as before this fix.
logger.debug(
"drain_parent_null_resume: scratchpad.get_null_resume raised",
exc_info=True,
)

View file

@ -20,6 +20,7 @@ from langgraph.types import Command
from .config import (
consume_surfsense_resume,
drain_parent_null_resume,
has_surfsense_resume,
subagent_invoke_config,
)
@ -157,6 +158,9 @@ def build_task_tool_with_parent_config(
)
expected = hitlrequest_action_count(pending_value)
resume_value = fan_out_decisions_to_match(resume_value, expected)
# Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime)
result = subagent.invoke(
build_resume_command(resume_value, pending_id),
config=sub_config,
@ -221,6 +225,9 @@ def build_task_tool_with_parent_config(
)
expected = hitlrequest_action_count(pending_value)
resume_value = fan_out_decisions_to_match(resume_value, expected)
# Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime)
result = await subagent.ainvoke(
build_resume_command(resume_value, pending_id),
config=sub_config,