multi_agent_chat/middleware: real-graph regression tests for interrupt stamping

2026-05-17 18:35:19 +02:00 · 2026-05-13 19:57:09 +02:00 · 2026-05-13 19:57:09 +02:00 · 6fb011c95c
commit 6fb011c95c
parent e27883e88c
1 changed files with 284 additions and 0 deletions
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_interrupt_stamping.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_subagent_interrupt_stamping.py
@ -0,0 +1,284 @@
 """Production-shape regression tests for ``tool_call_id`` stamping on subagent interrupts.
 The production bug we're pinning here: when the orchestrator dispatches one or
 more ``task`` tool calls and the targeted subagents hit a HITL ``interrupt(...)``,
 the parent's persisted ``state.interrupts`` must carry the parent's
 ``tool_call_id`` on each interrupt value. Without that stamp,
 ``stream_resume_chat`` cannot route a flat ``decisions`` list back to the right
 paused subagent and resume fails with ``Decision count mismatch``.
 The tests in this module:
 - Build a **real** ``StateGraph`` subagent that calls real ``interrupt(...)``
  (no MagicMock, no patch of langgraph internals — those are exactly the kind
  of fakes that hid this bug).
 - Invoke the ``task`` tool from **inside a parent pregel** (via a tiny parent
  ``StateGraph`` node) so the subagent invocation happens in the
  production-shape "subgraph called from a parent tool node" context.
 - Assert on ``parent.state.interrupts[*].value["tool_call_id"]`` — the
  observable that ``stream_resume_chat`` reads.
 """
 from __future__ import annotations
 import pytest
 from langchain.tools import ToolRuntime
 from langchain_core.messages import AIMessage, HumanMessage
 from langchain_core.runnables import RunnableConfig
 from langgraph.checkpoint.memory import InMemorySaver
 from langgraph.graph import END, START, StateGraph
 from langgraph.types import Send, interrupt
 from typing_extensions import TypedDict
 from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
    build_task_tool_with_parent_config,
 )
 class _S(TypedDict, total=False):
    messages: list
 def _build_single_interrupt_subagent(checkpointer: InMemorySaver):
    """Subagent that fires one HITL-bundle-shaped interrupt and waits for a decision."""
    def approve_node(_state):
        decision = interrupt(
            {
                "action_requests": [
                    {"name": "do_thing", "args": {"x": 1}, "description": ""}
                ],
                "review_configs": [{}],
            }
        )
        return {"messages": [AIMessage(content=f"got:{decision}")]}
    g = StateGraph(_S)
    g.add_node("approve", approve_node)
    g.add_edge(START, "approve")
    g.add_edge("approve", END)
    return g.compile(checkpointer=checkpointer)
 def _build_bundle_subagent(checkpointer: InMemorySaver):
    """Subagent that fires one interrupt carrying a 3-action bundle."""
    def bundle_node(_state):
        decision = interrupt(
            {
                "action_requests": [
                    {"name": "a", "args": {}, "description": ""},
                    {"name": "b", "args": {}, "description": ""},
                    {"name": "c", "args": {}, "description": ""},
                ],
                "review_configs": [{}, {}, {}],
            }
        )
        return {"messages": [AIMessage(content=f"bundle:{decision}")]}
    g = StateGraph(_S)
    g.add_node("bundle", bundle_node)
    g.add_edge(START, "bundle")
    g.add_edge("bundle", END)
    return g.compile(checkpointer=checkpointer)
 def _parent_graph_calling_task(task_tool, *, tool_call_id: str, checkpointer):
    """A tiny parent graph whose only node invokes ``task_tool`` from inside the pregel runtime.
    This is the minimal reproduction of production's "subagent invoked from
    inside a parent tool node" context — the *only* context where langgraph
    treats the subagent as a subgraph and routes its interrupts back to the
    parent's checkpoint.
    """
    async def call_task(state, config: RunnableConfig):
        rt = ToolRuntime(
            state=state,
            config=config,
            context=None,
            stream_writer=None,
            tool_call_id=tool_call_id,
            store=None,
        )
        return await task_tool.coroutine(
            description="please approve",
            subagent_type="approver",
            runtime=rt,
        )
    g = StateGraph(_S)
    g.add_node("call_task", call_task)
    g.add_edge(START, "call_task")
    g.add_edge("call_task", END)
    return g.compile(checkpointer=checkpointer)
 class _DispatchState(TypedDict, total=False):
    messages: list
    tcid: str
    desc: str
 def _parent_graph_dispatching_two_tasks_via_send(
    task_tool, *, tool_call_id_a: str, tool_call_id_b: str, checkpointer
 ):
    """A parent graph that dispatches two ``task`` calls as parallel pregel
    tasks via :class:`~langgraph.types.Send`.
    This mirrors the production dispatch mechanism: when the orchestrator's
    LLM emits two ``task`` tool calls in one turn, langchain's tool node
    fans them out as parallel pregel tasks (the same primitive as ``Send``)
    so each tool call gets its own pregel task that can raise
    ``GraphInterrupt`` independently — and pregel collects *all* of them
    into the parent's snapshot at the end of the superstep.
    """
    def fanout_edge(_state) -> list[Send]:
        return [
            Send("call_task", {"tcid": tool_call_id_a, "desc": "approve A"}),
            Send("call_task", {"tcid": tool_call_id_b, "desc": "approve B"}),
        ]
    async def call_task(state: _DispatchState, config: RunnableConfig):
        rt = ToolRuntime(
            state=state,
            config=config,
            context=None,
            stream_writer=None,
            tool_call_id=state["tcid"],
            store=None,
        )
        return await task_tool.coroutine(
            description=state["desc"], subagent_type="approver", runtime=rt
        )
    g = StateGraph(_DispatchState)
    g.add_node("call_task", call_task)
    g.add_conditional_edges(START, fanout_edge, ["call_task"])
    g.add_edge("call_task", END)
    return g.compile(checkpointer=checkpointer)
 def _parent_interrupt_values(snapshot) -> list[dict]:
    """Extract ``state.interrupts[*].value`` for assertions."""
    return [i.value for i in (snapshot.interrupts or ())]
@pytest.mark.asyncio
 async def test_single_subagent_interrupt_stamps_parent_tool_call_id():
    """A single paused subagent must surface to the parent with ``tool_call_id`` stamped.
    Production bug regression: was producing
    ``value={"action_requests": [...], "review_configs": [...]}`` (no
    ``tool_call_id``), causing ``stream_resume_chat`` to skip the interrupt
    and raise ``Decision count mismatch``.
    """
    checkpointer = InMemorySaver()
    subagent = _build_single_interrupt_subagent(checkpointer)
    task_tool = build_task_tool_with_parent_config(
        [{"name": "approver", "description": "approves", "runnable": subagent}]
    )
    parent = _parent_graph_calling_task(
        task_tool, tool_call_id="parent-tcid-A", checkpointer=checkpointer
    )
    parent_config = {
        "configurable": {"thread_id": "parent-thread"},
        "recursion_limit": 100,
    }
    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
    snap = await parent.aget_state(parent_config)
    values = _parent_interrupt_values(snap)
    assert len(values) == 1, (
        f"expected exactly 1 parent interrupt, got {len(values)}: {values!r}"
    )
    value = values[0]
    assert isinstance(value, dict)
    assert value.get("tool_call_id") == "parent-tcid-A", (
        f"REGRESSION: parent interrupt missing/wrong tool_call_id stamp. "
        f"Expected 'parent-tcid-A', got {value.get('tool_call_id')!r}. "
        f"Keys present: {sorted(value.keys())}"
    )
    # The original HITL payload must still be intact alongside the stamp.
    assert value.get("action_requests") == [
        {"name": "do_thing", "args": {"x": 1}, "description": ""}
    ]
@pytest.mark.asyncio
 async def test_two_parallel_subagents_each_stamp_their_own_tool_call_id():
    """Two ``task`` calls dispatched in parallel must each carry their own ``tool_call_id``.
    This is the actual production scenario (Linear + Jira ticket creation):
    two parallel ``task`` tool calls, both subagents hit HITL, parent must
    end up with two interrupts whose ``tool_call_id``s match the two
    distinct parent-level ``tool_call_id``s the LLM emitted.
    """
    checkpointer = InMemorySaver()
    subagent = _build_single_interrupt_subagent(checkpointer)
    task_tool = build_task_tool_with_parent_config(
        [{"name": "approver", "description": "approves", "runnable": subagent}]
    )
    parent = _parent_graph_dispatching_two_tasks_via_send(
        task_tool,
        tool_call_id_a="parent-tcid-A",
        tool_call_id_b="parent-tcid-B",
        checkpointer=checkpointer,
    )
    parent_config = {
        "configurable": {"thread_id": "parent-thread-parallel"},
        "recursion_limit": 100,
    }
    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
    snap = await parent.aget_state(parent_config)
    values = _parent_interrupt_values(snap)
    assert len(values) == 2, (
        f"expected 2 parent interrupts (one per parallel task call), "
        f"got {len(values)}: {values!r}"
    )
    stamps = {v.get("tool_call_id") for v in values}
    assert stamps == {"parent-tcid-A", "parent-tcid-B"}, (
        f"REGRESSION: parallel parent interrupts missing/wrong tool_call_id stamps. "
        f"Expected {{'parent-tcid-A', 'parent-tcid-B'}}, got {stamps!r}. "
        f"Values: {values!r}"
    )
@pytest.mark.asyncio
 async def test_bundle_subagent_interrupt_stamps_tool_call_id_preserving_actions():
    """A subagent emitting a multi-action bundle must surface stamped, with all actions intact.
    The bundle shape (``action_requests=[3 items]``) drives the
    ``slice_decisions_by_tool_call`` accounting in ``stream_resume_chat`` —
    if either the stamp or the action count is lost, resume routing
    miscounts and crashes.
    """
    checkpointer = InMemorySaver()
    subagent = _build_bundle_subagent(checkpointer)
    task_tool = build_task_tool_with_parent_config(
        [{"name": "approver", "description": "approves", "runnable": subagent}]
    )
    parent = _parent_graph_calling_task(
        task_tool, tool_call_id="parent-tcid-bundle", checkpointer=checkpointer
    )
    parent_config = {
        "configurable": {"thread_id": "parent-thread-bundle"},
        "recursion_limit": 100,
    }
    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, parent_config)
    snap = await parent.aget_state(parent_config)
    values = _parent_interrupt_values(snap)
    assert len(values) == 1
    value = values[0]
    assert value.get("tool_call_id") == "parent-tcid-bundle"
    assert isinstance(value.get("action_requests"), list)
    assert len(value["action_requests"]) == 3, (
        f"REGRESSION: bundle action_requests count changed during stamping; "
        f"got {len(value['action_requests'])} actions: {value['action_requests']!r}"
    )