multi_agent_chat/middleware: real-graph regression test for all-reject parallel routing

2026-05-17 18:35:19 +02:00 · 2026-05-14 09:36:03 +02:00 · 2026-05-14 09:36:03 +02:00 · 8e10f38f32
commit 8e10f38f32
parent ca57b2106e
1 changed files with 216 additions and 0 deletions
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_reject_only_routing.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_reject_only_routing.py
@ -0,0 +1,216 @@
+"""Real-graph contract: all-reject decisions route correctly across parallel subagents.
+
+Heterogeneous routing is covered by ``test_parallel_heterogeneous_decisions``.
+This module pins the narrower edge case where **every** card on **every**
+paused subagent is rejected.
+
+Why a separate pin:
+
+1. **No approval-bias in the slicer.** A future "if no approvals, short-circuit
+   resume" optimization would be tempting (skips a langgraph round-trip) and
+   would also silently break this scenario. Pin it.
+2. **``message`` metadata pass-through across a run of rejects.** The reject
+   ``message`` is the user-visible reason ("looks suspicious", "duplicate",
+   etc.). Losing it would silently swallow user intent — the worst UX
+   failure mode for HITL. Heterogeneous covers one reject; here we verify a
+   sequence of rejects survives the slicer + bridge with distinct messages
+   intact and in order.
+3. **All branches complete with no leftover pending.** Even when nothing was
+   approved, the parent must drain every paused subagent so the SSE stream
+   can close cleanly. A bug that left one ``Interrupt.id`` un-keyed would
+   strand the conversation in "pending" forever.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: list
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+def _build_recording_subagent(checkpointer: InMemorySaver, *, action_count: int):
+    """Subagent that pauses with ``action_count`` actions and records its resume payload.
+
+    The recorded ``AIMessage`` content is the JSON-serialized payload, so the
+    test can match each subagent's slice by content.
+    """
+
+    def hitl_node(_state):
+        decision_payload = interrupt(
+            {
+                "action_requests": [
+                    {"name": f"act_{i}", "args": {"i": i}, "description": ""}
+                    for i in range(action_count)
+                ],
+                "review_configs": [
+                    {
+                        "action_name": f"act_{i}",
+                        "allowed_decisions": ["approve", "reject", "edit"],
+                    }
+                    for i in range(action_count)
+                ],
+            }
+        )
+        return {
+            "messages": [
+                AIMessage(content=json.dumps(decision_payload, sort_keys=True))
+            ]
+        }
+
+    g = StateGraph(_SubState)
+    g.add_node("hitl", hitl_node)
+    g.add_edge(START, "hitl")
+    g.add_edge("hitl", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_two_branches(task_tool, *, dispatches, checkpointer):
+    def fanout(_state) -> list[Send]:
+        return [Send("call_task", d) for d in dispatches]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+@pytest.mark.asyncio
+async def test_all_reject_decisions_route_to_each_subagent_with_messages_intact():
+    """All cards rejected across two parallel subagents — order and messages preserved.
+
+    Setup mirrors a real "user reviews two parallel ticket creations and
+    rejects everything with distinct reasons":
+
+    - Sub-A pauses with 2 actions.
+    - Sub-B pauses with 1 action.
+    - Flat decisions: 3 rejects, each with a unique ``message``.
+
+    Asserts each subagent receives only its slice, in original order,
+    with every ``message`` intact and no ``edited_action`` fields fabricated.
+    """
+    checkpointer = InMemorySaver()
+
+    sub_a = _build_recording_subagent(checkpointer, action_count=2)
+    sub_b = _build_recording_subagent(checkpointer, action_count=1)
+
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "agent-a", "description": "first", "runnable": sub_a},
+            {"name": "agent-b", "description": "second", "runnable": sub_b},
+        ]
+    )
+
+    parent = _parent_two_branches(
+        task_tool,
+        dispatches=[
+            {"tcid": "tcid-A", "subtype": "agent-a", "desc": "do A"},
+            {"tcid": "tcid-B", "subtype": "agent-b", "desc": "do B"},
+        ],
+        checkpointer=checkpointer,
+    )
+
+    config: dict = {
+        "configurable": {"thread_id": "all-reject-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused_state = await parent.aget_state(config)
+    assert len(paused_state.interrupts) == 2, (
+        f"fixture broken: expected 2 paused subagents, got {len(paused_state.interrupts)}"
+    )
+
+    a_reject_0 = {"type": "reject", "message": "A[0] looks suspicious"}
+    a_reject_1 = {"type": "reject", "message": "A[1] duplicates A[0]"}
+    b_reject_0 = {"type": "reject", "message": "B[0] needs more context"}
+    flat_decisions = [a_reject_0, a_reject_1, b_reject_0]
+
+    pending = collect_pending_tool_calls(paused_state)
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+
+    assert by_tool_call_id == {
+        "tcid-A": {"decisions": [a_reject_0, a_reject_1]},
+        "tcid-B": {"decisions": [b_reject_0]},
+    }, f"REGRESSION: slicer mis-routed all-reject decisions: {by_tool_call_id!r}"
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    lg_resume_map = build_lg_resume_map(paused_state, by_tool_call_id)
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final_state = await parent.aget_state(config)
+    assert not final_state.interrupts, (
+        f"REGRESSION: leftover pending interrupts after all-reject resume: "
+        f"{final_state.interrupts!r}"
+    )
+
+    payloads: list[dict] = []
+    for msg in final_state.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            try:
+                payloads.append(json.loads(content))
+            except json.JSONDecodeError:
+                pass
+
+    expected_a = {"decisions": [a_reject_0, a_reject_1]}
+    expected_b = {"decisions": [b_reject_0]}
+
+    assert expected_a in payloads, (
+        f"REGRESSION: sub-A did not receive its 2-reject slice in order; "
+        f"payloads seen: {payloads!r}"
+    )
+    assert expected_b in payloads, (
+        f"REGRESSION: sub-B did not receive its single reject; "
+        f"payloads seen: {payloads!r}"
+    )
+
+    for p in payloads:
+        for d in p.get("decisions", []):
+            assert "edited_action" not in d, (
+                f"REGRESSION: spurious ``edited_action`` on a reject — "
+                f"slicer/bridge mutated payload: {d!r}"
+            )