feat: improved agent streaming

2026-05-04 05:12:38 +02:00 · 2026-04-29 07:20:31 -07:00 · 2026-04-29 07:20:31 -07:00 · c110f5b955
commit c110f5b955
parent afb4b09cde
60 changed files with 8068 additions and 303 deletions
--- a/surfsense_backend/tests/unit/agents/new_chat/test_action_log.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_action_log.py
@ -15,6 +15,17 @@ from app.agents.new_chat.middleware.action_log import ActionLogMiddleware
 from app.agents.new_chat.tools.registry import ToolDefinition


+@dataclass
+class _FakeRuntime:
+    """Minimal stand-in for ``ToolRuntime`` used in unit tests.
+
+    ``ActionLogMiddleware`` reads ``runtime.config['configurable']['turn_id']``
+    to populate the new ``chat_turn_id`` column (see migration 135).
+    """
+
+    config: dict[str, Any] | None = None
+
+
@dataclass
 class _FakeRequest:
    """Minimal stand-in for ToolCallRequest used in unit tests."""
@ -120,6 +131,9 @@ class TestActionLogMiddlewarePersistence:
                "args": {"color": "red", "size": 3},
                "id": "tc-abc",
            },
+            runtime=_FakeRuntime(
+                config={"configurable": {"turn_id": "42:1700000000000"}}
+            ),
        )
        result_msg = ToolMessage(content="ok", tool_call_id="tc-abc", id="msg-1")
        handler = AsyncMock(return_value=result_msg)
@ -142,6 +156,32 @@ class TestActionLogMiddlewarePersistence:
        assert row.error is None
        assert row.reverse_descriptor is None
        assert row.reversible is False
+        # Migration 135: ``turn_id`` is the deprecated alias of ``tool_call_id``;
+        # ``chat_turn_id`` comes from ``runtime.config['configurable']['turn_id']``.
+        assert row.tool_call_id == "tc-abc"
+        assert row.turn_id == "tc-abc"
+        assert row.chat_turn_id == "42:1700000000000"
+
+    @pytest.mark.asyncio
+    async def test_chat_turn_id_none_when_runtime_missing(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        """``chat_turn_id`` falls back to NULL when ``runtime.config`` is absent."""
+        captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc-1"},
+            runtime=None,
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc-1"))
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        row = captured["rows"][0]
+        assert row.tool_call_id == "tc-1"
+        assert row.chat_turn_id is None

    @pytest.mark.asyncio
    async def test_writes_row_on_failure_and_reraises(
@ -293,6 +333,76 @@ class TestReverseDescriptor:
        assert row.reversible is False


+class TestActionLogDispatch:
+    """Verify ``adispatch_custom_event`` fires after commit."""
+
+    @pytest.mark.asyncio
+    async def test_dispatches_action_log_event_on_success(
+        self, patch_get_flags, fake_session_factory
+    ) -> None:
+        _captured, factory = fake_session_factory
+        mw = ActionLogMiddleware(thread_id=42, search_space_id=7, user_id="u1")
+        request = _FakeRequest(
+            tool_call={
+                "name": "make_widget",
+                "args": {"color": "red"},
+                "id": "tc-evt",
+            },
+            runtime=_FakeRuntime(
+                config={"configurable": {"turn_id": "42:1700000000000"}}
+            ),
+        )
+        result_msg = ToolMessage(content="ok", tool_call_id="tc-evt", id="msg-42")
+        handler = AsyncMock(return_value=result_msg)
+
+        dispatch_mock = AsyncMock()
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=lambda: factory()),
+            patch(
+                "app.agents.new_chat.middleware.action_log.adispatch_custom_event",
+                dispatch_mock,
+            ),
+        ):
+            await mw.awrap_tool_call(request, handler)
+
+        dispatch_mock.assert_awaited_once()
+        call_args = dispatch_mock.await_args
+        assert call_args is not None
+        assert call_args.args[0] == "action_log"
+        payload = call_args.args[1]
+        assert payload["lc_tool_call_id"] == "tc-evt"
+        assert payload["chat_turn_id"] == "42:1700000000000"
+        assert payload["tool_name"] == "make_widget"
+        assert payload["reversible"] is False
+        assert payload["reverse_descriptor_present"] is False
+        assert payload["error"] is False
+
+    @pytest.mark.asyncio
+    async def test_no_dispatch_when_persistence_fails(self, patch_get_flags) -> None:
+        """If commit fails the dispatch is suppressed (no row to surface)."""
+        mw = ActionLogMiddleware(thread_id=1, search_space_id=1, user_id=None)
+        request = _FakeRequest(
+            tool_call={"name": "make_widget", "args": {}, "id": "tc1"}
+        )
+        handler = AsyncMock(return_value=ToolMessage(content="ok", tool_call_id="tc1"))
+        dispatch_mock = AsyncMock()
+
+        def _exploding_session():
+            raise RuntimeError("DB is down")
+
+        with (
+            patch_get_flags(_enabled_flags()),
+            patch("app.db.shielded_async_session", side_effect=_exploding_session),
+            patch(
+                "app.agents.new_chat.middleware.action_log.adispatch_custom_event",
+                dispatch_mock,
+            ),
+        ):
+            await mw.awrap_tool_call(request, handler)
+        dispatch_mock.assert_not_awaited()
+
+
 class TestArgsTruncation:
    @pytest.mark.asyncio
    async def test_huge_args_payload_is_truncated(
--- a/surfsense_backend/tests/unit/agents/new_chat/test_desktop_safety_rules.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_desktop_safety_rules.py
@ -0,0 +1,122 @@
+"""Tests for the desktop-mode safety ruleset.
+
+In desktop mode the agent operates against the user's real disk with no
+revision history, so destructive filesystem operations must require
+explicit approval. These tests pin the set of tools that get the ``ask``
+gate so it cannot silently regress.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.middleware.permission import PermissionMiddleware
+from app.agents.new_chat.permissions import (
+    Rule,
+    Ruleset,
+    aggregate_action,
+    evaluate_many,
+)
+
+pytestmark = pytest.mark.unit
+
+
+# Mirror the ruleset built inside ``chat_deepagent._build_compiled_agent_blocking``
+# when ``filesystem_mode == FilesystemMode.DESKTOP_LOCAL_FOLDER``. Keeping a
+# copy here means the rule contract has a focused regression test even when
+# the larger graph-build helper is hard to instantiate in unit tests.
+DESKTOP_SAFETY_RULESET = Ruleset(
+    rules=[
+        Rule(permission="rm", pattern="*", action="ask"),
+        Rule(permission="rmdir", pattern="*", action="ask"),
+        Rule(permission="move_file", pattern="*", action="ask"),
+        Rule(permission="edit_file", pattern="*", action="ask"),
+        Rule(permission="write_file", pattern="*", action="ask"),
+    ],
+    origin="desktop_safety",
+)
+
+SURFSENSE_DEFAULTS = Ruleset(
+    rules=[Rule(permission="*", pattern="*", action="allow")],
+    origin="surfsense_defaults",
+)
+
+
+def _action_for(tool_name: str, *rulesets: Ruleset) -> str:
+    rules = evaluate_many(tool_name, [tool_name], *rulesets)
+    return aggregate_action(rules)
+
+
+class TestDesktopSafetyRulesGateDestructiveOps:
+    @pytest.mark.parametrize(
+        "tool_name",
+        ["rm", "rmdir", "move_file", "edit_file", "write_file"],
+    )
+    def test_destructive_op_resolves_to_ask(self, tool_name: str) -> None:
+        # surfsense_defaults says "allow */*"; desktop_safety must override
+        # because it's layered later (last-match-wins).
+        action = _action_for(tool_name, SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "ask", (
+            f"{tool_name} must require approval in desktop mode "
+            f"(no revert path on real disk); got {action!r}"
+        )
+
+    @pytest.mark.parametrize(
+        "tool_name",
+        ["read_file", "ls", "list_tree", "grep", "glob", "cd", "pwd", "mkdir"],
+    )
+    def test_safe_ops_remain_allowed(self, tool_name: str) -> None:
+        # Read-only and trivially-reversible tools must NOT get gated —
+        # otherwise every navigation in desktop mode pops an interrupt.
+        action = _action_for(tool_name, SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "allow", (
+            f"{tool_name} should not be gated in desktop mode; got {action!r}"
+        )
+
+
+class TestDesktopSafetyOverridesAllowDefault:
+    def test_layer_order_last_match_wins(self) -> None:
+        # If desktop_safety is layered BEFORE surfsense_defaults, the allow
+        # default would win and the safety net would be inert. This test
+        # protects against accidentally swapping the rulesets in
+        # ``_build_compiled_agent_blocking``.
+        action = _action_for("rm", DESKTOP_SAFETY_RULESET, SURFSENSE_DEFAULTS)
+        # Layered "wrong way" — the broad allow now wins.
+        assert action == "allow"
+
+        # Correct order: defaults < desktop_safety -> ask wins.
+        action = _action_for("rm", SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET)
+        assert action == "ask"
+
+
+class TestPermissionMiddlewareIntegration:
+    def test_middleware_raises_interrupt_for_rm_in_desktop_mode(self) -> None:
+        from langchain_core.messages import AIMessage
+
+        from app.agents.new_chat.errors import RejectedError
+
+        mw = PermissionMiddleware(rulesets=[SURFSENSE_DEFAULTS, DESKTOP_SAFETY_RULESET])
+        # Stub the interrupt to a "reject" decision so we can assert the
+        # ask path was taken without spinning up the LangGraph runtime.
+        mw._raise_interrupt = lambda **kw: {"decision_type": "reject"}  # type: ignore[assignment]
+
+        state = {
+            "messages": [
+                AIMessage(
+                    content="",
+                    tool_calls=[
+                        {
+                            "name": "rm",
+                            "args": {"path": "/Users/me/Documents/important.docx"},
+                            "id": "tc-rm",
+                        }
+                    ],
+                )
+            ]
+        }
+
+        class _FakeRuntime:
+            config: dict = {"configurable": {"thread_id": "test"}}
+
+        with pytest.raises(RejectedError):
+            mw.after_model(state, _FakeRuntime())
--- a/surfsense_backend/tests/unit/agents/new_chat/test_hitl_auto_approve.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_hitl_auto_approve.py
@ -0,0 +1,111 @@
+"""Tests for the default auto-approval list in ``hitl.request_approval``.
+
+These pin the policy that low-stakes connector creation tools (drafts,
+new-file creates) skip the HITL interrupt by default. Without this set,
+every "draft my newsletter" turn used to fire ~3 interrupts before any
+useful work happened.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.new_chat.tools.hitl import (
+    DEFAULT_AUTO_APPROVED_TOOLS,
+    HITLResult,
+    request_approval,
+)
+
+pytestmark = pytest.mark.unit
+
+
+class TestDefaultAutoApprovedToolsList:
+    def test_set_contains_expected_creation_tools(self) -> None:
+        # If anyone changes the policy list, we want a single test to
+        # update so the contract is explicit. Keep this in sync with
+        # ``hitl.DEFAULT_AUTO_APPROVED_TOOLS``.
+        expected = {
+            "create_gmail_draft",
+            "update_gmail_draft",
+            "create_notion_page",
+            "create_confluence_page",
+            "create_google_drive_file",
+            "create_dropbox_file",
+            "create_onedrive_file",
+        }
+        assert expected == DEFAULT_AUTO_APPROVED_TOOLS
+
+    def test_set_is_immutable(self) -> None:
+        # frozenset prevents accidental at-runtime mutation that would
+        # silently widen the auto-approval surface.
+        assert isinstance(DEFAULT_AUTO_APPROVED_TOOLS, frozenset)
+
+    def test_send_tools_are_not_auto_approved(self) -> None:
+        # External-broadcast tools must always prompt.
+        for tool_name in (
+            "send_gmail_email",
+            "send_discord_message",
+            "send_teams_message",
+            "delete_notion_page",
+            "create_calendar_event",
+            "delete_calendar_event",
+        ):
+            assert tool_name not in DEFAULT_AUTO_APPROVED_TOOLS, (
+                f"{tool_name} must remain HITL-gated"
+            )
+
+
+class TestRequestApprovalAutoBypass:
+    def test_auto_approved_tool_skips_interrupt(self) -> None:
+        # No interrupt mock set up — if the function attempted to call
+        # ``langgraph.types.interrupt`` it would raise GraphInterrupt.
+        # The fact that we get a clean HITLResult proves the bypass.
+        result = request_approval(
+            action_type="gmail_draft_creation",
+            tool_name="create_gmail_draft",
+            params={"to": "alice@example.com", "subject": "hi", "body": "hey"},
+        )
+        assert isinstance(result, HITLResult)
+        assert result.rejected is False
+        assert result.decision_type == "auto_approved"
+        # Original params are preserved untouched (no user edits possible).
+        assert result.params == {
+            "to": "alice@example.com",
+            "subject": "hi",
+            "body": "hey",
+        }
+
+    def test_non_listed_tool_still_attempts_interrupt(self) -> None:
+        # A tool NOT in the default list must reach ``langgraph.interrupt``.
+        # Outside a runnable context that call raises a RuntimeError —
+        # which is exactly the signal we want: the bypass did NOT fire.
+        with pytest.raises(RuntimeError, match="runnable context"):
+            request_approval(
+                action_type="gmail_email_send",
+                tool_name="send_gmail_email",
+                params={"to": "alice@example.com", "subject": "hi", "body": "hey"},
+            )
+
+    def test_user_trusted_tools_still_take_precedence(self) -> None:
+        # ``trusted_tools`` (per-connector "always allow" from MCP/UI)
+        # was checked BEFORE the default list and must keep working
+        # for tools outside the default list.
+        result = request_approval(
+            action_type="mcp_tool_call",
+            tool_name="my_custom_mcp_tool",
+            params={"x": 1},
+            trusted_tools=["my_custom_mcp_tool"],
+        )
+        assert result.decision_type == "trusted"
+        assert result.rejected is False
+
+    def test_auto_approved_overrides_no_trusted_tools(self) -> None:
+        # When trusted_tools is empty and tool is in the default list,
+        # we should still bypass — proves the order in request_approval.
+        result = request_approval(
+            action_type="notion_page_creation",
+            tool_name="create_notion_page",
+            params={"title": "Plan"},
+            trusted_tools=[],
+        )
+        assert result.decision_type == "auto_approved"
--- a/surfsense_backend/tests/unit/agents/new_chat/test_rm_rmdir_cloud.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_rm_rmdir_cloud.py
@ -0,0 +1,333 @@
+"""Cloud-mode behavior tests for the new ``rm`` and ``rmdir`` filesystem tools.
+
+The tools build ``Command(update=...)`` payloads that the persistence
+middleware applies at end of turn. These tests stub out the backend and
+runtime to assert the staging payload shape:
+
+* ``rm`` queues into ``pending_deletes`` and tombstones state files.
+* ``rm`` rejects directories, ``/documents``, root, and the anonymous doc.
+* ``rmdir`` queues into ``pending_dir_deletes`` and rejects non-empty dirs.
+* ``rmdir`` un-stages a same-turn ``mkdir`` rather than queuing a delete.
+* ``rmdir`` refuses to drop the cwd or any of its ancestors.
+* ``KBPostgresBackend`` view-helpers honor staged deletes.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import AsyncMock
+
+import pytest
+
+from app.agents.new_chat.filesystem_selection import FilesystemMode
+from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
+from app.agents.new_chat.middleware.kb_postgres_backend import KBPostgresBackend
+
+pytestmark = pytest.mark.unit
+
+
+def _make_middleware(mode: FilesystemMode = FilesystemMode.CLOUD):
+    middleware = SurfSenseFilesystemMiddleware.__new__(SurfSenseFilesystemMiddleware)
+    middleware._filesystem_mode = mode
+    middleware._custom_tool_descriptions = {}
+    return middleware
+
+
+def _runtime(state: dict[str, Any] | None = None, *, tool_call_id: str = "tc-abc"):
+    state = state or {}
+    state.setdefault("cwd", "/documents")
+    return SimpleNamespace(state=state, tool_call_id=tool_call_id)
+
+
+class _KBBackendStub(KBPostgresBackend):
+    """Construct-able subclass of :class:`KBPostgresBackend` for tests.
+
+    We bypass the real ``__init__`` (which expects a runtime + DB session)
+    and inject just the methods the rm/rmdir tools touch. The class
+    inheritance keeps ``isinstance(backend, KBPostgresBackend)`` checks
+    inside the tools happy, which is what gates them from the desktop
+    code path.
+    """
+
+    def __init__(self, *, children=None, file_data=None) -> None:
+        self.als_info = AsyncMock(return_value=children or [])
+        self._load_file_data = AsyncMock(
+            return_value=(file_data, 17) if file_data is not None else None
+        )
+
+
+def _make_backend_stub(*, children=None, file_data=None) -> KBPostgresBackend:
+    return _KBBackendStub(children=children, file_data=file_data)
+
+
+def _bind_backend(middleware, backend):
+    """Inject a backend resolver onto the middleware test instance."""
+    middleware._get_backend = lambda runtime: backend
+    return backend
+
+
+# ---------------------------------------------------------------------------
+# rm
+# ---------------------------------------------------------------------------
+
+
+class TestRmStaging:
+    @pytest.mark.asyncio
+    async def test_stages_delete_and_tombstones_state(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+                "files": {"/documents/notes.md": {"content": ["hello"]}},
+                "doc_id_by_path": {"/documents/notes.md": 17},
+            },
+            tool_call_id="tc-1",
+        )
+
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+
+        assert hasattr(result, "update"), f"expected Command, got {result!r}"
+        update = result.update
+        assert update["pending_deletes"] == [
+            {"path": "/documents/notes.md", "tool_call_id": "tc-1"}
+        ]
+        assert update["files"] == {"/documents/notes.md": None}
+        assert update["doc_id_by_path"] == {"/documents/notes.md": None}
+
+    @pytest.mark.asyncio
+    async def test_rejects_documents_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents", runtime=runtime)
+        assert isinstance(result, str)
+        assert "refusing to rm" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/", runtime=runtime)
+        assert isinstance(result, str)
+        assert "refusing to rm" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_directory_via_staged_dirs(self):
+        m = _make_middleware()
+        runtime = _runtime(
+            {
+                "staged_dirs": ["/documents/team-x"],
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/team-x", runtime=runtime)
+        assert isinstance(result, str)
+        assert "directory" in result.lower()
+        assert "rmdir" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_directory_via_listing(self):
+        m = _make_middleware()
+        _bind_backend(
+            m,
+            _make_backend_stub(
+                children=[{"path": "/documents/foo/x.md", "is_dir": False}]
+            ),
+        )
+        runtime = _runtime()
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/foo", runtime=runtime)
+        assert isinstance(result, str)
+        assert "directory" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_anonymous_doc(self):
+        m = _make_middleware()
+        runtime = _runtime(
+            {
+                "kb_anon_doc": {
+                    "path": "/documents/uploaded.xml",
+                    "title": "uploaded",
+                    "content": "",
+                    "chunks": [],
+                }
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/uploaded.xml", runtime=runtime)
+        assert isinstance(result, str)
+        assert "read-only" in result
+
+    @pytest.mark.asyncio
+    async def test_drops_path_from_dirty_paths(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime(
+            {
+                "files": {"/documents/notes.md": {"content": ["x"]}},
+                "doc_id_by_path": {"/documents/notes.md": 17},
+                "dirty_paths": ["/documents/notes.md"],
+            }
+        )
+        tool = m._create_rm_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+        update = result.update
+        # First element is _CLEAR sentinel; the rest must NOT contain the
+        # rm'd path.
+        dirty = update.get("dirty_paths") or []
+        assert "/documents/notes.md" not in dirty[1:]
+
+
+# ---------------------------------------------------------------------------
+# rmdir
+# ---------------------------------------------------------------------------
+
+
+class TestRmdirStaging:
+    @pytest.mark.asyncio
+    async def test_stages_dir_delete_when_empty_and_db_backed(self):
+        m = _make_middleware()
+        backend = _bind_backend(m, _make_backend_stub(children=[]))
+        # Override _load_file_data to return None (folder, not a file) and
+        # parent listing to claim the folder exists.
+        backend._load_file_data = AsyncMock(return_value=None)
+        backend.als_info = AsyncMock(
+            side_effect=[
+                [],  # children of /documents/proj
+                [
+                    {"path": "/documents/proj", "is_dir": True},
+                ],  # parent listing
+            ]
+        )
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+            },
+            tool_call_id="tc-rd",
+        )
+
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+
+        assert hasattr(result, "update")
+        update = result.update
+        assert update["pending_dir_deletes"] == [
+            {"path": "/documents/proj", "tool_call_id": "tc-rd"}
+        ]
+
+    @pytest.mark.asyncio
+    async def test_rejects_non_empty(self):
+        m = _make_middleware()
+        _bind_backend(
+            m,
+            _make_backend_stub(
+                children=[{"path": "/documents/proj/x.md", "is_dir": False}]
+            ),
+        )
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "not empty" in result
+
+    @pytest.mark.asyncio
+    async def test_unstages_same_turn_mkdir(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[]))
+        runtime = _runtime(
+            {
+                "cwd": "/documents",
+                "staged_dirs": ["/documents/scratch"],
+            },
+            tool_call_id="tc-rd",
+        )
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/scratch", runtime=runtime)
+
+        assert hasattr(result, "update")
+        update = result.update
+        assert "pending_dir_deletes" not in update
+        # _CLEAR sentinel + remaining items (in this case, none).
+        staged_after = update["staged_dirs"]
+        assert staged_after[0] == "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
+        assert "/documents/scratch" not in staged_after[1:]
+
+    @pytest.mark.asyncio
+    async def test_rejects_root(self):
+        m = _make_middleware()
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        for victim in ("/", "/documents"):
+            result = await tool.coroutine(victim, runtime=runtime)
+            assert isinstance(result, str)
+            assert "refusing to rmdir" in result
+
+    @pytest.mark.asyncio
+    async def test_rejects_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/proj"})
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "cwd" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_ancestor_of_cwd(self):
+        m = _make_middleware()
+        runtime = _runtime({"cwd": "/documents/proj/sub"})
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/proj", runtime=runtime)
+        assert isinstance(result, str)
+        assert "cwd" in result.lower()
+
+    @pytest.mark.asyncio
+    async def test_rejects_files(self):
+        m = _make_middleware()
+        _bind_backend(m, _make_backend_stub(children=[], file_data={"content": ["x"]}))
+        runtime = _runtime()
+        tool = m._create_rmdir_tool()
+        result = await tool.coroutine("/documents/notes.md", runtime=runtime)
+        assert isinstance(result, str)
+        assert "is a file" in result
+
+
+# ---------------------------------------------------------------------------
+# KBPostgresBackend view filter
+# ---------------------------------------------------------------------------
+
+
+class TestKBPostgresBackendDeleteFilter:
+    """als_info / glob / grep should suppress paths queued for delete."""
+
+    def _make_backend(self, state: dict[str, Any]) -> KBPostgresBackend:
+        runtime = SimpleNamespace(state=state)
+        backend = KBPostgresBackend(search_space_id=1, runtime=runtime)
+        return backend
+
+    def test_pending_filesystem_view_returns_deleted_paths(self):
+        backend = self._make_backend(
+            {
+                "pending_deletes": [
+                    {"path": "/documents/x.md", "tool_call_id": "t1"},
+                ],
+                "pending_dir_deletes": [
+                    {"path": "/documents/d1", "tool_call_id": "t2"},
+                ],
+            }
+        )
+        removed, alias, deleted_dirs = backend._pending_filesystem_view({})
+        assert "/documents/x.md" in removed
+        assert "/documents/d1" in deleted_dirs
+        assert alias == {}
+
+    def test_dir_suppressed_covers_descendants(self):
+        backend = self._make_backend({})
+        deleted_dirs = {"/documents/d"}
+        assert backend._is_dir_suppressed("/documents/d", deleted_dirs)
+        assert backend._is_dir_suppressed("/documents/d/x.md", deleted_dirs)
+        assert backend._is_dir_suppressed("/documents/d/sub/y.md", deleted_dirs)
+        assert not backend._is_dir_suppressed("/documents/other.md", deleted_dirs)
--- a/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py
@ -98,10 +98,54 @@ class TestInitialFilesystemState:
        state = _initial_filesystem_state()
        assert state["cwd"] == "/documents"
        assert state["staged_dirs"] == []
+        assert state["staged_dir_tool_calls"] == {}
        assert state["pending_moves"] == []
+        assert state["pending_deletes"] == []
+        assert state["pending_dir_deletes"] == []
        assert state["doc_id_by_path"] == {}
        assert state["dirty_paths"] == []
+        assert state["dirty_path_tool_calls"] == {}
        assert state["kb_priority"] == []
        assert state["kb_matched_chunk_ids"] == {}
        assert state["kb_anon_doc"] is None
        assert state["tree_version"] == 0
+
+
+class TestMultiEditSamePathCoalescing:
+    """Multi-edit-same-path turns must coalesce into ONE binding record.
+
+    The persistence body uses ``dirty_path_tool_calls[path]`` to find the
+    tool_call_id that produced the current state on disk. Because
+    ``dirty_paths`` dedupes via :func:`_add_unique_reducer` the second
+    edit doesn't append a new path entry — and because
+    ``_dict_merge_with_tombstones_reducer`` lets the right-hand side
+    overwrite, the LATEST tool_call_id wins. That's the correct behavior
+    for snapshotting: revert restores to the pre-mutation state, and
+    multiple back-to-back edits in one turn coalesce into a single
+    revisible op (the user sees ONE Revert button per turn-per-path,
+    not N).
+    """
+
+    def test_dirty_paths_dedupes_repeated_writes(self):
+        # ``_add_unique_reducer`` is applied to ``dirty_paths``. Two writes
+        # to the same path produce one entry, not two.
+        first = _add_unique_reducer([], ["/documents/a.md"])
+        second = _add_unique_reducer(first, ["/documents/a.md"])
+        assert second == ["/documents/a.md"]
+
+    def test_dirty_path_tool_calls_keeps_latest_tool_call_id(self):
+        # First write tags the path with tcid-1.
+        merged = _dict_merge_with_tombstones_reducer({}, {"/documents/a.md": "tcid-1"})
+        # Second write to the same path tags it with tcid-2 (latest wins).
+        merged = _dict_merge_with_tombstones_reducer(
+            merged, {"/documents/a.md": "tcid-2"}
+        )
+        assert merged == {"/documents/a.md": "tcid-2"}
+
+    def test_rm_tombstones_dirty_path_tool_call(self):
+        # ``rm`` writes ``{path: None}`` into dirty_path_tool_calls to
+        # prevent a stale binding from leaking past the delete.
+        merged = _dict_merge_with_tombstones_reducer(
+            {"/documents/a.md": "tcid-1"}, {"/documents/a.md": None}
+        )
+        assert merged == {}