Merge remote-tracking branch 'upstream/dev' into feature/multi-agent

2026-05-07 23:02:39 +02:00 · 2026-05-01 00:05:20 +02:00 · 2026-05-01 00:05:20 +02:00 · 5d3b8b9ca9
commit 5d3b8b9ca9
parent d157ceaabc b2f487bf36
83 changed files with 10514 additions and 638 deletions
--- a/surfsense_backend/app/services/new_streaming_service.py
+++ b/surfsense_backend/app/services/new_streaming_service.py
@ -584,13 +584,33 @@ class VercelStreamingService:
    # Tool Parts
    # =========================================================================

-    def format_tool_input_start(self, tool_call_id: str, tool_name: str) -> str:
+    def format_tool_input_start(
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        *,
+        langchain_tool_call_id: str | None = None,
+    ) -> str:
        """
        Format the start of tool input streaming.

        Args:
-            tool_call_id: The unique tool call identifier
-            tool_name: The name of the tool being called
+            tool_call_id: The unique tool call identifier. May be EITHER the
+                synthetic ``call_<run_id>`` id derived from LangGraph
+                ``run_id`` (legacy / ``SURFSENSE_ENABLE_STREAM_PARITY_V2``
+                OFF, or the unmatched-fallback path under parity_v2) OR
+                the authoritative LangChain ``tool_call.id`` (parity_v2
+                path: when the provider streams ``tool_call_chunks`` we
+                register the ``index`` and reuse the lc-id as the card
+                id so live ``tool-input-delta`` events can be routed
+                without a downstream join). Either way, the same id is
+                preserved across ``tool-input-start`` / ``-delta`` /
+                ``-available`` / ``tool-output-available`` for one call.
+            tool_name: The name of the tool being called.
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id``. When set, surfaces as
+                ``langchainToolCallId`` so the frontend can join this card
+                to the action-log row written by ``ActionLogMiddleware``.

        Returns:
            str: SSE formatted tool input start part
@ -598,13 +618,14 @@ class VercelStreamingService:
        Example output:
            data: {"type":"tool-input-start","toolCallId":"call_abc123","toolName":"getWeather"}
        """
-        return self._format_sse(
-            {
-                "type": "tool-input-start",
-                "toolCallId": tool_call_id,
-                "toolName": tool_name,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-input-start",
+            "toolCallId": tool_call_id,
+            "toolName": tool_name,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        return self._format_sse(payload)

    def format_tool_input_delta(self, tool_call_id: str, input_text_delta: str) -> str:
        """
@ -629,7 +650,12 @@ class VercelStreamingService:
        )

    def format_tool_input_available(
-        self, tool_call_id: str, tool_name: str, input_data: dict[str, Any]
+        self,
+        tool_call_id: str,
+        tool_name: str,
+        input_data: dict[str, Any],
+        *,
+        langchain_tool_call_id: str | None = None,
    ) -> str:
        """
        Format the completion of tool input.
@ -638,6 +664,8 @@ class VercelStreamingService:
            tool_call_id: The tool call identifier
            tool_name: The name of the tool
            input_data: The complete tool input parameters
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id`` (see ``format_tool_input_start``).

        Returns:
            str: SSE formatted tool input available part
@ -645,22 +673,34 @@ class VercelStreamingService:
        Example output:
            data: {"type":"tool-input-available","toolCallId":"call_abc123","toolName":"getWeather","input":{"city":"SF"}}
        """
-        return self._format_sse(
-            {
-                "type": "tool-input-available",
-                "toolCallId": tool_call_id,
-                "toolName": tool_name,
-                "input": input_data,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-input-available",
+            "toolCallId": tool_call_id,
+            "toolName": tool_name,
+            "input": input_data,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        return self._format_sse(payload)

-    def format_tool_output_available(self, tool_call_id: str, output: Any) -> str:
+    def format_tool_output_available(
+        self,
+        tool_call_id: str,
+        output: Any,
+        *,
+        langchain_tool_call_id: str | None = None,
+    ) -> str:
        """
        Format tool execution output.

        Args:
            tool_call_id: The tool call identifier
            output: The tool execution result
+            langchain_tool_call_id: Optional authoritative LangChain
+                ``tool_call.id`` extracted from ``ToolMessage.tool_call_id``.
+                When set, the frontend can backfill any card whose
+                ``langchainToolCallId`` was not yet known at
+                ``tool-input-start`` time.

        Returns:
            str: SSE formatted tool output available part
@ -668,13 +708,14 @@ class VercelStreamingService:
        Example output:
            data: {"type":"tool-output-available","toolCallId":"call_abc123","output":{"weather":"sunny"}}
        """
-        return self._format_sse(
-            {
-                "type": "tool-output-available",
-                "toolCallId": tool_call_id,
-                "output": output,
-            }
-        )
+        payload: dict[str, Any] = {
+            "type": "tool-output-available",
+            "toolCallId": tool_call_id,
+            "output": output,
+        }
+        if langchain_tool_call_id:
+            payload["langchainToolCallId"] = langchain_tool_call_id
+        return self._format_sse(payload)

    # =========================================================================
    # Step Parts
--- a/surfsense_backend/app/services/revert_service.py
+++ b/surfsense_backend/app/services/revert_service.py
@ -8,7 +8,9 @@ Operation outcomes mirror the plan:

 * **KB-owned actions** (NOTE / FILE / FOLDER mutations): restore from
  :class:`app.db.DocumentRevision` / :class:`app.db.FolderRevision` rows
-  written before the original mutation.
+  written before the original mutation. ``rm``/``rmdir`` re-INSERT a fresh
+  row from the snapshot; ``write_file`` create / ``mkdir`` DELETE the row
+  that was created; everything else is an in-place restore.
 * **Connector-owned actions with a declared ``reverse_descriptor``**: invoke
  the inverse tool through the agent's normal permission stack (NOT
  bypassed). Out of scope for this PR — returns ``REVERSE_NOT_IMPLEMENTED``.
@ -18,6 +20,11 @@ Operation outcomes mirror the plan:
 A successful revert appends a NEW row to ``agent_action_log`` with
 ``reverse_of=<original_action_id>`` and the requesting user's
 ``user_id``, preserving an auditable chain.
+
+Dispatch must be exact-match (``tool_name == name``), NOT prefix matching.
+``"rmdir".startswith("rm")`` would otherwise mis-route directory revert
+to the document branch (and ``delete_note`` vs ``delete_folder`` is the
+same trap waiting to happen).
 """

 from __future__ import annotations
@ -25,17 +32,31 @@ from __future__ import annotations
 import logging
 from dataclasses import dataclass
 from datetime import UTC, datetime
-from typing import Literal
+from typing import Any, Literal

-from sqlalchemy import select
+from sqlalchemy import delete, select
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    safe_filename,
+    safe_folder_segment,
+)
 from app.db import (
    AgentActionLog,
+    Chunk,
+    Document,
    DocumentRevision,
+    DocumentType,
+    Folder,
    FolderRevision,
    NewChatThread,
 )
+from app.utils.document_converters import (
+    embed_texts,
+    generate_content_hash,
+    generate_unique_identifier_hash,
+)

 logger = logging.getLogger(__name__)

@ -110,14 +131,244 @@ def can_revert(


 # ---------------------------------------------------------------------------
-# Revert paths
+# Helper: reconstruct virtual path from a snapshot
 # ---------------------------------------------------------------------------


+async def _virtual_path_from_snapshot(
+    session: AsyncSession,
+    revision: DocumentRevision,
+) -> str | None:
+    """Reconstruct the virtual_path the document was at before mutation.
+
+    Preference order:
+    1. ``metadata_before["virtual_path"]`` — written by every snapshot
+       helper since this PR.
+    2. Compose ``"<folder_path>/<title_before>"`` from
+       ``folder_id_before`` + ``title_before``. Walks the folder chain via
+       ``parent_id``.
+    """
+    metadata = revision.metadata_before or {}
+    candidate = metadata.get("virtual_path") if isinstance(metadata, dict) else None
+    if isinstance(candidate, str) and candidate.startswith(DOCUMENTS_ROOT):
+        return candidate
+
+    title = revision.title_before
+    if not isinstance(title, str) or not title:
+        return None
+
+    parts: list[str] = []
+    cursor: int | None = revision.folder_id_before
+    visited: set[int] = set()
+    while cursor is not None and cursor not in visited:
+        visited.add(cursor)
+        folder = await session.get(Folder, cursor)
+        if folder is None:
+            return None
+        parts.append(safe_folder_segment(str(folder.name or "")))
+        cursor = folder.parent_id
+    parts.reverse()
+
+    base = f"{DOCUMENTS_ROOT}/" + "/".join(parts) if parts else DOCUMENTS_ROOT
+    filename = safe_filename(title)
+    return f"{base}/{filename}"
+
+
+# ---------------------------------------------------------------------------
+# Document revision restore (write/edit/move/rm)
+# ---------------------------------------------------------------------------
+
+
+def _set_field(target: Any, field: str, value: Any) -> None:
+    if value is not None:
+        setattr(target, field, value)
+
+
+async def _restore_in_place_document(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Apply an in-place restore to an existing :class:`Document`."""
+    if revision.document_id is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Original document was hard-deleted; in-place restore is not possible."
+            ),
+        )
+    doc = await session.get(Document, revision.document_id)
+    if doc is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message="Original document has been deleted; revert cannot proceed.",
+        )
+
+    _set_field(doc, "content", revision.content_before)
+    _set_field(doc, "source_markdown", revision.content_before)
+    _set_field(doc, "title", revision.title_before)
+    _set_field(doc, "folder_id", revision.folder_id_before)
+    metadata_before = revision.metadata_before or {}
+    if isinstance(metadata_before, dict) and metadata_before:
+        doc.document_metadata = dict(metadata_before)
+
+    if isinstance(revision.content_before, str):
+        doc.content_hash = generate_content_hash(
+            revision.content_before, doc.search_space_id
+        )
+
+    virtual_path = await _virtual_path_from_snapshot(session, revision)
+    if virtual_path:
+        doc.unique_identifier_hash = generate_unique_identifier_hash(
+            DocumentType.NOTE,
+            virtual_path,
+            doc.search_space_id,
+        )
+
+    chunks_before = revision.chunks_before
+    if isinstance(chunks_before, list):
+        await session.execute(delete(Chunk).where(Chunk.document_id == doc.id))
+        chunk_texts = [
+            str(c.get("content"))
+            for c in chunks_before
+            if isinstance(c, dict) and isinstance(c.get("content"), str)
+        ]
+        if chunk_texts:
+            chunk_embeddings = embed_texts(chunk_texts)
+            session.add_all(
+                [
+                    Chunk(document_id=doc.id, content=text, embedding=embedding)
+                    for text, embedding in zip(
+                        chunk_texts, chunk_embeddings, strict=True
+                    )
+                ]
+            )
+            if isinstance(revision.content_before, str):
+                doc.embedding = embed_texts([revision.content_before])[0]
+
+    doc.updated_at = datetime.now(UTC)
+    return RevertOutcome(status="ok", message="Document restored from snapshot.")
+
+
+async def _reinsert_document_from_revision(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Re-INSERT a deleted :class:`Document` from a snapshot row (``rm`` revert)."""
+    if not isinstance(revision.title_before, str) or not revision.title_before:
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks title_before; cannot recreate document.",
+        )
+    if not isinstance(revision.content_before, str):
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks content_before; cannot recreate document.",
+        )
+
+    virtual_path = await _virtual_path_from_snapshot(session, revision)
+    if not virtual_path:
+        return RevertOutcome(
+            status="not_reversible",
+            message=(
+                "Snapshot is missing both metadata_before['virtual_path'] AND "
+                "a resolvable (folder_id_before, title_before) pair."
+            ),
+        )
+
+    search_space_id = revision.search_space_id
+    unique_identifier_hash = generate_unique_identifier_hash(
+        DocumentType.NOTE,
+        virtual_path,
+        search_space_id,
+    )
+    collision = await session.execute(
+        select(Document.id).where(
+            Document.search_space_id == search_space_id,
+            Document.unique_identifier_hash == unique_identifier_hash,
+        )
+    )
+    if collision.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                f"A document already exists at '{virtual_path}'; revert would "
+                "collide. Move the live doc out of the way first."
+            ),
+        )
+
+    metadata = revision.metadata_before or {}
+    if not isinstance(metadata, dict):
+        metadata = {}
+    metadata = dict(metadata)
+    metadata["virtual_path"] = virtual_path
+
+    content = revision.content_before
+    new_doc = Document(
+        title=revision.title_before,
+        document_type=DocumentType.NOTE,
+        document_metadata=metadata,
+        content=content,
+        content_hash=generate_content_hash(content, search_space_id),
+        unique_identifier_hash=unique_identifier_hash,
+        source_markdown=content,
+        search_space_id=search_space_id,
+        folder_id=revision.folder_id_before,
+        updated_at=datetime.now(UTC),
+    )
+    session.add(new_doc)
+    await session.flush()
+
+    new_doc.embedding = embed_texts([content])[0]
+    chunk_texts = []
+    chunks_before = revision.chunks_before
+    if isinstance(chunks_before, list):
+        chunk_texts = [
+            str(c.get("content"))
+            for c in chunks_before
+            if isinstance(c, dict) and isinstance(c.get("content"), str)
+        ]
+    if chunk_texts:
+        chunk_embeddings = embed_texts(chunk_texts)
+        session.add_all(
+            [
+                Chunk(document_id=new_doc.id, content=text, embedding=embedding)
+                for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True)
+            ]
+        )
+
+    # Repoint the snapshot at the recreated row so a follow-up revert of
+    # the same row works as expected.
+    revision.document_id = new_doc.id
+    return RevertOutcome(
+        status="ok",
+        message=f"Re-inserted document '{revision.title_before}' from snapshot.",
+    )
+
+
+async def _delete_created_document(
+    session: AsyncSession,
+    *,
+    revision: DocumentRevision,
+) -> RevertOutcome:
+    """Delete the document that ``write_file`` created (``content_before IS NULL``)."""
+    if revision.document_id is None:
+        return RevertOutcome(
+            status="ok",
+            message="No live row to delete (already removed elsewhere).",
+        )
+    await session.execute(delete(Document).where(Document.id == revision.document_id))
+    return RevertOutcome(
+        status="ok",
+        message="Deleted the document that was created by this action.",
+    )
+
+
 async def _restore_document_revision(
    session: AsyncSession, *, action: AgentActionLog
 ) -> RevertOutcome:
-    """Restore the most recent :class:`DocumentRevision` for ``action``."""
+    """Dispatch document-level revert based on ``action.tool_name``."""
    stmt = (
        select(DocumentRevision)
        .where(DocumentRevision.agent_action_id == action.id)
@ -132,23 +383,111 @@ async def _restore_document_revision(
            message="No document_revisions row tied to this action.",
        )

-    from app.db import Document  # late import to avoid cycles at module load
+    tool_name = (action.tool_name or "").lower()

-    doc = await session.get(Document, revision.document_id)
-    if doc is None:
+    if tool_name == "rm":
+        return await _reinsert_document_from_revision(session, revision=revision)
+
+    if tool_name == "write_file" and revision.content_before is None:
+        return await _delete_created_document(session, revision=revision)
+
+    return await _restore_in_place_document(session, revision=revision)
+
+
+# ---------------------------------------------------------------------------
+# Folder revision restore (mkdir/rmdir/rename/move)
+# ---------------------------------------------------------------------------
+
+
+async def _restore_in_place_folder(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if revision.folder_id is None:
        return RevertOutcome(
            status="tool_unavailable",
-            message="Original document has been deleted; revert cannot proceed.",
+            message="Original folder was hard-deleted; in-place restore is impossible.",
+        )
+    folder = await session.get(Folder, revision.folder_id)
+    if folder is None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message="Original folder has been deleted; revert cannot proceed.",
+        )
+    _set_field(folder, "name", revision.name_before)
+    _set_field(folder, "parent_id", revision.parent_id_before)
+    _set_field(folder, "position", revision.position_before)
+    folder.updated_at = datetime.now(UTC)
+    return RevertOutcome(status="ok", message="Folder restored from snapshot.")
+
+
+async def _reinsert_folder_from_revision(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if not isinstance(revision.name_before, str) or not revision.name_before:
+        return RevertOutcome(
+            status="not_reversible",
+            message="Snapshot lacks name_before; cannot recreate folder.",
+        )
+    new_folder = Folder(
+        name=revision.name_before,
+        parent_id=revision.parent_id_before,
+        position=revision.position_before,
+        search_space_id=revision.search_space_id,
+        updated_at=datetime.now(UTC),
+    )
+    session.add(new_folder)
+    await session.flush()
+    revision.folder_id = new_folder.id
+    return RevertOutcome(
+        status="ok",
+        message=f"Re-inserted folder '{revision.name_before}' from snapshot.",
+    )
+
+
+async def _delete_created_folder(
+    session: AsyncSession,
+    *,
+    revision: FolderRevision,
+) -> RevertOutcome:
+    if revision.folder_id is None:
+        return RevertOutcome(
+            status="ok",
+            message="No live folder row to delete (already removed elsewhere).",
+        )
+    folder_id = revision.folder_id
+
+    has_doc = await session.execute(
+        select(Document.id).where(Document.folder_id == folder_id).limit(1)
+    )
+    if has_doc.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Folder is no longer empty (documents have been added since "
+                "mkdir); cannot revert."
+            ),
+        )
+    has_child = await session.execute(
+        select(Folder.id).where(Folder.parent_id == folder_id).limit(1)
+    )
+    if has_child.scalar_one_or_none() is not None:
+        return RevertOutcome(
+            status="tool_unavailable",
+            message=(
+                "Folder is no longer empty (sub-folders have been added "
+                "since mkdir); cannot revert."
+            ),
        )

-    if revision.content_before is not None:
-        doc.content = revision.content_before
-    if revision.title_before is not None:
-        doc.title = revision.title_before
-    if revision.folder_id_before is not None:
-        doc.folder_id = revision.folder_id_before
-    doc.updated_at = datetime.now(UTC)
-    return RevertOutcome(status="ok", message="Document restored from snapshot.")
+    await session.execute(delete(Folder).where(Folder.id == folder_id))
+    return RevertOutcome(
+        status="ok",
+        message="Deleted the folder that was created by this action.",
+    )


 async def _restore_folder_revision(
@ -168,41 +507,44 @@ async def _restore_folder_revision(
            message="No folder_revisions row tied to this action.",
        )

-    from app.db import Folder
+    tool_name = (action.tool_name or "").lower()

-    folder = await session.get(Folder, revision.folder_id)
-    if folder is None:
-        return RevertOutcome(
-            status="tool_unavailable",
-            message="Original folder has been deleted; revert cannot proceed.",
-        )
+    if tool_name == "rmdir":
+        return await _reinsert_folder_from_revision(session, revision=revision)

-    if revision.name_before is not None:
-        folder.name = revision.name_before
-    if revision.parent_id_before is not None:
-        folder.parent_id = revision.parent_id_before
-    if revision.position_before is not None:
-        folder.position = revision.position_before
-    folder.updated_at = datetime.now(UTC)
-    return RevertOutcome(status="ok", message="Folder restored from snapshot.")
+    if tool_name == "mkdir":
+        return await _delete_created_folder(session, revision=revision)
+
+    return await _restore_in_place_folder(session, revision=revision)


-# Tool-name prefixes that route to KB document / folder revert paths. Kept
-# as data so a future PR adding new KB-owned tools doesn't have to touch
-# this module's control flow.
-_DOC_TOOL_PREFIXES: tuple[str, ...] = (
-    "edit_file",
-    "write_file",
-    "update_memory",
-    "create_note",
-    "update_note",
-    "delete_note",
+# ---------------------------------------------------------------------------
+# Dispatch
+# ---------------------------------------------------------------------------
+#
+# Exact-name dispatch: ``tool_name == name``, NOT ``startswith(...)``.
+# Prefix-matching mis-routes pairs like ``rm``/``rmdir`` and
+# ``delete_note``/``delete_folder``.
+
+_DOC_TOOLS: frozenset[str] = frozenset(
+    {
+        "edit_file",
+        "write_file",
+        "move_file",
+        "rm",
+        "update_memory",
+        "create_note",
+        "update_note",
+        "delete_note",
+    }
 )
-_FOLDER_TOOL_PREFIXES: tuple[str, ...] = (
-    "mkdir",
-    "move_file",
-    "rename_folder",
-    "delete_folder",
+_FOLDER_TOOLS: frozenset[str] = frozenset(
+    {
+        "mkdir",
+        "rmdir",
+        "rename_folder",
+        "delete_folder",
+    }
 )


@ -220,9 +562,9 @@ async def revert_action(
    """
    tool_name = (action.tool_name or "").lower()

-    if tool_name.startswith(_DOC_TOOL_PREFIXES):
+    if tool_name in _DOC_TOOLS:
        outcome = await _restore_document_revision(session, action=action)
-    elif tool_name.startswith(_FOLDER_TOOL_PREFIXES):
+    elif tool_name in _FOLDER_TOOLS:
        outcome = await _restore_folder_revision(session, action=action)
    elif action.reverse_descriptor:
        # Connector-owned reversibles run through the normal permission