add execute_code tool with sandbox integration

2026-05-11 16:52:38 +02:00 · 2026-04-13 16:23:47 +02:00 · 2026-04-13 16:23:47 +02:00 · a7e70020b1
commit a7e70020b1
parent 9e8ea1fd1c
1 changed files with 132 additions and 3 deletions
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@ -7,10 +7,12 @@ This middleware customizes prompts and persists write/edit operations for
 from __future__ import annotations
 import asyncio
 import logging
 import re
 from datetime import UTC, datetime
 from typing import Annotated, Any
 from daytona.common.errors import DaytonaError
 from deepagents import FilesystemMiddleware
 from deepagents.backends.protocol import EditResult, WriteResult
 from deepagents.backends.utils import validate_path
@ -23,6 +25,12 @@ from langchain_core.tools import BaseTool, StructuredTool
 from langgraph.types import Command
 from sqlalchemy import delete, select
 from app.agents.new_chat.sandbox import (
    _evict_sandbox_cache,
    get_or_create_sandbox,
    is_sandbox_enabled,
    sync_files_to_sandbox,
 )
 from app.db import Chunk, Document, DocumentType, Folder, shielded_async_session
 from app.indexing_pipeline.document_chunker import chunk_text
 from app.utils.document_converters import (
@ -31,6 +39,8 @@ from app.utils.document_converters import (
    generate_unique_identifier_hash,
 )
 logger = logging.getLogger(__name__)
 # =============================================================================
 # System Prompt (injected into every model call by wrap_model_call)
 # =============================================================================
@ -40,7 +50,7 @@ SURFSENSE_FILESYSTEM_SYSTEM_PROMPT = """## Following Conventions
 - Read files before editing — understand existing content before making changes.
 - Mimic existing style, naming conventions, and patterns.
-## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`, `save_document`
+## Filesystem Tools
 All file paths must start with a `/`.
 - ls: list files and directories at a given path.
@ -128,6 +138,21 @@ SURFSENSE_GREP_TOOL_DESCRIPTION = """Search for a literal text pattern across fi
 Use this to locate relevant document files/chunks before reading full files.
 """
 SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION = """Executes a shell command in an isolated sandbox environment.
 The sandbox runs Python with common data-science packages pre-installed
 (pandas, numpy, matplotlib, scipy, scikit-learn).
 Knowledge base documents from your conversation are automatically available
 as XML files under /home/daytona/documents/.
 Usage notes:
 - Commands run in an isolated sandbox with no outbound network access.
 - Returns combined stdout/stderr output with exit code.
 - Use the optional timeout parameter to override the default timeout.
 - When issuing multiple commands, use ';' or '&&' to chain them.
 """
 SURFSENSE_SAVE_DOCUMENT_TOOL_DESCRIPTION = """Permanently saves a document to the user's knowledge base.
 This is an expensive operation — it creates a new Document record in the
@ -148,17 +173,29 @@ Args:
 class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
    """SurfSense-specific filesystem middleware with DB persistence for docs."""
    _MAX_EXECUTE_TIMEOUT = 300
    def __init__(
        self,
        *,
        search_space_id: int | None = None,
        created_by_id: str | None = None,
        thread_id: int | str | None = None,
        tool_token_limit_before_evict: int | None = 20000,
    ) -> None:
        self._search_space_id = search_space_id
        self._created_by_id = created_by_id
        self._thread_id = thread_id
        self._sandbox_available = is_sandbox_enabled() and thread_id is not None
        system_prompt = SURFSENSE_FILESYSTEM_SYSTEM_PROMPT
        if self._sandbox_available:
            system_prompt += (
                "\n- execute_code: run shell commands in an isolated Python sandbox."
            )
        super().__init__(
-            system_prompt=SURFSENSE_FILESYSTEM_SYSTEM_PROMPT,
+            system_prompt=system_prompt,
            custom_tool_descriptions={
                "ls": SURFSENSE_LIST_FILES_TOOL_DESCRIPTION,
                "read_file": SURFSENSE_READ_FILE_TOOL_DESCRIPTION,
@ -168,10 +205,12 @@ class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
                "grep": SURFSENSE_GREP_TOOL_DESCRIPTION,
            },
            tool_token_limit_before_evict=tool_token_limit_before_evict,
            max_execute_timeout=self._MAX_EXECUTE_TIMEOUT,
        )
        # Remove the execute tool (no sandbox backend)
        self.tools = [t for t in self.tools if t.name != "execute"]
        self.tools.append(self._create_save_document_tool())
        if self._sandbox_available:
            self.tools.append(self._create_execute_code_tool())
    @staticmethod
    def _run_async_blocking(coro: Any) -> Any:
@ -455,6 +494,96 @@ class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
            coroutine=async_save_document,
        )
    def _create_execute_code_tool(self) -> BaseTool:
        """Create execute_code tool backed by a Daytona sandbox."""
        def sync_execute_code(
            command: Annotated[
                str, "Shell command to execute in the sandbox environment."
            ],
            runtime: ToolRuntime[None, FilesystemState],
            timeout: Annotated[
                int | None,
                "Optional timeout in seconds for this command.",
            ] = None,
        ) -> str:
            if timeout is not None:
                if timeout < 0:
                    return f"Error: timeout must be non-negative, got {timeout}."
                if timeout > self._MAX_EXECUTE_TIMEOUT:
                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
            return self._run_async_blocking(
                self._execute_in_sandbox(command, runtime, timeout)
            )
        async def async_execute_code(
            command: Annotated[
                str, "Shell command to execute in the sandbox environment."
            ],
            runtime: ToolRuntime[None, FilesystemState],
            timeout: Annotated[
                int | None,
                "Optional timeout in seconds for this command.",
            ] = None,
        ) -> str:
            if timeout is not None:
                if timeout < 0:
                    return f"Error: timeout must be non-negative, got {timeout}."
                if timeout > self._MAX_EXECUTE_TIMEOUT:
                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
            return await self._execute_in_sandbox(command, runtime, timeout)
        return StructuredTool.from_function(
            name="execute_code",
            description=SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION,
            func=sync_execute_code,
            coroutine=async_execute_code,
        )
    async def _execute_in_sandbox(
        self,
        command: str,
        runtime: ToolRuntime[None, FilesystemState],
        timeout: int | None,
    ) -> str:
        """Core logic: get sandbox, sync files, run command, handle retries."""
        assert self._thread_id is not None
        try:
            return await self._try_sandbox_execute(command, runtime, timeout)
        except (DaytonaError, Exception) as first_err:
            logger.warning(
                "Sandbox execute failed for thread %s, retrying: %s",
                self._thread_id,
                first_err,
            )
            _evict_sandbox_cache(self._thread_id)
            try:
                return await self._try_sandbox_execute(command, runtime, timeout)
            except Exception:
                logger.exception(
                    "Sandbox retry also failed for thread %s", self._thread_id
                )
                return "Error: Code execution is temporarily unavailable. Please try again."
    async def _try_sandbox_execute(
        self,
        command: str,
        runtime: ToolRuntime[None, FilesystemState],
        timeout: int | None,
    ) -> str:
        sandbox, is_new = await get_or_create_sandbox(self._thread_id)
        files = runtime.state.get("files") or {}
        await sync_files_to_sandbox(self._thread_id, files, sandbox, is_new)
        result = await sandbox.aexecute(command, timeout=timeout)
        parts = [result.output]
        if result.exit_code is not None:
            status = "succeeded" if result.exit_code == 0 else "failed"
            parts.append(f"\n[Command {status} with exit code {result.exit_code}]")
        if result.truncated:
            parts.append("\n[Output was truncated due to size limits]")
        return "".join(parts)
    def _create_write_file_tool(self) -> BaseTool:
        """Create write_file — ephemeral for /documents/*, persisted otherwise."""
        tool_description = (