add execute_code tool with sandbox integration

2026-04-25 08:46:22 +02:00 · 2026-04-13 16:23:47 +02:00 · 2026-04-13 16:23:47 +02:00 · a7e70020b1
commit a7e70020b1
parent 9e8ea1fd1c
1 changed files with 132 additions and 3 deletions
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@ -7,10 +7,12 @@ This middleware customizes prompts and persists write/edit operations for
 from __future__ import annotations

 import asyncio
+import logging
 import re
 from datetime import UTC, datetime
 from typing import Annotated, Any

+from daytona.common.errors import DaytonaError
 from deepagents import FilesystemMiddleware
 from deepagents.backends.protocol import EditResult, WriteResult
 from deepagents.backends.utils import validate_path
@ -23,6 +25,12 @@ from langchain_core.tools import BaseTool, StructuredTool
 from langgraph.types import Command
 from sqlalchemy import delete, select

+from app.agents.new_chat.sandbox import (
+    _evict_sandbox_cache,
+    get_or_create_sandbox,
+    is_sandbox_enabled,
+    sync_files_to_sandbox,
+)
 from app.db import Chunk, Document, DocumentType, Folder, shielded_async_session
 from app.indexing_pipeline.document_chunker import chunk_text
 from app.utils.document_converters import (
@ -31,6 +39,8 @@ from app.utils.document_converters import (
    generate_unique_identifier_hash,
 )

+logger = logging.getLogger(__name__)
+
 # =============================================================================
 # System Prompt (injected into every model call by wrap_model_call)
 # =============================================================================
@ -40,7 +50,7 @@ SURFSENSE_FILESYSTEM_SYSTEM_PROMPT = """## Following Conventions
 - Read files before editing — understand existing content before making changes.
 - Mimic existing style, naming conventions, and patterns.

-## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`, `save_document`
+## Filesystem Tools

 All file paths must start with a `/`.
 - ls: list files and directories at a given path.
@ -128,6 +138,21 @@ SURFSENSE_GREP_TOOL_DESCRIPTION = """Search for a literal text pattern across fi
 Use this to locate relevant document files/chunks before reading full files.
 """

+SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION = """Executes a shell command in an isolated sandbox environment.
+
+The sandbox runs Python with common data-science packages pre-installed
+(pandas, numpy, matplotlib, scipy, scikit-learn).
+
+Knowledge base documents from your conversation are automatically available
+as XML files under /home/daytona/documents/.
+
+Usage notes:
+- Commands run in an isolated sandbox with no outbound network access.
+- Returns combined stdout/stderr output with exit code.
+- Use the optional timeout parameter to override the default timeout.
+- When issuing multiple commands, use ';' or '&&' to chain them.
+"""
+
 SURFSENSE_SAVE_DOCUMENT_TOOL_DESCRIPTION = """Permanently saves a document to the user's knowledge base.

 This is an expensive operation — it creates a new Document record in the
@ -148,17 +173,29 @@ Args:
 class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
    """SurfSense-specific filesystem middleware with DB persistence for docs."""

+    _MAX_EXECUTE_TIMEOUT = 300
+
    def __init__(
        self,
        *,
        search_space_id: int | None = None,
        created_by_id: str | None = None,
+        thread_id: int | str | None = None,
        tool_token_limit_before_evict: int | None = 20000,
    ) -> None:
        self._search_space_id = search_space_id
        self._created_by_id = created_by_id
+        self._thread_id = thread_id
+        self._sandbox_available = is_sandbox_enabled() and thread_id is not None
+
+        system_prompt = SURFSENSE_FILESYSTEM_SYSTEM_PROMPT
+        if self._sandbox_available:
+            system_prompt += (
+                "\n- execute_code: run shell commands in an isolated Python sandbox."
+            )
+
        super().__init__(
-            system_prompt=SURFSENSE_FILESYSTEM_SYSTEM_PROMPT,
+            system_prompt=system_prompt,
            custom_tool_descriptions={
                "ls": SURFSENSE_LIST_FILES_TOOL_DESCRIPTION,
                "read_file": SURFSENSE_READ_FILE_TOOL_DESCRIPTION,
@ -168,10 +205,12 @@ class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
                "grep": SURFSENSE_GREP_TOOL_DESCRIPTION,
            },
            tool_token_limit_before_evict=tool_token_limit_before_evict,
+            max_execute_timeout=self._MAX_EXECUTE_TIMEOUT,
        )
-        # Remove the execute tool (no sandbox backend)
        self.tools = [t for t in self.tools if t.name != "execute"]
        self.tools.append(self._create_save_document_tool())
+        if self._sandbox_available:
+            self.tools.append(self._create_execute_code_tool())

    @staticmethod
    def _run_async_blocking(coro: Any) -> Any:
@ -455,6 +494,96 @@ class SurfSenseFilesystemMiddleware(FilesystemMiddleware):
            coroutine=async_save_document,
        )

+    def _create_execute_code_tool(self) -> BaseTool:
+        """Create execute_code tool backed by a Daytona sandbox."""
+
+        def sync_execute_code(
+            command: Annotated[
+                str, "Shell command to execute in the sandbox environment."
+            ],
+            runtime: ToolRuntime[None, FilesystemState],
+            timeout: Annotated[
+                int | None,
+                "Optional timeout in seconds for this command.",
+            ] = None,
+        ) -> str:
+            if timeout is not None:
+                if timeout < 0:
+                    return f"Error: timeout must be non-negative, got {timeout}."
+                if timeout > self._MAX_EXECUTE_TIMEOUT:
+                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
+            return self._run_async_blocking(
+                self._execute_in_sandbox(command, runtime, timeout)
+            )
+
+        async def async_execute_code(
+            command: Annotated[
+                str, "Shell command to execute in the sandbox environment."
+            ],
+            runtime: ToolRuntime[None, FilesystemState],
+            timeout: Annotated[
+                int | None,
+                "Optional timeout in seconds for this command.",
+            ] = None,
+        ) -> str:
+            if timeout is not None:
+                if timeout < 0:
+                    return f"Error: timeout must be non-negative, got {timeout}."
+                if timeout > self._MAX_EXECUTE_TIMEOUT:
+                    return f"Error: timeout {timeout}s exceeds maximum ({self._MAX_EXECUTE_TIMEOUT}s)."
+            return await self._execute_in_sandbox(command, runtime, timeout)
+
+        return StructuredTool.from_function(
+            name="execute_code",
+            description=SURFSENSE_EXECUTE_CODE_TOOL_DESCRIPTION,
+            func=sync_execute_code,
+            coroutine=async_execute_code,
+        )
+
+    async def _execute_in_sandbox(
+        self,
+        command: str,
+        runtime: ToolRuntime[None, FilesystemState],
+        timeout: int | None,
+    ) -> str:
+        """Core logic: get sandbox, sync files, run command, handle retries."""
+        assert self._thread_id is not None
+
+        try:
+            return await self._try_sandbox_execute(command, runtime, timeout)
+        except (DaytonaError, Exception) as first_err:
+            logger.warning(
+                "Sandbox execute failed for thread %s, retrying: %s",
+                self._thread_id,
+                first_err,
+            )
+            _evict_sandbox_cache(self._thread_id)
+            try:
+                return await self._try_sandbox_execute(command, runtime, timeout)
+            except Exception:
+                logger.exception(
+                    "Sandbox retry also failed for thread %s", self._thread_id
+                )
+                return "Error: Code execution is temporarily unavailable. Please try again."
+
+    async def _try_sandbox_execute(
+        self,
+        command: str,
+        runtime: ToolRuntime[None, FilesystemState],
+        timeout: int | None,
+    ) -> str:
+        sandbox, is_new = await get_or_create_sandbox(self._thread_id)
+        files = runtime.state.get("files") or {}
+        await sync_files_to_sandbox(self._thread_id, files, sandbox, is_new)
+        result = await sandbox.aexecute(command, timeout=timeout)
+        parts = [result.output]
+        if result.exit_code is not None:
+            status = "succeeded" if result.exit_code == 0 else "failed"
+            parts.append(f"\n[Command {status} with exit code {result.exit_code}]")
+        if result.truncated:
+            parts.append("\n[Output was truncated due to size limits]")
+        return "".join(parts)
+
    def _create_write_file_tool(self) -> BaseTool:
        """Create write_file — ephemeral for /documents/*, persisted otherwise."""
        tool_description = (