feat: enhance task management and timeout configurations in multi-agent chat

- Added new environment variables for controlling task execution limits, including `SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`, `SURFSENSE_TASK_BATCH_CONCURRENCY`, and `SURFSENSE_TASK_BATCH_MAX_SIZE`. - Updated documentation to reflect new batch processing capabilities for `task` calls, allowing for concurrent execution of multiple subagent tasks. - Improved error handling and receipt generation for deliverables, ensuring consistent feedback on task status. - Refactored middleware to incorporate search space ID for better task management.
2026-05-29 19:35:20 +02:00 · 2026-05-27 14:58:10 -07:00 · 2026-05-27 14:58:10 -07:00 · 9d6e9b7e2d
commit 9d6e9b7e2d
parent 820f541f08
66 changed files with 2561 additions and 380 deletions
--- a/surfsense_backend/app/agents/new_chat/filesystem_state.py
+++ b/surfsense_backend/app/agents/new_chat/filesystem_state.py
@ -33,9 +33,11 @@ from typing_extensions import TypedDict
 from app.agents.new_chat.state_reducers import (
    _add_unique_reducer,
    _dict_merge_with_tombstones_reducer,
+    _int_counter_merge_reducer,
    _list_append_reducer,
    _replace_reducer,
 )
+from app.agents.shared.receipt import Receipt


 class PendingMove(TypedDict, total=False):
@ -172,6 +174,35 @@ class SurfSenseFilesystemState(FilesystemState):
    workspace_tree_text: NotRequired[Annotated[str, _replace_reducer]]
    """Pre-rendered ``<workspace_tree>`` body; shared with subagents to skip re-render."""

+    billable_calls: NotRequired[Annotated[dict[str, int], _int_counter_merge_reducer]]
+    """Per-subagent ``task(...)`` invocation counter, summed across the turn.
+
+    Incremented by ``task_tool.py`` each time a subagent invocation
+    completes (single- or batch-mode). The orchestrator can read this map
+    to self-limit when a runaway loop sends the same specialist 20 calls
+    in a row; the runtime emits a soft warning ToolMessage once the
+    cumulative count crosses :data:`DEFAULT_SUBAGENT_BILLABLE_THRESHOLD`.
+    Cleared by checkpoint rollover (i.e. per turn).
+    """
+
+    receipts: NotRequired[Annotated[list[Receipt], _list_append_reducer]]
+    """Structured Receipt handles emitted by mutating subagent tools this turn.
+
+    Each mutating tool (deliverables, every connector, KB writes via the
+    persistence middleware) wraps its native return into a
+    :class:`~app.agents.shared.receipt.Receipt`
+    and returns it under the ``"receipt"`` key alongside its existing
+    payload. The subagent's tool-call middleware folds the receipt into
+    this list, and ``_return_command_with_state_update`` in
+    ``checkpointed_subagent_middleware/task_tool.py`` carries the list up
+    to the parent automatically (``"receipts"`` is not in
+    ``EXCLUDED_STATE_KEYS``).
+
+    Append-only across the turn; cleared by checkpoint rollover. The
+    orchestrator reads it via the ``<verification>`` teaching to confirm
+    side-effecting subagent claims (see ``shared/snippets/verifiable_handle.md``).
+    """
+

 __all__ = [
    "KbAnonDoc",
--- a/surfsense_backend/app/agents/new_chat/middleware/compaction.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/compaction.py
@ -34,8 +34,7 @@ from deepagents.middleware.summarization import (
 )
 from langchain_core.messages import SystemMessage

-from app.observability import metrics as ot_metrics
-from app.observability import otel as ot
+from app.observability import metrics as ot_metrics, otel as ot

 if TYPE_CHECKING:
    from deepagents.backends.protocol import BACKEND_TYPES
--- a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
@ -47,8 +47,7 @@ from langgraph.config import get_config
 from langgraph.runtime import Runtime
 from langgraph.types import interrupt

-from app.observability import metrics as ot_metrics
-from app.observability import otel as ot
+from app.observability import metrics as ot_metrics, otel as ot

 logger = logging.getLogger(__name__)

--- a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py
@ -55,6 +55,7 @@ from app.agents.new_chat.path_resolver import (
    virtual_path_to_doc,
 )
 from app.agents.new_chat.state_reducers import _CLEAR
+from app.agents.shared.receipt import Receipt, make_receipt
 from app.db import (
    AgentActionLog,
    Chunk,
@ -1392,6 +1393,81 @@ async def commit_staged_filesystem_state(
        "pending_dir_deletes": [_CLEAR],
        "dirty_path_tool_calls": {_CLEAR: True},
    }
+
+    # Emit one Receipt per committed mutation, folded into ``state['receipts']``
+    # via ``_list_append_reducer``. The receipts surface what actually committed
+    # (post-savepoint) rather than what the LLM intended; the orchestrator uses
+    # them as ground truth in the ``<verification>`` teaching. KB writes do not
+    # have public verifiable URLs, so ``verifiable_url`` stays unset.
+    receipts: list[Receipt] = []
+
+    def _kb_receipt(
+        *,
+        type: str,
+        operation: str,
+        path: str,
+        external_id: int | None = None,
+    ) -> None:
+        if not path:
+            return
+        preview = path.rsplit("/", 1)[-1] or path
+        receipts.append(
+            make_receipt(
+                route="knowledge_base",
+                type=type,
+                operation=operation,
+                status="success",
+                external_id=str(external_id) if external_id is not None else path,
+                preview=preview,
+            )
+        )
+
+    for payload in committed_creates:
+        path = str(payload.get("virtualPath") or "")
+        _kb_receipt(
+            type="file",
+            operation="write_file",
+            path=path,
+            external_id=payload.get("id"),
+        )
+    for payload in committed_updates:
+        path = str(payload.get("virtualPath") or "")
+        _kb_receipt(
+            type="file",
+            operation="edit_file",
+            path=path,
+            external_id=payload.get("id"),
+        )
+    for payload in applied_moves:
+        # ``applied_moves`` rows carry the destination ``virtualPath`` because
+        # the move has already landed in the DB by the time we reach this code.
+        path = str(payload.get("virtualPath") or "")
+        _kb_receipt(
+            type="file",
+            operation="move_file",
+            path=path,
+            external_id=payload.get("id"),
+        )
+    for path in staged_dirs:
+        _kb_receipt(type="folder", operation="mkdir", path=path)
+    for payload in committed_deletes:
+        path = str(payload.get("virtualPath") or "")
+        _kb_receipt(
+            type="file",
+            operation="rm",
+            path=path,
+            external_id=payload.get("id"),
+        )
+    for payload in committed_folder_deletes:
+        path = str(payload.get("virtualPath") or "")
+        _kb_receipt(
+            type="folder",
+            operation="rmdir",
+            path=path,
+            external_id=payload.get("id"),
+        )
+    if receipts:
+        delta["receipts"] = receipts
    files_delta: dict[str, Any] = {}
    if temp_paths:
        files_delta.update(dict.fromkeys(temp_paths))
--- a/surfsense_backend/app/agents/new_chat/middleware/permission.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/permission.py
@ -61,8 +61,7 @@ from app.agents.new_chat.permissions import (
    aggregate_action,
    evaluate_many,
 )
-from app.observability import metrics as ot_metrics
-from app.observability import otel as ot
+from app.observability import metrics as ot_metrics, otel as ot

 logger = logging.getLogger(__name__)

--- a/surfsense_backend/app/agents/new_chat/state_reducers.py
+++ b/surfsense_backend/app/agents/new_chat/state_reducers.py
@ -171,6 +171,39 @@ def _dict_merge_with_tombstones_reducer(
    return result


+def _int_counter_merge_reducer(
+    left: dict[str, int] | None,
+    right: dict[str, int] | None,
+) -> dict[str, int]:
+    """Merge ``right`` into ``left`` by **summing** per-key integer counters.
+
+    Used for state fields that accumulate counts across multiple updates
+    within the same turn (e.g. per-subagent ``billable_calls``). Unknown
+    keys are added; existing keys are summed. ``_CLEAR`` sentinels reset
+    the accumulator the same way the other reducers do, so the orchestrator
+    can wipe the counter at end-of-turn if needed.
+    """
+    if right is None:
+        return dict(left or {})
+
+    if _CLEAR in right or any(_is_clear(k) for k in right):
+        result: dict[str, int] = {}
+        for key, value in right.items():
+            if _is_clear(key):
+                continue
+            if not isinstance(value, int):
+                continue
+            result[key] = result.get(key, 0) + value
+        return result
+
+    base = dict(left or {})
+    for key, value in right.items():
+        if not isinstance(value, int):
+            continue
+        base[key] = base.get(key, 0) + value
+    return base
+
+
 def _initial_filesystem_state() -> dict[str, Any]:
    """Default empty values for SurfSense filesystem state fields.

@ -200,6 +233,7 @@ __all__ = [
    "_add_unique_reducer",
    "_dict_merge_with_tombstones_reducer",
    "_initial_filesystem_state",
+    "_int_counter_merge_reducer",
    "_list_append_reducer",
    "_replace_reducer",
 ]
--- a/surfsense_backend/app/agents/new_chat/tools/podcast.py
+++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py
@ -2,17 +2,23 @@
 Podcast generation tool for the SurfSense agent.

 This module provides a factory function for creating the generate_podcast tool
-that submits a Celery task for background podcast generation. The frontend
-polls for completion and auto-updates when the podcast is ready.
+that submits a Celery task for background podcast generation. The tool then
+polls the podcast row until it reaches a terminal status (READY/FAILED) and
+returns that status. The wait is bounded by the chat's HTTP / process
+lifetime; see app.agents.shared.deliverable_wait for details.
 """

+import logging
 from typing import Any

 from langchain_core.tools import tool
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.agents.shared.deliverable_wait import wait_for_deliverable
 from app.db import Podcast, PodcastStatus, shielded_async_session

+logger = logging.getLogger(__name__)
+

 def create_generate_podcast_tool(
    search_space_id: int,
@ -97,18 +103,57 @@ def create_generate_podcast_tool(
                user_prompt=user_prompt,
            )

-            print(f"[generate_podcast] Created podcast {podcast_id}, task: {task.id}")
+            logger.info(
+                "[generate_podcast] Created podcast %s, task: %s",
+                podcast_id,
+                task.id,
+            )

+            # Wait until the Celery worker flips the row to a terminal
+            # state. No internal budget — see deliverable_wait module.
+            terminal_status, columns, elapsed = await wait_for_deliverable(
+                model=Podcast,
+                row_id=podcast_id,
+                columns=[Podcast.status, Podcast.file_location],
+                terminal_statuses={PodcastStatus.READY, PodcastStatus.FAILED},
+            )
+
+            if terminal_status == PodcastStatus.READY:
+                file_location = columns[1] if columns else None
+                logger.info(
+                    "[generate_podcast] Podcast %s READY in %.2fs (file=%s)",
+                    podcast_id,
+                    elapsed,
+                    file_location,
+                )
+                return {
+                    "status": PodcastStatus.READY.value,
+                    "podcast_id": podcast_id,
+                    "title": podcast_title,
+                    "file_location": file_location,
+                    "message": (
+                        "Podcast generated and saved to your podcast panel."
+                    ),
+                }
+
+            # Only other terminal state is FAILED.
+            logger.warning(
+                "[generate_podcast] Podcast %s FAILED in %.2fs",
+                podcast_id,
+                elapsed,
+            )
            return {
-                "status": PodcastStatus.PENDING.value,
+                "status": PodcastStatus.FAILED.value,
                "podcast_id": podcast_id,
                "title": podcast_title,
-                "message": "Podcast generation started. This may take a few minutes.",
+                "error": (
+                    "Background worker reported FAILED status for this podcast."
+                ),
            }

        except Exception as e:
            error_message = str(e)
-            print(f"[generate_podcast] Error: {error_message}")
+            logger.exception("[generate_podcast] Error: %s", error_message)
            return {
                "status": PodcastStatus.FAILED.value,
                "error": error_message,
--- a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py
+++ b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py
@ -2,17 +2,23 @@
 Video presentation generation tool for the SurfSense agent.

 This module provides a factory function for creating the generate_video_presentation
-tool that submits a Celery task for background video presentation generation.
-The frontend polls for completion and auto-updates when the presentation is ready.
+tool that submits a Celery task for background video presentation generation. The
+tool then polls the row until it reaches a terminal status (READY/FAILED) and
+returns that status. The wait is bounded by the chat's HTTP / process lifetime;
+see app.agents.shared.deliverable_wait for details.
 """

+import logging
 from typing import Any

 from langchain_core.tools import tool
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.agents.shared.deliverable_wait import wait_for_deliverable
 from app.db import VideoPresentation, VideoPresentationStatus, shielded_async_session

+logger = logging.getLogger(__name__)
+

 def create_generate_video_presentation_tool(
    search_space_id: int,
@ -72,20 +78,58 @@ def create_generate_video_presentation_tool(
                user_prompt=user_prompt,
            )

-            print(
-                f"[generate_video_presentation] Created video presentation {video_pres_id}, task: {task.id}"
+            logger.info(
+                "[generate_video_presentation] Created video presentation %s, task: %s",
+                video_pres_id,
+                task.id,
            )

+            # Wait until the Celery worker flips the row to a terminal
+            # state. No internal budget — see deliverable_wait module.
+            terminal_status, _columns, elapsed = await wait_for_deliverable(
+                model=VideoPresentation,
+                row_id=video_pres_id,
+                columns=[VideoPresentation.status],
+                terminal_statuses={
+                    VideoPresentationStatus.READY,
+                    VideoPresentationStatus.FAILED,
+                },
+            )
+
+            if terminal_status == VideoPresentationStatus.READY:
+                logger.info(
+                    "[generate_video_presentation] %s READY in %.2fs",
+                    video_pres_id,
+                    elapsed,
+                )
+                return {
+                    "status": VideoPresentationStatus.READY.value,
+                    "video_presentation_id": video_pres_id,
+                    "title": video_title,
+                    "message": "Video presentation generated and saved.",
+                }
+
+            # Only other terminal state is FAILED.
+            logger.warning(
+                "[generate_video_presentation] %s FAILED in %.2fs",
+                video_pres_id,
+                elapsed,
+            )
            return {
-                "status": VideoPresentationStatus.PENDING.value,
+                "status": VideoPresentationStatus.FAILED.value,
                "video_presentation_id": video_pres_id,
                "title": video_title,
-                "message": "Video presentation generation started. This may take a few minutes.",
+                "error": (
+                    "Background worker reported FAILED status for this "
+                    "video presentation."
+                ),
            }

        except Exception as e:
            error_message = str(e)
-            print(f"[generate_video_presentation] Error: {error_message}")
+            logger.exception(
+                "[generate_video_presentation] Error: %s", error_message
+            )
            return {
                "status": VideoPresentationStatus.FAILED.value,
                "error": error_message,