feat: enhance task management and timeout configurations in multi-agent chat

- Added new environment variables for controlling task execution limits, including `SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`, `SURFSENSE_TASK_BATCH_CONCURRENCY`, and `SURFSENSE_TASK_BATCH_MAX_SIZE`. - Updated documentation to reflect new batch processing capabilities for `task` calls, allowing for concurrent execution of multiple subagent tasks. - Improved error handling and receipt generation for deliverables, ensuring consistent feedback on task status. - Refactored middleware to incorporate search space ID for better task management.
2026-07-18 23:11:12 +02:00 · 2026-05-27 14:58:10 -07:00 · 2026-05-27 14:58:10 -07:00 · 9d6e9b7e2d
commit 9d6e9b7e2d
parent 820f541f08
66 changed files with 2561 additions and 380 deletions
--- a/surfsense_backend/app/agents/shared/init.py
+++ b/surfsense_backend/app/agents/shared/init.py
@ -0,0 +1,9 @@
+"""Cross-package agent contracts.
+
+Symbols here are intentionally framework-light (no LangGraph / deepagents
+internals) so they can be imported from both ``app.agents.new_chat`` and
+``app.agents.multi_agent_chat`` without creating a circular dependency
+between the two packages. See ``receipt.py`` for the rationale.
+"""
+
+from __future__ import annotations
--- a/surfsense_backend/app/agents/shared/deliverable_wait.py
+++ b/surfsense_backend/app/agents/shared/deliverable_wait.py
@ -0,0 +1,123 @@
+"""Shared poll-until-terminal helper for Celery-backed deliverables.
+
+Lives in ``app.agents.shared`` (neutral package, no dependencies on either
+``new_chat`` or ``multi_agent_chat``) so both the flat single-agent tools
+under ``app/agents/new_chat/tools/`` and the multi-agent subagent tools
+under ``app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/``
+can import it without creating a circular dependency.
+
+Background
+----------
+Tools like ``generate_podcast`` and ``generate_video_presentation`` enqueue
+the heavy work to Celery and historically returned immediately with a
+"pending" status. That works for very-long deliverables but hurts UX for
+the common case (most podcasts finish in 10-30 seconds): the agent sends
+a "kicked off, check back in a minute" reply *before* the worker is done,
+so the user never gets a "ready" confirmation.
+
+This helper bridges that gap. The tool dispatches the Celery task as
+before, then polls the artefact row's ``status`` column **until it
+reaches a terminal value** (READY / FAILED). The tool then returns a
+real terminal outcome — never a pending one.
+
+No wall-clock budget here on purpose
+------------------------------------
+Layering a second budget on top of the existing per-invocation safety
+nets just confused the UX. The real ceilings are:
+
+* **Multi-agent mode** — ``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS``
+  (default ``300.0``, ``0`` to disable) caps how long any single
+  ``task(subagent, ...)`` invocation can run. If a deliverable needs
+  longer than this, the subagent invocation is cancelled and the
+  orchestrator surfaces a "subagent timed out" ToolMessage. Operators
+  who routinely generate long videos should raise that ceiling (or set
+  it to ``0`` for true unbounded waits).
+* **Single-agent mode** — the chat's HTTP stream / process lifetime is
+  the only ceiling. Truly indefinite waits work here, but a dead Celery
+  worker will leave the row in PENDING/GENERATING forever; treat that
+  as an operational concern, not a UX concern.
+
+Configuration
+-------------
+None. The poll cadence is hardcoded at 1.5s — small enough to feel
+responsive (~6 polls per typical 10s podcast), large enough to avoid
+hammering the DB under burst traffic. Override at the call site if a
+specific tool needs a different cadence.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+from enum import Enum
+from typing import Any
+
+from sqlalchemy import select
+from sqlalchemy.orm import InstrumentedAttribute
+
+from app.db import shielded_async_session
+
+logger = logging.getLogger(__name__)
+
+
+_DEFAULT_POLL_INTERVAL_SECONDS: float = 1.5
+
+
+async def wait_for_deliverable(
+    *,
+    model: type,
+    row_id: int,
+    columns: list[InstrumentedAttribute[Any]],
+    terminal_statuses: set[Enum],
+    poll_interval_s: float = _DEFAULT_POLL_INTERVAL_SECONDS,
+) -> tuple[Enum, tuple[Any, ...], float]:
+    """Poll ``model`` row ``row_id`` until ``columns[0]`` reaches a terminal status.
+
+    Blocks until the row's status column matches one of
+    ``terminal_statuses``. There is no internal wall-clock budget; cancel
+    from the outside (subagent timeout, HTTP disconnect, task
+    cancellation) if you need a ceiling. See module docstring.
+
+    The first entry of ``columns`` must be the status column; additional
+    columns (e.g. ``Podcast.file_location``) are returned alongside the
+    final status so callers can build their payload without a second
+    roundtrip.
+
+    A fresh ``shielded_async_session`` is opened per poll so we never
+    hold a transaction across the wait, and a failed poll is logged but
+    does not abort the wait — transient DB hiccups should not collapse
+    the tool call.
+
+    Returns
+    -------
+    ``(terminal_status, columns, elapsed_seconds)``
+        ``columns`` mirrors the requested ``columns`` (including the
+        status itself in position 0).
+    """
+    if not columns:
+        raise ValueError("wait_for_deliverable requires at least the status column")
+
+    start = time.monotonic()
+
+    while True:
+        await asyncio.sleep(poll_interval_s)
+        row: tuple[Any, ...] | None = None
+        try:
+            async with shielded_async_session() as session:
+                result = await session.execute(
+                    select(*columns).where(model.id == row_id)
+                )
+                row = result.first()
+        except Exception as exc:
+            logger.warning(
+                "[deliverable_wait] poll failed model=%s id=%s err=%r",
+                getattr(model, "__name__", str(model)),
+                row_id,
+                exc,
+            )
+
+        if row is not None:
+            status_val = row[0]
+            if status_val in terminal_statuses:
+                return status_val, tuple(row), time.monotonic() - start
--- a/surfsense_backend/app/agents/shared/receipt.py
+++ b/surfsense_backend/app/agents/shared/receipt.py
@ -0,0 +1,161 @@
+"""Receipt: structured handle returned by every mutating subagent tool.
+
+Generalises the Hermes ``entry`` dict (see ``references/hermes-agent/tools/
+delegate_tool.py:1663-1697``) for our 5 deliverable types + 15 connectors +
+KB writes. The supervisor reads the Receipt to verify what actually happened
+without round-tripping through LLM paraphrase.
+
+**Why this lives under ``app.agents.shared`` and not under either of the
+two agent packages:** the Receipt is a *contract* shared between
+``multi_agent_chat`` (where mutating tools emit it) and ``new_chat``
+(where ``filesystem_state.SurfSenseFilesystemState`` declares the
+``receipts`` reducer that accumulates it, and where
+``middleware.kb_persistence`` emits its own KB-write receipts). Putting
+the contract in either package would create a bidirectional import
+between the two — see the commit that introduced this module for the
+``ImportError`` chain it broke.
+
+Each mutating tool wraps its native return shape into a Receipt via
+:func:`make_receipt` (or builds one directly) and returns it under the
+``"receipt"`` key alongside its existing payload. The subagent boundary
+machinery in ``checkpointed_subagent_middleware.task_tool`` then folds
+the receipt into the parent's ``receipts`` state via the append reducer.
+
+The KB write path is the one exception: file-tool calls cannot emit a
+durable receipt because the actual DB writes happen end-of-turn inside
+:class:`app.agents.new_chat.middleware.kb_persistence.KnowledgeBasePersistenceMiddleware`.
+KB tools therefore emit a *provisional* receipt with ``status="pending"``;
+the persistence middleware flips it to ``"success"`` or ``"failed"``
+before returning control to the parent.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal, TypedDict
+
+# Subagent that emitted this receipt.
+ReceiptRoute = Literal[
+    "deliverables",
+    "knowledge_base",
+    "notion",
+    "slack",
+    "gmail",
+    "linear",
+    "jira",
+    "clickup",
+    "confluence",
+    "calendar",
+    "luma",
+    "airtable",
+    "google_drive",
+    "dropbox",
+    "onedrive",
+    "discord",
+    "teams",
+]
+
+# Within-route kind of artefact / external resource the operation touched.
+# Left as ``str`` rather than a giant union so each route file documents
+# its own enum next to its tools.
+ReceiptType = str
+
+# Operation verb. Kept open for the same reason as ``ReceiptType``.
+ReceiptOperation = str
+
+# Pending = async backend (Celery podcast / video) that the orchestrator
+# will surface progress for out of band; persistence-MW flipped this to
+# ``success`` for KB writes that committed.
+ReceiptStatus = Literal["success", "pending", "failed"]
+
+
+class Receipt(TypedDict, total=False):
+    """Structured per-mutation handle returned to the parent subagent.
+
+    All fields are ``NotRequired`` (TypedDict ``total=False``) so each
+    route's tool can populate only the fields it actually has — e.g. Gmail
+    never sets ``verifiable_url`` because Gmail doesn't expose per-message
+    URLs. The receipts state reducer treats missing keys as missing rather
+    than ``null`` so we don't double-count.
+    """
+
+    route: ReceiptRoute
+    """Subagent name. Lets the orchestrator filter ``state['receipts']``
+    by route without re-deriving from ``type``."""
+
+    type: ReceiptType
+    """Within-route kind. e.g. for ``deliverables`` one of ``{report,
+    podcast, video_presentation, resume, image}``; for ``notion`` ``page``;
+    for ``slack`` ``message``."""
+
+    operation: ReceiptOperation
+    """Verb. e.g. ``generate`` (deliverables), ``create`` / ``update`` /
+    ``delete`` (most connectors), ``send`` / ``post`` (chat), ``write_file``
+    / ``edit_file`` / ``rm`` / ``rmdir`` / ``move_file`` / ``mkdir`` (KB)."""
+
+    status: ReceiptStatus
+    """``success`` / ``pending`` / ``failed``. The verification teaching
+    in ``shared/snippets/verifiable_handle.md`` keys off this field."""
+
+    external_id: str | None
+    """Backend identifier. Report row id, Notion ``page_id``, Slack ``ts``,
+    Gmail ``message_id``, Linear identifier, KB ``virtualPath``, etc.
+    ``None`` only when the operation failed before the backend assigned one."""
+
+    verifiable_url: str | None
+    """URL the parent can pass to ``scrape_webpage`` to verify the
+    operation. ``None`` when no public URL exists (Gmail, KB, raw images
+    stored in the DB)."""
+
+    preview: str | None
+    """Short snippet (~200 chars) of what was produced. First lines of
+    a generated report's markdown, transcript opener for a podcast,
+    thumbnail URL for an image. Lets the orchestrator decide whether to
+    re-render in the UI without re-loading the artefact."""
+
+    error: str | None
+    """Filled iff ``status == "failed"``. Plain-text reason; the parent
+    surfaces it in its own ``next_step``."""
+
+
+def make_receipt(
+    *,
+    route: ReceiptRoute,
+    type: str,
+    operation: str,
+    status: ReceiptStatus,
+    external_id: str | None = None,
+    verifiable_url: str | None = None,
+    preview: str | None = None,
+    error: str | None = None,
+) -> Receipt:
+    """Construct a :class:`Receipt` with non-``None`` fields only.
+
+    Drops keys whose value is ``None`` so downstream consumers can use
+    ``"verifiable_url" in receipt`` to distinguish "tool returned no URL"
+    from "tool deliberately surfaced ``null``".
+    """
+    out: dict[str, Any] = {
+        "route": route,
+        "type": type,
+        "operation": operation,
+        "status": status,
+    }
+    if external_id is not None:
+        out["external_id"] = external_id
+    if verifiable_url is not None:
+        out["verifiable_url"] = verifiable_url
+    if preview is not None:
+        out["preview"] = preview
+    if error is not None:
+        out["error"] = error
+    return out  # type: ignore[return-value]
+
+
+__all__ = [
+    "Receipt",
+    "ReceiptOperation",
+    "ReceiptRoute",
+    "ReceiptStatus",
+    "ReceiptType",
+    "make_receipt",
+]
--- a/surfsense_backend/app/agents/shared/receipt_command.py
+++ b/surfsense_backend/app/agents/shared/receipt_command.py
@ -0,0 +1,71 @@
+"""Helper for wrapping a tool result with a Receipt in a ``Command(update=...)``.
+
+Most mutating subagent tools historically returned a plain ``dict`` payload
+which deepagents serialised straight into the ``ToolMessage`` content. To
+participate in the verification teaching from
+``multi_agent_chat/subagents/shared/snippets/verifiable_handle.md`` those
+tools now also need to write a :class:`Receipt` into the parent's
+``state['receipts']`` list (declared on
+:class:`~app.agents.new_chat.filesystem_state.SurfSenseFilesystemState`
+and backed by the append reducer).
+
+:func:`with_receipt` wraps both behaviours: it returns the tool payload as
+a JSON-encoded ``ToolMessage`` AND appends the receipt to state in a single
+:class:`~langgraph.types.Command`. Use it at every ``return`` site of a
+mutating tool — including failure paths (emit a receipt with
+``status="failed"`` and the error message in ``error``).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from langchain_core.messages import ToolMessage
+from langgraph.types import Command
+
+from app.agents.shared.receipt import Receipt
+
+
+def _content_to_text(payload: dict[str, Any] | str) -> str:
+    """Serialise a tool payload to ``ToolMessage`` content.
+
+    Dicts go through ``json.dumps`` (matching deepagents' default tool-result
+    serialisation); strings are passed through. Anything else is coerced via
+    ``str`` so we never raise here — a mis-typed tool return would already
+    have failed inside the tool body.
+    """
+    if isinstance(payload, str):
+        return payload
+    if isinstance(payload, dict):
+        return json.dumps(payload, default=str)
+    return str(payload)
+
+
+def with_receipt(
+    *,
+    payload: dict[str, Any] | str,
+    receipt: Receipt,
+    tool_call_id: str,
+) -> Command:
+    """Return a Command that ships ``payload`` as a ToolMessage AND appends ``receipt``.
+
+    The append happens via the ``_list_append_reducer`` on the ``receipts``
+    field of :class:`~app.agents.new_chat.filesystem_state.SurfSenseFilesystemState`,
+    so concurrent subagent batches (item 4 in the plan) won't clobber each
+    other's receipts.
+    """
+    return Command(
+        update={
+            "messages": [
+                ToolMessage(
+                    content=_content_to_text(payload),
+                    tool_call_id=tool_call_id,
+                )
+            ],
+            "receipts": [receipt],
+        }
+    )
+
+
+__all__ = ["with_receipt"]