From 9d6e9b7e2d4edf97cd68e3f7fbbd535ba398c378 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 27 May 2026 14:58:10 -0700 Subject: [PATCH] feat: enhance task management and timeout configurations in multi-agent chat - Added new environment variables for controlling task execution limits, including `SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`, `SURFSENSE_TASK_BATCH_CONCURRENCY`, and `SURFSENSE_TASK_BATCH_MAX_SIZE`. - Updated documentation to reflect new batch processing capabilities for `task` calls, allowing for concurrent execution of multiple subagent tasks. - Improved error handling and receipt generation for deliverables, ensuring consistent feedback on task status. - Refactored middleware to incorporate search space ID for better task management. --- surfsense_backend/.env.example | 47 ++ .../system_prompt/prompts/routing.md | 70 ++ .../prompts/tools/task/description.md | 59 +- .../constants.py | 71 ++ .../middleware.py | 21 +- .../spawn_paused.py | 84 +++ .../task_tool.py | 674 +++++++++++++++++- .../shared/kb_context_projection.py | 4 +- .../shared/permissions/ask/request.py | 3 +- .../multi_agent_chat/middleware/stack.py | 1 + .../builtins/deliverables/system_prompt.md | 12 +- .../deliverables/tools/generate_image.py | 78 +- .../builtins/deliverables/tools/podcast.py | 111 ++- .../builtins/deliverables/tools/report.py | 100 ++- .../builtins/deliverables/tools/resume.py | 202 ++++-- .../deliverables/tools/video_presentation.py | 114 ++- .../knowledge_base/system_prompt_cloud.md | 9 +- .../knowledge_base/system_prompt_desktop.md | 9 +- .../builtins/memory/system_prompt.md | 8 +- .../builtins/research/system_prompt.md | 6 +- .../connectors/airtable/system_prompt.md | 8 +- .../connectors/calendar/system_prompt.md | 9 +- .../connectors/clickup/system_prompt.md | 8 +- .../connectors/confluence/system_prompt.md | 7 +- .../connectors/discord/system_prompt.md | 7 +- .../connectors/dropbox/system_prompt.md | 7 +- .../connectors/gmail/system_prompt.md | 9 +- .../connectors/gmail/tools/send_email.py | 148 ++-- .../connectors/google_drive/system_prompt.md | 7 +- .../connectors/jira/system_prompt.md | 8 +- .../connectors/linear/system_prompt.md | 8 +- .../connectors/luma/system_prompt.md | 7 +- .../connectors/notion/system_prompt.md | 7 +- .../connectors/notion/tools/delete_page.py | 140 +++- .../connectors/onedrive/system_prompt.md | 7 +- .../connectors/slack/system_prompt.md | 8 +- .../connectors/teams/system_prompt.md | 7 +- .../subagents/shared/md_file_reader.py | 13 + .../subagents/shared/snippets/__init__.py | 6 + .../shared/snippets/output_contract_base.md | 6 + .../shared/snippets/verifiable_handle.md | 10 + .../multi_agent_chat/subagents/shared/spec.py | 32 +- .../subagents/shared/subagent_builder.py | 56 +- .../app/agents/new_chat/filesystem_state.py | 31 + .../agents/new_chat/middleware/compaction.py | 3 +- .../agents/new_chat/middleware/doom_loop.py | 3 +- .../new_chat/middleware/kb_persistence.py | 76 ++ .../agents/new_chat/middleware/permission.py | 3 +- .../app/agents/new_chat/state_reducers.py | 34 + .../app/agents/new_chat/tools/podcast.py | 57 +- .../new_chat/tools/video_presentation.py | 58 +- .../app/agents/shared/__init__.py | 9 + .../app/agents/shared/deliverable_wait.py | 123 ++++ .../app/agents/shared/receipt.py | 161 +++++ .../app/agents/shared/receipt_command.py | 71 ++ .../app/etl_pipeline/etl_pipeline_service.py | 4 +- .../app/services/composio_service.py | 9 +- .../app/services/gmail/kb_sync_service.py | 4 +- .../google_calendar/kb_sync_service.py | 8 +- .../app/services/jira/kb_sync_service.py | 8 +- surfsense_backend/app/services/llm_service.py | 6 +- .../app/services/onedrive/kb_sync_service.py | 4 +- .../app/tasks/chat/stream_new_chat.py | 4 +- .../tasks/chat/streaming/handlers/tool_end.py | 37 +- .../generate_video_presentation/emission.py | 16 +- .../app/utils/document_converters.py | 4 +- 66 files changed, 2561 insertions(+), 380 deletions(-) create mode 100644 surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/spawn_paused.py create mode 100644 surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/__init__.py create mode 100644 surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/output_contract_base.md create mode 100644 surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/verifiable_handle.md create mode 100644 surfsense_backend/app/agents/shared/__init__.py create mode 100644 surfsense_backend/app/agents/shared/deliverable_wait.py create mode 100644 surfsense_backend/app/agents/shared/receipt.py create mode 100644 surfsense_backend/app/agents/shared/receipt_command.py diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index b05369412..70cf687d8 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -357,3 +357,50 @@ LANGSMITH_PROJECT=surfsense # updates and deletes — the TTL only bounds staleness for bulk-import # paths that bypass the ORM. Set to 0 to disable the cache. # SURFSENSE_CONNECTOR_DISCOVERY_TTL_SECONDS=30 + +# ----------------------------------------------------------------------------- +# `task` boundary controls (Hermes-inspired improvements) +# ----------------------------------------------------------------------------- +# Wall-clock budget for a single ``task(subagent, ...)`` invocation in +# seconds. Subagents that run hot (slow image vendors, sluggish embedders, +# wedged MCP servers) would otherwise pin the orchestrator until the next +# checkpoint heartbeat fires. On timeout the runtime cancels the underlying +# coroutine and synthesizes a ToolMessage telling the orchestrator to treat +# the result as ``status=error``. Set to 0 to disable the cap entirely. +# Default: 300.0 +# SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS=300 + +# Batch-mode (``task(tasks=[...])``) concurrency cap and max batch size. +# Concurrency is enforced via an ``asyncio.Semaphore`` so a runaway fanout +# cannot starve unrelated subagents (each child still owns an LLM call and +# its own DB session). Max-size is a hard safety net for prompt-injection / +# runaway loops; the orchestrator rarely needs more than a handful of +# concurrent specialists. Set concurrency to 1 to effectively serialise +# batches without changing the schema. +# SURFSENSE_TASK_BATCH_CONCURRENCY=3 +# SURFSENSE_TASK_BATCH_MAX_SIZE=8 + +# Soft per-turn cap on cumulative ``task(...)`` invocations across all +# subagents. Once the sum of ``state['billable_calls']`` crosses this +# number, the runtime appends a one-shot warning ToolMessage telling the +# orchestrator to wrap up rather than launching more specialists. Tunable +# so heavy-research turns (15+ legitimate specialist calls) don't trip the +# alarm in production. Set to 0 to disable the warning entirely. +# SURFSENSE_SUBAGENT_BILLABLE_THRESHOLD=15 + +# Per-workspace spawn-paused kill switch — set via Redis at runtime, not +# this env var. The env var below only disables the check itself (useful +# for local dev without Redis). To pause a workspace in production: +# redis-cli SET surfsense:spawn_paused: 1 EX 600 +# redis-cli DEL surfsense:spawn_paused: +# The check is fail-open: a Redis blip never blocks ``task(...)``. +# SURFSENSE_TASK_SPAWN_PAUSED_DISABLED=false + +# Note on Celery-backed deliverables (generate_podcast, +# generate_video_presentation): these tools poll the artefact row until +# it reaches a terminal status — they do NOT use an internal wall-clock +# budget. The effective ceiling is SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS +# (above, default 300s) in multi-agent mode and the chat's HTTP / process +# lifetime in single-agent mode. If your podcasts or videos routinely +# exceed 5 minutes, raise SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS (or +# set it to 0 to disable that ceiling entirely). diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md index 4e27381d3..1038dde3d 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md +++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/routing.md @@ -33,6 +33,15 @@ Rules for `task`: - Neither's prompt references the other's output, and - They target different specialists, OR the same specialist with non-overlapping scopes (e.g. reading two unrelated paths). +- **Batch shape for many-shot fanout.** When a single user request expands + to **3 or more independent specialist calls** (e.g. "create five issues + from this list"), prefer the batch shape: + `task(tasks=[{description, subagent_type}, ...])`. The runtime fans them + out concurrently under a small semaphore and aggregates one ToolMessage + per child prefixed with `[task ]`. Batched children **do not + support human-in-the-loop interrupts** — if one needs approval it surfaces + an error and you re-dispatch it as a single (non-batched) `task(...)` call. + For 1–2 independent calls, just emit two separate `task(...)` calls. - **Serialise dependent work across turns.** If one specialist's output must inform another's input (e.g. "find the roadmap in my KB, then email it to Maya"), invoke them on consecutive turns — first finishes, @@ -93,4 +102,65 @@ user: "Find my Q2 roadmap doc in the KB and email a summary to Maya." task(gmail, "Send an email to Maya with subject 'Q2 roadmap summary' and the following body: .") + + +user: "Create issues in Linear for each of these five bugs: " +→ Many-shot independent fanout — use the batch shape: + task(tasks=[ + {subagent_type: "linear", description: "Create a Linear issue titled + '' with body ''. Return the issue URL."}, + {subagent_type: "linear", description: "Create a Linear issue titled + '' with body ''. Return the issue URL."}, + {subagent_type: "linear", description: "Create a Linear issue titled + '' with body ''. Return the issue URL."}, + {subagent_type: "linear", description: "Create a Linear issue titled + '' with body ''. Return the issue URL."}, + {subagent_type: "linear", description: "Create a Linear issue titled + '' with body ''. Return the issue URL."}, + ]) + Read back the `[task 0]`…`[task 4]` blocks in the combined ToolMessage and + verify each via its Receipt's `verifiable_url` per the `` + teaching before confirming to the user. + + + +user: "Make a 30-second podcast of this conversation." +→ Celery-backed deliverable. The `deliverables` subagent dispatches the + Celery job and then **waits for it to finish** before returning. The + call may take 10-60 seconds (or longer for video presentations) — + that is intentional, not a hang. You always get back one of two + Receipt shapes: + task(deliverables, "Generate a podcast titled '' from the + following content. Use a 30-second style brief. Return the podcast + id and title.\n\n<source content>") + Outcomes: + - **`status="success"`**: the audio is saved. Tell the user the + podcast is **ready** and quote the `external_id` / `preview` so + they can find it in the podcast panel. + - **`status="failed"`**: surface the Receipt's `error` field + verbatim. Do NOT silently re-dispatch — the backend already tried + and reported a real error. + Same two-way pattern applies to video presentations (which take + longer to render, but still return a terminal status). If a + `task(deliverables, ...)` invocation itself times out at the subagent + layer (separate from the Receipt), that's an operator-side problem + with the subagent invoke timeout, not a deliverable failure — pass + the message through and stop. +</example> + +<example> +user: "Post the launch announcement to #general and let me know when it's up." +→ Mutating subagent + user wants external confirmation. Apply the + `<verification>` teaching: the slack subagent's reply is a self-report; + check its `evidence.receipts` for a Receipt with `status="success"` and + a `verifiable_url`, then fetch that URL to confirm before reporting back. + This turn: + task(slack, "Post '<launch announcement text>' to #general. + Return the message permalink.") + Next turn (with the receipt's `verifiable_url` in hand): + scrape_webpage(url=<verifiable_url from slack receipt>) + → confirm the post is live, then tell the user it's up with the URL. + If the slack reply has NO Receipt with `status="success"`, treat it as a + silent failure: surface the error verbatim, do not retry. +</example> </routing> diff --git a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md index 2f47d4df1..d6a81d8d3 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md +++ b/surfsense_backend/app/agents/multi_agent_chat/main_agent/system_prompt/prompts/tools/task/description.md @@ -4,12 +4,69 @@ `<specialists>` for the live roster. - Each subagent runs in isolation with its own tool stack and context, and returns a single synthesized result. - - Args: + - Args (single mode): - `subagent_type` — name of the specialist to invoke (must match an entry in `<specialists>`). - `description` — the FULL task prompt. The specialist cannot see this thread, so include all context and constraints, plus what you need back. The specialist will respond in its own format — don't dictate one. + - Args (batch mode): + - `tasks` — array of `{description, subagent_type}` objects to fan out + concurrently. Mutually exclusive with single-mode args. Use when a + single request expands to **3 or more independent specialist calls** + (e.g. "create five issues from this list"). Children run under a + small concurrency cap and the runtime returns one ToolMessage block + per child, prefixed with `[task <index>]`. **Batched children do not + support human-in-the-loop interrupts** — if any child needs approval + it surfaces an error and you must re-dispatch that single task as a + non-batched `task(...)` call. - Routing rules (when to call, how often, how to scope) live in `<routing>`. + <verification> + A subagent's natural-language reply is a **self-report**, not proof. The + specialist might claim a Slack message was posted, a Jira issue was + created, or a report was generated even when the underlying tool call + failed silently or was rate-limited. Treat success language ("Done", + "Posted to #general", "Created ENG-42") as a hypothesis, not a fact. + + Two ground-truth signals are always available to verify a mutating + subagent's claim: + + 1. **`state['receipts']`** — every mutating tool emits a structured + `Receipt` (route, type, operation, status, external_id, + verifiable_url, preview) into this append-only list. The supervisor + never sees the raw list directly, but each subagent's + `<output_contract>` carries the matching Receipt(s) under + `evidence.receipts`. If a subagent reports success with NO matching + Receipt at `status="success"` (or `"pending"` for async deliverables + like podcasts/videos), the operation did not happen — treat as + failure and surface that to the user verbatim, do not retry blindly. + + 2. **`scrape_webpage`** — when a Receipt carries a `verifiable_url` + (Notion page URL, Slack permalink, Jira issue URL, Linear identifier + URL, etc.), you can fetch that URL and confirm the operation + externally. Use this for high-stakes mutations the user explicitly + called out (e.g. "send the launch email to the whole team") or when + the subagent's self-report contradicts what the user expected. + + **Receipt status semantics — read carefully:** + + - `status="success"`: the mutation already committed in the backend. + If a `verifiable_url` is present and the request was high-stakes, + you may `scrape_webpage` it to externally confirm. Otherwise trust + the Receipt and tell the user it is done. Celery-backed deliverables + (podcasts, video presentations) also land here — the subagent + already waited for the worker to finish, so a `success` Receipt + means the artefact really is saved. + - `status="failed"`: a Receipt with this status carries the backend's + error in its `error` field. Surface that text verbatim to the user; + re-routing or retrying is only appropriate when the user explicitly + asks for it. + - `status="pending"`: rare today — current mutating tools wait for + their backend before returning. If you ever do see a pending + Receipt, tell the user the work has been **kicked off** (quote the + `external_id` / `preview` so they can find it later), do not + `scrape_webpage` it, and do not re-dispatch the same + `task(...)` call hoping it will be done "this time". + </verification> diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py index 6c4519f3a..e11f3c3ec 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/constants.py @@ -2,6 +2,8 @@ from __future__ import annotations +import os + # Mirror of deepagents.middleware.subagents._EXCLUDED_STATE_KEYS. EXCLUDED_STATE_KEYS = frozenset( { @@ -16,3 +18,72 @@ EXCLUDED_STATE_KEYS = frozenset( # Match the parent graph's budget; the LangGraph default of 25 trips on # multi-step subagent runs. DEFAULT_SUBAGENT_RECURSION_LIMIT = 10_000 + + +def _read_timeout_env(name: str, default: float) -> float: + """Parse ``name`` from the environment; fall back to ``default`` on bad values. + + Kept as a free function so the module-level constants stay constants + after import; tests can monkeypatch this and re-evaluate via + ``importlib.reload`` if they need a different value mid-process. + """ + raw = os.environ.get(name) + if not raw: + return default + try: + value = float(raw) + except (TypeError, ValueError): + return default + return value if value > 0 else default + + +# Wall-clock budget for a single ``task(subagent, ...)`` invocation. +# Subagents that run hot (image generation with slow vendors, KB writes +# behind a sluggish embedder) can otherwise wedge the orchestrator until +# the next checkpoint heartbeat. ``0`` disables the timeout entirely. +DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS: float = _read_timeout_env( + "SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS", + default=300.0, +) + + +def _read_int_env(name: str, default: int) -> int: + raw = os.environ.get(name) + if not raw: + return default + try: + value = int(raw) + except (TypeError, ValueError): + return default + return value if value > 0 else default + + +# Maximum number of children that ``task(..., tasks=[...])`` runs in +# parallel via ``asyncio.gather`` + ``Semaphore``. Bounded so a runaway +# fanout cannot starve unrelated subagents (each child still owns an +# LLM call + DB session). Set ``SURFSENSE_TASK_BATCH_CONCURRENCY=1`` to +# effectively serialise batches without changing the schema. +DEFAULT_SUBAGENT_BATCH_CONCURRENCY: int = _read_int_env( + "SURFSENSE_TASK_BATCH_CONCURRENCY", + default=3, +) + +# Max number of children in a single batched ``task`` call. Hard upper +# bound is a safety net for prompt-injection / runaway loops; the orchestrator +# rarely needs more than a handful of concurrent specialists. +MAX_SUBAGENT_BATCH_SIZE: int = _read_int_env( + "SURFSENSE_TASK_BATCH_MAX_SIZE", + default=8, +) + + +# Soft threshold for per-turn cumulative ``task(...)`` invocations across +# **all** subagents. Once the sum of ``state['billable_calls']`` values +# crosses this number, the runtime appends a one-shot warning ToolMessage +# instructing the orchestrator to wrap up the turn. Tunable so heavy-research +# turns (which legitimately need 15+ specialist calls) don't trip the alarm +# in production. Set to ``0`` to disable the warning entirely. +DEFAULT_SUBAGENT_BILLABLE_THRESHOLD: int = _read_int_env( + "SURFSENSE_SUBAGENT_BILLABLE_THRESHOLD", + default=15, +) diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py index 0119752c1..6cc71f252 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/middleware.py @@ -16,6 +16,9 @@ from langchain.agents import create_agent from langchain.chat_models import init_chat_model from langgraph.types import Checkpointer +from app.agents.multi_agent_chat.subagents.shared.spec import ( + SURF_CONTEXT_HINT_PROVIDER_KEY, +) from app.utils.perf import get_perf_logger from .task_tool import build_task_tool_with_parent_config @@ -34,6 +37,7 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware): subagents: list[SubAgent | CompiledSubAgent], system_prompt: str | None = TASK_SYSTEM_PROMPT, task_description: str | None = None, + search_space_id: int | None = None, ) -> None: self._surf_checkpointer = checkpointer super(SubAgentMiddleware, self).__init__() @@ -43,8 +47,17 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware): ) self._backend = backend self._subagents = subagents + # Search-space id is captured at build time (the orchestrator runs in + # exactly one search space for its lifetime). The spawn-paused kill + # switch keys on it so an operator can quarantine one workspace + # without affecting the rest of the deployment. + self._search_space_id = search_space_id subagent_specs = self._surf_compile_subagent_graphs() - task_tool = build_task_tool_with_parent_config(subagent_specs, task_description) + task_tool = build_task_tool_with_parent_config( + subagent_specs, + task_description, + search_space_id=search_space_id, + ) if system_prompt and subagent_specs: agents_desc = "\n".join( f"- {s['name']}: {s['description']}" for s in subagent_specs @@ -64,6 +77,10 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware): for spec in self._subagents: spec_start = time.perf_counter() + # Provider may be ``None`` (no hint), in which case task_tool + # skips the prepend step. We forward the key unconditionally so + # the registry shape is uniform. + hint_provider = cast(dict, spec).get(SURF_CONTEXT_HINT_PROVIDER_KEY) if "runnable" in spec: compiled = cast(CompiledSubAgent, spec) specs.append( @@ -71,6 +88,7 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware): "name": compiled["name"], "description": compiled["description"], "runnable": compiled["runnable"], + SURF_CONTEXT_HINT_PROVIDER_KEY: hint_provider, } ) timings.append( @@ -108,6 +126,7 @@ class SurfSenseCheckpointedSubAgentMiddleware(SubAgentMiddleware): "name": spec["name"], "description": spec["description"], "runnable": runnable, + SURF_CONTEXT_HINT_PROVIDER_KEY: hint_provider, } ) timings.append( diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/spawn_paused.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/spawn_paused.py new file mode 100644 index 000000000..2c9e114e0 --- /dev/null +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/spawn_paused.py @@ -0,0 +1,84 @@ +"""Per-search-space spawn-paused kill switch for the ``task`` boundary. + +When operators see a runaway loop, a vendor outage, or a billing event +that requires immediate cessation of subagent traffic for a specific +workspace, they flip a Redis flag and the ``task`` tool short-circuits +without touching downstream services. The flag is **per-search-space** +so one tenant's incident never silences the rest of the deployment. + +Flag key: ``surfsense:spawn_paused:{search_space_id}`` +Flag value: any string-truthy value (we read presence, not contents). +TTL: set by whoever toggles the flag — this module never expires + keys on its own, since "the flag is on" is itself the signal + that a human (or alert) needs to investigate. + +The check is best-effort: Redis errors are logged but do not block the +``task`` invocation. Failing closed (block-on-redis-error) would let a +single Redis blip take the whole orchestrator offline; failing open +preserves availability and the alarm bells (rate-limits, cost spikes) +will surface the underlying outage. +""" + +from __future__ import annotations + +import contextlib +import logging +import os + +from app.config import config + +logger = logging.getLogger(__name__) + + +# Operators can disable the check entirely (e.g. local dev without Redis) +# by setting ``SURFSENSE_TASK_SPAWN_PAUSED_DISABLED=1``. Default is +# enabled so production never relies on flipping an opt-out flag. +_DISABLED = os.environ.get( + "SURFSENSE_TASK_SPAWN_PAUSED_DISABLED", "" +).strip().lower() in { + "1", + "true", + "yes", + "on", +} + + +def _flag_key(search_space_id: int) -> str: + return f"surfsense:spawn_paused:{search_space_id}" + + +async def is_spawn_paused(search_space_id: int | None) -> bool: + """Return ``True`` iff the workspace's spawn-paused flag is set in Redis. + + A ``None`` search-space (e.g. dev paths that did not plumb the id + through yet) bypasses the check. So does a Redis outage — see module + docstring for the fail-open rationale. + """ + if _DISABLED or search_space_id is None: + return False + try: + # Local import keeps the cold-path import cheap and lets routes + # that never call ``task`` skip the redis dependency entirely. + import redis.asyncio as aioredis # type: ignore[import-not-found] + + client = aioredis.from_url(config.REDIS_APP_URL, decode_responses=True) + try: + raw = await client.get(_flag_key(search_space_id)) + finally: + # ``aclose()`` is the async-safe variant on redis-py >=5; fall back + # to ``close()`` for older clients pinned in tests. + close = getattr(client, "aclose", None) or getattr(client, "close", None) + if callable(close): + with contextlib.suppress(Exception): + await close() # type: ignore[misc] + return bool(raw) + except Exception: + logger.warning( + "spawn_paused check failed for search_space_id=%s; failing open.", + search_space_id, + exc_info=True, + ) + return False + + +__all__ = ["is_spawn_paused"] diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py index c3babab83..91a0be506 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py @@ -8,9 +8,12 @@ re-raises any new pending interrupt back to the parent. from __future__ import annotations +import asyncio +import json import logging import time -from typing import Annotated, Any, NoReturn +from collections.abc import Awaitable +from typing import Annotated, Any, NoReturn, TypeVar from deepagents.middleware.subagents import TASK_TOOL_DESCRIPTION from langchain.tools import BaseTool, ToolRuntime @@ -20,6 +23,10 @@ from langchain_core.tools import StructuredTool from langgraph.errors import GraphInterrupt from langgraph.types import Command, Interrupt +from app.agents.multi_agent_chat.subagents.shared.spec import ( + SURF_CONTEXT_HINT_PROVIDER_KEY, + ContextHintProvider, +) from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger @@ -29,7 +36,13 @@ from .config import ( has_surfsense_resume, subagent_invoke_config, ) -from .constants import EXCLUDED_STATE_KEYS +from .constants import ( + DEFAULT_SUBAGENT_BATCH_CONCURRENCY, + DEFAULT_SUBAGENT_BILLABLE_THRESHOLD, + DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS, + EXCLUDED_STATE_KEYS, + MAX_SUBAGENT_BATCH_SIZE, +) from .propagation import wrap_with_tool_call_id from .resume import ( build_resume_command, @@ -37,11 +50,70 @@ from .resume import ( get_first_pending_subagent_interrupt, hitlrequest_action_count, ) +from .spawn_paused import is_spawn_paused logger = logging.getLogger(__name__) _perf_log = get_perf_logger() +class SubagentInvokeTimeoutError(Exception): + """Raised when ``subagent.ainvoke`` exceeds the configured wall-clock budget. + + Carries the subagent name and the elapsed seconds so the caller can + synthesize a ToolMessage that the orchestrator can act on (re-route, + surface to the user, or retry with a smaller scope). + """ + + def __init__(self, subagent_type: str, elapsed_seconds: float) -> None: + super().__init__( + f"subagent {subagent_type!r} exceeded " + f"{DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS:.0f}s budget " + f"(elapsed={elapsed_seconds:.1f}s)" + ) + self.subagent_type = subagent_type + self.elapsed_seconds = elapsed_seconds + + +_T = TypeVar("_T") + + +async def _ainvoke_with_timeout[T]( + coro: Awaitable[_T], *, subagent_type: str, started_at: float +) -> _T: + """Apply :data:`DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS` to ``coro``. + + A non-positive timeout disables the cap (configurable via the + ``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`` env var). On expiry the + underlying task is cancelled and :class:`SubagentInvokeTimeoutError` is + raised — the caller wraps it into a synthetic ToolMessage so the + orchestrator can decide what to do. + """ + timeout = DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS + if timeout <= 0: + return await coro + try: + return await asyncio.wait_for(coro, timeout=timeout) + except TimeoutError as exc: + elapsed = time.perf_counter() - started_at + raise SubagentInvokeTimeoutError(subagent_type, elapsed) from exc + + +def _synthesize_timeout_command( + exc: SubagentInvokeTimeoutError, *, tool_call_id: str +) -> Command: + """Turn a :class:`SubagentInvokeTimeoutError` into a ToolMessage the parent can read.""" + content = ( + f"Subagent {exc.subagent_type!r} timed out after " + f"{exc.elapsed_seconds:.1f}s (budget=" + f"{DEFAULT_SUBAGENT_INVOKE_TIMEOUT_SECONDS:.0f}s). " + "The work was cancelled. Treat as status=error; re-route with a " + "narrower scope or different specialist." + ) + return Command( + update={"messages": [ToolMessage(content=content, tool_call_id=tool_call_id)]} + ) + + def _reraise_stamped_subagent_interrupt( gi: GraphInterrupt, tool_call_id: str ) -> NoReturn: @@ -70,11 +142,24 @@ def _reraise_stamped_subagent_interrupt( def build_task_tool_with_parent_config( subagents: list[dict[str, Any]], task_description: str | None = None, + *, + search_space_id: int | None = None, ) -> BaseTool: """Upstream ``_build_task_tool`` + parent ``runtime.config`` propagation + resume bridging.""" subagent_graphs: dict[str, Runnable] = { spec["name"]: spec["runnable"] for spec in subagents } + # Per-subagent context-hint providers (see ``SurfSenseSubagentSpec``). + # The mapping is sparse: only routes that opted in via ``pack_subagent`` + # appear here, and the value is invoked once per ``task(...)`` call to + # generate a short string prepended to the subagent's first + # ``HumanMessage``. Failures are logged and swallowed — a broken hint + # provider must never prevent the underlying task from running. + subagent_hint_providers: dict[str, ContextHintProvider] = { + spec["name"]: provider + for spec in subagents + if (provider := spec.get(SURF_CONTEXT_HINT_PROVIDER_KEY)) is not None + } subagent_description_str = "\n".join( f"- {s['name']}: {s['description']}" for s in subagents ) @@ -88,6 +173,120 @@ def build_task_tool_with_parent_config( else: description = task_description + def _billable_call_update( + subagent_type: str, runtime: ToolRuntime + ) -> dict[str, Any]: + """Build the per-call ``billable_calls`` delta + an optional warning. + + The orchestrator's ``billable_calls`` map is summed by + :func:`_int_counter_merge_reducer`, so we always emit + ``{subagent_type: 1}`` and let the reducer accumulate. If the + cumulative count *after* this call would cross the configured + threshold, we also slip a soft ``messages`` entry into the update + so the orchestrator can read it on its next step and self-limit. + Returning a plain ``dict`` (vs. an extra :class:`Command`) keeps + the helper composable with the existing single/batch return paths. + """ + delta: dict[str, Any] = {"billable_calls": {subagent_type: 1}} + threshold = DEFAULT_SUBAGENT_BILLABLE_THRESHOLD + if threshold <= 0: + return delta + prior = runtime.state.get("billable_calls") or {} + # ``prior`` may be a plain dict or a reducer-managed mapping; only + # int values are counted so a malformed checkpoint can't crash us. + prior_total = sum(v for v in prior.values() if isinstance(v, int)) + new_total = prior_total + 1 + if prior_total < threshold <= new_total: + warn = ( + f"[budget warning] This turn has dispatched {new_total} " + f"subagent calls (soft cap = {threshold}). Wrap up the " + "user's request with what you have rather than launching " + "more specialists; surface a partial answer if needed." + ) + delta["_billable_warn_text"] = warn + return delta + + def _attach_billable( + cmd: Command, subagent_type: str, runtime: ToolRuntime + ) -> Command: + """Merge the per-call billable counter (and warning) into ``cmd``.""" + delta = _billable_call_update(subagent_type, runtime) + warn_text = delta.pop("_billable_warn_text", None) + # ``cmd.update`` may be a dict or LangGraph ``UpdateDict``; defensively + # copy so we don't mutate state shared across other tool returns. + update = dict(getattr(cmd, "update", {}) or {}) + for key, value in delta.items(): + update[key] = value + if warn_text: + existing_msgs = list(update.get("messages") or []) + existing_msgs.append( + ToolMessage(content=warn_text, tool_call_id=runtime.tool_call_id) + ) + update["messages"] = existing_msgs + return Command(update=update) + + def _safe_message_text(msg: Any) -> str: + """Pull text out of a BaseMessage without trusting the ``.text`` property. + + ``BaseMessage.text`` walks ``content_blocks`` and crashes with + ``TypeError: 'NoneType' object is not iterable`` when ``content`` is + ``None`` (common for tool-call AIMessages whose payload is purely + structured). ``getattr(msg, "text", None)`` does not catch this + because Python evaluates the property body before falling back to + the default. Read ``content`` directly and coerce defensively. + """ + try: + content = getattr(msg, "content", None) + except Exception: + content = None + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for block in content: + if isinstance(block, str): + parts.append(block) + elif isinstance(block, dict): + block_text = block.get("text") or block.get("content") + if isinstance(block_text, str): + parts.append(block_text) + return " ".join(parts) + return str(content) + + def _build_tool_trace(messages: list[Any]) -> list[dict[str, Any]]: + """Compress the subagent's message stream into a compact tool trace. + + Each entry is ``{"tool": <name>, "status": "ok"|"error", "preview": + <≤120 chars>}`` so the orchestrator can show "this is what your + specialist actually did" without dumping the full message stream + back through the prompt. The list is attached to the returned + ToolMessage's ``additional_kwargs`` (under ``"surf_tool_trace"``); + the LLM never sees it, but UI / observability code can pluck it + out of the checkpoint. + """ + trace: list[dict[str, Any]] = [] + for msg in messages: + tool_name = getattr(msg, "name", None) + tool_call_id_attr = getattr(msg, "tool_call_id", None) + if not tool_name and not tool_call_id_attr: + # Only ToolMessages have either field; skip AIMessage / + # HumanMessage / SystemMessage frames. + continue + status = getattr(msg, "status", None) or "ok" + preview = _safe_message_text(msg).strip().replace("\n", " ") + if len(preview) > 120: + preview = preview[:117] + "..." + trace.append( + { + "tool": tool_name or "<unknown>", + "status": status, + "preview": preview, + } + ) + return trace + def _return_command_with_state_update(result: dict, tool_call_id: str) -> Command: if "messages" not in result: msg = ( @@ -106,15 +305,51 @@ def build_task_tool_with_parent_config( "output to forward back to the user." ) raise ValueError(msg) - last_text = getattr(messages[-1], "text", None) or "" - message_text = last_text.rstrip() + message_text = _safe_message_text(messages[-1]).rstrip() + # Tool-trace is purely observability — wrap defensively so a single + # malformed frame never bubbles up and kills the whole user turn. + try: + tool_trace = _build_tool_trace(messages) + except Exception: + logger.exception( + "Failed to build tool_trace for subagent return; " + "continuing without trace." + ) + tool_trace = [] + tool_msg = ToolMessage(message_text, tool_call_id=tool_call_id) + if tool_trace: + # ``additional_kwargs`` is a free-form dict on BaseMessage; using + # a ``surf_`` prefix avoids collision with provider-specific keys + # (e.g. Anthropic's ``cache_control``). The LLM doesn't see it; + # consumers (UI, observability) read it off the checkpoint. + tool_msg.additional_kwargs["surf_tool_trace"] = tool_trace return Command( update={ **state_update, - "messages": [ToolMessage(message_text, tool_call_id=tool_call_id)], + "messages": [tool_msg], } ) + def _resolve_context_hint( + subagent_type: str, description: str, runtime: ToolRuntime + ) -> str | None: + """Run the per-subagent hint provider; swallow & log any exception.""" + provider = subagent_hint_providers.get(subagent_type) + if provider is None: + return None + try: + hint = provider(runtime.state, description) + except Exception: + logger.exception( + "Context-hint provider for subagent %r raised; skipping hint.", + subagent_type, + ) + return None + if not hint or not isinstance(hint, str): + return None + cleaned = hint.strip() + return cleaned or None + def _validate_and_prepare_state( subagent_type: str, description: str, runtime: ToolRuntime ) -> tuple[Runnable, dict]: @@ -122,20 +357,308 @@ def build_task_tool_with_parent_config( subagent_state = { k: v for k, v in runtime.state.items() if k not in EXCLUDED_STATE_KEYS } - subagent_state["messages"] = [HumanMessage(content=description)] + hint = _resolve_context_hint(subagent_type, description, runtime) + if hint: + # Prepend as a tagged block so the subagent prompt can pattern-match + # on the section (and a future change can lift it into its own + # ``SystemMessage`` if needed). + payload = f"<context_hint>\n{hint}\n</context_hint>\n\n{description}" + else: + payload = description + subagent_state["messages"] = [HumanMessage(content=payload)] return subagent, subagent_state + def _merge_batch_results( + results: list[tuple[int, str, dict | str, dict | None]], + runtime: ToolRuntime, + ) -> Command: + """Combine per-child results into one Command with a combined ToolMessage. + + ``results`` is a list of ``(task_index, subagent_type, + payload_or_error_text, child_state_update)`` tuples — preserving the + input order so the orchestrator can map each block back to the task + it dispatched. State updates are merged by reducer for keys outside + :data:`EXCLUDED_STATE_KEYS`; everything else (``messages``, ``todos``, + etc.) is replaced by the synthesized aggregate ToolMessage. Every + child also contributes a ``billable_calls`` increment so cost + accounting matches single-mode dispatch. + """ + results.sort(key=lambda r: r[0]) + merged_state: dict[str, Any] = {} + billable_delta: dict[str, int] = {} + message_blocks: list[str] = [] + batch_trace: list[dict[str, Any]] = [] + for task_index, subagent_type, payload, state_update in results: + billable_delta[subagent_type] = billable_delta.get(subagent_type, 0) + 1 + if isinstance(payload, str): + # Pre-flight error or per-task exception text. + message_blocks.append(f"[task {task_index}] {payload}") + batch_trace.append( + { + "task_index": task_index, + "subagent_type": subagent_type, + "status": "error", + "tool_trace": [], + } + ) + continue + messages = payload.get("messages") or [] + last_text = _safe_message_text(messages[-1]).rstrip() if messages else "" + message_blocks.append( + f"[task {task_index}] {last_text or '<empty>'}" + ) + try: + child_trace = _build_tool_trace(messages) + except Exception: + logger.exception( + "Failed to build tool_trace for batch task_index=%d; continuing.", + task_index, + ) + child_trace = [] + batch_trace.append( + { + "task_index": task_index, + "subagent_type": subagent_type, + "status": "ok", + "tool_trace": child_trace, + } + ) + if state_update: + # Naive merge: later tasks win on scalar collisions; reducer-backed + # fields (``receipts``, ``files`` etc.) accumulate at apply time. + merged_state.update(state_update) + aggregate = "\n\n".join(message_blocks) + aggregate_msg = ToolMessage( + content=aggregate, tool_call_id=runtime.tool_call_id + ) + if batch_trace: + aggregate_msg.additional_kwargs["surf_tool_trace"] = batch_trace + update: dict[str, Any] = { + **merged_state, + "billable_calls": billable_delta, + "messages": [aggregate_msg], + } + # Soft-cap warning: check the cumulative count after attribution. + threshold = DEFAULT_SUBAGENT_BILLABLE_THRESHOLD + if threshold > 0: + prior = runtime.state.get("billable_calls") or {} + prior_total = sum(v for v in prior.values() if isinstance(v, int)) + new_total = prior_total + sum(billable_delta.values()) + if prior_total < threshold <= new_total: + update["messages"].append( + ToolMessage( + content=( + f"[budget warning] This turn has dispatched " + f"{new_total} subagent calls (soft cap = " + f"{threshold}). Wrap up the user's request with " + "what you have rather than launching more " + "specialists; surface a partial answer if needed." + ), + tool_call_id=runtime.tool_call_id, + ) + ) + return Command(update=update) + + async def _ainvoke_one_batch_child( + *, + task_index: int, + subagent_type: str, + description: str, + runtime: ToolRuntime, + semaphore: asyncio.Semaphore, + ) -> tuple[int, str, dict | str, dict | None]: + """Run one child of a batched ``task`` call under the concurrency cap. + + Errors are returned as plain text in slot 2 so a single child's + failure does not abort the whole batch. ``GraphInterrupt`` from a + batched child is currently treated as a hard failure for that child + only — batched HITL is intentionally out of scope for the v1 + rollout (see plan tier 2 item 4 risks). + """ + async with semaphore: + if subagent_type not in subagent_graphs: + allowed_types = ", ".join([f"`{k}`" for k in subagent_graphs]) + return ( + task_index, + subagent_type, + ( + f"Subagent {subagent_type!r} does not exist; " + f"allowed: {allowed_types}" + ), + None, + ) + subagent, subagent_state = _validate_and_prepare_state( + subagent_type, description, runtime + ) + sub_config = subagent_invoke_config(runtime) + started_at = time.perf_counter() + try: + result = await _ainvoke_with_timeout( + subagent.ainvoke(subagent_state, config=sub_config), + subagent_type=subagent_type, + started_at=started_at, + ) + except SubagentInvokeTimeoutError as exc: + logger.warning( + "Batch child %d (%s) timed out after %.1fs", + task_index, + subagent_type, + exc.elapsed_seconds, + ) + return (task_index, subagent_type, str(exc), None) + except GraphInterrupt: + # Batched HITL is unsupported in v1 — surface as a failure + # for this child so the rest of the batch still completes. + logger.warning( + "Batch child %d (%s) raised GraphInterrupt; batched HITL " + "is not supported. Re-dispatch this task as a single " + "(non-batched) `task(...)` call to get the HITL prompt.", + task_index, + subagent_type, + ) + return ( + task_index, + subagent_type, + ( + f"Subagent {subagent_type!r} needs human approval. " + "Re-dispatch this task as a single (non-batched) " + "`task(...)` call so the approval card can be shown." + ), + None, + ) + except Exception as exc: + logger.exception( + "Batch child %d (%s) raised: %s", + task_index, + subagent_type, + exc, + ) + return ( + task_index, + subagent_type, + f"Subagent {subagent_type!r} error: {exc}", + None, + ) + child_state_update = { + k: v for k, v in result.items() if k not in EXCLUDED_STATE_KEYS + } + return (task_index, subagent_type, result, child_state_update) + + def _coerce_batch_arg(tasks: Any) -> list[dict] | str: + """Rescue common LLM-side malformations of the ``tasks`` argument. + + Some providers serialise an array argument as a JSON-encoded string, + and small models occasionally hand back a single ``{description, + subagent_type}`` dict instead of a one-element array. Both are + recovered here with a WARN log so the issue is visible in metrics + but the user's turn still completes; truly broken shapes return a + plain string that the caller surfaces as the tool error. + """ + if isinstance(tasks, list): + return tasks + if isinstance(tasks, dict): + logger.warning( + "task: `tasks` was a single dict; coercing to a 1-element list. " + "Orchestrators should send `tasks=[{...}]` directly." + ) + return [tasks] + if isinstance(tasks, str): + stripped = tasks.strip() + if not stripped: + return "tasks: argument is empty." + try: + parsed = json.loads(stripped) + except json.JSONDecodeError as exc: + return ( + f"tasks: argument is a string but not valid JSON ({exc.msg}). " + "Send a JSON array of `{description, subagent_type}` objects." + ) + logger.warning( + "task: `tasks` was a JSON-encoded string; parsed to %s. " + "Orchestrators should send a JSON array directly.", + type(parsed).__name__, + ) + return _coerce_batch_arg(parsed) + return ( + f"tasks: unsupported type {type(tasks).__name__}; expected an array " + "of `{description, subagent_type}` objects." + ) + + async def _adispatch_batch( + tasks: list[dict], runtime: ToolRuntime + ) -> Command | str: + """Fan-out helper for the ``tasks`` array shape. + + Bounded by :data:`MAX_SUBAGENT_BATCH_SIZE` and concurrency-capped + at :data:`DEFAULT_SUBAGENT_BATCH_CONCURRENCY`. Returns a single + :class:`Command` that the LLM sees as one ToolMessage per child, + prefixed with ``[task <index>]`` so it can map back to the input + order. + """ + if not tasks: + return "tasks: array is empty; nothing to dispatch." + if len(tasks) > MAX_SUBAGENT_BATCH_SIZE: + return ( + f"tasks: too many children ({len(tasks)}); " + f"max is {MAX_SUBAGENT_BATCH_SIZE}. Split the batch." + ) + normalized: list[tuple[int, str, str]] = [] + for idx, item in enumerate(tasks): + if not isinstance(item, dict): + return ( + f"tasks[{idx}]: must be an object with description+subagent_type." + ) + description = item.get("description") + subagent_type = item.get("subagent_type") + if not isinstance(description, str) or not description.strip(): + return f"tasks[{idx}]: missing or empty 'description'." + if not isinstance(subagent_type, str) or not subagent_type.strip(): + return f"tasks[{idx}]: missing or empty 'subagent_type'." + normalized.append((idx, subagent_type.strip(), description)) + semaphore = asyncio.Semaphore(DEFAULT_SUBAGENT_BATCH_CONCURRENCY) + coros = [ + _ainvoke_one_batch_child( + task_index=idx, + subagent_type=subagent_type, + description=description, + runtime=runtime, + semaphore=semaphore, + ) + for idx, subagent_type, description in normalized + ] + results = await asyncio.gather(*coros) + return _merge_batch_results(list(results), runtime) + def task( description: Annotated[ - str, - "A detailed description of the task for the subagent to perform autonomously. Include all necessary context and specify the expected output format.", - ], + str | None, + "Single-mode: a detailed task description for the subagent. Required unless `tasks` is provided.", + ] = None, subagent_type: Annotated[ - str, - "The type of subagent to use. Must be one of the available agent types listed in the tool description.", - ], - runtime: ToolRuntime, + str | None, + "Single-mode: the type of subagent to use. Required unless `tasks` is provided.", + ] = None, + runtime: ToolRuntime = None, # type: ignore[assignment] + tasks: Annotated[ + list[dict] | None, + ( + "Batch-mode: array of `{description, subagent_type}` objects. " + "Synchronous path does not support batch mode; orchestrators " + "must use the async event loop to fan out." + ), + ] = None, ) -> str | Command: + if tasks is not None: + return ( + "task: batch mode (`tasks=[...]`) is only supported on the async " + "path. SurfSense orchestrators always run in an event loop, so " + "this should never fire — file a bug if you see it." + ) + if not description or not subagent_type: + return ( + "task: must provide either single-mode (`description`+`subagent_type`) " + "or batch-mode (`tasks`)." + ) if subagent_type not in subagent_graphs: allowed_types = ", ".join([f"`{k}`" for k in subagent_graphs]) return ( @@ -284,16 +807,65 @@ def build_task_tool_with_parent_config( async def atask( description: Annotated[ - str, - "A detailed description of the task for the subagent to perform autonomously. Include all necessary context and specify the expected output format.", - ], + str | None, + "Single-mode: a detailed task description for the subagent. Required unless `tasks` is provided.", + ] = None, subagent_type: Annotated[ - str, - "The type of subagent to use. Must be one of the available agent types listed in the tool description.", - ], - runtime: ToolRuntime, + str | None, + "Single-mode: the type of subagent to use. Required unless `tasks` is provided.", + ] = None, + runtime: ToolRuntime = None, # type: ignore[assignment] + tasks: Annotated[ + list[dict] | None, + ( + "Batch-mode: array of `{description, subagent_type}` objects " + "to fan out concurrently (max " + f"{MAX_SUBAGENT_BATCH_SIZE}, concurrency " + f"{DEFAULT_SUBAGENT_BATCH_CONCURRENCY}). Mutually exclusive " + "with single-mode args. Batched children do not support " + "human-in-the-loop interrupts; re-dispatch as single mode " + "if a child needs approval." + ), + ] = None, ) -> str | Command: atask_start = time.perf_counter() + # Kill switch: when ops flips the spawn-paused flag for this + # workspace, every ``task(...)`` invocation (single- or batch-mode) + # short-circuits with a clear ToolMessage so the orchestrator can + # tell the user what happened and stop hammering downstream APIs. + if await is_spawn_paused(search_space_id): + logger.warning( + "[hitl_route] atask SPAWN_PAUSED: search_space_id=%s tool_call_id=%s", + search_space_id, + runtime.tool_call_id, + ) + return ( + "task: subagent dispatch is currently paused for this workspace. " + "Acknowledge to the user that delegation is temporarily disabled " + "(ops kill switch); do not retry until the pause is lifted." + ) + if tasks is not None: + if description or subagent_type: + return ( + "task: cannot combine `tasks` with `description`/`subagent_type`. " + "Use either single-mode (description+subagent_type) or batch-mode (tasks)." + ) + if not runtime.tool_call_id: + raise ValueError("Tool call ID is required for subagent invocation") + coerced = _coerce_batch_arg(tasks) + if isinstance(coerced, str): + return coerced + logger.info( + "[hitl_route] atask BATCH ENTRY: size=%d tool_call_id=%s", + len(coerced), + runtime.tool_call_id, + ) + return await _adispatch_batch(coerced, runtime) + if not description or not subagent_type: + return ( + "task: must provide either single-mode (`description`+`subagent_type`) " + "or batch-mode (`tasks`)." + ) logger.info( "[hitl_route] atask ENTRY: subagent_type=%r tool_call_id=%s", subagent_type, @@ -358,11 +930,37 @@ def build_task_tool_with_parent_config( subagent_type=subagent_type, path=invoke_path ) as sp: try: - result = await subagent.ainvoke( - build_resume_command(resume_value, pending_id), - config=sub_config, + result = await _ainvoke_with_timeout( + subagent.ainvoke( + build_resume_command(resume_value, pending_id), + config=sub_config, + ), + subagent_type=subagent_type, + started_at=ainvoke_start, ) sp.set_attribute("subagent.outcome", ainvoke_outcome) + except SubagentInvokeTimeoutError as exc: + ainvoke_outcome = "timeout" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + logger.warning( + "Subagent %r ainvoke (resume) timed out after %.1fs", + subagent_type, + exc.elapsed_seconds, + ) + return _synthesize_timeout_command( + exc, tool_call_id=runtime.tool_call_id + ) except GraphInterrupt as gi: ainvoke_outcome = "interrupted" sp.set_attribute("subagent.outcome", ainvoke_outcome) @@ -408,10 +1006,34 @@ def build_task_tool_with_parent_config( subagent_type=subagent_type, path=invoke_path ) as sp: try: - result = await subagent.ainvoke( - subagent_state, config=sub_config + result = await _ainvoke_with_timeout( + subagent.ainvoke(subagent_state, config=sub_config), + subagent_type=subagent_type, + started_at=ainvoke_start, ) sp.set_attribute("subagent.outcome", ainvoke_outcome) + except SubagentInvokeTimeoutError as exc: + ainvoke_outcome = "timeout" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + logger.warning( + "Subagent %r ainvoke (fresh) timed out after %.1fs", + subagent_type, + exc.elapsed_seconds, + ) + return _synthesize_timeout_command( + exc, tool_call_id=runtime.tool_call_id + ) except GraphInterrupt as gi: ainvoke_outcome = "interrupted" sp.set_attribute("subagent.outcome", ainvoke_outcome) @@ -481,7 +1103,7 @@ def build_task_tool_with_parent_config( path=invoke_path, outcome=ainvoke_outcome, ) - return cmd + return _attach_billable(cmd, subagent_type, runtime) return StructuredTool.from_function( name="task", diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py index e8a4c9899..2685d8a9b 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/kb_context_projection.py @@ -52,9 +52,7 @@ class KbContextProjectionMiddleware(AgentMiddleware): # type: ignore[type-arg] messages.insert(insert_at, SystemMessage(content=tree_text)) priority_count = 0 if priority: - priority_count = ( - len(priority) if hasattr(priority, "__len__") else 1 - ) + priority_count = len(priority) if hasattr(priority, "__len__") else 1 messages.insert(insert_at, _render_priority_message(priority)) _perf_log.info( "[kb_context_projection] tree_chars=%d priority_items=%d elapsed=%.3fs", diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py index a725bfee1..3db51883d 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py @@ -17,8 +17,7 @@ from langchain_core.tools import BaseTool from langgraph.types import interrupt from app.agents.new_chat.permissions import Rule -from app.observability import metrics as ot_metrics -from app.observability import otel as ot +from app.observability import metrics as ot_metrics, otel as ot from .decision import normalize_permission_decision from .payload import PERMISSION_ASK_INTERRUPT_TYPE, build_permission_ask_payload diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py index c1ebe31ca..3b20d8915 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/stack.py @@ -173,6 +173,7 @@ def build_main_agent_deepagent_middleware( subagents=subagents, system_prompt=None, task_description=TASK_TOOL_DESCRIPTION, + search_space_id=search_space_id, ), resilience.model_call_limit, resilience.tool_call_limit, diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md index c44f131bb..413791037 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/system_prompt.md @@ -42,14 +42,16 @@ Return **only** one JSON object (no markdown/prose): "evidence": { "artifact_type": "report" | "podcast" | "video_presentation" | "resume" | "image" | null, "artifact_id": string | null, - "artifact_location": string | null + "artifact_location": string | null, + "receipts": Receipt[] | null }, "next_step": string | null, "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` -> `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` -> `next_step` must be non-null. -- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null. +Route-specific rules: +- `evidence.receipts` quotes the Receipt(s) returned by `generate_report` / `generate_podcast` / `generate_video_presentation` / `generate_resume` / `generate_image` this turn, verbatim. The Receipt's `type` enum is one of `report` | `podcast` | `video_presentation` | `resume` | `image`. +<include snippet="output_contract_base"/> </output_contract> + +<include snippet="verifiable_handle"/> diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py index ab9dbc0ea..f170a35db 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py @@ -4,11 +4,15 @@ import hashlib import logging from typing import Any +from langchain.tools import ToolRuntime from langchain_core.tools import tool +from langgraph.types import Command from litellm import aimage_generation from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.config import config from app.db import ( ImageGeneration, @@ -66,8 +70,9 @@ def create_generate_image_tool( @tool async def generate_image( prompt: str, + runtime: ToolRuntime, n: int = 1, - ) -> dict[str, Any]: + ) -> Command: """ Generate an image from a text description using AI image models. @@ -82,6 +87,21 @@ def create_generate_image_tool( Returns: A dictionary containing the generated image(s) for display in the chat. """ + + def _failed(payload: dict[str, Any], *, error: str) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="image", + operation="generate", + status="failed", + preview=prompt[:200] if prompt else None, + error=error, + ), + tool_call_id=runtime.tool_call_id, + ) + try: # Use a per-call session so concurrent tool calls don't share an # AsyncSession (which is not concurrency-safe). The streaming @@ -93,7 +113,10 @@ def create_generate_image_tool( ) search_space = result.scalars().first() if not search_space: - return {"error": "Search space not found"} + return _failed( + {"error": "Search space not found"}, + error="Search space not found", + ) config_id = ( search_space.image_generation_config_id or IMAGE_GEN_AUTO_MODE_ID @@ -112,19 +135,19 @@ def create_generate_image_tool( # Call litellm based on config type if is_image_gen_auto_mode(config_id): if not ImageGenRouterService.is_initialized(): - return { - "error": "No image generation models configured. " + err = ( + "No image generation models configured. " "Please add an image model in Settings > Image Models." - } + ) + return _failed({"error": err}, error=err) response = await ImageGenRouterService.aimage_generation( prompt=prompt, model="auto", **gen_kwargs ) elif config_id < 0: cfg = _get_global_image_gen_config(config_id) if not cfg: - return { - "error": f"Image generation config {config_id} not found" - } + err = f"Image generation config {config_id} not found" + return _failed({"error": err}, error=err) model_string = _build_model_string( cfg.get("provider", ""), @@ -151,9 +174,8 @@ def create_generate_image_tool( ) db_cfg = cfg_result.scalars().first() if not db_cfg: - return { - "error": f"Image generation config {config_id} not found" - } + err = f"Image generation config {config_id} not found" + return _failed({"error": err}, error=err) model_string = _build_model_string( db_cfg.provider.value, @@ -200,7 +222,10 @@ def create_generate_image_tool( # Extract image URLs from response images = response_dict.get("data", []) if not images: - return {"error": "No images were generated"} + return _failed( + {"error": "No images were generated"}, + error="No images were generated", + ) first_image = images[0] revised_prompt = first_image.get("revised_prompt", prompt) @@ -219,11 +244,14 @@ def create_generate_image_tool( f"{db_image_gen_id}/image?token={access_token}" ) else: - return {"error": "No displayable image data in the response"} + return _failed( + {"error": "No displayable image data in the response"}, + error="No displayable image data in the response", + ) image_id = f"image-{hashlib.md5(image_url.encode()).hexdigest()[:12]}" - return { + payload = { "id": image_id, "assetId": image_url, "src": image_url, @@ -236,12 +264,26 @@ def create_generate_image_tool( "prompt": prompt, "image_count": len(images), } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="image", + operation="generate", + status="success", + external_id=str(db_image_gen_id), + verifiable_url=image_url, + preview=(revised_prompt or prompt)[:200], + ), + tool_call_id=runtime.tool_call_id, + ) except Exception as e: logger.exception("Image generation failed in tool") - return { - "error": f"Image generation failed: {e!s}", - "prompt": prompt, - } + err = f"Image generation failed: {e!s}" + return _failed( + {"error": err, "prompt": prompt}, + error=err, + ) return generate_image diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py index 55d9b3565..84617d38b 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/podcast.py @@ -1,12 +1,28 @@ -"""Factory for a podcast-generation tool that queues background work and returns an ID for polling.""" +"""Factory for a podcast-generation tool. +Dispatches the heavy generation to Celery and then polls the podcast row +until it reaches a terminal status (READY/FAILED). The tool always +returns a real terminal ``Receipt`` — never a pending one. The wait is +bounded by the existing per-invocation safety net +(``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`` in multi-agent mode, +HTTP / process lifetime in single-agent mode). +""" + +import logging from typing import Any +from langchain.tools import ToolRuntime from langchain_core.tools import tool +from langgraph.types import Command from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.shared.deliverable_wait import wait_for_deliverable +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.db import Podcast, PodcastStatus, shielded_async_session +logger = logging.getLogger(__name__) + def create_generate_podcast_tool( search_space_id: int, @@ -19,9 +35,10 @@ def create_generate_podcast_tool( @tool async def generate_podcast( source_content: str, + runtime: ToolRuntime, podcast_title: str = "SurfSense Podcast", user_prompt: str | None = None, - ) -> dict[str, Any]: + ) -> Command: """ Generate a podcast from the provided content. @@ -70,23 +87,101 @@ def create_generate_podcast_tool( user_prompt=user_prompt, ) - print(f"[generate_podcast] Created podcast {podcast_id}, task: {task.id}") + logger.info( + "[generate_podcast] Created podcast %s, task: %s", + podcast_id, + task.id, + ) - return { - "status": PodcastStatus.PENDING.value, + # Wait until the Celery worker flips the row to a terminal + # state. The wait is bounded only by the subagent invoke + # timeout (multi-agent) or HTTP lifetime (single-agent) — + # see app.agents.shared.deliverable_wait for details. + terminal_status, columns, elapsed = await wait_for_deliverable( + model=Podcast, + row_id=podcast_id, + columns=[Podcast.status, Podcast.file_location], + terminal_statuses={PodcastStatus.READY, PodcastStatus.FAILED}, + ) + + if terminal_status == PodcastStatus.READY: + file_location = columns[1] if columns else None + logger.info( + "[generate_podcast] Podcast %s READY in %.2fs (file=%s)", + podcast_id, + elapsed, + file_location, + ) + payload: dict[str, Any] = { + "status": PodcastStatus.READY.value, + "podcast_id": podcast_id, + "title": podcast_title, + "file_location": file_location, + "message": ( + "Podcast generated and saved to your podcast panel." + ), + } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="podcast", + operation="generate", + status="success", + external_id=str(podcast_id), + preview=podcast_title, + ), + tool_call_id=runtime.tool_call_id, + ) + + # Only other terminal state is FAILED. + logger.warning( + "[generate_podcast] Podcast %s FAILED in %.2fs", + podcast_id, + elapsed, + ) + err = "Background worker reported FAILED status for this podcast." + payload = { + "status": PodcastStatus.FAILED.value, "podcast_id": podcast_id, "title": podcast_title, - "message": "Podcast generation started. This may take a few minutes.", + "error": err, } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="podcast", + operation="generate", + status="failed", + external_id=str(podcast_id), + preview=podcast_title, + error=err, + ), + tool_call_id=runtime.tool_call_id, + ) except Exception as e: error_message = str(e) - print(f"[generate_podcast] Error: {error_message}") - return { + logger.exception("[generate_podcast] Error: %s", error_message) + payload = { "status": PodcastStatus.FAILED.value, "error": error_message, "title": podcast_title, "podcast_id": None, } + receipt = make_receipt( + route="deliverables", + type="podcast", + operation="generate", + status="failed", + preview=podcast_title, + error=error_message, + ) + return with_receipt( + payload=payload, + receipt=receipt, + tool_call_id=runtime.tool_call_id, + ) return generate_podcast diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py index 385100c62..f12ca8a90 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/report.py @@ -6,10 +6,14 @@ import logging import re from typing import Any +from langchain.tools import ToolRuntime from langchain_core.callbacks import dispatch_custom_event from langchain_core.messages import HumanMessage from langchain_core.tools import tool +from langgraph.types import Command +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.db import Report, shielded_async_session from app.services.connector_service import ConnectorService from app.services.llm_service import get_document_summary_llm @@ -573,13 +577,14 @@ def create_generate_report_tool( @tool async def generate_report( topic: str, + runtime: ToolRuntime, source_content: str = "", source_strategy: str = "provided", search_queries: list[str] | None = None, report_style: str = "detailed", user_instructions: str | None = None, parent_report_id: int | None = None, - ) -> dict[str, Any]: + ) -> Command: """ Generate a structured Markdown report artifact from provided content. @@ -692,6 +697,23 @@ def create_generate_report_tool( parent_report_content: str | None = None report_group_id: int | None = None + def _failed(payload: dict[str, Any], *, error: str) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="report", + operation="generate", + status="failed", + external_id=str(payload.get("report_id")) + if payload.get("report_id") is not None + else None, + preview=topic, + error=error, + ), + tool_call_id=runtime.tool_call_id, + ) + async def _save_failed_report(error_msg: str) -> int | None: """Persist a failed report row using a short-lived session.""" try: @@ -753,12 +775,15 @@ def create_generate_report_tool( "No LLM configured. Please configure a language model in Settings." ) report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": topic, - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": topic, + }, + error=error_msg, + ) # Build the user instructions string user_instructions_section = "" @@ -971,12 +996,15 @@ def create_generate_report_tool( if not report_content or not isinstance(report_content, str): error_msg = "LLM returned empty or invalid content" report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": topic, - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": topic, + }, + error=error_msg, + ) # LLMs often wrap output in ```markdown ... ``` fences — strip them report_content = _strip_wrapping_code_fences(report_content) @@ -984,12 +1012,15 @@ def create_generate_report_tool( if not report_content: error_msg = "LLM returned empty or invalid content" report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": topic, - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": topic, + }, + error=error_msg, + ) # Strip any existing footer(s) carried over from parent version(s) while report_content.rstrip().endswith(_REPORT_FOOTER): @@ -1036,7 +1067,7 @@ def create_generate_report_tool( f"{metadata.get('section_count', 0)} sections" ) - return { + payload: dict[str, Any] = { "status": "ready", "report_id": saved_report_id, "title": topic, @@ -1045,17 +1076,32 @@ def create_generate_report_tool( "report_markdown": report_content, "message": f"Report generated successfully: {topic}", } + receipt = make_receipt( + route="deliverables", + type="report", + operation="generate", + status="success", + external_id=str(saved_report_id), + preview=topic, + ) + return with_receipt( + payload=payload, + receipt=receipt, + tool_call_id=runtime.tool_call_id, + ) except Exception as e: error_message = str(e) logger.exception(f"[generate_report] Error: {error_message}") report_id = await _save_failed_report(error_message) - - return { - "status": "failed", - "error": error_message, - "report_id": report_id, - "title": topic, - } + return _failed( + { + "status": "failed", + "error": error_message, + "report_id": report_id, + "title": topic, + }, + error=error_message, + ) return generate_report diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py index ece3ce241..ad16b7ba7 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/resume.py @@ -8,10 +8,14 @@ from typing import Any import pypdf import typst +from langchain.tools import ToolRuntime from langchain_core.callbacks import dispatch_custom_event from langchain_core.messages import HumanMessage from langchain_core.tools import tool +from langgraph.types import Command +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.db import Report, shielded_async_session from app.services.llm_service import get_document_summary_llm @@ -429,10 +433,11 @@ def create_generate_resume_tool( @tool async def generate_resume( user_info: str, + runtime: ToolRuntime, user_instructions: str | None = None, parent_report_id: int | None = None, max_pages: int = 1, - ) -> dict[str, Any]: + ) -> Command: """ Generate a professional resume as a Typst document. @@ -476,6 +481,41 @@ def create_generate_resume_tool( template = _get_template() llm_reference = _build_llm_reference(template) + def _success(payload: dict[str, Any], *, report_id: int, title: str) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="resume", + operation="generate", + status="success", + external_id=str(report_id), + preview=title, + ), + tool_call_id=runtime.tool_call_id, + ) + + def _failed( + payload: dict[str, Any], + *, + report_id: int | None, + error: str, + title: str = "Resume", + ) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="resume", + operation="generate", + status="failed", + external_id=str(report_id) if report_id is not None else None, + preview=title, + error=error, + ), + tool_call_id=runtime.tool_call_id, + ) + async def _save_failed_report(error_msg: str) -> int | None: try: async with shielded_async_session() as session: @@ -514,13 +554,17 @@ def create_generate_resume_tool( except ValueError as e: error_msg = str(e) report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) # ── Phase 1: READ ───────────────────────────────────────────── async with shielded_async_session() as read_session: @@ -541,13 +585,17 @@ def create_generate_resume_tool( "No LLM configured. Please configure a language model in Settings." ) report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) # ── Phase 2: LLM GENERATION ─────────────────────────────────── @@ -588,13 +636,17 @@ def create_generate_resume_tool( if not body or not isinstance(body, str): error_msg = "LLM returned empty or invalid content" report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) body = _strip_typst_fences(body) body = _strip_imports(body) @@ -661,13 +713,17 @@ def create_generate_resume_tool( f"{compile_error or 'Unknown compile error'}" ) report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) actual_pages = _count_pdf_pages(pdf_bytes) if actual_pages <= validated_max_pages: @@ -700,13 +756,17 @@ def create_generate_resume_tool( ): error_msg = "LLM returned empty content while compressing resume" report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) body = _strip_typst_fences(compress_response.content) body = _strip_imports(body) @@ -718,13 +778,17 @@ def create_generate_resume_tool( f"Hard limit: <= {MAX_RESUME_PAGES} page(s), actual: {actual_pages}." ) report_id = await _save_failed_report(error_msg) - return { - "status": "failed", - "error": error_msg, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_msg, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_msg, + ) # ── Phase 4: SAVE ───────────────────────────────────────────── dispatch_custom_event( @@ -768,32 +832,40 @@ def create_generate_resume_tool( logger.info(f"[generate_resume] Created resume {saved_id}: {resume_title}") - return { - "status": "ready", - "report_id": saved_id, - "title": resume_title, - "content_type": "typst", - "is_revision": bool(parent_content), - "message": ( - f"Resume generated successfully: {resume_title}" - if target_page_met - else ( - f"Resume generated, but could not fit the target of <= {validated_max_pages} " - f"page(s). Final length: {actual_pages} page(s)." - ) - ), - } + return _success( + { + "status": "ready", + "report_id": saved_id, + "title": resume_title, + "content_type": "typst", + "is_revision": bool(parent_content), + "message": ( + f"Resume generated successfully: {resume_title}" + if target_page_met + else ( + f"Resume generated, but could not fit the target of <= {validated_max_pages} " + f"page(s). Final length: {actual_pages} page(s)." + ) + ), + }, + report_id=saved_id, + title=resume_title, + ) except Exception as e: error_message = str(e) logger.exception(f"[generate_resume] Error: {error_message}") report_id = await _save_failed_report(error_message) - return { - "status": "failed", - "error": error_message, - "report_id": report_id, - "title": "Resume", - "content_type": "typst", - } + return _failed( + { + "status": "failed", + "error": error_message, + "report_id": report_id, + "title": "Resume", + "content_type": "typst", + }, + report_id=report_id, + error=error_message, + ) return generate_resume diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py index a9f3447ab..8c52293de 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/video_presentation.py @@ -1,12 +1,29 @@ -"""Factory for a video-presentation tool that queues background work and returns an ID for polling.""" +"""Factory for a video-presentation tool. +Dispatches the heavy generation to Celery and then polls the +video-presentation row until it reaches a terminal status (READY/FAILED). +The tool always returns a real terminal ``Receipt`` — never a pending +one. The wait is bounded by the existing per-invocation safety net +(``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`` in multi-agent mode, +HTTP / process lifetime in single-agent mode). Video rendering can be +heavy; raise that ceiling if your generations routinely exceed it. +""" + +import logging from typing import Any +from langchain.tools import ToolRuntime from langchain_core.tools import tool +from langgraph.types import Command from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.shared.deliverable_wait import wait_for_deliverable +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.db import VideoPresentation, VideoPresentationStatus, shielded_async_session +logger = logging.getLogger(__name__) + def create_generate_video_presentation_tool( search_space_id: int, @@ -19,9 +36,10 @@ def create_generate_video_presentation_tool( @tool async def generate_video_presentation( source_content: str, + runtime: ToolRuntime, video_title: str = "SurfSense Presentation", user_prompt: str | None = None, - ) -> dict[str, Any]: + ) -> Command: """Generate a video presentation from the provided content. Use this tool when the user asks to create a video, presentation, slides, or slide deck. @@ -56,25 +74,103 @@ def create_generate_video_presentation_tool( user_prompt=user_prompt, ) - print( - f"[generate_video_presentation] Created video presentation {video_pres_id}, task: {task.id}" + logger.info( + "[generate_video_presentation] Created video presentation %s, task: %s", + video_pres_id, + task.id, ) - return { - "status": VideoPresentationStatus.PENDING.value, + # Wait until the Celery worker flips the row to a terminal + # state. The wait is bounded only by the subagent invoke + # timeout (multi-agent) or HTTP lifetime (single-agent) — + # see app.agents.shared.deliverable_wait for details. + terminal_status, _columns, elapsed = await wait_for_deliverable( + model=VideoPresentation, + row_id=video_pres_id, + columns=[VideoPresentation.status], + terminal_statuses={ + VideoPresentationStatus.READY, + VideoPresentationStatus.FAILED, + }, + ) + + if terminal_status == VideoPresentationStatus.READY: + logger.info( + "[generate_video_presentation] %s READY in %.2fs", + video_pres_id, + elapsed, + ) + payload: dict[str, Any] = { + "status": VideoPresentationStatus.READY.value, + "video_presentation_id": video_pres_id, + "title": video_title, + "message": "Video presentation generated and saved.", + } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="video_presentation", + operation="generate", + status="success", + external_id=str(video_pres_id), + preview=video_title, + ), + tool_call_id=runtime.tool_call_id, + ) + + # Only other terminal state is FAILED. + logger.warning( + "[generate_video_presentation] %s FAILED in %.2fs", + video_pres_id, + elapsed, + ) + err = ( + "Background worker reported FAILED status for this " + "video presentation." + ) + payload = { + "status": VideoPresentationStatus.FAILED.value, "video_presentation_id": video_pres_id, "title": video_title, - "message": "Video presentation generation started. This may take a few minutes.", + "error": err, } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="video_presentation", + operation="generate", + status="failed", + external_id=str(video_pres_id), + preview=video_title, + error=err, + ), + tool_call_id=runtime.tool_call_id, + ) except Exception as e: error_message = str(e) - print(f"[generate_video_presentation] Error: {error_message}") - return { + logger.exception( + "[generate_video_presentation] Error: %s", error_message + ) + payload = { "status": VideoPresentationStatus.FAILED.value, "error": error_message, "title": video_title, "video_presentation_id": None, } + return with_receipt( + payload=payload, + receipt=make_receipt( + route="deliverables", + type="video_presentation", + operation="generate", + status="failed", + preview=video_title, + error=error_message, + ), + tool_call_id=runtime.tool_call_id, + ) return generate_video_presentation diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md index 2ae21c271..c4e36fc73 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md @@ -150,11 +150,12 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: +<include snippet="output_contract_base"/> + +Route-specific rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. - `evidence.content_excerpt`: max ~500 characters. Surface a short excerpt or a one-sentence summary, not the full file body. The supervisor already sees the tool's raw output. +<include snippet="verifiable_handle"/> + Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md index b0f2dacb2..25dafa3df 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md @@ -117,11 +117,12 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: +<include snippet="output_contract_base"/> + +Route-specific rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. - `evidence.content_excerpt`: max ~500 characters. Surface a short excerpt or a one-sentence summary, not the full file body. The supervisor already sees the tool's raw output. +<include snippet="verifiable_handle"/> + Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md index 13f7b68a5..b656c5019 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/memory/system_prompt.md @@ -6,7 +6,7 @@ Persist durable preferences/facts/instructions with `update_memory` while avoidi </goal> <visibility_scope> -{{MEMORY_VISIBILITY_POLICY}} +Memory is search-space-scoped; do not assume cross-workspace visibility. </visibility_scope> <available_tools> @@ -53,10 +53,8 @@ Return **only** one JSON object (no markdown/prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` -> `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` -> `next_step` must be non-null. -- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - `evidence.memory_category` is a semantic classification for supervisor logs only. It is not the persisted storage format and must not force inline `[fact|preference|instruction]` markers into saved memory. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md index f1a22ddf1..3eabd8ee0 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/builtins/research/system_prompt.md @@ -46,10 +46,8 @@ Return **only** one JSON object (no markdown/prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` -> `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` -> `next_step` must be non-null. -- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - `evidence.findings`: max 10 entries, each a single sentence stating one distinct fact. Do not paste raw paragraphs, scraped pages, or quote blocks. - `evidence.sources`: max 10 URLs, one per finding when applicable. List each URL once. </output_contract> diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md index 9434db7a1..e6a639af3 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/airtable/system_prompt.md @@ -92,12 +92,12 @@ Return **only** one JSON object (no markdown, no prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: base, table, field, choice, record, etc.). - For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (record id, primary-field value, and 1-2 most relevant fields; up to 10 entries, then `"...and N more"`). </output_contract> +<include snippet="verifiable_handle"/> + Discover before you mutate; never guess identifiers, choice IDs, or required fields. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md index a663f5b37..9168f4d2b 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/calendar/system_prompt.md @@ -111,11 +111,12 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +Route-specific rules: - For `search_calendar_events` results, set `evidence.items` to `{ "total": N }` and list the matched events in `action_summary` (title, date, start time; up to 10 entries, then `"...and N more"`). - For ambiguous matches across `update_calendar_event` / `delete_calendar_event`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`, where `label` should include the event title and start time for human readability). +<include snippet="verifiable_handle"/> + Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md index 898197f14..029609670 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/clickup/system_prompt.md @@ -93,12 +93,12 @@ Return **only** one JSON object (no markdown, no prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: task, list, member, status, custom-field choice, etc.). - For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (task id, title, status, assignees; up to 10 entries, then `"...and N more"`). </output_contract> +<include snippet="verifiable_handle"/> + Discover before you mutate; never guess identifiers, list statuses, or assignees. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md index 991ec3d03..5aa687cd0 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/confluence/system_prompt.md @@ -100,9 +100,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md index 249f9ec8b..aaabd2ac3 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/discord/system_prompt.md @@ -108,9 +108,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Resolve before you call; verify before you send; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md index a963b0ec6..8e498dfdf 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/dropbox/system_prompt.md @@ -98,9 +98,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md index c04d69ad0..02aff5589 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/system_prompt.md @@ -110,11 +110,12 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +Route-specific rules: - For `search_gmail` results, set `evidence.items` to `{ "total": N }` and list the matched emails in `action_summary` (sender, subject, date; up to 10 entries, then `"...and N more"`). - For ambiguous matches across `update_gmail_draft` / `trash_gmail_email` / `read_gmail_email`, populate `evidence.matched_candidates` with up to 5 options (`id` + `label`). +<include snippet="verifiable_handle"/> + Infer before you call; verify before you send; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py index 578233b57..0680e51cb 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/gmail/tools/send_email.py @@ -5,12 +5,16 @@ from datetime import datetime from email.mime.text import MIMEText from typing import Any +from langchain.tools import ToolRuntime from langchain_core.tools import tool +from langgraph.types import Command from sqlalchemy.ext.asyncio import AsyncSession from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import ( request_approval, ) +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.services.gmail import GmailToolMetadataService logger = logging.getLogger(__name__) @@ -26,9 +30,10 @@ def create_send_gmail_email_tool( to: str, subject: str, body: str, + runtime: ToolRuntime, cc: str | None = None, bcc: str | None = None, - ) -> dict[str, Any]: + ) -> Command: """Send an email via Gmail. Use when the user explicitly asks to send an email. This sends the @@ -60,11 +65,34 @@ def create_send_gmail_email_tool( """ logger.info(f"send_gmail_email called: to='{to}', subject='{subject}'") + def _emit( + payload: dict[str, Any], + *, + success: bool, + external_id: str | None = None, + error: str | None = None, + ) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="gmail", + type="message", + operation="send", + status="success" if success else "failed", + external_id=external_id, + preview=f"to={to}: {subject}"[:200], + error=error, + ), + tool_call_id=runtime.tool_call_id, + ) + if db_session is None or search_space_id is None or user_id is None: - return { - "status": "error", - "message": "Gmail tool not properly configured. Please contact support.", - } + msg = "Gmail tool not properly configured. Please contact support." + return _emit( + {"status": "error", "message": msg}, + success=False, + error=msg, + ) try: metadata_service = GmailToolMetadataService(db_session) @@ -74,16 +102,24 @@ def create_send_gmail_email_tool( if "error" in context: logger.error(f"Failed to fetch creation context: {context['error']}") - return {"status": "error", "message": context["error"]} + return _emit( + {"status": "error", "message": context["error"]}, + success=False, + error=context["error"], + ) accounts = context.get("accounts", []) if accounts and all(a.get("auth_expired") for a in accounts): logger.warning("All Gmail accounts have expired authentication") - return { - "status": "auth_error", - "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.", - "connector_type": "gmail", - } + return _emit( + { + "status": "auth_error", + "message": "All connected Gmail accounts need re-authentication. Please re-authenticate in your connector settings.", + "connector_type": "gmail", + }, + success=False, + error="auth_expired", + ) logger.info( f"Requesting approval for sending Gmail email: to='{to}', subject='{subject}'" @@ -103,10 +139,14 @@ def create_send_gmail_email_tool( ) if result.rejected: - return { - "status": "rejected", - "message": "User declined. The email was not sent. Do not ask again or suggest alternatives.", - } + return _emit( + { + "status": "rejected", + "message": "User declined. The email was not sent. Do not ask again or suggest alternatives.", + }, + success=False, + error="user_rejected", + ) final_to = result.params.get("to", to) final_subject = result.params.get("subject", subject) @@ -135,10 +175,14 @@ def create_send_gmail_email_tool( ) connector = result.scalars().first() if not connector: - return { - "status": "error", - "message": "Selected Gmail connector is invalid or has been disconnected.", - } + msg = ( + "Selected Gmail connector is invalid or has been disconnected." + ) + return _emit( + {"status": "error", "message": msg}, + success=False, + error=msg, + ) actual_connector_id = connector.id else: result = await db_session.execute( @@ -150,10 +194,12 @@ def create_send_gmail_email_tool( ) connector = result.scalars().first() if not connector: - return { - "status": "error", - "message": "No Gmail connector found. Please connect Gmail in your workspace settings.", - } + msg = "No Gmail connector found. Please connect Gmail in your workspace settings." + return _emit( + {"status": "error", "message": msg}, + success=False, + error=msg, + ) actual_connector_id = connector.id logger.info( @@ -166,10 +212,12 @@ def create_send_gmail_email_tool( ): cca_id = connector.config.get("composio_connected_account_id") if not cca_id: - return { - "status": "error", - "message": "Composio connected account ID not found for this Gmail connector.", - } + msg = "Composio connected account ID not found for this Gmail connector." + return _emit( + {"status": "error", "message": msg}, + success=False, + error=msg, + ) from app.services.composio_service import ComposioService @@ -187,7 +235,11 @@ def create_send_gmail_email_tool( bcc=final_bcc, ) if error: - return {"status": "error", "message": error} + return _emit( + {"status": "error", "message": error}, + success=False, + error=error, + ) sent = {"id": sent_message_id, "threadId": sent_thread_id} else: from google.oauth2.credentials import Credentials @@ -275,11 +327,15 @@ def create_send_gmail_email_tool( actual_connector_id, exc_info=True, ) - return { - "status": "insufficient_permissions", - "connector_id": actual_connector_id, - "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.", - } + return _emit( + { + "status": "insufficient_permissions", + "connector_id": actual_connector_id, + "message": "This Gmail account needs additional permissions. Please re-authenticate in connector settings.", + }, + success=False, + error="insufficient_permissions", + ) raise logger.info( @@ -310,12 +366,16 @@ def create_send_gmail_email_tool( logger.warning(f"KB sync after send failed: {kb_err}") kb_message_suffix = " This email will be added to your knowledge base in the next scheduled sync." - return { - "status": "success", - "message_id": sent.get("id"), - "thread_id": sent.get("threadId"), - "message": f"Successfully sent email to '{final_to}' with subject '{final_subject}'.{kb_message_suffix}", - } + return _emit( + { + "status": "success", + "message_id": sent.get("id"), + "thread_id": sent.get("threadId"), + "message": f"Successfully sent email to '{final_to}' with subject '{final_subject}'.{kb_message_suffix}", + }, + success=True, + external_id=sent.get("id"), + ) except Exception as e: from langgraph.errors import GraphInterrupt @@ -324,9 +384,11 @@ def create_send_gmail_email_tool( raise logger.error(f"Error sending Gmail email: {e}", exc_info=True) - return { - "status": "error", - "message": "Something went wrong while sending the email. Please try again.", - } + msg = "Something went wrong while sending the email. Please try again." + return _emit( + {"status": "error", "message": msg}, + success=False, + error=str(e), + ) return send_gmail_email diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md index b78e1f7c6..10140d842 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/google_drive/system_prompt.md @@ -100,9 +100,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md index 4dcc56454..d7816dead 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/jira/system_prompt.md @@ -111,12 +111,12 @@ Return **only** one JSON object (no markdown, no prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: site, project, issue, user, transition, etc.). - For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (issue key, summary, status, assignee; up to 10 entries, then `"...and N more"`). </output_contract> +<include snippet="verifiable_handle"/> + Discover before you mutate; never guess identifiers, transitions, or required fields. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md index 1d96a4105..5dfd29112 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/linear/system_prompt.md @@ -101,12 +101,12 @@ Return **only** one JSON object (no markdown, no prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: issue, user, project, state, etc.). - For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (identifier, title, state, assignee; up to 10 entries, then `"...and N more"`). </output_contract> +<include snippet="verifiable_handle"/> + Discover before you mutate; never guess identifiers. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md index 0f42161b3..e483789d5 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/luma/system_prompt.md @@ -101,9 +101,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; verify before you create; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md index b38c30167..909c72471 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/system_prompt.md @@ -99,9 +99,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py index 85d0ef22e..c98b25811 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/notion/tools/delete_page.py @@ -1,12 +1,16 @@ import logging from typing import Any +from langchain.tools import ToolRuntime from langchain_core.tools import tool +from langgraph.types import Command from sqlalchemy.ext.asyncio import AsyncSession from app.agents.multi_agent_chat.subagents.shared.hitl.approvals.self_gated import ( request_approval, ) +from app.agents.shared.receipt import make_receipt +from app.agents.shared.receipt_command import with_receipt from app.connectors.notion_history import NotionAPIError, NotionHistoryConnector from app.services.notion.tool_metadata_service import NotionToolMetadataService @@ -35,8 +39,9 @@ def create_delete_notion_page_tool( @tool async def delete_notion_page( page_title: str, + runtime: ToolRuntime, delete_from_kb: bool = False, - ) -> dict[str, Any]: + ) -> Command: """Delete (archive) a Notion page. Use this tool when the user asks you to delete, remove, or archive @@ -65,14 +70,39 @@ def create_delete_notion_page_tool( f"delete_notion_page called: page_title='{page_title}', delete_from_kb={delete_from_kb}" ) + def _emit( + payload: dict[str, Any], + *, + status: str, + external_id: str | None = None, + error: str | None = None, + ) -> Command: + return with_receipt( + payload=payload, + receipt=make_receipt( + route="notion", + type="page", + operation="delete", + status="success" if status == "success" else "failed", + external_id=external_id, + preview=page_title, + error=error, + ), + tool_call_id=runtime.tool_call_id, + ) + if db_session is None or search_space_id is None or user_id is None: logger.error( "Notion tool not properly configured - missing required parameters" ) - return { - "status": "error", - "message": "Notion tool not properly configured. Please contact support.", - } + return _emit( + { + "status": "error", + "message": "Notion tool not properly configured. Please contact support.", + }, + status="error", + error="Notion tool not properly configured. Please contact support.", + ) try: # Get page context (page_id, account, title) from indexed data @@ -86,16 +116,18 @@ def create_delete_notion_page_tool( # Check if it's a "not found" error (softer handling for LLM) if "not found" in error_msg.lower(): logger.warning(f"Page not found: {error_msg}") - return { - "status": "not_found", - "message": error_msg, - } + return _emit( + {"status": "not_found", "message": error_msg}, + status="error", + error=error_msg, + ) else: logger.error(f"Failed to fetch delete context: {error_msg}") - return { - "status": "error", - "message": error_msg, - } + return _emit( + {"status": "error", "message": error_msg}, + status="error", + error=error_msg, + ) account = context.get("account", {}) if account.get("auth_expired"): @@ -103,10 +135,14 @@ def create_delete_notion_page_tool( "Notion account %s has expired authentication", account.get("id"), ) - return { - "status": "auth_error", - "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.", - } + return _emit( + { + "status": "auth_error", + "message": "The Notion account for this page needs re-authentication. Please re-authenticate in your connector settings.", + }, + status="error", + error="auth_expired", + ) page_id = context.get("page_id") connector_id_from_context = account.get("id") @@ -129,10 +165,14 @@ def create_delete_notion_page_tool( if result.rejected: logger.info("Notion page deletion rejected by user") - return { - "status": "rejected", - "message": "User declined. Do not retry or suggest alternatives.", - } + return _emit( + { + "status": "rejected", + "message": "User declined. Do not retry or suggest alternatives.", + }, + status="error", + error="user_rejected", + ) final_page_id = result.params.get("page_id", page_id) final_connector_id = result.params.get( @@ -165,18 +205,26 @@ def create_delete_notion_page_tool( logger.error( f"Invalid connector_id={final_connector_id} for search_space_id={search_space_id}" ) - return { - "status": "error", - "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.", - } + return _emit( + { + "status": "error", + "message": "Selected Notion account is invalid or has been disconnected. Please select a valid account.", + }, + status="error", + error="invalid_connector", + ) actual_connector_id = connector.id logger.info(f"Validated Notion connector: id={actual_connector_id}") else: logger.error("No connector found for this page") - return { - "status": "error", - "message": "No connector found for this page.", - } + return _emit( + { + "status": "error", + "message": "No connector found for this page.", + }, + status="error", + error="no_connector", + ) # Create connector instance notion_connector = NotionHistoryConnector( @@ -232,7 +280,13 @@ def create_delete_notion_page_tool( f"{result.get('message', '')} (also removed from knowledge base)" ) - return result + status = result.get("status", "error") + return _emit( + result, + status=status, + external_id=str(final_page_id) if final_page_id else None, + error=None if status == "success" else result.get("message"), + ) except Exception as e: from langgraph.errors import GraphInterrupt @@ -245,20 +299,28 @@ def create_delete_notion_page_tool( if isinstance(e, NotionAPIError) and ( "401" in error_str or "unauthorized" in error_str ): - return { - "status": "auth_error", - "message": str(e), - "connector_id": connector_id_from_context - if "connector_id_from_context" in dir() - else None, - "connector_type": "notion", - } + return _emit( + { + "status": "auth_error", + "message": str(e), + "connector_id": connector_id_from_context + if "connector_id_from_context" in dir() + else None, + "connector_type": "notion", + }, + status="error", + error=str(e), + ) if isinstance(e, ValueError | NotionAPIError): message = str(e) else: message = ( "Something went wrong while deleting the page. Please try again." ) - return {"status": "error", "message": message} + return _emit( + {"status": "error", "message": message}, + status="error", + error=message, + ) return delete_notion_page diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md index 8ae444a58..4b45b05a9 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/onedrive/system_prompt.md @@ -97,9 +97,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Infer before you call; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md index 3c24b19c9..e4e0d1f6f 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/slack/system_prompt.md @@ -87,12 +87,12 @@ Return **only** one JSON object (no markdown, no prose): "missing_fields": string[] | null, "assumptions": string[] | null } -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> +Route-specific rules: - For blocked ambiguity, populate `evidence.matched_candidates` with up to 5 options (`id` + `label` — works for any kind of candidate: channel, user, message, thread). - For discovery-only queries (lists), set `evidence.items` to `{ "total": N }` and list the matched items in `action_summary` (channel/user, key identifier, timestamp, short snippet; up to 10 entries, then `"...and N more"`). </output_contract> +<include snippet="verifiable_handle"/> + Discover before you post; never guess channel, user, or thread targets. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md index c3a280f79..9b283acf5 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/connectors/teams/system_prompt.md @@ -115,9 +115,8 @@ Return **only** one JSON object (no markdown or prose outside it): } ``` -Rules: -- `status=success` → `next_step=null`, `missing_fields=null`. -- `status=partial|blocked|error` → `next_step` must be non-null. -- `status=blocked` due to missing required inputs → `missing_fields` must be non-null. +<include snippet="output_contract_base"/> + +<include snippet="verifiable_handle"/> Resolve before you call; verify before you send; map every tool outcome faithfully. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py index 2fce413a6..5694e4326 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/md_file_reader.py @@ -2,8 +2,11 @@ from __future__ import annotations +from functools import lru_cache from importlib import resources +_SHARED_SNIPPETS_PACKAGE = "app.agents.multi_agent_chat.subagents.shared.snippets" + def read_md_file(package: str, stem: str) -> str: """Load ``{stem}.md`` from ``package`` via importlib resources, or return empty.""" @@ -12,3 +15,13 @@ def read_md_file(package: str, stem: str) -> str: return "" text = ref.read_text(encoding="utf-8") return text.rstrip("\n") + + +@lru_cache(maxsize=64) +def read_shared_snippet(name: str) -> str: + """Load a shared markdown snippet from the snippets package. + + Cached because snippets are static at runtime and resolved many times + (once per subagent build, plus per-subagent-per-route). + """ + return read_md_file(_SHARED_SNIPPETS_PACKAGE, name) diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/__init__.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/__init__.py new file mode 100644 index 000000000..802a8e241 --- /dev/null +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/__init__.py @@ -0,0 +1,6 @@ +"""Shared markdown snippets composed into every subagent system prompt. + +Resolved at build time by :func:`pack_subagent` in ``subagent_builder.py`` +via the ``<include snippet="NAME"/>`` directive. See ``output_contract_base.md`` +and ``verifiable_handle.md`` for the included content. +""" diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/output_contract_base.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/output_contract_base.md new file mode 100644 index 000000000..100daae75 --- /dev/null +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/output_contract_base.md @@ -0,0 +1,6 @@ +Rules (universal): +- `status=success` -> `next_step=null`, `missing_fields=null`. +- `status=partial|blocked|error` -> `next_step` must be non-null. +- `status=blocked` due to missing required inputs -> `missing_fields` must be non-null. +- `assumptions`: any inferences you made about the user's intent; `null` when no inferences were needed. +- The `evidence` object's fields are documented in your route-specific `<output_contract>` above; never invent fields the tool did not return. diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/verifiable_handle.md b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/verifiable_handle.md new file mode 100644 index 000000000..bea070ce9 --- /dev/null +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/snippets/verifiable_handle.md @@ -0,0 +1,10 @@ +<verifiable_handle> +Mutating tools you call return a structured `Receipt` object alongside their normal payload (see `evidence.receipts` in your `<output_contract>`). The supervisor uses the Receipt's `verifiable_url` and `external_id` to independently confirm the operation succeeded - do not paraphrase, shorten, or guess these values. + +Rules: +- Quote each Receipt's `verifiable_url` and `external_id` **verbatim** in `evidence.receipts`. Copy character-for-character; never retype from memory. +- If a Receipt has `status="failed"`, set your own `status="error"` and put the Receipt's `error` field in `next_step`. +- If a Receipt has `status="pending"` (async backends — podcasts, video presentations, anything queued through Celery), report `status=success`, surface the pending Receipt as-is, and tell the supervisor in `action_summary` that the artefact is **being generated in the background** (e.g. "Podcast 38 queued; orchestrator should report it as kicked off, not yet ready"). A pending Receipt almost always lacks `verifiable_url` because the artefact does not exist yet — that is expected, not a defect. Do **not** wait, poll, or retry; control returns to the supervisor immediately and the asset becomes visible to the user out of band via its own UI surface. +- Never claim a mutation succeeded without a matching Receipt with `status="success"` or `"pending"` in your tool results this turn. +- For tools that do not return a Receipt (read-only operations, search, lookup), the receipt rules do not apply; only the route-specific `evidence` fields matter. +</verifiable_handle> diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py index 797ab535b..f891f94d2 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/spec.py @@ -2,12 +2,30 @@ from __future__ import annotations +from collections.abc import Callable, Mapping from dataclasses import dataclass +from typing import Any from deepagents import SubAgent from app.agents.new_chat.permissions import Ruleset +# A context-hint provider receives the parent-agent ``runtime.state`` mapping +# and the ``description`` the orchestrator wrote, and returns a short string +# the runtime prepends to the subagent's first ``HumanMessage``. Used for +# things like "current search-space id is X" or "the user is in workspace Y" — +# never for full corpora, since the prepended text consumes the subagent's +# prompt budget on every invocation. Return ``None`` (or an empty string) to +# skip the hint for this call. +ContextHintProvider = Callable[[Mapping[str, Any], str], str | None] + +# Custom key stashed on the deepagents ``SubAgent`` dict so the provider +# survives the trip from ``pack_subagent`` → registry → middleware → +# task_tool. ``deepagents.create_agent`` only extracts the keys it +# recognises, so an extra key here is dropped silently at compile time. +# The prefix avoids any collision with future deepagents fields. +SURF_CONTEXT_HINT_PROVIDER_KEY = "surf_context_hint_provider" + @dataclass(frozen=True, slots=True) class SurfSenseSubagentSpec: @@ -20,10 +38,22 @@ class SurfSenseSubagentSpec: layers them into the subagent's :class:`PermissionMiddleware`, so each subagent owns its own ruleset without aliasing the shared rule engine. + context_hint_provider: Optional callback invoked once per ``task(...)`` + invocation, immediately before the subagent runs. Its return + value is prepended to the subagent's first ``HumanMessage`` so + the subagent can see things it would otherwise have to discover + (active search space, KB root, current user timezone, etc.). + Kept out of the deepagents ``spec`` because that dict is forwarded + verbatim to upstream code and only recognises its own typed keys. """ spec: SubAgent ruleset: Ruleset + context_hint_provider: ContextHintProvider | None = None -__all__ = ["SurfSenseSubagentSpec"] +__all__ = [ + "SURF_CONTEXT_HINT_PROVIDER_KEY", + "ContextHintProvider", + "SurfSenseSubagentSpec", +] diff --git a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py index 7173901f9..5025b32e7 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py +++ b/surfsense_backend/app/agents/multi_agent_chat/subagents/shared/subagent_builder.py @@ -2,6 +2,8 @@ from __future__ import annotations +import logging +import re from typing import Any, cast from deepagents import SubAgent @@ -12,9 +14,48 @@ from langchain_core.tools import BaseTool from app.agents.multi_agent_chat.middleware.shared.permissions import ( build_permission_mw, ) -from app.agents.multi_agent_chat.subagents.shared.spec import SurfSenseSubagentSpec +from app.agents.multi_agent_chat.subagents.shared.md_file_reader import ( + read_shared_snippet, +) +from app.agents.multi_agent_chat.subagents.shared.spec import ( + SURF_CONTEXT_HINT_PROVIDER_KEY, + ContextHintProvider, + SurfSenseSubagentSpec, +) from app.agents.new_chat.permissions import Ruleset +logger = logging.getLogger(__name__) + +# ``<include snippet="NAME"/>`` directive. Matches an XML-style self-closing +# tag whose ``snippet`` attribute names a file in ``shared/snippets/``. +# Whitespace around the attribute and self-close is tolerated; the snippet +# name itself must be a bare identifier (letters / digits / underscores) so +# we never pull a path-traversal value into ``read_shared_snippet``. +_INCLUDE_DIRECTIVE_RE = re.compile( + r"<include\s+snippet=\"(?P<name>[A-Za-z0-9_]+)\"\s*/>" +) + + +def _resolve_includes(prompt: str, *, subagent_name: str) -> str: + """Replace ``<include snippet="X"/>`` directives with the snippet body. + + Unknown snippet names raise; an empty body is treated as unknown so a + typo or missing file fails loudly at startup instead of silently + shipping a broken prompt to the LLM. + """ + + def _replace(match: re.Match[str]) -> str: + name = match.group("name") + body = read_shared_snippet(name) + if not body.strip(): + raise ValueError( + f"Subagent {subagent_name!r}: unknown or empty shared " + f"snippet {name!r} referenced via <include>." + ) + return body + + return _INCLUDE_DIRECTIVE_RE.sub(_replace, prompt) + def _user_allowlist_for( dependencies: dict[str, Any], subagent_name: str @@ -43,6 +84,7 @@ def pack_subagent( dependencies: dict[str, Any], model: BaseChatModel | None = None, middleware_stack: dict[str, Any] | None = None, + context_hint_provider: ContextHintProvider | None = None, ) -> SurfSenseSubagentSpec: """Pack the route-local pieces into one sub-agent spec + its Ruleset. @@ -68,6 +110,8 @@ def pack_subagent( msg = f"Subagent {name!r}: system_prompt is empty" raise ValueError(msg) + system_prompt = _resolve_includes(system_prompt, subagent_name=name) + flags = dependencies["flags"] user_allowlist = _user_allowlist_for(dependencies, name) subagent_rulesets: list[Ruleset] = [ruleset] @@ -99,4 +143,12 @@ def pack_subagent( } if model is not None: spec_dict["model"] = model - return SurfSenseSubagentSpec(spec=cast(SubAgent, spec_dict), ruleset=ruleset) + if context_hint_provider is not None: + # Stash the callback on the dict so it survives the trip through + # registry / middleware unpacking (both treat the spec as opaque). + spec_dict[SURF_CONTEXT_HINT_PROVIDER_KEY] = context_hint_provider + return SurfSenseSubagentSpec( + spec=cast(SubAgent, spec_dict), + ruleset=ruleset, + context_hint_provider=context_hint_provider, + ) diff --git a/surfsense_backend/app/agents/new_chat/filesystem_state.py b/surfsense_backend/app/agents/new_chat/filesystem_state.py index cc674be76..de2c94b41 100644 --- a/surfsense_backend/app/agents/new_chat/filesystem_state.py +++ b/surfsense_backend/app/agents/new_chat/filesystem_state.py @@ -33,9 +33,11 @@ from typing_extensions import TypedDict from app.agents.new_chat.state_reducers import ( _add_unique_reducer, _dict_merge_with_tombstones_reducer, + _int_counter_merge_reducer, _list_append_reducer, _replace_reducer, ) +from app.agents.shared.receipt import Receipt class PendingMove(TypedDict, total=False): @@ -172,6 +174,35 @@ class SurfSenseFilesystemState(FilesystemState): workspace_tree_text: NotRequired[Annotated[str, _replace_reducer]] """Pre-rendered ``<workspace_tree>`` body; shared with subagents to skip re-render.""" + billable_calls: NotRequired[Annotated[dict[str, int], _int_counter_merge_reducer]] + """Per-subagent ``task(...)`` invocation counter, summed across the turn. + + Incremented by ``task_tool.py`` each time a subagent invocation + completes (single- or batch-mode). The orchestrator can read this map + to self-limit when a runaway loop sends the same specialist 20 calls + in a row; the runtime emits a soft warning ToolMessage once the + cumulative count crosses :data:`DEFAULT_SUBAGENT_BILLABLE_THRESHOLD`. + Cleared by checkpoint rollover (i.e. per turn). + """ + + receipts: NotRequired[Annotated[list[Receipt], _list_append_reducer]] + """Structured Receipt handles emitted by mutating subagent tools this turn. + + Each mutating tool (deliverables, every connector, KB writes via the + persistence middleware) wraps its native return into a + :class:`~app.agents.shared.receipt.Receipt` + and returns it under the ``"receipt"`` key alongside its existing + payload. The subagent's tool-call middleware folds the receipt into + this list, and ``_return_command_with_state_update`` in + ``checkpointed_subagent_middleware/task_tool.py`` carries the list up + to the parent automatically (``"receipts"`` is not in + ``EXCLUDED_STATE_KEYS``). + + Append-only across the turn; cleared by checkpoint rollover. The + orchestrator reads it via the ``<verification>`` teaching to confirm + side-effecting subagent claims (see ``shared/snippets/verifiable_handle.md``). + """ + __all__ = [ "KbAnonDoc", diff --git a/surfsense_backend/app/agents/new_chat/middleware/compaction.py b/surfsense_backend/app/agents/new_chat/middleware/compaction.py index 8173976fe..f8d340e5d 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/compaction.py +++ b/surfsense_backend/app/agents/new_chat/middleware/compaction.py @@ -34,8 +34,7 @@ from deepagents.middleware.summarization import ( ) from langchain_core.messages import SystemMessage -from app.observability import metrics as ot_metrics -from app.observability import otel as ot +from app.observability import metrics as ot_metrics, otel as ot if TYPE_CHECKING: from deepagents.backends.protocol import BACKEND_TYPES diff --git a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py index d50cadc0e..a7901c010 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py +++ b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py @@ -47,8 +47,7 @@ from langgraph.config import get_config from langgraph.runtime import Runtime from langgraph.types import interrupt -from app.observability import metrics as ot_metrics -from app.observability import otel as ot +from app.observability import metrics as ot_metrics, otel as ot logger = logging.getLogger(__name__) diff --git a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py index cc30f4897..c88dced85 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py +++ b/surfsense_backend/app/agents/new_chat/middleware/kb_persistence.py @@ -55,6 +55,7 @@ from app.agents.new_chat.path_resolver import ( virtual_path_to_doc, ) from app.agents.new_chat.state_reducers import _CLEAR +from app.agents.shared.receipt import Receipt, make_receipt from app.db import ( AgentActionLog, Chunk, @@ -1392,6 +1393,81 @@ async def commit_staged_filesystem_state( "pending_dir_deletes": [_CLEAR], "dirty_path_tool_calls": {_CLEAR: True}, } + + # Emit one Receipt per committed mutation, folded into ``state['receipts']`` + # via ``_list_append_reducer``. The receipts surface what actually committed + # (post-savepoint) rather than what the LLM intended; the orchestrator uses + # them as ground truth in the ``<verification>`` teaching. KB writes do not + # have public verifiable URLs, so ``verifiable_url`` stays unset. + receipts: list[Receipt] = [] + + def _kb_receipt( + *, + type: str, + operation: str, + path: str, + external_id: int | None = None, + ) -> None: + if not path: + return + preview = path.rsplit("/", 1)[-1] or path + receipts.append( + make_receipt( + route="knowledge_base", + type=type, + operation=operation, + status="success", + external_id=str(external_id) if external_id is not None else path, + preview=preview, + ) + ) + + for payload in committed_creates: + path = str(payload.get("virtualPath") or "") + _kb_receipt( + type="file", + operation="write_file", + path=path, + external_id=payload.get("id"), + ) + for payload in committed_updates: + path = str(payload.get("virtualPath") or "") + _kb_receipt( + type="file", + operation="edit_file", + path=path, + external_id=payload.get("id"), + ) + for payload in applied_moves: + # ``applied_moves`` rows carry the destination ``virtualPath`` because + # the move has already landed in the DB by the time we reach this code. + path = str(payload.get("virtualPath") or "") + _kb_receipt( + type="file", + operation="move_file", + path=path, + external_id=payload.get("id"), + ) + for path in staged_dirs: + _kb_receipt(type="folder", operation="mkdir", path=path) + for payload in committed_deletes: + path = str(payload.get("virtualPath") or "") + _kb_receipt( + type="file", + operation="rm", + path=path, + external_id=payload.get("id"), + ) + for payload in committed_folder_deletes: + path = str(payload.get("virtualPath") or "") + _kb_receipt( + type="folder", + operation="rmdir", + path=path, + external_id=payload.get("id"), + ) + if receipts: + delta["receipts"] = receipts files_delta: dict[str, Any] = {} if temp_paths: files_delta.update(dict.fromkeys(temp_paths)) diff --git a/surfsense_backend/app/agents/new_chat/middleware/permission.py b/surfsense_backend/app/agents/new_chat/middleware/permission.py index e174ab0bd..07549bedb 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/permission.py +++ b/surfsense_backend/app/agents/new_chat/middleware/permission.py @@ -61,8 +61,7 @@ from app.agents.new_chat.permissions import ( aggregate_action, evaluate_many, ) -from app.observability import metrics as ot_metrics -from app.observability import otel as ot +from app.observability import metrics as ot_metrics, otel as ot logger = logging.getLogger(__name__) diff --git a/surfsense_backend/app/agents/new_chat/state_reducers.py b/surfsense_backend/app/agents/new_chat/state_reducers.py index 89fc86367..c7b7685f0 100644 --- a/surfsense_backend/app/agents/new_chat/state_reducers.py +++ b/surfsense_backend/app/agents/new_chat/state_reducers.py @@ -171,6 +171,39 @@ def _dict_merge_with_tombstones_reducer( return result +def _int_counter_merge_reducer( + left: dict[str, int] | None, + right: dict[str, int] | None, +) -> dict[str, int]: + """Merge ``right`` into ``left`` by **summing** per-key integer counters. + + Used for state fields that accumulate counts across multiple updates + within the same turn (e.g. per-subagent ``billable_calls``). Unknown + keys are added; existing keys are summed. ``_CLEAR`` sentinels reset + the accumulator the same way the other reducers do, so the orchestrator + can wipe the counter at end-of-turn if needed. + """ + if right is None: + return dict(left or {}) + + if _CLEAR in right or any(_is_clear(k) for k in right): + result: dict[str, int] = {} + for key, value in right.items(): + if _is_clear(key): + continue + if not isinstance(value, int): + continue + result[key] = result.get(key, 0) + value + return result + + base = dict(left or {}) + for key, value in right.items(): + if not isinstance(value, int): + continue + base[key] = base.get(key, 0) + value + return base + + def _initial_filesystem_state() -> dict[str, Any]: """Default empty values for SurfSense filesystem state fields. @@ -200,6 +233,7 @@ __all__ = [ "_add_unique_reducer", "_dict_merge_with_tombstones_reducer", "_initial_filesystem_state", + "_int_counter_merge_reducer", "_list_append_reducer", "_replace_reducer", ] diff --git a/surfsense_backend/app/agents/new_chat/tools/podcast.py b/surfsense_backend/app/agents/new_chat/tools/podcast.py index 2c9b7fa0c..36aecfe49 100644 --- a/surfsense_backend/app/agents/new_chat/tools/podcast.py +++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py @@ -2,17 +2,23 @@ Podcast generation tool for the SurfSense agent. This module provides a factory function for creating the generate_podcast tool -that submits a Celery task for background podcast generation. The frontend -polls for completion and auto-updates when the podcast is ready. +that submits a Celery task for background podcast generation. The tool then +polls the podcast row until it reaches a terminal status (READY/FAILED) and +returns that status. The wait is bounded by the chat's HTTP / process +lifetime; see app.agents.shared.deliverable_wait for details. """ +import logging from typing import Any from langchain_core.tools import tool from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.shared.deliverable_wait import wait_for_deliverable from app.db import Podcast, PodcastStatus, shielded_async_session +logger = logging.getLogger(__name__) + def create_generate_podcast_tool( search_space_id: int, @@ -97,18 +103,57 @@ def create_generate_podcast_tool( user_prompt=user_prompt, ) - print(f"[generate_podcast] Created podcast {podcast_id}, task: {task.id}") + logger.info( + "[generate_podcast] Created podcast %s, task: %s", + podcast_id, + task.id, + ) + # Wait until the Celery worker flips the row to a terminal + # state. No internal budget — see deliverable_wait module. + terminal_status, columns, elapsed = await wait_for_deliverable( + model=Podcast, + row_id=podcast_id, + columns=[Podcast.status, Podcast.file_location], + terminal_statuses={PodcastStatus.READY, PodcastStatus.FAILED}, + ) + + if terminal_status == PodcastStatus.READY: + file_location = columns[1] if columns else None + logger.info( + "[generate_podcast] Podcast %s READY in %.2fs (file=%s)", + podcast_id, + elapsed, + file_location, + ) + return { + "status": PodcastStatus.READY.value, + "podcast_id": podcast_id, + "title": podcast_title, + "file_location": file_location, + "message": ( + "Podcast generated and saved to your podcast panel." + ), + } + + # Only other terminal state is FAILED. + logger.warning( + "[generate_podcast] Podcast %s FAILED in %.2fs", + podcast_id, + elapsed, + ) return { - "status": PodcastStatus.PENDING.value, + "status": PodcastStatus.FAILED.value, "podcast_id": podcast_id, "title": podcast_title, - "message": "Podcast generation started. This may take a few minutes.", + "error": ( + "Background worker reported FAILED status for this podcast." + ), } except Exception as e: error_message = str(e) - print(f"[generate_podcast] Error: {error_message}") + logger.exception("[generate_podcast] Error: %s", error_message) return { "status": PodcastStatus.FAILED.value, "error": error_message, diff --git a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py index 7bf9a1c3b..4bf13b28e 100644 --- a/surfsense_backend/app/agents/new_chat/tools/video_presentation.py +++ b/surfsense_backend/app/agents/new_chat/tools/video_presentation.py @@ -2,17 +2,23 @@ Video presentation generation tool for the SurfSense agent. This module provides a factory function for creating the generate_video_presentation -tool that submits a Celery task for background video presentation generation. -The frontend polls for completion and auto-updates when the presentation is ready. +tool that submits a Celery task for background video presentation generation. The +tool then polls the row until it reaches a terminal status (READY/FAILED) and +returns that status. The wait is bounded by the chat's HTTP / process lifetime; +see app.agents.shared.deliverable_wait for details. """ +import logging from typing import Any from langchain_core.tools import tool from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.shared.deliverable_wait import wait_for_deliverable from app.db import VideoPresentation, VideoPresentationStatus, shielded_async_session +logger = logging.getLogger(__name__) + def create_generate_video_presentation_tool( search_space_id: int, @@ -72,20 +78,58 @@ def create_generate_video_presentation_tool( user_prompt=user_prompt, ) - print( - f"[generate_video_presentation] Created video presentation {video_pres_id}, task: {task.id}" + logger.info( + "[generate_video_presentation] Created video presentation %s, task: %s", + video_pres_id, + task.id, ) + # Wait until the Celery worker flips the row to a terminal + # state. No internal budget — see deliverable_wait module. + terminal_status, _columns, elapsed = await wait_for_deliverable( + model=VideoPresentation, + row_id=video_pres_id, + columns=[VideoPresentation.status], + terminal_statuses={ + VideoPresentationStatus.READY, + VideoPresentationStatus.FAILED, + }, + ) + + if terminal_status == VideoPresentationStatus.READY: + logger.info( + "[generate_video_presentation] %s READY in %.2fs", + video_pres_id, + elapsed, + ) + return { + "status": VideoPresentationStatus.READY.value, + "video_presentation_id": video_pres_id, + "title": video_title, + "message": "Video presentation generated and saved.", + } + + # Only other terminal state is FAILED. + logger.warning( + "[generate_video_presentation] %s FAILED in %.2fs", + video_pres_id, + elapsed, + ) return { - "status": VideoPresentationStatus.PENDING.value, + "status": VideoPresentationStatus.FAILED.value, "video_presentation_id": video_pres_id, "title": video_title, - "message": "Video presentation generation started. This may take a few minutes.", + "error": ( + "Background worker reported FAILED status for this " + "video presentation." + ), } except Exception as e: error_message = str(e) - print(f"[generate_video_presentation] Error: {error_message}") + logger.exception( + "[generate_video_presentation] Error: %s", error_message + ) return { "status": VideoPresentationStatus.FAILED.value, "error": error_message, diff --git a/surfsense_backend/app/agents/shared/__init__.py b/surfsense_backend/app/agents/shared/__init__.py new file mode 100644 index 000000000..7c46c65ff --- /dev/null +++ b/surfsense_backend/app/agents/shared/__init__.py @@ -0,0 +1,9 @@ +"""Cross-package agent contracts. + +Symbols here are intentionally framework-light (no LangGraph / deepagents +internals) so they can be imported from both ``app.agents.new_chat`` and +``app.agents.multi_agent_chat`` without creating a circular dependency +between the two packages. See ``receipt.py`` for the rationale. +""" + +from __future__ import annotations diff --git a/surfsense_backend/app/agents/shared/deliverable_wait.py b/surfsense_backend/app/agents/shared/deliverable_wait.py new file mode 100644 index 000000000..abaa017ea --- /dev/null +++ b/surfsense_backend/app/agents/shared/deliverable_wait.py @@ -0,0 +1,123 @@ +"""Shared poll-until-terminal helper for Celery-backed deliverables. + +Lives in ``app.agents.shared`` (neutral package, no dependencies on either +``new_chat`` or ``multi_agent_chat``) so both the flat single-agent tools +under ``app/agents/new_chat/tools/`` and the multi-agent subagent tools +under ``app/agents/multi_agent_chat/subagents/builtins/deliverables/tools/`` +can import it without creating a circular dependency. + +Background +---------- +Tools like ``generate_podcast`` and ``generate_video_presentation`` enqueue +the heavy work to Celery and historically returned immediately with a +"pending" status. That works for very-long deliverables but hurts UX for +the common case (most podcasts finish in 10-30 seconds): the agent sends +a "kicked off, check back in a minute" reply *before* the worker is done, +so the user never gets a "ready" confirmation. + +This helper bridges that gap. The tool dispatches the Celery task as +before, then polls the artefact row's ``status`` column **until it +reaches a terminal value** (READY / FAILED). The tool then returns a +real terminal outcome — never a pending one. + +No wall-clock budget here on purpose +------------------------------------ +Layering a second budget on top of the existing per-invocation safety +nets just confused the UX. The real ceilings are: + +* **Multi-agent mode** — ``SURFSENSE_SUBAGENT_INVOKE_TIMEOUT_SECONDS`` + (default ``300.0``, ``0`` to disable) caps how long any single + ``task(subagent, ...)`` invocation can run. If a deliverable needs + longer than this, the subagent invocation is cancelled and the + orchestrator surfaces a "subagent timed out" ToolMessage. Operators + who routinely generate long videos should raise that ceiling (or set + it to ``0`` for true unbounded waits). +* **Single-agent mode** — the chat's HTTP stream / process lifetime is + the only ceiling. Truly indefinite waits work here, but a dead Celery + worker will leave the row in PENDING/GENERATING forever; treat that + as an operational concern, not a UX concern. + +Configuration +------------- +None. The poll cadence is hardcoded at 1.5s — small enough to feel +responsive (~6 polls per typical 10s podcast), large enough to avoid +hammering the DB under burst traffic. Override at the call site if a +specific tool needs a different cadence. +""" + +from __future__ import annotations + +import asyncio +import logging +import time +from enum import Enum +from typing import Any + +from sqlalchemy import select +from sqlalchemy.orm import InstrumentedAttribute + +from app.db import shielded_async_session + +logger = logging.getLogger(__name__) + + +_DEFAULT_POLL_INTERVAL_SECONDS: float = 1.5 + + +async def wait_for_deliverable( + *, + model: type, + row_id: int, + columns: list[InstrumentedAttribute[Any]], + terminal_statuses: set[Enum], + poll_interval_s: float = _DEFAULT_POLL_INTERVAL_SECONDS, +) -> tuple[Enum, tuple[Any, ...], float]: + """Poll ``model`` row ``row_id`` until ``columns[0]`` reaches a terminal status. + + Blocks until the row's status column matches one of + ``terminal_statuses``. There is no internal wall-clock budget; cancel + from the outside (subagent timeout, HTTP disconnect, task + cancellation) if you need a ceiling. See module docstring. + + The first entry of ``columns`` must be the status column; additional + columns (e.g. ``Podcast.file_location``) are returned alongside the + final status so callers can build their payload without a second + roundtrip. + + A fresh ``shielded_async_session`` is opened per poll so we never + hold a transaction across the wait, and a failed poll is logged but + does not abort the wait — transient DB hiccups should not collapse + the tool call. + + Returns + ------- + ``(terminal_status, columns, elapsed_seconds)`` + ``columns`` mirrors the requested ``columns`` (including the + status itself in position 0). + """ + if not columns: + raise ValueError("wait_for_deliverable requires at least the status column") + + start = time.monotonic() + + while True: + await asyncio.sleep(poll_interval_s) + row: tuple[Any, ...] | None = None + try: + async with shielded_async_session() as session: + result = await session.execute( + select(*columns).where(model.id == row_id) + ) + row = result.first() + except Exception as exc: + logger.warning( + "[deliverable_wait] poll failed model=%s id=%s err=%r", + getattr(model, "__name__", str(model)), + row_id, + exc, + ) + + if row is not None: + status_val = row[0] + if status_val in terminal_statuses: + return status_val, tuple(row), time.monotonic() - start diff --git a/surfsense_backend/app/agents/shared/receipt.py b/surfsense_backend/app/agents/shared/receipt.py new file mode 100644 index 000000000..6f30067ee --- /dev/null +++ b/surfsense_backend/app/agents/shared/receipt.py @@ -0,0 +1,161 @@ +"""Receipt: structured handle returned by every mutating subagent tool. + +Generalises the Hermes ``entry`` dict (see ``references/hermes-agent/tools/ +delegate_tool.py:1663-1697``) for our 5 deliverable types + 15 connectors + +KB writes. The supervisor reads the Receipt to verify what actually happened +without round-tripping through LLM paraphrase. + +**Why this lives under ``app.agents.shared`` and not under either of the +two agent packages:** the Receipt is a *contract* shared between +``multi_agent_chat`` (where mutating tools emit it) and ``new_chat`` +(where ``filesystem_state.SurfSenseFilesystemState`` declares the +``receipts`` reducer that accumulates it, and where +``middleware.kb_persistence`` emits its own KB-write receipts). Putting +the contract in either package would create a bidirectional import +between the two — see the commit that introduced this module for the +``ImportError`` chain it broke. + +Each mutating tool wraps its native return shape into a Receipt via +:func:`make_receipt` (or builds one directly) and returns it under the +``"receipt"`` key alongside its existing payload. The subagent boundary +machinery in ``checkpointed_subagent_middleware.task_tool`` then folds +the receipt into the parent's ``receipts`` state via the append reducer. + +The KB write path is the one exception: file-tool calls cannot emit a +durable receipt because the actual DB writes happen end-of-turn inside +:class:`app.agents.new_chat.middleware.kb_persistence.KnowledgeBasePersistenceMiddleware`. +KB tools therefore emit a *provisional* receipt with ``status="pending"``; +the persistence middleware flips it to ``"success"`` or ``"failed"`` +before returning control to the parent. +""" + +from __future__ import annotations + +from typing import Any, Literal, TypedDict + +# Subagent that emitted this receipt. +ReceiptRoute = Literal[ + "deliverables", + "knowledge_base", + "notion", + "slack", + "gmail", + "linear", + "jira", + "clickup", + "confluence", + "calendar", + "luma", + "airtable", + "google_drive", + "dropbox", + "onedrive", + "discord", + "teams", +] + +# Within-route kind of artefact / external resource the operation touched. +# Left as ``str`` rather than a giant union so each route file documents +# its own enum next to its tools. +ReceiptType = str + +# Operation verb. Kept open for the same reason as ``ReceiptType``. +ReceiptOperation = str + +# Pending = async backend (Celery podcast / video) that the orchestrator +# will surface progress for out of band; persistence-MW flipped this to +# ``success`` for KB writes that committed. +ReceiptStatus = Literal["success", "pending", "failed"] + + +class Receipt(TypedDict, total=False): + """Structured per-mutation handle returned to the parent subagent. + + All fields are ``NotRequired`` (TypedDict ``total=False``) so each + route's tool can populate only the fields it actually has — e.g. Gmail + never sets ``verifiable_url`` because Gmail doesn't expose per-message + URLs. The receipts state reducer treats missing keys as missing rather + than ``null`` so we don't double-count. + """ + + route: ReceiptRoute + """Subagent name. Lets the orchestrator filter ``state['receipts']`` + by route without re-deriving from ``type``.""" + + type: ReceiptType + """Within-route kind. e.g. for ``deliverables`` one of ``{report, + podcast, video_presentation, resume, image}``; for ``notion`` ``page``; + for ``slack`` ``message``.""" + + operation: ReceiptOperation + """Verb. e.g. ``generate`` (deliverables), ``create`` / ``update`` / + ``delete`` (most connectors), ``send`` / ``post`` (chat), ``write_file`` + / ``edit_file`` / ``rm`` / ``rmdir`` / ``move_file`` / ``mkdir`` (KB).""" + + status: ReceiptStatus + """``success`` / ``pending`` / ``failed``. The verification teaching + in ``shared/snippets/verifiable_handle.md`` keys off this field.""" + + external_id: str | None + """Backend identifier. Report row id, Notion ``page_id``, Slack ``ts``, + Gmail ``message_id``, Linear identifier, KB ``virtualPath``, etc. + ``None`` only when the operation failed before the backend assigned one.""" + + verifiable_url: str | None + """URL the parent can pass to ``scrape_webpage`` to verify the + operation. ``None`` when no public URL exists (Gmail, KB, raw images + stored in the DB).""" + + preview: str | None + """Short snippet (~200 chars) of what was produced. First lines of + a generated report's markdown, transcript opener for a podcast, + thumbnail URL for an image. Lets the orchestrator decide whether to + re-render in the UI without re-loading the artefact.""" + + error: str | None + """Filled iff ``status == "failed"``. Plain-text reason; the parent + surfaces it in its own ``next_step``.""" + + +def make_receipt( + *, + route: ReceiptRoute, + type: str, + operation: str, + status: ReceiptStatus, + external_id: str | None = None, + verifiable_url: str | None = None, + preview: str | None = None, + error: str | None = None, +) -> Receipt: + """Construct a :class:`Receipt` with non-``None`` fields only. + + Drops keys whose value is ``None`` so downstream consumers can use + ``"verifiable_url" in receipt`` to distinguish "tool returned no URL" + from "tool deliberately surfaced ``null``". + """ + out: dict[str, Any] = { + "route": route, + "type": type, + "operation": operation, + "status": status, + } + if external_id is not None: + out["external_id"] = external_id + if verifiable_url is not None: + out["verifiable_url"] = verifiable_url + if preview is not None: + out["preview"] = preview + if error is not None: + out["error"] = error + return out # type: ignore[return-value] + + +__all__ = [ + "Receipt", + "ReceiptOperation", + "ReceiptRoute", + "ReceiptStatus", + "ReceiptType", + "make_receipt", +] diff --git a/surfsense_backend/app/agents/shared/receipt_command.py b/surfsense_backend/app/agents/shared/receipt_command.py new file mode 100644 index 000000000..f1c269e90 --- /dev/null +++ b/surfsense_backend/app/agents/shared/receipt_command.py @@ -0,0 +1,71 @@ +"""Helper for wrapping a tool result with a Receipt in a ``Command(update=...)``. + +Most mutating subagent tools historically returned a plain ``dict`` payload +which deepagents serialised straight into the ``ToolMessage`` content. To +participate in the verification teaching from +``multi_agent_chat/subagents/shared/snippets/verifiable_handle.md`` those +tools now also need to write a :class:`Receipt` into the parent's +``state['receipts']`` list (declared on +:class:`~app.agents.new_chat.filesystem_state.SurfSenseFilesystemState` +and backed by the append reducer). + +:func:`with_receipt` wraps both behaviours: it returns the tool payload as +a JSON-encoded ``ToolMessage`` AND appends the receipt to state in a single +:class:`~langgraph.types.Command`. Use it at every ``return`` site of a +mutating tool — including failure paths (emit a receipt with +``status="failed"`` and the error message in ``error``). +""" + +from __future__ import annotations + +import json +from typing import Any + +from langchain_core.messages import ToolMessage +from langgraph.types import Command + +from app.agents.shared.receipt import Receipt + + +def _content_to_text(payload: dict[str, Any] | str) -> str: + """Serialise a tool payload to ``ToolMessage`` content. + + Dicts go through ``json.dumps`` (matching deepagents' default tool-result + serialisation); strings are passed through. Anything else is coerced via + ``str`` so we never raise here — a mis-typed tool return would already + have failed inside the tool body. + """ + if isinstance(payload, str): + return payload + if isinstance(payload, dict): + return json.dumps(payload, default=str) + return str(payload) + + +def with_receipt( + *, + payload: dict[str, Any] | str, + receipt: Receipt, + tool_call_id: str, +) -> Command: + """Return a Command that ships ``payload`` as a ToolMessage AND appends ``receipt``. + + The append happens via the ``_list_append_reducer`` on the ``receipts`` + field of :class:`~app.agents.new_chat.filesystem_state.SurfSenseFilesystemState`, + so concurrent subagent batches (item 4 in the plan) won't clobber each + other's receipts. + """ + return Command( + update={ + "messages": [ + ToolMessage( + content=_content_to_text(payload), + tool_call_id=tool_call_id, + ) + ], + "receipts": [receipt], + } + ) + + +__all__ = ["with_receipt"] diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 496c6d0c3..a2f4d0bbd 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -62,7 +62,9 @@ class EtlPipelineService: return result if category == FileCategory.AUDIO: - content = await transcribe_audio(request.file_path, request.filename) + content = await transcribe_audio( + request.file_path, request.filename + ) result = EtlResult( markdown_content=content, etl_service="AUDIO", diff --git a/surfsense_backend/app/services/composio_service.py b/surfsense_backend/app/services/composio_service.py index d73a0d4ce..920f51d84 100644 --- a/surfsense_backend/app/services/composio_service.py +++ b/surfsense_backend/app/services/composio_service.py @@ -835,7 +835,14 @@ class ComposioService: ) if not result.get("success"): - return [], None, result.get("error", "Unknown error") + # 4-tuple to match this function's declared return shape + # ``(messages, next_page_token, result_size_estimate, error)``. + # The error branch previously dropped the + # ``result_size_estimate`` slot, which crashed the caller's + # unpack with ``ValueError: not enough values to unpack + # (expected 4, got 3)`` and hid the real Composio error + # (e.g. expired connected account / invalid API key). + return [], None, None, result.get("error", "Unknown error") data = result.get("data", {}) diff --git a/surfsense_backend/app/services/gmail/kb_sync_service.py b/surfsense_backend/app/services/gmail/kb_sync_service.py index 6ff5f3c2b..85e25fcb6 100644 --- a/surfsense_backend/app/services/gmail/kb_sync_service.py +++ b/surfsense_backend/app/services/gmail/kb_sync_service.py @@ -101,9 +101,7 @@ class GmailKBSyncService: else: logger.warning("No LLM configured -- using fallback summary") summary_content = f"Gmail Message: {subject}\n\n{indexable_content}" - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(indexable_content) now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/surfsense_backend/app/services/google_calendar/kb_sync_service.py b/surfsense_backend/app/services/google_calendar/kb_sync_service.py index 1f017ec4d..e59868aff 100644 --- a/surfsense_backend/app/services/google_calendar/kb_sync_service.py +++ b/surfsense_backend/app/services/google_calendar/kb_sync_service.py @@ -116,9 +116,7 @@ class GoogleCalendarKBSyncService: summary_content = ( f"Google Calendar Event: {event_summary}\n\n{indexable_content}" ) - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(indexable_content) now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -297,9 +295,7 @@ class GoogleCalendarKBSyncService: summary_content = ( f"Google Calendar Event: {event_summary}\n\n{indexable_content}" ) - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(indexable_content) now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/surfsense_backend/app/services/jira/kb_sync_service.py b/surfsense_backend/app/services/jira/kb_sync_service.py index 5f6668377..37001a476 100644 --- a/surfsense_backend/app/services/jira/kb_sync_service.py +++ b/surfsense_backend/app/services/jira/kb_sync_service.py @@ -98,9 +98,7 @@ class JiraKBSyncService: summary_content = ( f"Jira Issue {issue_identifier}: {issue_title}\n\n{issue_content}" ) - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(issue_content) now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -214,9 +212,7 @@ class JiraKBSyncService: summary_content = ( f"Jira Issue {issue_identifier}: {issue_title}\n\n{issue_content}" ) - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(issue_content) diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py index fa97fb33a..aadb60cde 100644 --- a/surfsense_backend/app/services/llm_service.py +++ b/surfsense_backend/app/services/llm_service.py @@ -682,11 +682,7 @@ def get_planner_llm() -> ChatLiteLLM | None: from app.agents.new_chat.llm_config import create_chat_litellm_from_config planner_cfg = next( - ( - cfg - for cfg in config.GLOBAL_LLM_CONFIGS - if cfg.get("is_planner") is True - ), + (cfg for cfg in config.GLOBAL_LLM_CONFIGS if cfg.get("is_planner") is True), None, ) if not planner_cfg: diff --git a/surfsense_backend/app/services/onedrive/kb_sync_service.py b/surfsense_backend/app/services/onedrive/kb_sync_service.py index e1da3b4a1..731f081dd 100644 --- a/surfsense_backend/app/services/onedrive/kb_sync_service.py +++ b/surfsense_backend/app/services/onedrive/kb_sync_service.py @@ -96,9 +96,7 @@ class OneDriveKBSyncService: else: logger.warning("No LLM configured — using fallback summary") summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}" - summary_embedding = await asyncio.to_thread( - embed_text, summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, summary_content) chunks = await create_document_chunks(indexable_content) now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 1b2a4cfbb..78f80c955 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -2608,9 +2608,7 @@ async def stream_resume_chat( visibility = thread_visibility or ChatVisibility.PRIVATE from app.config import config as _app_config - chat_agent_mode = ( - "multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single" - ) + chat_agent_mode = "multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single" with contextlib.suppress(Exception): chat_span.set_attribute("agent.mode", chat_agent_mode) _t0 = time.perf_counter() diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py index ad4a17d08..2ff810447 100644 --- a/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py +++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tool_end.py @@ -6,6 +6,9 @@ import json from collections.abc import Iterator from typing import Any +from langchain_core.messages import ToolMessage +from langgraph.types import Command + from app.tasks.chat.streaming.handlers.tools import ( ToolCompletionEmissionContext, iter_tool_completion_emission_frames, @@ -19,6 +22,38 @@ from app.tasks.chat.streaming.relay.task_span import ( from app.tasks.chat.streaming.relay.thinking_step_sse import emit_thinking_step_frame +def _unwrap_command_output(raw_output: Any) -> Any: + """Replace a ``Command`` from a tool return with its inner ``ToolMessage``. + + Tools that participate in receipt-style state writes (see + ``app.agents.shared.receipt_command.with_receipt``) return a + ``Command(update={"messages": [ToolMessage(...)], "receipts": [...]})``. + LangChain's ``on_tool_end`` event surfaces that ``Command`` verbatim as + ``data.output``, which the rest of this handler can't introspect: it has + no ``.content``, isn't a ``dict``, and stringifies to ``"Command(...)"``. + That stringified payload reaches the frontend and breaks tool-specific + UI components (e.g. the podcast card) that look for ``status`` / + ``podcast_id`` at the top level. + + We extract the first ``ToolMessage`` from the Command's ``messages`` list + so downstream code can read ``.content`` normally. Commands that don't + contain a ``ToolMessage`` (rare, e.g. pure state updates) are returned + unchanged — the existing ``str(raw_output)`` fallback handles them. + """ + if not isinstance(raw_output, Command): + return raw_output + update = raw_output.update + if not isinstance(update, dict): + return raw_output + messages = update.get("messages") + if not isinstance(messages, list): + return raw_output + for msg in messages: + if isinstance(msg, ToolMessage): + return msg + return raw_output + + def iter_tool_end_frames( event: dict[str, Any], *, @@ -33,7 +68,7 @@ def iter_tool_end_frames( state.active_tool_depth = max(0, state.active_tool_depth - 1) run_id = event.get("run_id", "") tool_name = event.get("name", "unknown_tool") - raw_output = event.get("data", {}).get("output", "") + raw_output = _unwrap_command_output(event.get("data", {}).get("output", "")) staged_file_path = state.file_path_by_run.pop(run_id, None) if run_id else None if hasattr(raw_output, "content"): diff --git a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py index 21e27d4c3..51a67f369 100644 --- a/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py +++ b/surfsense_backend/app/tasks/chat/streaming/handlers/tools/deliverables/generate_video_presentation/emission.py @@ -15,12 +15,24 @@ def iter_completion_emission_frames( out = ctx.tool_output payload = out if isinstance(out, dict) else {"result": out} yield ctx.emit_tool_output_card(payload) - if isinstance(out, dict) and out.get("status") == "pending": + if not isinstance(out, dict): + return + status = out.get("status") + # ``ready`` is the live success status now that the tool waits for the + # Celery worker to reach a terminal state. ``pending`` is retained as a + # legacy branch for old saved chats that pre-date the wait-for-terminal + # change (see ``app.agents.shared.deliverable_wait``). + if status == "ready": + yield ctx.streaming_service.format_terminal_info( + f"Video presentation generated successfully: {out.get('title', 'Presentation')}", + "success", + ) + elif status == "pending": yield ctx.streaming_service.format_terminal_info( f"Video presentation queued: {out.get('title', 'Presentation')}", "success", ) - elif isinstance(out, dict) and out.get("status") == "failed": + elif status == "failed": error_msg = out.get("error", "Unknown error") yield ctx.streaming_service.format_terminal_info( f"Presentation generation failed: {error_msg}", diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index 9bc8103c5..059d91806 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -222,9 +222,7 @@ async def generate_document_summary( else: enhanced_summary_content = summary_content - summary_embedding = await asyncio.to_thread( - embed_text, enhanced_summary_content - ) + summary_embedding = await asyncio.to_thread(embed_text, enhanced_summary_content) return enhanced_summary_content, summary_embedding