Merge upstream/dev into feature/multi-agent

2026-05-07 14:52:39 +02:00 · 2026-05-05 01:44:46 +02:00 · 2026-05-05 01:44:46 +02:00 · 5119915f4f
commit 5119915f4f
parent 9e35cdaec7 b2373c1ba3
278 changed files with 34669 additions and 8970 deletions
--- a/surfsense_backend/app/tasks/chat/content_builder.py
+++ b/surfsense_backend/app/tasks/chat/content_builder.py
@ -0,0 +1,515 @@
+"""Server-side mirror of the frontend's assistant-ui ``ContentPart`` projection.
+
+Background
+----------
+The streaming chat task in ``stream_new_chat`` / ``stream_resume_chat`` yields
+SSE events that the frontend folds into a ``ContentPartsState`` (see
+``surfsense_web/lib/chat/streaming-state.ts`` and the matching pipeline in
+``stream-pipeline.ts``). When a turn ends, the frontend calls
+``buildContentForPersistence(...)`` and round-trips that ``ContentPart[]``
+JSONB to ``POST /threads/{id}/messages``, which is what was historically
+written to ``new_chat_messages.content``.
+
+After the ghost-thread fix moved persistence server-side, the assistant
+row is written by ``finalize_assistant_turn`` in the streaming finally
+block. The frontend's later ``appendMessage`` is now a no-op (recovers
+via the ``(thread_id, turn_id, role)`` partial unique index added in
+migration 141), which means the *server* is now responsible for
+producing the rich ``ContentPart[]`` shape the FE expects on history
+reload — text + reasoning + tool-call cards (with ``args``, ``argsText``,
+``result``, ``langchainToolCallId``) + thinking-step buckets +
+step-separators.
+
+This module is the in-memory accumulator that mirrors the FE state for
+exactly that purpose. The streaming code calls ``on_text_*`` / ``on_reasoning_*``
+/ ``on_tool_*`` / ``on_thinking_step`` / ``on_step_separator`` /
+``mark_interrupted`` at the same call sites it yields the matching
+``streaming_service.format_*`` SSE event, so the in-memory ``parts`` list
+stays in lockstep with what the FE's pipeline would have produced live.
+``snapshot()`` is then taken once in the ``finally`` block and persisted
+in a single UPDATE.
+
+Pure synchronous state — no DB I/O, no async, no flush callbacks. The
+streaming code is responsible for driving lifecycle methods; this class
+is a thin projection helper.
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+# Mirrors the FE's filter in ``buildContentForPersistence`` / ``buildContentForUI``:
+# only text/reasoning/tool-call parts count as "meaningful". data-thinking-steps
+# and data-step-separator decorate the meaningful parts but never stand alone
+# in a successful turn.
+_MEANINGFUL_PART_TYPES: frozenset[str] = frozenset({"text", "reasoning", "tool-call"})
+
+
+class AssistantContentBuilder:
+    """Server-side projection of ``surfsense_web/lib/chat/streaming-state.ts``.
+
+    Output shape (deep copy of ``self.parts`` via ``snapshot()``) strictly
+    matches the FE ``ContentPart`` union::
+
+        | { type: "text"; text: string }
+        | { type: "reasoning"; text: string }
+        | { type: "tool-call"; toolCallId: str; toolName: str;
+            args: dict; result?: any; argsText?: str; langchainToolCallId?: str;
+            state?: "aborted" }
+        | { type: "data-thinking-steps"; data: { steps: ThinkingStepData[] } }
+        | { type: "data-step-separator"; data: { stepIndex: int } }
+
+    Order matches the wire order of the SSE events that drive the lifecycle
+    methods, with two FE-mirrored exceptions:
+
+    1. ``data-thinking-steps`` is a *singleton* and pinned at index 0 the
+       first time we see a ``data-thinking-step`` SSE event (the FE's
+       ``updateThinkingSteps`` does ``unshift`` on first sight). Subsequent
+       thinking-step updates mutate that singleton in place.
+    2. ``data-step-separator`` is appended only when the message already has
+       meaningful content and the previous part isn't itself a separator
+       (so the FIRST step of a turn doesn't generate a leading divider).
+    """
+
+    def __init__(self) -> None:
+        self.parts: list[dict[str, Any]] = []
+        # Index of the active text/reasoning part within ``parts`` while
+        # streaming is open; -1 means "no active part" and the next delta
+        # opens a fresh one. Mirrors ``ContentPartsState.currentTextPartIndex``.
+        self._current_text_idx: int = -1
+        self._current_reasoning_idx: int = -1
+        # ``ui_id``-keyed indexes for tool-call parts. ``ui_id`` is the
+        # synthetic ``call_<run_id>`` (legacy) or the LangChain
+        # ``tool_call.id`` (parity_v2) — same key the streaming layer
+        # threads through every ``tool-input-*`` / ``tool-output-*`` event.
+        self._tool_call_idx_by_ui_id: dict[str, int] = {}
+        # Live argsText accumulator (concatenated ``tool-input-delta`` chunks)
+        # so we can reproduce the FE's ``appendToolInputDelta`` behaviour
+        # before ``tool-input-available`` overwrites it with the
+        # pretty-printed final JSON.
+        self._args_text_by_ui_id: dict[str, str] = {}
+
+    # ------------------------------------------------------------------
+    # Text
+    # ------------------------------------------------------------------
+
+    def on_text_start(self, text_id: str) -> None:
+        """Begin a fresh text block.
+
+        Symmetric to FE ``appendText``: opening text closes any active
+        reasoning so the renderer treats them as separate parts. The
+        actual text part isn't materialised here — it's lazily created
+        on the first ``on_text_delta`` so an empty start/end pair
+        leaves no trace. Matches the FE pipeline which has no explicit
+        ``text-start`` handler at all.
+        """
+        if self._current_reasoning_idx >= 0:
+            self._current_reasoning_idx = -1
+
+    def on_text_delta(self, text_id: str, delta: str) -> None:
+        if not delta:
+            return
+        if self._current_reasoning_idx >= 0:
+            # FE behaviour: a text delta after reasoning implicitly
+            # closes the reasoning block (see ``appendText`` lines
+            # 178-180).
+            self._current_reasoning_idx = -1
+        if (
+            self._current_text_idx >= 0
+            and 0 <= self._current_text_idx < len(self.parts)
+            and self.parts[self._current_text_idx].get("type") == "text"
+        ):
+            self.parts[self._current_text_idx]["text"] += delta
+            return
+        self.parts.append({"type": "text", "text": delta})
+        self._current_text_idx = len(self.parts) - 1
+
+    def on_text_end(self, text_id: str) -> None:
+        """Close the active text block.
+
+        Mirrors the wire-level ``text-end`` boundary the streaming layer
+        emits before tool calls / reasoning / step boundaries. The FE
+        pipeline implicitly closes via ``currentTextPartIndex = -1``
+        in ``addToolCall`` / ``appendReasoning`` / ``addStepSeparator``;
+        our helper does the same explicitly so callers don't have to
+        maintain that invariant per call site.
+        """
+        self._current_text_idx = -1
+
+    # ------------------------------------------------------------------
+    # Reasoning
+    # ------------------------------------------------------------------
+
+    def on_reasoning_start(self, reasoning_id: str) -> None:
+        if self._current_text_idx >= 0:
+            self._current_text_idx = -1
+
+    def on_reasoning_delta(self, reasoning_id: str, delta: str) -> None:
+        if not delta:
+            return
+        if self._current_text_idx >= 0:
+            self._current_text_idx = -1
+        if (
+            self._current_reasoning_idx >= 0
+            and 0 <= self._current_reasoning_idx < len(self.parts)
+            and self.parts[self._current_reasoning_idx].get("type") == "reasoning"
+        ):
+            self.parts[self._current_reasoning_idx]["text"] += delta
+            return
+        self.parts.append({"type": "reasoning", "text": delta})
+        self._current_reasoning_idx = len(self.parts) - 1
+
+    def on_reasoning_end(self, reasoning_id: str) -> None:
+        self._current_reasoning_idx = -1
+
+    # ------------------------------------------------------------------
+    # Tool calls
+    # ------------------------------------------------------------------
+
+    def on_tool_input_start(
+        self,
+        ui_id: str,
+        tool_name: str,
+        langchain_tool_call_id: str | None,
+    ) -> None:
+        """Register a tool-call card. Args are filled in by later events."""
+        if not ui_id:
+            return
+        # Skip duplicate registration: parity_v2 may emit
+        # ``tool-input-start`` from both ``on_chat_model_stream``
+        # (when tool_call_chunks register a name) and ``on_tool_start``
+        # (the canonical path). The FE de-dupes via ``toolCallIndices``;
+        # we mirror that here.
+        if ui_id in self._tool_call_idx_by_ui_id:
+            if langchain_tool_call_id:
+                idx = self._tool_call_idx_by_ui_id[ui_id]
+                part = self.parts[idx]
+                if not part.get("langchainToolCallId"):
+                    part["langchainToolCallId"] = langchain_tool_call_id
+            return
+
+        part: dict[str, Any] = {
+            "type": "tool-call",
+            "toolCallId": ui_id,
+            "toolName": tool_name,
+            "args": {},
+        }
+        if langchain_tool_call_id:
+            part["langchainToolCallId"] = langchain_tool_call_id
+        self.parts.append(part)
+        self._tool_call_idx_by_ui_id[ui_id] = len(self.parts) - 1
+
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    def on_tool_input_delta(self, ui_id: str, args_chunk: str) -> None:
+        """Append a streamed args-delta chunk to the matching card's argsText.
+
+        Mirrors FE ``appendToolInputDelta``: no-ops when no card has been
+        registered yet for the given ``ui_id`` — the deltas have nowhere
+        safe to land.
+        """
+        if not ui_id or not args_chunk:
+            return
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is None:
+            return
+        if not (0 <= idx < len(self.parts)):
+            return
+        part = self.parts[idx]
+        if part.get("type") != "tool-call":
+            return
+        new_text = (part.get("argsText") or "") + args_chunk
+        part["argsText"] = new_text
+        self._args_text_by_ui_id[ui_id] = new_text
+
+    def on_tool_input_available(
+        self,
+        ui_id: str,
+        tool_name: str,
+        args: dict[str, Any],
+        langchain_tool_call_id: str | None,
+    ) -> None:
+        """Finalize the tool-call card's input.
+
+        Mirrors FE ``stream-pipeline.ts`` lines 127-153: replaces ``argsText``
+        with ``json.dumps(input, indent=2)`` so the post-stream card renders
+        pretty-printed JSON, sets the full ``args`` dict, and backfills
+        ``langchainToolCallId`` if it wasn't known at ``tool-input-start`` time.
+        Also creates the card if no prior ``tool-input-start`` registered it
+        (legacy parity_v2-OFF / late-registration paths).
+        """
+        if not ui_id:
+            return
+        try:
+            final_args_text = json.dumps(args or {}, indent=2, ensure_ascii=False)
+        except (TypeError, ValueError):
+            # Defensive: ``args`` should already be JSON-safe (the
+            # streaming layer sanitizes it before emitting), but if a
+            # caller hands us a non-serializable value we still want
+            # to record the call without breaking the snapshot.
+            final_args_text = str(args)
+
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is not None and 0 <= idx < len(self.parts):
+            part = self.parts[idx]
+            if part.get("type") == "tool-call":
+                part["args"] = args or {}
+                part["argsText"] = final_args_text
+                if langchain_tool_call_id and not part.get("langchainToolCallId"):
+                    part["langchainToolCallId"] = langchain_tool_call_id
+                return
+
+        # No prior tool-input-start: register the card now.
+        new_part: dict[str, Any] = {
+            "type": "tool-call",
+            "toolCallId": ui_id,
+            "toolName": tool_name,
+            "args": args or {},
+            "argsText": final_args_text,
+        }
+        if langchain_tool_call_id:
+            new_part["langchainToolCallId"] = langchain_tool_call_id
+        self.parts.append(new_part)
+        self._tool_call_idx_by_ui_id[ui_id] = len(self.parts) - 1
+
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    def on_tool_output_available(
+        self,
+        ui_id: str,
+        output: Any,
+        langchain_tool_call_id: str | None,
+    ) -> None:
+        """Attach the tool's output (``result``) to the matching card.
+
+        Mirrors FE ``updateToolCall``: backfill ``langchainToolCallId``
+        only if not already set (a NULL late-arriving value never blows
+        away an earlier known good one).
+        """
+        if not ui_id:
+            return
+        idx = self._tool_call_idx_by_ui_id.get(ui_id)
+        if idx is None or not (0 <= idx < len(self.parts)):
+            return
+        part = self.parts[idx]
+        if part.get("type") != "tool-call":
+            return
+        part["result"] = output
+        if langchain_tool_call_id and not part.get("langchainToolCallId"):
+            part["langchainToolCallId"] = langchain_tool_call_id
+
+    # ------------------------------------------------------------------
+    # Thinking steps & step separators
+    # ------------------------------------------------------------------
+
+    def on_thinking_step(
+        self,
+        step_id: str,
+        title: str,
+        status: str,
+        items: list[str] | None,
+    ) -> None:
+        """Update / insert the singleton ``data-thinking-steps`` part.
+
+        Mirrors FE ``updateThinkingSteps``: maintain a single
+        ``data-thinking-steps`` part anchored at index 0, replacing or
+        unshifting on first sight. Each ``on_thinking_step`` call
+        replaces the entry in the steps list keyed by ``step_id`` (or
+        appends if new).
+        """
+        if not step_id:
+            return
+
+        new_step = {
+            "id": step_id,
+            "title": title or "",
+            "status": status or "in_progress",
+            "items": list(items) if items else [],
+        }
+
+        # Find existing data-thinking-steps part.
+        existing_idx = -1
+        for i, p in enumerate(self.parts):
+            if p.get("type") == "data-thinking-steps":
+                existing_idx = i
+                break
+
+        if existing_idx >= 0:
+            current_steps = self.parts[existing_idx].get("data", {}).get("steps") or []
+            replaced = False
+            for i, step in enumerate(current_steps):
+                if step.get("id") == step_id:
+                    current_steps[i] = new_step
+                    replaced = True
+                    break
+            if not replaced:
+                current_steps.append(new_step)
+            self.parts[existing_idx] = {
+                "type": "data-thinking-steps",
+                "data": {"steps": current_steps},
+            }
+            return
+
+        # First sight: unshift to position 0 (FE parity).
+        self.parts.insert(
+            0,
+            {
+                "type": "data-thinking-steps",
+                "data": {"steps": [new_step]},
+            },
+        )
+        # Bump tracked indices since we inserted at the head.
+        if self._current_text_idx >= 0:
+            self._current_text_idx += 1
+        if self._current_reasoning_idx >= 0:
+            self._current_reasoning_idx += 1
+        for ui_id, idx in list(self._tool_call_idx_by_ui_id.items()):
+            self._tool_call_idx_by_ui_id[ui_id] = idx + 1
+
+    def on_step_separator(self) -> None:
+        """Append a ``data-step-separator`` between consecutive model steps.
+
+        Mirrors FE ``addStepSeparator``: only emit when the message
+        already has meaningful content AND the previous part isn't
+        itself a separator. ``stepIndex`` is the running count of
+        separators already in ``parts``.
+        """
+        has_content = any(p.get("type") in _MEANINGFUL_PART_TYPES for p in self.parts)
+        if not has_content:
+            return
+        if self.parts and self.parts[-1].get("type") == "data-step-separator":
+            return
+        step_index = sum(
+            1 for p in self.parts if p.get("type") == "data-step-separator"
+        )
+        self.parts.append(
+            {
+                "type": "data-step-separator",
+                "data": {"stepIndex": step_index},
+            }
+        )
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+
+    # ------------------------------------------------------------------
+    # Interruption handling
+    # ------------------------------------------------------------------
+
+    def mark_interrupted(self) -> None:
+        """Close any open text/reasoning and flip running tools to aborted.
+
+        Called from the streaming ``finally`` block before ``snapshot()`` so
+        the persisted JSONB reflects a coherent end-state even when the
+        client disconnected mid-turn or the agent hit a fatal error.
+
+        - Active text/reasoning blocks: simply lose their "active"
+          marker (no synthetic content appended). Whatever was streamed
+          stays as-is.
+        - Tool-call parts that never received a ``result`` get
+          ``state="aborted"`` so the FE history loader can render them
+          as "interrupted" rather than "still running".
+        """
+        self._current_text_idx = -1
+        self._current_reasoning_idx = -1
+        for part in self.parts:
+            if part.get("type") != "tool-call":
+                continue
+            if "result" in part:
+                continue
+            part["state"] = "aborted"
+
+    # ------------------------------------------------------------------
+    # Snapshot & introspection
+    # ------------------------------------------------------------------
+
+    def snapshot(self) -> list[dict[str, Any]]:
+        """Return a deep copy of ``parts`` ready for SQL UPDATE / json.dumps.
+
+        Deep-copied so callers that finalize from the shielded ``finally``
+        block can't accidentally mutate the persisted payload while the
+        SQL UPDATE is in flight (the streaming layer doesn't touch the
+        builder after this call, but defensive copies are cheap and cheap
+        is what we want in a finally block).
+        """
+        return copy.deepcopy(self.parts)
+
+    def is_empty(self) -> bool:
+        """True if no meaningful content was captured.
+
+        ``data-thinking-steps`` and ``data-step-separator`` decorate
+        meaningful content but don't count on their own — a turn that
+        only emitted a thinking step before being interrupted should
+        still be treated as empty for the status-marker fallback.
+        """
+        return not any(p.get("type") in _MEANINGFUL_PART_TYPES for p in self.parts)
+
+    def stats(self) -> dict[str, int]:
+        """Return counts of each part-type plus rough byte size.
+
+        Used by the streaming layer's perf logger so an ops dashboard
+        can correlate finalize latency with payload size, and so a
+        regression that quietly stops emitting tool-call parts (or
+        starts emitting hundreds) shows up in [PERF] grep rather than
+        only as a "history reload looks weird" bug report.
+
+        ``bytes`` is the JSON-serialised payload length — what actually
+        crosses the wire to PostgreSQL's JSONB column. We compute it
+        with ``ensure_ascii=False`` to match the JSONB encoder's UTF-8
+        on-disk layout closely enough for back-of-the-envelope sizing.
+        Reasoning/text/tool-call/thinking-step/step-separator counts are
+        independent so any one can spike without the others.
+
+        Defensive: ``json.dumps`` failure (a non-serializable value
+        slipped past the streaming layer's sanitization) is reported as
+        ``bytes=-1`` rather than raised — perf logging must not be the
+        thing that breaks the streaming finally block.
+        """
+        text_blocks = 0
+        reasoning_blocks = 0
+        tool_calls = 0
+        tool_calls_completed = 0
+        tool_calls_aborted = 0
+        thinking_step_parts = 0
+        step_separators = 0
+
+        for part in self.parts:
+            kind = part.get("type")
+            if kind == "text":
+                text_blocks += 1
+            elif kind == "reasoning":
+                reasoning_blocks += 1
+            elif kind == "tool-call":
+                tool_calls += 1
+                if part.get("state") == "aborted":
+                    tool_calls_aborted += 1
+                elif "result" in part:
+                    tool_calls_completed += 1
+            elif kind == "data-thinking-steps":
+                thinking_step_parts += 1
+            elif kind == "data-step-separator":
+                step_separators += 1
+
+        try:
+            byte_size = len(json.dumps(self.parts, ensure_ascii=False, default=str))
+        except (TypeError, ValueError):
+            byte_size = -1
+
+        return {
+            "parts": len(self.parts),
+            "bytes": byte_size,
+            "text": text_blocks,
+            "reasoning": reasoning_blocks,
+            "tool_calls": tool_calls,
+            "tool_calls_completed": tool_calls_completed,
+            "tool_calls_aborted": tool_calls_aborted,
+            "thinking_step_parts": thinking_step_parts,
+            "step_separators": step_separators,
+        }
--- a/surfsense_backend/app/tasks/chat/persistence.py
+++ b/surfsense_backend/app/tasks/chat/persistence.py
@ -0,0 +1,534 @@
+"""Server-side message persistence helpers for the streaming chat agent.
+
+Historically the streaming task (``stream_new_chat``/``stream_resume_chat``)
+left ``new_chat_messages`` empty and relied on the frontend to round-trip
+``POST /threads/{id}/messages`` afterwards. That gave authenticated clients
+a "ghost-thread" abuse vector: skip the round-trip and burn LLM tokens
+without leaving an audit trail. These helpers move both writes (the user
+turn that triggered the stream and the assistant turn the stream produced)
+into the server itself, idempotent against the partial unique index
+``uq_new_chat_messages_thread_turn_role`` so legacy frontends that *do*
+keep posting via ``appendMessage`` simply hit the unique-index recovery
+path on the second writer instead of creating duplicates.
+
+Assistant turn lifecycle
+------------------------
+The assistant side is split into two helpers so we can capture the row id
+*before* the stream produces any output:
+
+* ``persist_assistant_shell`` runs immediately after ``persist_user_turn``
+  and INSERTs an empty assistant row anchored to ``(thread_id, turn_id,
+  ASSISTANT)``. Returns the row id so the streaming layer can correlate
+  later writes (token_usage, AgentActionLog future-correlation) against
+  a stable PK from the start of the turn.
+* ``finalize_assistant_turn`` runs from the streaming ``finally`` block.
+  It UPDATEs the row's ``content`` to the rich ``ContentPart[]`` snapshot
+  produced server-side by ``AssistantContentBuilder`` and writes the
+  ``token_usage`` row using ``INSERT ... ON CONFLICT DO NOTHING`` against
+  the ``uq_token_usage_message_id`` partial unique index from migration
+  142, hard-eliminating any race against ``append_message``'s recovery
+  branch.
+
+Defensive contract
+------------------
+
+* Every helper runs inside ``shielded_async_session()`` so ``session.close()``
+  survives starlette's mid-stream cancel scope on client disconnect.
+* ``persist_user_turn`` and ``persist_assistant_shell`` use ``INSERT ... ON
+  CONFLICT DO NOTHING ... RETURNING id`` keyed on the ``(thread_id, turn_id,
+  role)`` partial unique index. On conflict the insert silently no-ops at
+  the DB level — no Python ``IntegrityError`` is constructed, which
+  eliminates spurious debugger pauses and keeps logs clean. On conflict a
+  follow-up ``SELECT`` resolves the existing row id so the streaming layer
+  can correlate writes against a stable PK.
+* ``finalize_assistant_turn`` is best-effort: it never raises. The
+  streaming ``finally`` block calls it from within
+  ``anyio.CancelScope(shield=True)`` and any raised exception there
+  would mask the real error.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from datetime import UTC, datetime
+from typing import Any
+from uuid import UUID
+
+from sqlalchemy import text as sa_text
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from sqlalchemy.future import select
+
+from app.db import (
+    NewChatMessage,
+    NewChatMessageRole,
+    NewChatThread,
+    TokenUsage,
+    shielded_async_session,
+)
+from app.services.token_tracking_service import (
+    TurnTokenAccumulator,
+)
+from app.utils.perf import get_perf_logger
+
+logger = logging.getLogger(__name__)
+_perf_log = get_perf_logger()
+
+
+# Empty initial assistant content. ``finalize_assistant_turn`` overwrites
+# this in a single UPDATE at end-of-stream with the full ``ContentPart[]``
+# snapshot produced by ``AssistantContentBuilder``. We persist a one-element
+# list with an empty text part so a crash between shell-INSERT and finalize
+# leaves the row in a FE-renderable shape (blank bubble) instead of
+# blowing up the history loader.
+_EMPTY_SHELL_CONTENT: list[dict[str, Any]] = [{"type": "text", "text": ""}]
+
+# Substituted content for genuinely empty turns (no text, no reasoning,
+# no tool calls). The streaming layer flips to this when
+# ``AssistantContentBuilder.is_empty()`` returns True so the persisted
+# row is at least somewhat self-describing instead of an empty text
+# bubble. The FE's ``ContentPart`` union doesn't include ``status``
+# yet, so the history loader will silently drop this part and render
+# a blank bubble (matches today's behaviour for empty turns); a follow-up
+# FE PR adds the explicit "no response" rendering.
+_STATUS_NO_RESPONSE: list[dict[str, Any]] = [
+    {"type": "status", "text": "(no text response)"}
+]
+
+
+def _build_user_content(
+    user_query: str,
+    user_image_data_urls: list[str] | None,
+    mentioned_documents: list[dict[str, Any]] | None = None,
+) -> list[dict[str, Any]]:
+    """Build the persisted user-message ``content`` (assistant-ui v2 parts).
+
+    Mirrors the shape the existing frontend posts via
+    ``appendMessage`` (see ``surfsense_web/.../new-chat/[[...chat_id]]/page.tsx``):
+
+        [{"type": "text", "text": "..."},
+         {"type": "image", "image": "data:..."},
+         {"type": "mentioned-documents", "documents": [{"id": int,
+            "title": str, "document_type": str}, ...]}]
+
+    The companion reader is
+    ``app.utils.user_message_multimodal.split_persisted_user_content_parts``
+    which expects exactly this shape — keep them in sync.
+
+    ``mentioned_documents``: optional list of ``{id, title, document_type}``
+    dicts. When non-empty (and a ``mentioned-documents`` part is not already
+    in some other input shape), a single ``{"type": "mentioned-documents",
+    "documents": [...]}`` part is appended. Mirrors the FE injection at
+    ``page.tsx:281-286`` (``persistUserTurn``).
+    """
+    parts: list[dict[str, Any]] = [{"type": "text", "text": user_query or ""}]
+    for url in user_image_data_urls or ():
+        if isinstance(url, str) and url:
+            parts.append({"type": "image", "image": url})
+    if mentioned_documents:
+        normalized: list[dict[str, Any]] = []
+        for doc in mentioned_documents:
+            if not isinstance(doc, dict):
+                continue
+            doc_id = doc.get("id")
+            title = doc.get("title")
+            document_type = doc.get("document_type")
+            if doc_id is None or title is None or document_type is None:
+                continue
+            normalized.append(
+                {
+                    "id": doc_id,
+                    "title": str(title),
+                    "document_type": str(document_type),
+                }
+            )
+        if normalized:
+            parts.append({"type": "mentioned-documents", "documents": normalized})
+    return parts
+
+
+async def persist_user_turn(
+    *,
+    chat_id: int,
+    user_id: str | None,
+    turn_id: str,
+    user_query: str,
+    user_image_data_urls: list[str] | None = None,
+    mentioned_documents: list[dict[str, Any]] | None = None,
+) -> int | None:
+    """Persist the user-side row for a chat turn and return its ``id``.
+
+    Uses ``INSERT ... ON CONFLICT DO NOTHING ... RETURNING id`` keyed on the
+    ``(thread_id, turn_id, role)`` partial unique index from migration 141
+    (``WHERE turn_id IS NOT NULL``). On conflict the insert silently no-ops
+    at the DB level — no Python ``IntegrityError`` is constructed, which
+    eliminates the debugger pause that ``justMyCode=false`` + async greenlet
+    interactions used to produce, and keeps production logs clean.
+
+    Returns the ``id`` of the row that exists for this turn after the call:
+    the freshly inserted ``id`` on the happy path, or the existing ``id``
+    when a previous writer (legacy FE ``appendMessage`` racing the SSE
+    stream, redelivered request, etc.) already wrote it. Returns ``None``
+    only on genuine DB failure; the caller should yield a streaming error
+    and abort the turn so we never produce a title/assistant row that
+    isn't anchored to a persisted user message.
+
+    Other constraint violations (FK, NOT NULL, etc.) still raise
+    ``IntegrityError`` — only the ``(thread_id, turn_id, role)`` collision
+    is silenced.
+    """
+    if not turn_id:
+        # Defensive: turn_id is always populated by the streaming path
+        # before this helper is called. If it isn't, we cannot be
+        # idempotent against the unique index — refuse to write rather
+        # than create a row the unique index can't dedupe.
+        logger.error(
+            "persist_user_turn called without a turn_id (chat_id=%s); skipping",
+            chat_id,
+        )
+        return None
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    resolved_id: int | None = None
+    try:
+        async with shielded_async_session() as ws:
+            # Re-attach the thread row so we can also bump updated_at
+            # in the same write — keeps the sidebar ordering accurate
+            # when a user fires off a turn but never reaches the
+            # legacy appendMessage.
+            thread = await ws.get(NewChatThread, chat_id)
+            author_uuid: UUID | None = None
+            if user_id:
+                try:
+                    author_uuid = UUID(user_id)
+                except (TypeError, ValueError):
+                    logger.warning(
+                        "persist_user_turn: invalid user_id=%r, persisting as anonymous",
+                        user_id,
+                    )
+
+            content_payload = _build_user_content(
+                user_query, user_image_data_urls, mentioned_documents
+            )
+            insert_stmt = (
+                pg_insert(NewChatMessage)
+                .values(
+                    thread_id=chat_id,
+                    role=NewChatMessageRole.USER,
+                    content=content_payload,
+                    author_id=author_uuid,
+                    turn_id=turn_id,
+                )
+                .on_conflict_do_nothing(
+                    index_elements=["thread_id", "turn_id", "role"],
+                    index_where=sa_text("turn_id IS NOT NULL"),
+                )
+                .returning(NewChatMessage.id)
+            )
+            inserted_id = (await ws.execute(insert_stmt)).scalar()
+
+            if inserted_id is None:
+                # Conflict on partial unique index — another writer
+                # (legacy FE appendMessage, redelivered request, etc.)
+                # already persisted this row. Look it up and reuse.
+                lookup = await ws.execute(
+                    select(NewChatMessage.id).where(
+                        NewChatMessage.thread_id == chat_id,
+                        NewChatMessage.turn_id == turn_id,
+                        NewChatMessage.role == NewChatMessageRole.USER,
+                    )
+                )
+                existing_id = lookup.scalars().first()
+                if existing_id is None:
+                    # Conflict reported but no row found — extremely
+                    # unlikely (concurrent DELETE). Surface as failure.
+                    logger.warning(
+                        "persist_user_turn: conflict but no matching row "
+                        "(chat_id=%s, turn_id=%s)",
+                        chat_id,
+                        turn_id,
+                    )
+                    outcome = "integrity_no_match"
+                    return None
+                resolved_id = int(existing_id)
+                outcome = "race_recovered"
+            else:
+                resolved_id = int(inserted_id)
+                outcome = "inserted"
+                # Bump thread.updated_at only on a real insert — when
+                # we recovered an existing row the prior writer
+                # already touched the thread.
+                if thread is not None:
+                    thread.updated_at = datetime.now(UTC)
+
+            await ws.commit()
+            return resolved_id
+    except Exception:
+        logger.exception(
+            "persist_user_turn failed (chat_id=%s, turn_id=%s)",
+            chat_id,
+            turn_id,
+        )
+        return None
+    finally:
+        _perf_log.info(
+            "[persist_user_turn] outcome=%s chat_id=%s turn_id=%s "
+            "message_id=%s query_len=%d images=%d mentioned_docs=%d "
+            "in %.3fs",
+            outcome,
+            chat_id,
+            turn_id,
+            resolved_id,
+            len(user_query or ""),
+            len(user_image_data_urls or ()),
+            len(mentioned_documents or ()),
+            time.perf_counter() - t0,
+        )
+
+
+async def persist_assistant_shell(
+    *,
+    chat_id: int,
+    user_id: str | None,
+    turn_id: str,
+) -> int | None:
+    """Pre-write an empty assistant row for the turn and return its id.
+
+    Inserts a placeholder ``new_chat_messages`` row (empty text content) so
+    the streaming layer has a stable ``message_id`` to correlate against
+    for the rest of the turn. ``finalize_assistant_turn`` overwrites the
+    ``content`` field at end-of-stream with the rich ``ContentPart[]``
+    snapshot produced by ``AssistantContentBuilder``.
+
+    Returns the row id on success, ``None`` on a genuine DB failure (caller
+    should abort the turn rather than stream into a void).
+
+    Idempotent against the ``(thread_id, turn_id, ASSISTANT)`` partial unique
+    index from migration 141: if a row already exists (resume retry, racing
+    legacy frontend, redelivered request, etc.) we look it up by
+    ``(thread_id, turn_id, role)`` and return its existing id. The streaming
+    layer is then free to UPDATE that row at finalize time.
+    """
+    if not turn_id:
+        logger.error(
+            "persist_assistant_shell called without a turn_id (chat_id=%s); skipping",
+            chat_id,
+        )
+        return None
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    resolved_id: int | None = None
+    try:
+        async with shielded_async_session() as ws:
+            insert_stmt = (
+                pg_insert(NewChatMessage)
+                .values(
+                    thread_id=chat_id,
+                    role=NewChatMessageRole.ASSISTANT,
+                    content=_EMPTY_SHELL_CONTENT,
+                    author_id=None,
+                    turn_id=turn_id,
+                )
+                .on_conflict_do_nothing(
+                    index_elements=["thread_id", "turn_id", "role"],
+                    index_where=sa_text("turn_id IS NOT NULL"),
+                )
+                .returning(NewChatMessage.id)
+            )
+            inserted_id = (await ws.execute(insert_stmt)).scalar()
+
+            if inserted_id is None:
+                # Conflict — another writer (legacy FE appendMessage,
+                # resume retry, redelivered request) wrote the
+                # (thread_id, turn_id, ASSISTANT) row first. Look it up
+                # so the streaming layer can UPDATE the same row at
+                # finalize time.
+                lookup = await ws.execute(
+                    select(NewChatMessage.id).where(
+                        NewChatMessage.thread_id == chat_id,
+                        NewChatMessage.turn_id == turn_id,
+                        NewChatMessage.role == NewChatMessageRole.ASSISTANT,
+                    )
+                )
+                existing_id = lookup.scalars().first()
+                if existing_id is None:
+                    logger.warning(
+                        "persist_assistant_shell: conflict but no matching "
+                        "(thread_id, turn_id, role) row found "
+                        "(chat_id=%s, turn_id=%s)",
+                        chat_id,
+                        turn_id,
+                    )
+                    outcome = "integrity_no_match"
+                    return None
+                resolved_id = int(existing_id)
+                outcome = "race_recovered"
+            else:
+                resolved_id = int(inserted_id)
+                outcome = "inserted"
+
+            await ws.commit()
+            return resolved_id
+    except Exception:
+        logger.exception(
+            "persist_assistant_shell failed (chat_id=%s, turn_id=%s)",
+            chat_id,
+            turn_id,
+        )
+        return None
+    finally:
+        _perf_log.info(
+            "[persist_assistant_shell] outcome=%s chat_id=%s turn_id=%s "
+            "message_id=%s in %.3fs",
+            outcome,
+            chat_id,
+            turn_id,
+            resolved_id,
+            time.perf_counter() - t0,
+        )
+
+
+async def finalize_assistant_turn(
+    *,
+    message_id: int,
+    chat_id: int,
+    search_space_id: int,
+    user_id: str | None,
+    turn_id: str,
+    content: list[dict[str, Any]],
+    accumulator: TurnTokenAccumulator | None,
+) -> None:
+    """Finalize the assistant row and write its token_usage.
+
+    Two writes in a single shielded session:
+
+    1. ``UPDATE new_chat_messages SET content = :c, updated_at = now()
+       WHERE id = :id`` — overwrites the placeholder ``persist_assistant_shell``
+       wrote with the full ``ContentPart[]`` snapshot produced server-side.
+    2. ``INSERT INTO token_usage (...) VALUES (...) ON CONFLICT (message_id)
+       WHERE message_id IS NOT NULL DO NOTHING`` — uses the partial unique
+       index ``uq_token_usage_message_id`` from migration 142 to make the
+       insert idempotent against ``append_message``'s recovery branch
+       (which uses the same ON CONFLICT clause).
+
+    Substitutes the status-marker payload when ``content`` is empty
+    (pure tool-call turn that aborted before any output, or interrupt
+    before any event arrived). The status marker is preferable to a
+    blank text bubble because token accounting still runs and an ops
+    dashboard can flag the row.
+
+    Best-effort — never raises. The streaming ``finally`` calls this
+    from within ``anyio.CancelScope(shield=True)``; any raised exception
+    here would mask the real error that triggered the cleanup.
+    """
+    if not turn_id:
+        logger.error(
+            "finalize_assistant_turn called without turn_id "
+            "(chat_id=%s, message_id=%s); skipping",
+            chat_id,
+            message_id,
+        )
+        return
+    if not message_id:
+        logger.error(
+            "finalize_assistant_turn called without message_id "
+            "(chat_id=%s, turn_id=%s); skipping",
+            chat_id,
+            turn_id,
+        )
+        return
+
+    payload: list[dict[str, Any]]
+    is_status_marker = False
+    if content:
+        payload = content
+    else:
+        payload = _STATUS_NO_RESPONSE
+        is_status_marker = True
+
+    t0 = time.perf_counter()
+    outcome = "failed"
+    token_usage_attempted = bool(
+        accumulator is not None and accumulator.calls and user_id
+    )
+    try:
+        async with shielded_async_session() as ws:
+            assistant_row = await ws.get(NewChatMessage, message_id)
+            if assistant_row is None:
+                logger.warning(
+                    "finalize_assistant_turn: row not found "
+                    "(chat_id=%s, message_id=%s, turn_id=%s); skipping",
+                    chat_id,
+                    message_id,
+                    turn_id,
+                )
+                outcome = "row_missing"
+                return
+
+            assistant_row.content = payload
+            assistant_row.updated_at = datetime.now(UTC)
+
+            # Token usage. ``record_token_usage`` (used elsewhere) does
+            # SELECT-then-INSERT in two statements which races with
+            # ``append_message``. Switch to a single INSERT ... ON
+            # CONFLICT DO NOTHING keyed on the migration-142 partial
+            # unique index so the loser silently drops its write at
+            # the DB level — exactly one row per ``message_id``,
+            # regardless of which session committed first.
+            if accumulator is not None and accumulator.calls and user_id:
+                try:
+                    user_uuid = UUID(user_id)
+                except (TypeError, ValueError):
+                    logger.warning(
+                        "finalize_assistant_turn: invalid user_id=%r, "
+                        "skipping token_usage row",
+                        user_id,
+                    )
+                else:
+                    insert_stmt = (
+                        pg_insert(TokenUsage)
+                        .values(
+                            usage_type="chat",
+                            prompt_tokens=accumulator.total_prompt_tokens,
+                            completion_tokens=accumulator.total_completion_tokens,
+                            total_tokens=accumulator.grand_total,
+                            cost_micros=accumulator.total_cost_micros,
+                            model_breakdown=accumulator.per_message_summary(),
+                            call_details={"calls": accumulator.serialized_calls()},
+                            thread_id=chat_id,
+                            message_id=message_id,
+                            search_space_id=search_space_id,
+                            user_id=user_uuid,
+                        )
+                        .on_conflict_do_nothing(
+                            index_elements=["message_id"],
+                            index_where=sa_text("message_id IS NOT NULL"),
+                        )
+                    )
+                    await ws.execute(insert_stmt)
+
+            await ws.commit()
+            outcome = "ok"
+    except Exception:
+        logger.exception(
+            "finalize_assistant_turn failed (chat_id=%s, message_id=%s, turn_id=%s)",
+            chat_id,
+            message_id,
+            turn_id,
+        )
+    finally:
+        _perf_log.info(
+            "[finalize_assistant_turn] outcome=%s chat_id=%s message_id=%s "
+            "turn_id=%s parts=%d status_marker=%s "
+            "token_usage_attempted=%s in %.3fs",
+            outcome,
+            chat_id,
+            message_id,
+            turn_id,
+            len(payload),
+            is_status_marker,
+            token_usage_attempted,
+            time.perf_counter() - t0,
+        )
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py