feat: improved document, folder mentions rendering

2026-05-12 17:22:38 +02:00 · 2026-05-09 22:15:51 -07:00 · 2026-05-09 22:15:51 -07:00 · c8374e6c5b
commit c8374e6c5b
parent 28a02a9143
59 changed files with 1725 additions and 361 deletions
--- a/surfsense_backend/app/agents/new_chat/context.py
+++ b/surfsense_backend/app/agents/new_chat/context.py
@ -46,6 +46,10 @@ class SurfSenseContextSchema:
            Read by ``KnowledgePriorityMiddleware`` to seed its priority
            list. Stays out of the compiled-agent cache key — that's the
            whole point of putting it here.
+        mentioned_folder_ids: KB folders the user @-mentioned this turn
+            (cloud filesystem mode). Surfaced as ``[USER-MENTIONED]``
+            entries in ``<priority_documents>`` so the agent prioritises
+            walking those folders with ``ls`` / ``find_documents``.
        file_operation_contract: One-shot file operation contract emitted
            by ``FileIntentMiddleware`` for the upcoming turn.
        turn_id / request_id: Correlation IDs surfaced by the streaming
@ -59,6 +63,7 @@ class SurfSenseContextSchema:

    search_space_id: int | None = None
    mentioned_document_ids: list[int] = field(default_factory=list)
+    mentioned_folder_ids: list[int] = field(default_factory=list)
    file_operation_contract: FileOperationContractState | None = None
    turn_id: str | None = None
    request_id: str | None = None
--- a/surfsense_backend/app/agents/new_chat/mention_resolver.py
+++ b/surfsense_backend/app/agents/new_chat/mention_resolver.py
@ -0,0 +1,281 @@
+"""Resolve @-mention chips to canonical virtual paths and substitute the
+user-visible ``@title`` tokens with backtick-wrapped paths in the prompt
+the agent sees.
+
+The frontend's mention seam is a single discriminated-union list of
+``{kind: "doc" | "folder", id, title, document_type?}`` chips (see
+``surfsense_web/atoms/chat/mentioned-documents.atom.ts``). When a turn
+reaches the backend stream task we have three needs that this module
+centralises:
+
+1. Map each chip to its canonical virtual path
+   (``/documents/.../file.xml`` for docs, ``/documents/MyFolder/`` for
+   folders) so the agent sees concrete filesystem locations instead of
+   ambiguous ``@``-titles.
+2. Substitute ``@title`` tokens in the user-typed text with backtick-
+   wrapped paths so the path becomes part of the ``HumanMessage`` body
+   the LLM consumes — without rewriting the persisted user message
+   text (which keeps ``@title`` so chip rendering on reload is
+   unchanged).
+3. Surface the resolved id sets (docs + folders) to the priority
+   middleware so it can render ``[USER-MENTIONED]`` priority entries
+   without re-doing path resolution.
+
+This is intentionally one module — see the architectural note in
+``mention-paths-and-folders`` plan: previously the doc-resolution lived
+inline in ``stream_new_chat`` and the folder mention had no resolution
+at all. Centralising both behind a single ``resolve_mentions`` call
+turns a leaky multi-field seam into a single deeper interface.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.agents.new_chat.path_resolver import (
+    DOCUMENTS_ROOT,
+    build_path_index,
+    doc_to_virtual_path,
+)
+from app.db import Document, Folder
+from app.schemas.new_chat import MentionedDocumentInfo
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ResolvedMention:
+    """Canonical view of a single @-mention chip.
+
+    ``virtual_path`` is the path the agent will see (no trailing slash
+    for documents, trailing ``/`` for folders to match the convention
+    used by ``KnowledgeTreeMiddleware``).
+    """
+
+    kind: str  # "doc" | "folder"
+    id: int
+    title: str
+    virtual_path: str
+
+
+@dataclass
+class ResolvedMentionSet:
+    """Aggregate result of resolving a turn's mention chips.
+
+    ``token_to_path`` maps ``@title`` (the literal token the user typed
+    and the editor emitted) to the canonical virtual path for that
+    chip. It is produced longest-token-first so substitution mirrors
+    ``parseMentionSegments`` on the frontend (a longer title like
+    ``@Project Roadmap`` is never shadowed by a shorter prefix
+    ``@Project``).
+
+    ``mentioned_document_ids`` collapses doc + surfsense_doc chips into
+    a single ordered, deduped list because the priority middleware
+    treats them uniformly downstream — see
+    ``KnowledgePriorityMiddleware._compute_priority_paths``.
+    """
+
+    mentions: list[ResolvedMention] = field(default_factory=list)
+    token_to_path: list[tuple[str, str]] = field(default_factory=list)
+    mentioned_document_ids: list[int] = field(default_factory=list)
+    mentioned_folder_ids: list[int] = field(default_factory=list)
+
+
+def _folder_virtual_path(folder_id: int, folder_paths: dict[int, str]) -> str:
+    """Return ``/documents/Folder/Sub/`` for a folder id.
+
+    Falls back to the documents root when the folder is missing from
+    the index (deleted or in a different search space). Trailing slash
+    matches ``KnowledgeTreeMiddleware`` (``/documents/MyFolder/``) so
+    the agent's ``ls`` can dispatch on it as a directory.
+    """
+    base = folder_paths.get(folder_id, DOCUMENTS_ROOT)
+    return f"{base}/" if not base.endswith("/") else base
+
+
+async def resolve_mentions(
+    session: AsyncSession,
+    *,
+    search_space_id: int,
+    mentioned_documents: list[MentionedDocumentInfo] | None,
+    mentioned_document_ids: list[int] | None = None,
+    mentioned_surfsense_doc_ids: list[int] | None = None,
+    mentioned_folder_ids: list[int] | None = None,
+) -> ResolvedMentionSet:
+    """Resolve every @-mention chip on a turn into virtual paths.
+
+    The function takes both the ``mentioned_documents`` discriminated
+    list (chip metadata used for substitution + persistence) and the
+    parallel id arrays (``mentioned_document_ids``,
+    ``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
+    reasons:
+
+    * Legacy clients that haven't migrated to the unified chip list
+      still send the id arrays — we treat the union as authoritative.
+    * The id arrays are the canonical input to
+      ``KnowledgePriorityMiddleware`` (via ``SurfSenseContextSchema``);
+      returning the deduped, validated lists lets the route forward
+      them unchanged.
+
+    Resolution is best-effort: a chip whose id no longer exists (e.g.
+    document was deleted between mention and submit) is silently
+    dropped. The agent still sees the user's original text, just
+    without a backtick-path substitution for that chip.
+    """
+    chip_doc_ids: list[int] = []
+    chip_folder_ids: list[int] = []
+    chip_titles_by_id: dict[tuple[str, int], str] = {}
+    if mentioned_documents:
+        for chip in mentioned_documents:
+            kind = chip.kind
+            if kind == "folder":
+                chip_folder_ids.append(chip.id)
+            else:
+                chip_doc_ids.append(chip.id)
+            chip_titles_by_id[(kind, chip.id)] = chip.title
+
+    doc_id_pool: list[int] = list(
+        dict.fromkeys(
+            [
+                *(mentioned_document_ids or []),
+                *(mentioned_surfsense_doc_ids or []),
+                *chip_doc_ids,
+            ]
+        )
+    )
+    folder_id_pool: list[int] = list(
+        dict.fromkeys([*(mentioned_folder_ids or []), *chip_folder_ids])
+    )
+
+    if not doc_id_pool and not folder_id_pool:
+        return ResolvedMentionSet()
+
+    index = await build_path_index(session, search_space_id)
+
+    doc_rows: dict[int, Document] = {}
+    if doc_id_pool:
+        result = await session.execute(
+            select(Document).where(
+                Document.search_space_id == search_space_id,
+                Document.id.in_(doc_id_pool),
+            )
+        )
+        for row in result.scalars().all():
+            doc_rows[row.id] = row
+
+    folder_rows: dict[int, Folder] = {}
+    if folder_id_pool:
+        result = await session.execute(
+            select(Folder).where(
+                Folder.search_space_id == search_space_id,
+                Folder.id.in_(folder_id_pool),
+            )
+        )
+        for row in result.scalars().all():
+            folder_rows[row.id] = row
+
+    resolved: list[ResolvedMention] = []
+    accepted_doc_ids: list[int] = []
+    accepted_folder_ids: list[int] = []
+
+    for doc_id in doc_id_pool:
+        row = doc_rows.get(doc_id)
+        if row is None:
+            logger.debug(
+                "mention_resolver: dropping doc id=%s (not found in space=%s)",
+                doc_id,
+                search_space_id,
+            )
+            continue
+        title = chip_titles_by_id.get(("doc", doc_id), str(row.title or ""))
+        path = doc_to_virtual_path(
+            doc_id=row.id,
+            title=str(row.title or "untitled"),
+            folder_id=row.folder_id,
+            index=index,
+        )
+        resolved.append(
+            ResolvedMention(kind="doc", id=row.id, title=title, virtual_path=path)
+        )
+        accepted_doc_ids.append(row.id)
+
+    for folder_id in folder_id_pool:
+        row = folder_rows.get(folder_id)
+        if row is None:
+            logger.debug(
+                "mention_resolver: dropping folder id=%s (not found in space=%s)",
+                folder_id,
+                search_space_id,
+            )
+            continue
+        title = chip_titles_by_id.get(("folder", folder_id), str(row.name or ""))
+        path = _folder_virtual_path(row.id, index.folder_paths)
+        resolved.append(
+            ResolvedMention(kind="folder", id=row.id, title=title, virtual_path=path)
+        )
+        accepted_folder_ids.append(row.id)
+
+    token_to_path: list[tuple[str, str]] = []
+    seen_tokens: set[str] = set()
+    for mention in resolved:
+        if not mention.title:
+            continue
+        token = f"@{mention.title}"
+        if token in seen_tokens:
+            continue
+        seen_tokens.add(token)
+        token_to_path.append((token, mention.virtual_path))
+    token_to_path.sort(key=lambda pair: len(pair[0]), reverse=True)
+
+    return ResolvedMentionSet(
+        mentions=resolved,
+        token_to_path=token_to_path,
+        mentioned_document_ids=accepted_doc_ids,
+        mentioned_folder_ids=accepted_folder_ids,
+    )
+
+
+def substitute_in_text(text: str, token_to_path: list[tuple[str, str]]) -> str:
+    """Replace each ``@title`` token with a backtick-wrapped virtual path.
+
+    Mirrors ``parseMentionSegments`` on the frontend: longest token
+    first, single forward pass, no regex (titles can contain regex
+    metacharacters). The substitution is idempotent for already-
+    substituted text because the backtick-wrapped path no longer
+    starts with ``@``.
+
+    Empty / no-op cases short-circuit so callers can pass this through
+    unconditionally without paying for a scan.
+    """
+    if not text or not token_to_path:
+        return text
+
+    out: list[str] = []
+    i = 0
+    n = len(text)
+    while i < n:
+        matched: tuple[str, str] | None = None
+        for token, path in token_to_path:
+            if text.startswith(token, i):
+                matched = (token, path)
+                break
+        if matched is None:
+            out.append(text[i])
+            i += 1
+            continue
+        token, path = matched
+        out.append(f"`{path}`")
+        i += len(token)
+    return "".join(out)
+
+
+__all__ = [
+    "ResolvedMention",
+    "ResolvedMentionSet",
+    "resolve_mentions",
+    "substitute_in_text",
+]
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@ -54,6 +54,7 @@ from app.db import (
    NATIVE_TO_LEGACY_DOCTYPE,
    Chunk,
    Document,
+    Folder,
    shielded_async_session,
 )
 from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
@ -832,6 +833,22 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
            mention_ids = list(self.mentioned_document_ids)
            self.mentioned_document_ids = []

+        # Folder mentions live alongside doc mentions on the runtime
+        # context. They never feed hybrid search (folders aren't
+        # embedded) — they're surfaced purely as ``[USER-MENTIONED]``
+        # priority entries so the agent walks the folder with ``ls`` /
+        # ``find_documents`` instead of ignoring it. Cloud filesystem
+        # mode only.
+        folder_mention_ids: list[int] = []
+        if (
+            ctx is not None
+            and getattr(self, "filesystem_mode", FilesystemMode.CLOUD)
+            == FilesystemMode.CLOUD
+        ):
+            ctx_folders = getattr(ctx, "mentioned_folder_ids", None)
+            if ctx_folders:
+                folder_mention_ids = list(ctx_folders)
+
        mentioned_results: list[dict[str, Any]] = []
        if mention_ids:
            mentioned_results = await fetch_mentioned_documents(
@ -876,16 +893,21 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]

        priority, matched_chunk_ids = await self._materialize_priority(merged)

+        if folder_mention_ids:
+            folder_entries = await self._materialize_folder_priority(folder_mention_ids)
+            priority = folder_entries + priority
+
        new_messages = list(messages)
        insert_at = max(len(new_messages) - 1, 0)
        new_messages.insert(insert_at, _render_priority_message(priority))

        _perf_log.info(
-            "[kb_priority] completed in %.3fs query=%r priority=%d mentioned=%d",
+            "[kb_priority] completed in %.3fs query=%r priority=%d mentioned=%d folders=%d",
            asyncio.get_event_loop().time() - t0,
            user_text[:80],
            len(priority),
            len(mentioned_results),
+            len(folder_mention_ids),
        )

        return {
@ -894,6 +916,58 @@ class KnowledgePriorityMiddleware(AgentMiddleware):  # type: ignore[type-arg]
            "messages": new_messages,
        }

+    async def _materialize_folder_priority(
+        self, folder_ids: list[int]
+    ) -> list[dict[str, Any]]:
+        """Resolve user-mentioned folder ids to ``<priority_documents>`` entries.
+
+        Each entry uses the canonical ``/documents/Folder/Sub/`` virtual
+        path (matching ``KnowledgeTreeMiddleware`` and the agent's
+        ``ls`` adapter) and is flagged ``mentioned=True`` so the
+        rendered line carries ``[USER-MENTIONED]``. ``score`` is left
+        ``None`` so the renderer prints ``n/a`` — folders aren't
+        ranked, the agent decides which children to read.
+        """
+        if not folder_ids:
+            return []
+        async with shielded_async_session() as session:
+            index: PathIndex = await build_path_index(session, self.search_space_id)
+            folder_rows = await session.execute(
+                select(Folder.id, Folder.name).where(
+                    Folder.search_space_id == self.search_space_id,
+                    Folder.id.in_(folder_ids),
+                )
+            )
+            folder_titles: dict[int, str] = {
+                row.id: row.name for row in folder_rows.all()
+            }
+
+        entries: list[dict[str, Any]] = []
+        seen: set[int] = set()
+        for folder_id in folder_ids:
+            if folder_id in seen:
+                continue
+            seen.add(folder_id)
+            base = index.folder_paths.get(folder_id)
+            if base is None:
+                logger.debug(
+                    "kb_priority: dropping folder id=%s (missing from path index)",
+                    folder_id,
+                )
+                continue
+            path = base if base.endswith("/") else f"{base}/"
+            entries.append(
+                {
+                    "path": path,
+                    "score": None,
+                    "document_id": None,
+                    "folder_id": folder_id,
+                    "title": folder_titles.get(folder_id, ""),
+                    "mentioned": True,
+                }
+            )
+        return entries
+
    async def _materialize_priority(
        self, merged: list[dict[str, Any]]
    ) -> tuple[list[dict[str, Any]], dict[int, list[int]]]: