SurfSense/surfsense_backend/app/agents/new_chat/mention_resolver.py
DESKTOP-RTLN3BA\$punk c8374e6c5b
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
feat: improved document, folder mentions rendering
2026-05-09 22:15:51 -07:00

281 lines
9.7 KiB
Python

"""Resolve @-mention chips to canonical virtual paths and substitute the
user-visible ``@title`` tokens with backtick-wrapped paths in the prompt
the agent sees.
The frontend's mention seam is a single discriminated-union list of
``{kind: "doc" | "folder", id, title, document_type?}`` chips (see
``surfsense_web/atoms/chat/mentioned-documents.atom.ts``). When a turn
reaches the backend stream task we have three needs that this module
centralises:
1. Map each chip to its canonical virtual path
(``/documents/.../file.xml`` for docs, ``/documents/MyFolder/`` for
folders) so the agent sees concrete filesystem locations instead of
ambiguous ``@``-titles.
2. Substitute ``@title`` tokens in the user-typed text with backtick-
wrapped paths so the path becomes part of the ``HumanMessage`` body
the LLM consumes — without rewriting the persisted user message
text (which keeps ``@title`` so chip rendering on reload is
unchanged).
3. Surface the resolved id sets (docs + folders) to the priority
middleware so it can render ``[USER-MENTIONED]`` priority entries
without re-doing path resolution.
This is intentionally one module — see the architectural note in
``mention-paths-and-folders`` plan: previously the doc-resolution lived
inline in ``stream_new_chat`` and the folder mention had no resolution
at all. Centralising both behind a single ``resolve_mentions`` call
turns a leaky multi-field seam into a single deeper interface.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.new_chat.path_resolver import (
DOCUMENTS_ROOT,
build_path_index,
doc_to_virtual_path,
)
from app.db import Document, Folder
from app.schemas.new_chat import MentionedDocumentInfo
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class ResolvedMention:
"""Canonical view of a single @-mention chip.
``virtual_path`` is the path the agent will see (no trailing slash
for documents, trailing ``/`` for folders to match the convention
used by ``KnowledgeTreeMiddleware``).
"""
kind: str # "doc" | "folder"
id: int
title: str
virtual_path: str
@dataclass
class ResolvedMentionSet:
"""Aggregate result of resolving a turn's mention chips.
``token_to_path`` maps ``@title`` (the literal token the user typed
and the editor emitted) to the canonical virtual path for that
chip. It is produced longest-token-first so substitution mirrors
``parseMentionSegments`` on the frontend (a longer title like
``@Project Roadmap`` is never shadowed by a shorter prefix
``@Project``).
``mentioned_document_ids`` collapses doc + surfsense_doc chips into
a single ordered, deduped list because the priority middleware
treats them uniformly downstream — see
``KnowledgePriorityMiddleware._compute_priority_paths``.
"""
mentions: list[ResolvedMention] = field(default_factory=list)
token_to_path: list[tuple[str, str]] = field(default_factory=list)
mentioned_document_ids: list[int] = field(default_factory=list)
mentioned_folder_ids: list[int] = field(default_factory=list)
def _folder_virtual_path(folder_id: int, folder_paths: dict[int, str]) -> str:
"""Return ``/documents/Folder/Sub/`` for a folder id.
Falls back to the documents root when the folder is missing from
the index (deleted or in a different search space). Trailing slash
matches ``KnowledgeTreeMiddleware`` (``/documents/MyFolder/``) so
the agent's ``ls`` can dispatch on it as a directory.
"""
base = folder_paths.get(folder_id, DOCUMENTS_ROOT)
return f"{base}/" if not base.endswith("/") else base
async def resolve_mentions(
session: AsyncSession,
*,
search_space_id: int,
mentioned_documents: list[MentionedDocumentInfo] | None,
mentioned_document_ids: list[int] | None = None,
mentioned_surfsense_doc_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
) -> ResolvedMentionSet:
"""Resolve every @-mention chip on a turn into virtual paths.
The function takes both the ``mentioned_documents`` discriminated
list (chip metadata used for substitution + persistence) and the
parallel id arrays (``mentioned_document_ids``,
``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
reasons:
* Legacy clients that haven't migrated to the unified chip list
still send the id arrays — we treat the union as authoritative.
* The id arrays are the canonical input to
``KnowledgePriorityMiddleware`` (via ``SurfSenseContextSchema``);
returning the deduped, validated lists lets the route forward
them unchanged.
Resolution is best-effort: a chip whose id no longer exists (e.g.
document was deleted between mention and submit) is silently
dropped. The agent still sees the user's original text, just
without a backtick-path substitution for that chip.
"""
chip_doc_ids: list[int] = []
chip_folder_ids: list[int] = []
chip_titles_by_id: dict[tuple[str, int], str] = {}
if mentioned_documents:
for chip in mentioned_documents:
kind = chip.kind
if kind == "folder":
chip_folder_ids.append(chip.id)
else:
chip_doc_ids.append(chip.id)
chip_titles_by_id[(kind, chip.id)] = chip.title
doc_id_pool: list[int] = list(
dict.fromkeys(
[
*(mentioned_document_ids or []),
*(mentioned_surfsense_doc_ids or []),
*chip_doc_ids,
]
)
)
folder_id_pool: list[int] = list(
dict.fromkeys([*(mentioned_folder_ids or []), *chip_folder_ids])
)
if not doc_id_pool and not folder_id_pool:
return ResolvedMentionSet()
index = await build_path_index(session, search_space_id)
doc_rows: dict[int, Document] = {}
if doc_id_pool:
result = await session.execute(
select(Document).where(
Document.search_space_id == search_space_id,
Document.id.in_(doc_id_pool),
)
)
for row in result.scalars().all():
doc_rows[row.id] = row
folder_rows: dict[int, Folder] = {}
if folder_id_pool:
result = await session.execute(
select(Folder).where(
Folder.search_space_id == search_space_id,
Folder.id.in_(folder_id_pool),
)
)
for row in result.scalars().all():
folder_rows[row.id] = row
resolved: list[ResolvedMention] = []
accepted_doc_ids: list[int] = []
accepted_folder_ids: list[int] = []
for doc_id in doc_id_pool:
row = doc_rows.get(doc_id)
if row is None:
logger.debug(
"mention_resolver: dropping doc id=%s (not found in space=%s)",
doc_id,
search_space_id,
)
continue
title = chip_titles_by_id.get(("doc", doc_id), str(row.title or ""))
path = doc_to_virtual_path(
doc_id=row.id,
title=str(row.title or "untitled"),
folder_id=row.folder_id,
index=index,
)
resolved.append(
ResolvedMention(kind="doc", id=row.id, title=title, virtual_path=path)
)
accepted_doc_ids.append(row.id)
for folder_id in folder_id_pool:
row = folder_rows.get(folder_id)
if row is None:
logger.debug(
"mention_resolver: dropping folder id=%s (not found in space=%s)",
folder_id,
search_space_id,
)
continue
title = chip_titles_by_id.get(("folder", folder_id), str(row.name or ""))
path = _folder_virtual_path(row.id, index.folder_paths)
resolved.append(
ResolvedMention(kind="folder", id=row.id, title=title, virtual_path=path)
)
accepted_folder_ids.append(row.id)
token_to_path: list[tuple[str, str]] = []
seen_tokens: set[str] = set()
for mention in resolved:
if not mention.title:
continue
token = f"@{mention.title}"
if token in seen_tokens:
continue
seen_tokens.add(token)
token_to_path.append((token, mention.virtual_path))
token_to_path.sort(key=lambda pair: len(pair[0]), reverse=True)
return ResolvedMentionSet(
mentions=resolved,
token_to_path=token_to_path,
mentioned_document_ids=accepted_doc_ids,
mentioned_folder_ids=accepted_folder_ids,
)
def substitute_in_text(text: str, token_to_path: list[tuple[str, str]]) -> str:
"""Replace each ``@title`` token with a backtick-wrapped virtual path.
Mirrors ``parseMentionSegments`` on the frontend: longest token
first, single forward pass, no regex (titles can contain regex
metacharacters). The substitution is idempotent for already-
substituted text because the backtick-wrapped path no longer
starts with ``@``.
Empty / no-op cases short-circuit so callers can pass this through
unconditionally without paying for a scan.
"""
if not text or not token_to_path:
return text
out: list[str] = []
i = 0
n = len(text)
while i < n:
matched: tuple[str, str] | None = None
for token, path in token_to_path:
if text.startswith(token, i):
matched = (token, path)
break
if matched is None:
out.append(text[i])
i += 1
continue
token, path = matched
out.append(f"`{path}`")
i += len(token)
return "".join(out)
__all__ = [
"ResolvedMention",
"ResolvedMentionSet",
"resolve_mentions",
"substitute_in_text",
]