SurfSense/surfsense_backend/app/agents/new_chat/mention_resolver.py

"""Resolve @-mention chips to canonical virtual paths and substitute the
user-visible ``@title`` tokens with backtick-wrapped paths in the prompt
the agent sees.

The frontend's mention seam is a single discriminated-union list of
``{kind: "doc" | "folder", id, title, document_type?}`` chips (see
``surfsense_web/atoms/chat/mentioned-documents.atom.ts``). When a turn
reaches the backend stream task we have three needs that this module
centralises:

1. Map each chip to its canonical virtual path
   (``/documents/.../file.xml`` for docs, ``/documents/MyFolder/`` for
   folders) so the agent sees concrete filesystem locations instead of
   ambiguous ``@``-titles.
2. Substitute ``@title`` tokens in the user-typed text with backtick-
   wrapped paths so the path becomes part of the ``HumanMessage`` body
   the LLM consumes — without rewriting the persisted user message
   text (which keeps ``@title`` so chip rendering on reload is
   unchanged).
3. Surface the resolved id sets (docs + folders) to the priority
   middleware so it can render ``[USER-MENTIONED]`` priority entries
   without re-doing path resolution.

This is intentionally one module — see the architectural note in
``mention-paths-and-folders`` plan: previously the doc-resolution lived
inline in ``stream_new_chat`` and the folder mention had no resolution
at all. Centralising both behind a single ``resolve_mentions`` call
turns a leaky multi-field seam into a single deeper interface.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.agents.new_chat.path_resolver import (
    DOCUMENTS_ROOT,
    build_path_index,
    doc_to_virtual_path,
)
from app.db import Document, Folder
from app.schemas.new_chat import MentionedDocumentInfo

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class ResolvedMention:
    """Canonical view of a single @-mention chip.

    ``virtual_path`` is the path the agent will see (no trailing slash
    for documents, trailing ``/`` for folders to match the convention
    used by ``KnowledgeTreeMiddleware``).
    """

    kind: str  # "doc" | "folder"
    id: int
    title: str
    virtual_path: str


@dataclass
class ResolvedMentionSet:
    """Aggregate result of resolving a turn's mention chips.

    ``token_to_path`` maps ``@title`` (the literal token the user typed
    and the editor emitted) to the canonical virtual path for that
    chip. It is produced longest-token-first so substitution mirrors
    ``parseMentionSegments`` on the frontend (a longer title like
    ``@Project Roadmap`` is never shadowed by a shorter prefix
    ``@Project``).

    ``mentioned_document_ids`` collapses doc + surfsense_doc chips into
    a single ordered, deduped list because the priority middleware
    treats them uniformly downstream — see
    ``KnowledgePriorityMiddleware._compute_priority_paths``.
    """

    mentions: list[ResolvedMention] = field(default_factory=list)
    token_to_path: list[tuple[str, str]] = field(default_factory=list)
    mentioned_document_ids: list[int] = field(default_factory=list)
    mentioned_folder_ids: list[int] = field(default_factory=list)


def _folder_virtual_path(folder_id: int, folder_paths: dict[int, str]) -> str:
    """Return ``/documents/Folder/Sub/`` for a folder id.

    Falls back to the documents root when the folder is missing from
    the index (deleted or in a different search space). Trailing slash
    matches ``KnowledgeTreeMiddleware`` (``/documents/MyFolder/``) so
    the agent's ``ls`` can dispatch on it as a directory.
    """
    base = folder_paths.get(folder_id, DOCUMENTS_ROOT)
    return f"{base}/" if not base.endswith("/") else base


async def resolve_mentions(
    session: AsyncSession,
    *,
    search_space_id: int,
    mentioned_documents: list[MentionedDocumentInfo] | None,
    mentioned_document_ids: list[int] | None = None,
    mentioned_surfsense_doc_ids: list[int] | None = None,
    mentioned_folder_ids: list[int] | None = None,
) -> ResolvedMentionSet:
    """Resolve every @-mention chip on a turn into virtual paths.

    The function takes both the ``mentioned_documents`` discriminated
    list (chip metadata used for substitution + persistence) and the
    parallel id arrays (``mentioned_document_ids``,
    ``mentioned_surfsense_doc_ids``, ``mentioned_folder_ids``) for two
    reasons:

    * Legacy clients that haven't migrated to the unified chip list
      still send the id arrays — we treat the union as authoritative.
    * The id arrays are the canonical input to
      ``KnowledgePriorityMiddleware`` (via ``SurfSenseContextSchema``);
      returning the deduped, validated lists lets the route forward
      them unchanged.

    Resolution is best-effort: a chip whose id no longer exists (e.g.
    document was deleted between mention and submit) is silently
    dropped. The agent still sees the user's original text, just
    without a backtick-path substitution for that chip.
    """
    chip_doc_ids: list[int] = []
    chip_folder_ids: list[int] = []
    chip_titles_by_id: dict[tuple[str, int], str] = {}
    if mentioned_documents:
        for chip in mentioned_documents:
            kind = chip.kind
            if kind == "folder":
                chip_folder_ids.append(chip.id)
            else:
                chip_doc_ids.append(chip.id)
            chip_titles_by_id[(kind, chip.id)] = chip.title

    doc_id_pool: list[int] = list(
        dict.fromkeys(
            [
                *(mentioned_document_ids or []),
                *(mentioned_surfsense_doc_ids or []),
                *chip_doc_ids,
            ]
        )
    )
    folder_id_pool: list[int] = list(
        dict.fromkeys([*(mentioned_folder_ids or []), *chip_folder_ids])
    )

    if not doc_id_pool and not folder_id_pool:
        return ResolvedMentionSet()

    index = await build_path_index(session, search_space_id)

    doc_rows: dict[int, Document] = {}
    if doc_id_pool:
        result = await session.execute(
            select(Document).where(
                Document.search_space_id == search_space_id,
                Document.id.in_(doc_id_pool),
            )
        )
        for row in result.scalars().all():
            doc_rows[row.id] = row

    folder_rows: dict[int, Folder] = {}
    if folder_id_pool:
        result = await session.execute(
            select(Folder).where(
                Folder.search_space_id == search_space_id,
                Folder.id.in_(folder_id_pool),
            )
        )
        for row in result.scalars().all():
            folder_rows[row.id] = row

    resolved: list[ResolvedMention] = []
    accepted_doc_ids: list[int] = []
    accepted_folder_ids: list[int] = []

    for doc_id in doc_id_pool:
        row = doc_rows.get(doc_id)
        if row is None:
            logger.debug(
                "mention_resolver: dropping doc id=%s (not found in space=%s)",
                doc_id,
                search_space_id,
            )
            continue
        title = chip_titles_by_id.get(("doc", doc_id), str(row.title or ""))
        path = doc_to_virtual_path(
            doc_id=row.id,
            title=str(row.title or "untitled"),
            folder_id=row.folder_id,
            index=index,
        )
        resolved.append(
            ResolvedMention(kind="doc", id=row.id, title=title, virtual_path=path)
        )
        accepted_doc_ids.append(row.id)

    for folder_id in folder_id_pool:
        row = folder_rows.get(folder_id)
        if row is None:
            logger.debug(
                "mention_resolver: dropping folder id=%s (not found in space=%s)",
                folder_id,
                search_space_id,
            )
            continue
        title = chip_titles_by_id.get(("folder", folder_id), str(row.name or ""))
        path = _folder_virtual_path(row.id, index.folder_paths)
        resolved.append(
            ResolvedMention(kind="folder", id=row.id, title=title, virtual_path=path)
        )
        accepted_folder_ids.append(row.id)

    token_to_path: list[tuple[str, str]] = []
    seen_tokens: set[str] = set()
    for mention in resolved:
        if not mention.title:
            continue
        token = f"@{mention.title}"
        if token in seen_tokens:
            continue
        seen_tokens.add(token)
        token_to_path.append((token, mention.virtual_path))
    token_to_path.sort(key=lambda pair: len(pair[0]), reverse=True)

    return ResolvedMentionSet(
        mentions=resolved,
        token_to_path=token_to_path,
        mentioned_document_ids=accepted_doc_ids,
        mentioned_folder_ids=accepted_folder_ids,
    )


def substitute_in_text(text: str, token_to_path: list[tuple[str, str]]) -> str:
    """Replace each ``@title`` token with a backtick-wrapped virtual path.

    Mirrors ``parseMentionSegments`` on the frontend: longest token
    first, single forward pass, no regex (titles can contain regex
    metacharacters). The substitution is idempotent for already-
    substituted text because the backtick-wrapped path no longer
    starts with ``@``.

    Empty / no-op cases short-circuit so callers can pass this through
    unconditionally without paying for a scan.
    """
    if not text or not token_to_path:
        return text

    out: list[str] = []
    i = 0
    n = len(text)
    while i < n:
        matched: tuple[str, str] | None = None
        for token, path in token_to_path:
            if text.startswith(token, i):
                matched = (token, path)
                break
        if matched is None:
            out.append(text[i])
            i += 1
            continue
        token, path = matched
        out.append(f"`{path}`")
        i += len(token)
    return "".join(out)


__all__ = [
    "ResolvedMention",
    "ResolvedMentionSet",
    "resolve_mentions",
    "substitute_in_text",
]