diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py index 9236e9121..9c667c9fe 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py @@ -1,12 +1,11 @@ -"""On-demand ``search_knowledge_base`` main-agent tool (OpenCode-style lazy RAG). +"""On-demand ``search_knowledge_base`` main-agent tool (citation-spine RAG). -The main agent no longer receives eagerly pre-injected KB context on every -turn (see :class:`KnowledgePriorityMiddleware`, now gated off by default). -Instead it calls this tool only when it decides it needs knowledge-base -content. The tool runs a single hybrid search (embed + DB search, ~0.5s), -formats the top matches for the model, and writes ``kb_matched_chunk_ids`` -into graph state so matched-section highlighting is preserved when the agent -later reads a document via ``task(knowledge_base)``. +The main agent calls this when it decides it needs knowledge-base content. The +tool runs one hybrid search, renders the matched passages as a +```` block whose passages carry server-assigned ``[n]`` +labels, and persists the conversation's ``CitationRegistry`` onto graph state so +the ``[n]`` -> ``[citation:]`` normalizer can resolve them after the +turn. """ from __future__ import annotations @@ -18,153 +17,70 @@ from langchain.tools import ToolRuntime from langchain_core.messages import ToolMessage from langchain_core.tools import BaseTool, StructuredTool from langgraph.types import Command -from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession -from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import ( - search_knowledge_base as _hybrid_search_kb, +from app.agents.chat.multi_agent_chat.shared.citations import load_registry +from app.agents.chat.multi_agent_chat.shared.retrieval import SearchScope, build_context +from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import ( + search_chunks, ) from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import ( SurfSenseFilesystemState, ) -from app.agents.chat.runtime.path_resolver import ( - PathIndex, - build_path_index, - doc_to_virtual_path, -) -from app.db import Document, shielded_async_session +from app.agents.chat.runtime.references import referenced_document_ids +from app.db import shielded_async_session from app.utils.perf import get_perf_logger _perf_log = get_perf_logger() _DEFAULT_TOP_K = 5 _MAX_TOP_K = 20 -_PER_DOC_SNIPPET_CHARS = 1200 -_MAX_TOTAL_CHARS = 16_000 _TOOL_DESCRIPTION = ( "Search the user's knowledge base (their indexed documents, files, and " "connector content) for passages relevant to a query, using hybrid " "semantic + keyword retrieval.\n\n" "Use this FIRST to ground any factual or informational answer about the " - "user's own documents, notes, or connected sources. The workspace tree " - "shows which files exist; this tool pulls the actual relevant content. " - "Each hit returns the document's virtual path, a relevance score, and the " - "matched snippets. If you need a document's full text, delegate a read to " - "the knowledge_base specialist via `task` using the returned path.\n\n" + "user's own documents, notes, or connected sources. It returns a " + " block: each matched passage is labelled [n]. Cite a " + "passage by writing that [n] after the statement it supports.\n\n" "Write a focused, specific query containing the concrete entities, " "acronyms, people, projects, or terms you are looking for." ) -async def _resolve_virtual_paths( - results: list[dict[str, Any]], +def _search_types( + available_connectors: list[str] | None, + available_document_types: list[str] | None, +) -> tuple[str, ...] | None: + """Merge connector + document-type filters into a scope; ``None`` if unrestricted.""" + types: set[str] = set() + if available_document_types: + types.update(available_document_types) + if available_connectors: + types.update(available_connectors) + return tuple(sorted(types)) or None + + +async def _build_search_scope( + session: AsyncSession, *, search_space_id: int, -) -> dict[int, str]: - """Resolve ``Document.id`` -> canonical virtual path for the search hits.""" - doc_ids = [ - doc_id - for doc_id in ( - (doc.get("document") or {}).get("id") - for doc in results - if isinstance(doc, dict) - ) - if isinstance(doc_id, int) - ] - if not doc_ids: - return {} - - async with shielded_async_session() as session: - index: PathIndex = await build_path_index(session, search_space_id) - folder_rows = await session.execute( - select(Document.id, Document.folder_id).where( - Document.search_space_id == search_space_id, - Document.id.in_(doc_ids), - ) - ) - folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()} - - paths: dict[int, str] = {} - for doc in results: - doc_meta = doc.get("document") or {} - doc_id = doc_meta.get("id") - if not isinstance(doc_id, int): - continue - folder_id = folder_by_doc_id.get(doc_id, doc_meta.get("folder_id")) - paths[doc_id] = doc_to_virtual_path( - doc_id=doc_id, - title=str(doc_meta.get("title") or "untitled"), - folder_id=folder_id if isinstance(folder_id, int) else None, - index=index, - ) - return paths - - -def _format_hits( - results: list[dict[str, Any]], - *, - paths: dict[int, str], - query: str, -) -> str: - """Render search hits as a compact, model-readable block.""" - if not results: - return ( - f"No knowledge-base matches found for query: {query!r}.\n" - "Tell the user nothing relevant was found in their workspace, or " - "try a different query." - ) - - lines: list[str] = [f""] - total = len(lines[0]) - for rank, doc in enumerate(results, start=1): - doc_meta = doc.get("document") or {} - doc_id = doc_meta.get("id") - title = str(doc_meta.get("title") or "untitled") - doc_type = doc_meta.get("document_type") or doc.get("source") or "document" - score = doc.get("score") - score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a" - path = paths.get(doc_id) if isinstance(doc_id, int) else None - - header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + ( - f"\n path: {path}" if path else "" - ) - - content = (doc.get("content") or "").strip() - if content: - snippet = content[:_PER_DOC_SNIPPET_CHARS].strip() - if len(content) > _PER_DOC_SNIPPET_CHARS: - snippet += " ..." - body = "\n " + snippet.replace("\n", "\n ") - else: - body = "\n (no preview available; read the document for details)" - - entry = header + body - if total + len(entry) > _MAX_TOTAL_CHARS: - lines.append("\n") - break - lines.append(entry) - total += len(entry) - - lines.append( - "\n\nTo read a full document, delegate to the knowledge_base specialist " - "with `task`, referencing the path above." + document_types: tuple[str, ...] | None, + runtime: ToolRuntime[None, SurfSenseFilesystemState], +) -> SearchScope: + """Assemble the retrieval scope: workspace document-type filter + @-mention pins.""" + ctx = getattr(runtime, "context", None) + document_ids = await referenced_document_ids( + session, + search_space_id=search_space_id, + document_ids=getattr(ctx, "mentioned_document_ids", None), + folder_ids=getattr(ctx, "mentioned_folder_ids", None), + ) + return SearchScope( + document_types=document_types, + document_ids=document_ids or None, ) - lines.append("\n") - return "".join(lines) - - -def _matched_chunk_ids(results: list[dict[str, Any]]) -> dict[int, list[int]]: - """Extract ``Document.id`` -> matched chunk ids for state hand-off.""" - matched: dict[int, list[int]] = {} - for doc in results: - doc_id = (doc.get("document") or {}).get("id") - if not isinstance(doc_id, int): - continue - chunk_ids = doc.get("matched_chunk_ids") or [] - normalized = [int(cid) for cid in chunk_ids if isinstance(cid, int | str)] - if normalized: - matched[doc_id] = normalized - return matched def create_search_knowledge_base_tool( @@ -176,8 +92,7 @@ def create_search_knowledge_base_tool( """Factory for the on-demand ``search_knowledge_base`` tool.""" _space_id = search_space_id - _connectors = available_connectors - _doc_types = available_document_types + _document_types = _search_types(available_connectors, available_document_types) async def _impl( query: Annotated[ @@ -195,34 +110,45 @@ def create_search_knowledge_base_tool( return "Error: provide a non-empty search query." clamped_top_k = min(max(1, top_k), _MAX_TOP_K) - t0 = time.perf_counter() - results = await _hybrid_search_kb( - query=cleaned_query, - search_space_id=_space_id, - available_connectors=_connectors, - available_document_types=_doc_types, - top_k=clamped_top_k, - ) + registry = load_registry(getattr(runtime, "state", None)) - paths = await _resolve_virtual_paths(results, search_space_id=_space_id) - rendered = _format_hits(results, paths=paths, query=cleaned_query) - matched = _matched_chunk_ids(results) + t0 = time.perf_counter() + async with shielded_async_session() as session: + scope = await _build_search_scope( + session, + search_space_id=_space_id, + document_types=_document_types, + runtime=runtime, + ) + hits = await search_chunks( + session, + search_space_id=_space_id, + query=cleaned_query, + scope=scope, + top_k=clamped_top_k, + ) + rendered = build_context(cleaned_query, hits, registry) _perf_log.info( - "[search_knowledge_base] tool query=%r results=%d chars=%d in %.3fs", + "[search_knowledge_base] tool query=%r docs=%d in %.3fs", cleaned_query[:60], - len(results), - len(rendered), + len(hits), time.perf_counter() - t0, ) + if rendered is None: + return ( + f"No knowledge-base matches found for query: {cleaned_query!r}.\n" + "Tell the user nothing relevant was found in their workspace, or " + "try a different query." + ) + update: dict[str, Any] = { "messages": [ ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id) ], + "citation_registry": registry, } - if matched: - update["kb_matched_chunk_ids"] = matched return Command(update=update) return StructuredTool.from_function( diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py index 9ef601791..efb85a785 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py @@ -5,11 +5,6 @@ This middleware runs ``before_agent`` on every turn and writes: * ``state["kb_priority"]`` — the top-K most relevant documents for the current user message, used to render a ```` system message immediately before the user turn. -* ``state["kb_matched_chunk_ids"]`` — internal hand-off mapping - (``Document.id`` → matched chunk IDs) consumed by - :class:`KBPostgresBackend._load_file_data` when the agent first reads each - document, so the XML wrapper can flag matched sections in - ````. The previous "scoped filesystem" behaviour (synthetic ``ls`` + state ``files`` seeding) is intentionally removed: documents are now lazy-loaded @@ -816,7 +811,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] ] update: dict[str, Any] = { "kb_priority": priority, - "kb_matched_chunk_ids": {}, } if self.inject_system_message: new_messages = list(state.get("messages") or []) @@ -930,7 +924,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] merged.append(doc) _t_materialize = time.perf_counter() - priority, matched_chunk_ids = await self._materialize_priority(merged) + priority = await self._materialize_priority(merged) if folder_mention_ids: folder_entries = await self._materialize_folder_priority(folder_mention_ids) @@ -957,7 +951,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] update: dict[str, Any] = { "kb_priority": priority, - "kb_matched_chunk_ids": matched_chunk_ids, } if self.inject_system_message: new_messages = list(messages) @@ -1016,13 +1009,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] async def _materialize_priority( self, merged: list[dict[str, Any]] - ) -> tuple[list[dict[str, Any]], dict[int, list[int]]]: - """Resolve canonical paths and matched chunk ids for the priority list.""" + ) -> list[dict[str, Any]]: + """Resolve canonical paths for the priority list.""" priority: list[dict[str, Any]] = [] - matched_chunk_ids: dict[int, list[int]] = {} if not merged: - return priority, matched_chunk_ids + return priority _t0 = time.perf_counter() async with shielded_async_session() as session: @@ -1067,18 +1059,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg] "mentioned": bool(doc.get("_user_mentioned")), } ) - if isinstance(doc_id, int): - chunk_ids = doc.get("matched_chunk_ids") or [] - if chunk_ids: - matched_chunk_ids[doc_id] = [ - int(cid) for cid in chunk_ids if isinstance(cid, int | str) - ] _perf_log.info( "[kb_priority.materialize] db=%.3fs docs=%d", time.perf_counter() - _t0, len(merged), ) - return priority, matched_chunk_ids + return priority __all__ = [ diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/filesystem_state.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/filesystem_state.py index 41bed9d62..f0708ccaf 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/filesystem_state.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/filesystem_state.py @@ -14,8 +14,8 @@ extra fields needed to implement Postgres-backed virtual filesystem semantics: * ``dirty_path_tool_calls`` — sidecar map ``path -> latest tool_call_id`` for dirty paths; used to bind the per-path snapshot to an action_id. * ``kb_priority`` — top-K priority hints rendered into a system message. -* ``kb_matched_chunk_ids`` — internal hand-off for matched-chunk highlighting. * ``kb_anon_doc`` — Redis-loaded anonymous document (if any). +* ``citation_registry`` — per-conversation ``[n]`` -> source map for citations. * ``tree_version`` — bumped by persistence; invalidates the tree render cache. * ``workspace_tree_text`` — pre-rendered ```` body for the turn. @@ -30,9 +30,11 @@ from typing import Annotated, Any, NotRequired from deepagents.middleware.filesystem import FilesystemState from typing_extensions import TypedDict +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt from app.agents.chat.multi_agent_chat.shared.state.reducers import ( _add_unique_reducer, + _citation_registry_merge_reducer, _dict_merge_with_tombstones_reducer, _int_counter_merge_reducer, _list_append_reducer, @@ -162,12 +164,16 @@ class SurfSenseFilesystemState(FilesystemState): kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]] """Top-K priority hints rendered as a system message before the user turn.""" - kb_matched_chunk_ids: NotRequired[Annotated[dict[int, list[int]], _replace_reducer]] - """Internal: ``Document.id`` -> list of matched chunk IDs from hybrid search.""" - kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]] """Anonymous-session document loaded from Redis (read-only, no DB row).""" + citation_registry: NotRequired[ + Annotated[CitationRegistry, _citation_registry_merge_reducer] + ] + """Per-conversation ``[n]`` -> source map; written by retrieval, read by the + normalizer. Merges (union, find-or-create) so parallel/subagent registrations + stay globally consistent instead of clobbering each other.""" + tree_version: NotRequired[Annotated[int, _replace_reducer]] """Monotonically increasing counter; bumped when commits change the KB tree.""" diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/reducers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/reducers.py index c7b7685f0..8a9590723 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/reducers.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/state/reducers.py @@ -2,7 +2,7 @@ These reducers back the extra state fields used by the cloud-mode filesystem agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`, -`kb_priority`, `kb_matched_chunk_ids`, `kb_anon_doc`, `tree_version`). +`kb_priority`, `kb_anon_doc`, `tree_version`). Tools mutate these fields ONLY via `Command(update={...})` returns; the reducers are responsible for merging successive updates atomically and for @@ -20,6 +20,8 @@ from __future__ import annotations from typing import Any, Final, TypeVar +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry + _CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00" """Reset sentinel; pass it inside a list/dict update to request a reset. @@ -204,6 +206,41 @@ def _int_counter_merge_reducer( return base +def _as_registry(value: Any) -> CitationRegistry | None: + """Coerce a state value into a ``CitationRegistry``. + + The checkpointer serializes ``Command.update`` via ``ormsgpack`` *before* + reducers run, so an update can arrive as a plain ``dict`` rather than a model. + """ + if value is None: + return None + if isinstance(value, CitationRegistry): + return value + if isinstance(value, dict): + return CitationRegistry.model_validate(value) + return None + + +def _citation_registry_merge_reducer( + left: Any, + right: Any, +) -> CitationRegistry | None: + """Union two citation registries instead of replacing. + + Find-or-create across both sides so ``[n]`` stays globally consistent when + branches (parent + subagents, parallel tool calls) each register into a + registry forked from the same base. Collisions re-mint rather than drop. See + :meth:`CitationRegistry.merge`. + """ + right_reg = _as_registry(right) + left_reg = _as_registry(left) + if right_reg is None: + return left_reg + if left_reg is None: + return right_reg + return left_reg.merge(right_reg) + + def _initial_filesystem_state() -> dict[str, Any]: """Default empty values for SurfSense filesystem state fields. @@ -222,7 +259,6 @@ def _initial_filesystem_state() -> dict[str, Any]: "dirty_paths": [], "dirty_path_tool_calls": {}, "kb_priority": [], - "kb_matched_chunk_ids": {}, "kb_anon_doc": None, "tree_version": 0, } @@ -231,6 +267,7 @@ def _initial_filesystem_state() -> dict[str, Any]: __all__ = [ "_CLEAR", "_add_unique_reducer", + "_citation_registry_merge_reducer", "_dict_merge_with_tombstones_reducer", "_initial_filesystem_state", "_int_counter_merge_reducer", diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md index e989e3ee6..11dcc5d11 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/description_readonly.md @@ -2,4 +2,4 @@ Read-only specialist for the user's workspace (documents and folders). Use to fi Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs. -The specialist returns plain prose with absolute paths and `[citation:]` markers when claims came from KB-indexed chunks. Preserve those markers verbatim if you forward the answer. +The specialist returns plain prose with absolute paths and `[n]` citation labels when claims came from KB-indexed documents. Preserve those labels verbatim if you forward the answer. diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md index c4e36fc73..c77bd5bb4 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md @@ -35,42 +35,31 @@ Map outcomes to your `status`: You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. -## Chunk citations in your prose +## Citations in your prose -When `read_file` returns a KB-indexed document under `/documents/`, the response includes `` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:]` to the sentence stating that fact, using the **exact** id from the `` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. +When `read_file` returns a KB-indexed document under `/documents/`, it comes back as a `` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific passage, append its `[n]` to the sentence stating that fact, copying the label **exactly** as shown. The caller relays these labels verbatim and the server resolves each one, so a wrong number silently breaks the citation. -### Where chunk ids live in `read_file` output +### Where the labels live in `read_file` output -A KB document's XML has three numeric attributes — only **one** is a citation source: +A KB document reads back like this — only the bracketed `[n]` is a citation label: ``` - - - 42 ← NOT a citation. Parent doc id; ignore for citations. - ... - - - ← Index hint; the same id also appears below. - - - - ← This is the citation source. - - + + [3] First milestone is … + [4] Second milestone is … ``` ### Rules -- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. -- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. -- Never cite `` — that's the parent doc, not a chunk. -- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. +- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber. +- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation. +- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number. +- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers. +- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`. - Prefer **fewer accurate citations** over many speculative ones. -- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. -- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. -- Tool results without `` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none. -- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits. +- Tool results without `[n]` labels (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no label and need none. +- Populate `evidence.citations` with **only** the labels you actually emitted — same numbers. ## Examples @@ -89,7 +78,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation "path": "/documents/meetings/2026-05-11-meeting.md", "matched_candidates": null, "content_excerpt": null, - "chunk_ids": null + "citations": null }, "next_step": null, "missing_fields": null, @@ -121,7 +110,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" } ], "content_excerpt": null, - "chunk_ids": null + "citations": null }, "next_step": "Ask the user which design doc to update.", "missing_fields": ["path"], @@ -142,7 +131,7 @@ Return **only** one JSON object (no markdown or prose outside it): "path": string | null, "matched_candidates": [ { "id": string, "label": string } ] | null, "content_excerpt": string | null, - "chunk_ids": string[] | null + "citations": number[] | null }, "next_step": string | null, "missing_fields": string[] | null, diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md index 25dafa3df..d10a08282 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md @@ -33,11 +33,11 @@ Map outcomes to your `status`: - Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`. - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`. -You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.) +You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.) -## Chunk citations in your prose +## Citations in your prose -In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work. +In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Do not emit `[n]` or `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work. ## Examples @@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool "path": "/notes/meetings/2026-05-11-meeting.md", "matched_candidates": null, "content_excerpt": null, - "chunk_ids": null + "citations": null }, "next_step": null, "missing_fields": null, @@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" } ], "content_excerpt": null, - "chunk_ids": null + "citations": null }, "next_step": "Ask the user which design doc to update.", "missing_fields": ["path"], @@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it): "path": string | null, "matched_candidates": [ { "id": string, "label": string } ] | null, "content_excerpt": string | null, - "chunk_ids": string[] | null + "citations": number[] | null }, "next_step": string | null, "missing_fields": string[] | null, diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md index c7813e71d..ae6ba3cfb 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md @@ -28,41 +28,30 @@ Reply in plain prose: - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. -## Chunk citations +## Citations -When the evidence for a claim came from a `read_file` response that included `` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. +When the evidence for a claim came from a `read_file` response for a KB-indexed document under `/documents/`, the document reads back as a `` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Append the relevant `[n]` to the sentence stating the claim, copying it **exactly** as shown. The caller passes these labels through verbatim and the server resolves each one, so a wrong number silently breaks the citation. -### Where chunk ids live in `read_file` output +### Where the labels live in `read_file` output -A KB document's XML has three numeric attributes — only **one** is a citation source: +A KB document reads back like this — only the bracketed `[n]` is a citation label: ``` - - - 42 ← NOT a citation. Parent doc id; ignore for citations. - ... - - - ← Index hint; the same id also appears below. - - - - ← This is the citation source. - - + + [3] First milestone is … + [4] Second milestone is … ``` ### Rules -- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. -- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. -- Never cite `` — that's the parent doc, not a chunk. -- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. -- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids. -- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. -- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. -- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without ``), skip the citation. -- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference. +- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber. +- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation. +- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number. +- Prefer **fewer accurate citations** over many speculative ones. One correct `[3]` is more useful than a string of wrong numbers. +- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`. +- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers. +- If a claim came from a tool result that did **not** carry `[n]` labels (`ls`, `glob`, `grep` listings, error strings), skip the citation. +- The absolute path under `/documents/` is always required; `[n]` labels are additive, they do not replace the path reference. -Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].` +Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [3][4].` diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md index 2ea711e44..8704754a2 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_desktop.md @@ -29,6 +29,6 @@ Reply in plain prose: - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. -## Chunk citations +## Citations -In desktop mode your filesystem tools read local files only, and local-file `read_file` responses do **not** carry `` tags. Cite each claim with the absolute local path; do not emit `[citation:…]` markers — your caller has nothing to resolve them against. +In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Cite each claim with the absolute local path; do not emit `[n]` or `[citation:…]` markers — your caller has nothing to resolve them against. diff --git a/surfsense_backend/app/agents/chat/runtime/references/__init__.py b/surfsense_backend/app/agents/chat/runtime/references/__init__.py index 51e543ccc..62530fd71 100644 --- a/surfsense_backend/app/agents/chat/runtime/references/__init__.py +++ b/surfsense_backend/app/agents/chat/runtime/references/__init__.py @@ -13,7 +13,7 @@ from app.schemas.new_chat import MentionedDocumentInfo from .chat import resolve_chat_references from .connectors import resolve_connector_references -from .documents import resolve_document_references +from .documents import referenced_document_ids, resolve_document_references from .folders import resolve_folder_references from .models import ( ChatReference, @@ -89,6 +89,7 @@ __all__ = [ "FolderReference", "Reference", "ReferenceKind", + "referenced_document_ids", "render_reference_pointers", "resolve_references", ] diff --git a/surfsense_backend/app/agents/chat/runtime/references/documents/__init__.py b/surfsense_backend/app/agents/chat/runtime/references/documents/__init__.py new file mode 100644 index 000000000..4250ee119 --- /dev/null +++ b/surfsense_backend/app/agents/chat/runtime/references/documents/__init__.py @@ -0,0 +1,13 @@ +"""Resolve ``@document`` references. + +Two concerns, one subject: ``resolver`` turns document ids into pointer +references for the model, ``referenced`` turns ``@document`` / ``@folder`` +mentions into the document ids a retrieval is confined to. +""" + +from __future__ import annotations + +from .referenced import referenced_document_ids +from .resolver import resolve_document_references + +__all__ = ["referenced_document_ids", "resolve_document_references"] diff --git a/surfsense_backend/app/agents/chat/runtime/references/documents/referenced.py b/surfsense_backend/app/agents/chat/runtime/references/documents/referenced.py new file mode 100644 index 000000000..4e05fd324 --- /dev/null +++ b/surfsense_backend/app/agents/chat/runtime/references/documents/referenced.py @@ -0,0 +1,39 @@ +"""Resolve ``@document`` / ``@folder`` mentions to the documents they point at. + +Reference resolution, not retrieval: this answers "which knowledge-base +documents did the user point at this turn?". ``@document`` ids pass through; +``@folder`` ids expand to the documents directly inside each folder within this +search space (direct children only, not nested subfolders). The caller turns the +returned ids into a retrieval ``SearchScope``. +""" + +from __future__ import annotations + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document + + +async def referenced_document_ids( + session: AsyncSession, + *, + search_space_id: int, + document_ids: list[int] | None = None, + folder_ids: list[int] | None = None, +) -> tuple[int, ...]: + """Sorted document ids the user pointed at (empty = nothing referenced).""" + doc_ids = set(document_ids or []) + folders = list(folder_ids or []) + if folders: + rows = await session.execute( + select(Document.id).where( + Document.search_space_id == search_space_id, + Document.folder_id.in_(folders), + ) + ) + doc_ids.update(rows.scalars().all()) + return tuple(sorted(doc_ids)) + + +__all__ = ["referenced_document_ids"] diff --git a/surfsense_backend/app/agents/chat/runtime/references/documents.py b/surfsense_backend/app/agents/chat/runtime/references/documents/resolver.py similarity index 97% rename from surfsense_backend/app/agents/chat/runtime/references/documents.py rename to surfsense_backend/app/agents/chat/runtime/references/documents/resolver.py index b2a3b1fe4..72a459eb9 100644 --- a/surfsense_backend/app/agents/chat/runtime/references/documents.py +++ b/surfsense_backend/app/agents/chat/runtime/references/documents/resolver.py @@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path from app.db import Document -from .models import DocumentReference +from ..models import DocumentReference async def resolve_document_references( diff --git a/surfsense_backend/tests/integration/agents/multi_agent_chat/main_agent/tools/test_search_knowledge_base.py b/surfsense_backend/tests/integration/agents/multi_agent_chat/main_agent/tools/test_search_knowledge_base.py new file mode 100644 index 000000000..b25e8eeeb --- /dev/null +++ b/surfsense_backend/tests/integration/agents/multi_agent_chat/main_agent/tools/test_search_knowledge_base.py @@ -0,0 +1,237 @@ +"""Behavior tests for the ``search_knowledge_base`` main-agent tool. + +These exercise the tool through its public contract: seed a real document, +invoke the tool, and assert on the ``Command`` it returns — the rendered +```` carries ``[n]`` labels and the citation registry handed +back on state is populated. +The tool's own DB session is redirected to the test session, and the embedding +leg is pinned so the search is deterministic without a live model. +""" + +from __future__ import annotations + +import contextlib +import uuid +from types import SimpleNamespace + +import pytest +from langchain_core.messages import ToolMessage +from langgraph.types import Command + +from app.agents.chat.multi_agent_chat.main_agent.tools import search_knowledge_base +from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import ( + create_search_knowledge_base_tool, +) +from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry +from app.config import config +from app.db import Chunk, Document, DocumentType, Folder + +pytestmark = pytest.mark.integration + +_DIM = config.embedding_model_instance.dimension + + +def _axis(index: int) -> list[float]: + vector = [0.0] * _DIM + vector[index] = 1.0 + return vector + + +async def _add_document( + db_session, + *, + search_space_id: int, + title: str, + text: str, + folder_id: int | None = None, +): + document = Document( + title=title, + document_type=DocumentType.FILE, + content=text, + content_hash=uuid.uuid4().hex, + search_space_id=search_space_id, + folder_id=folder_id, + status={"state": "ready"}, + ) + db_session.add(document) + await db_session.flush() + db_session.add( + Chunk(content=text, document_id=document.id, position=0, embedding=_axis(0)) + ) + await db_session.flush() + return document + + +async def _add_folder(db_session, *, search_space_id: int, name: str = "Folder"): + folder = Folder(name=name, position="0", search_space_id=search_space_id) + db_session.add(folder) + await db_session.flush() + return folder + + +@pytest.fixture +def _tool_uses_test_session(db_session, monkeypatch): + """Redirect the tool's ``shielded_async_session`` to the test transaction.""" + + @contextlib.asynccontextmanager + async def _session(): + yield db_session + + monkeypatch.setattr(search_knowledge_base, "shielded_async_session", _session) + + +@pytest.fixture +def _pinned_embedding(monkeypatch): + monkeypatch.setattr( + config.embedding_model_instance, "embed", lambda _query: _axis(0) + ) + + +async def _invoke(tool, query: str, state: dict | None = None, context=None): + runtime = SimpleNamespace( + state=state or {}, tool_call_id="call-1", context=context + ) + return await tool.coroutine(query, runtime) + + +def _mentions(*, document_ids=(), folder_ids=()): + return SimpleNamespace( + mentioned_document_ids=list(document_ids), + mentioned_folder_ids=list(folder_ids), + ) + + +async def test_tool_returns_retrieved_context_with_numbered_passages( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Asyncio Guide", + text="The asyncio library enables concurrency.", + ) + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke(tool, "asyncio") + + assert isinstance(result, Command) + message = result.update["messages"][0] + assert isinstance(message, ToolMessage) + assert "" in message.content + assert "[1]" in message.content + + +async def test_tool_populates_citation_registry_on_state( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Asyncio Guide", + text="The asyncio library enables concurrency.", + ) + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke(tool, "asyncio") + + registry = result.update["citation_registry"] + assert isinstance(registry, CitationRegistry) + assert registry.by_n # at least one passage was registered as [n] + + +async def test_tool_reuses_existing_registry_numbering( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Asyncio Guide", + text="The asyncio library enables concurrency.", + ) + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + first = await _invoke(tool, "asyncio") + carried = first.update["citation_registry"] + second = await _invoke(tool, "asyncio", state={"citation_registry": carried}) + + # Same passage searched twice keeps a single [n] (find-or-create). + assert len(second.update["citation_registry"].by_n) == 1 + + +async def test_tool_reports_no_matches_without_touching_state( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke(tool, "nonexistent-term-zzz") + + assert isinstance(result, str) + assert "No knowledge-base matches" in result + + +async def test_tool_rejects_empty_query( + db_search_space, _tool_uses_test_session, _pinned_embedding +): + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke(tool, " ") + + assert isinstance(result, str) + assert "non-empty" in result + + +async def test_document_mention_confines_search_to_pinned_doc( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + pinned = await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Pinned", + text="asyncio appears in the pinned doc.", + ) + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Other", + text="asyncio appears in the other doc.", + ) + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke( + tool, "asyncio", context=_mentions(document_ids=[pinned.id]) + ) + + # Search is confined to the pinned doc: only its content is rendered. + content = result.update["messages"][0].content + assert "Pinned" in content + assert "Other" not in content + + +async def test_folder_mention_confines_search_to_folder_documents( + db_session, db_search_space, _tool_uses_test_session, _pinned_embedding +): + folder = await _add_folder(db_session, search_space_id=db_search_space.id) + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Inside", + text="asyncio appears inside the folder.", + folder_id=folder.id, + ) + await _add_document( + db_session, + search_space_id=db_search_space.id, + title="Outside", + text="asyncio appears outside the folder.", + ) + tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id) + + result = await _invoke( + tool, "asyncio", context=_mentions(folder_ids=[folder.id]) + ) + + # Search is confined to the folder's document: only its content is rendered. + content = result.update["messages"][0].content + assert "Inside" in content + assert "Outside" not in content diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py b/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py index 637a10704..7398fce6a 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py +++ b/surfsense_backend/tests/unit/agents/new_chat/test_state_reducers.py @@ -4,9 +4,14 @@ from __future__ import annotations import pytest +from app.agents.chat.multi_agent_chat.shared.citations import ( + CitationRegistry, + CitationSourceType, +) from app.agents.chat.multi_agent_chat.shared.state.reducers import ( _CLEAR, _add_unique_reducer, + _citation_registry_merge_reducer, _dict_merge_with_tombstones_reducer, _initial_filesystem_state, _list_append_reducer, @@ -93,6 +98,57 @@ class TestDictMergeWithTombstones: } +def _kb_registry(chunk_id: int) -> CitationRegistry: + registry = CitationRegistry() + registry.register( + CitationSourceType.KB_CHUNK, {"document_id": 1, "chunk_id": chunk_id} + ) + return registry + + +class TestCitationRegistryMergeReducer: + def test_none_left_returns_right(self): + right = _kb_registry(10) + assert _citation_registry_merge_reducer(None, right) is right + + def test_none_right_returns_left(self): + left = _kb_registry(10) + assert _citation_registry_merge_reducer(left, None) is left + + def test_both_none_returns_none(self): + assert _citation_registry_merge_reducer(None, None) is None + + def test_unions_two_registries(self): + left = _kb_registry(10) + right = _kb_registry(11) + + merged = _citation_registry_merge_reducer(left, right) + + chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()} + assert chunk_ids == {10, 11} + + def test_coerces_serialized_dict_update(self): + # The checkpointer serializes Command.update via ormsgpack before the + # reducer runs, so `right` can arrive as a plain dict. + left = _kb_registry(10) + right = _kb_registry(11).model_dump() + + merged = _citation_registry_merge_reducer(left, right) + + chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()} + assert chunk_ids == {10, 11} + + def test_coerces_both_sides_from_dict(self): + left = _kb_registry(10).model_dump() + right = _kb_registry(11).model_dump() + + merged = _citation_registry_merge_reducer(left, right) + + assert isinstance(merged, CitationRegistry) + chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()} + assert chunk_ids == {10, 11} + + class TestInitialFilesystemState: def test_default_shape(self): state = _initial_filesystem_state() @@ -106,7 +162,6 @@ class TestInitialFilesystemState: assert state["dirty_paths"] == [] assert state["dirty_path_tool_calls"] == {} assert state["kb_priority"] == [] - assert state["kb_matched_chunk_ids"] == {} assert state["kb_anon_doc"] is None assert state["tree_version"] == 0 diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py index 027738fba..b128c35e7 100644 --- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py +++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py @@ -6,9 +6,6 @@ import pytest from langchain_core.messages import AIMessage, HumanMessage from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks -from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import ( - build_document_xml as _build_document_xml, -) from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import ( KBSearchPlan, KnowledgePriorityMiddleware, @@ -59,88 +56,6 @@ class TestResolveSearchTypes: assert result.count("FILE") == 1 -# ── _build_document_xml ──────────────────────────────────────────────── - - -class TestBuildDocumentXml: - @pytest.fixture - def sample_document(self): - return { - "document_id": 42, - "document": { - "id": 42, - "document_type": "FILE", - "title": "Test Doc", - "metadata": {"url": "https://example.com"}, - }, - "chunks": [ - {"chunk_id": 101, "content": "First chunk content"}, - {"chunk_id": 102, "content": "Second chunk content"}, - {"chunk_id": 103, "content": "Third chunk content"}, - ], - } - - def test_contains_document_metadata(self, sample_document): - xml = _build_document_xml(sample_document) - assert "42" in xml - assert "FILE" in xml - assert "Test Doc" in xml - - def test_contains_chunk_index(self, sample_document): - xml = _build_document_xml(sample_document) - assert "" in xml - assert "" in xml - assert 'chunk_id="101"' in xml - assert 'chunk_id="102"' in xml - assert 'chunk_id="103"' in xml - - def test_matched_chunks_flagged_in_index(self, sample_document): - xml = _build_document_xml(sample_document, matched_chunk_ids={101, 103}) - lines = xml.split("\n") - for line in lines: - if 'chunk_id="101"' in line: - assert 'matched="true"' in line - if 'chunk_id="102"' in line: - assert 'matched="true"' not in line - if 'chunk_id="103"' in line: - assert 'matched="true"' in line - - def test_chunk_content_in_document_content_section(self, sample_document): - xml = _build_document_xml(sample_document) - assert "" in xml - assert "First chunk content" in xml - assert "Second chunk content" in xml - assert "Third chunk content" in xml - - def test_line_numbers_in_chunk_index_are_accurate(self, sample_document): - """Verify that the line ranges in chunk_index actually point to the right content.""" - xml = _build_document_xml(sample_document, matched_chunk_ids={101}) - xml_lines = xml.split("\n") - - for line in xml_lines: - if 'chunk_id="101"' in line and "lines=" in line: - import re - - m = re.search(r'lines="(\d+)-(\d+)"', line) - assert m, f"No lines= attribute found in: {line}" - start, _end = int(m.group(1)), int(m.group(2)) - target_line = xml_lines[start - 1] - assert "101" in target_line - assert "First chunk content" in target_line - break - else: - pytest.fail("chunk_id=101 entry not found in chunk_index") - - def test_splits_into_lines_correctly(self, sample_document): - """Each chunk occupies exactly one line (no embedded newlines).""" - xml = _build_document_xml(sample_document) - lines = xml.split("\n") - chunk_lines = [ - line for line in lines if "