mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
search-kb: on-demand KB tool on the [n] spine; drop kb_matched_chunk_ids
The main agent's search_knowledge_base tool runs the hybrid spine, renders a <retrieved_context> of numbered [n] passages, and persists the registry. KB subagent prompts teach citing [n] from <document view="full"> reads (evidence.chunk_ids -> evidence.citations). Delete the now-unused search->read highlighting hand-off: the kb_matched_chunk_ids state field, its reducer default, the tool's _matched_chunk_ids writer, and the dead KnowledgePriorityMiddleware writes.
This commit is contained in:
parent
04a76b163b
commit
c98bdea5cf
16 changed files with 518 additions and 325 deletions
|
|
@ -1,12 +1,11 @@
|
||||||
"""On-demand ``search_knowledge_base`` main-agent tool (OpenCode-style lazy RAG).
|
"""On-demand ``search_knowledge_base`` main-agent tool (citation-spine RAG).
|
||||||
|
|
||||||
The main agent no longer receives eagerly pre-injected KB context on every
|
The main agent calls this when it decides it needs knowledge-base content. The
|
||||||
turn (see :class:`KnowledgePriorityMiddleware`, now gated off by default).
|
tool runs one hybrid search, renders the matched passages as a
|
||||||
Instead it calls this tool only when it decides it needs knowledge-base
|
``<retrieved_context>`` block whose passages carry server-assigned ``[n]``
|
||||||
content. The tool runs a single hybrid search (embed + DB search, ~0.5s),
|
labels, and persists the conversation's ``CitationRegistry`` onto graph state so
|
||||||
formats the top matches for the model, and writes ``kb_matched_chunk_ids``
|
the ``[n]`` -> ``[citation:<payload>]`` normalizer can resolve them after the
|
||||||
into graph state so matched-section highlighting is preserved when the agent
|
turn.
|
||||||
later reads a document via ``task(knowledge_base)``.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -18,153 +17,70 @@ from langchain.tools import ToolRuntime
|
||||||
from langchain_core.messages import ToolMessage
|
from langchain_core.messages import ToolMessage
|
||||||
from langchain_core.tools import BaseTool, StructuredTool
|
from langchain_core.tools import BaseTool, StructuredTool
|
||||||
from langgraph.types import Command
|
from langgraph.types import Command
|
||||||
from sqlalchemy import select
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
|
from app.agents.chat.multi_agent_chat.shared.citations import load_registry
|
||||||
search_knowledge_base as _hybrid_search_kb,
|
from app.agents.chat.multi_agent_chat.shared.retrieval import SearchScope, build_context
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
|
||||||
|
search_chunks,
|
||||||
)
|
)
|
||||||
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
|
from app.agents.chat.multi_agent_chat.shared.state.filesystem_state import (
|
||||||
SurfSenseFilesystemState,
|
SurfSenseFilesystemState,
|
||||||
)
|
)
|
||||||
from app.agents.chat.runtime.path_resolver import (
|
from app.agents.chat.runtime.references import referenced_document_ids
|
||||||
PathIndex,
|
from app.db import shielded_async_session
|
||||||
build_path_index,
|
|
||||||
doc_to_virtual_path,
|
|
||||||
)
|
|
||||||
from app.db import Document, shielded_async_session
|
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
_perf_log = get_perf_logger()
|
_perf_log = get_perf_logger()
|
||||||
|
|
||||||
_DEFAULT_TOP_K = 5
|
_DEFAULT_TOP_K = 5
|
||||||
_MAX_TOP_K = 20
|
_MAX_TOP_K = 20
|
||||||
_PER_DOC_SNIPPET_CHARS = 1200
|
|
||||||
_MAX_TOTAL_CHARS = 16_000
|
|
||||||
|
|
||||||
_TOOL_DESCRIPTION = (
|
_TOOL_DESCRIPTION = (
|
||||||
"Search the user's knowledge base (their indexed documents, files, and "
|
"Search the user's knowledge base (their indexed documents, files, and "
|
||||||
"connector content) for passages relevant to a query, using hybrid "
|
"connector content) for passages relevant to a query, using hybrid "
|
||||||
"semantic + keyword retrieval.\n\n"
|
"semantic + keyword retrieval.\n\n"
|
||||||
"Use this FIRST to ground any factual or informational answer about the "
|
"Use this FIRST to ground any factual or informational answer about the "
|
||||||
"user's own documents, notes, or connected sources. The workspace tree "
|
"user's own documents, notes, or connected sources. It returns a "
|
||||||
"shows which files exist; this tool pulls the actual relevant content. "
|
"<retrieved_context> block: each matched passage is labelled [n]. Cite a "
|
||||||
"Each hit returns the document's virtual path, a relevance score, and the "
|
"passage by writing that [n] after the statement it supports.\n\n"
|
||||||
"matched snippets. If you need a document's full text, delegate a read to "
|
|
||||||
"the knowledge_base specialist via `task` using the returned path.\n\n"
|
|
||||||
"Write a focused, specific query containing the concrete entities, "
|
"Write a focused, specific query containing the concrete entities, "
|
||||||
"acronyms, people, projects, or terms you are looking for."
|
"acronyms, people, projects, or terms you are looking for."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def _resolve_virtual_paths(
|
def _search_types(
|
||||||
results: list[dict[str, Any]],
|
available_connectors: list[str] | None,
|
||||||
|
available_document_types: list[str] | None,
|
||||||
|
) -> tuple[str, ...] | None:
|
||||||
|
"""Merge connector + document-type filters into a scope; ``None`` if unrestricted."""
|
||||||
|
types: set[str] = set()
|
||||||
|
if available_document_types:
|
||||||
|
types.update(available_document_types)
|
||||||
|
if available_connectors:
|
||||||
|
types.update(available_connectors)
|
||||||
|
return tuple(sorted(types)) or None
|
||||||
|
|
||||||
|
|
||||||
|
async def _build_search_scope(
|
||||||
|
session: AsyncSession,
|
||||||
*,
|
*,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
) -> dict[int, str]:
|
document_types: tuple[str, ...] | None,
|
||||||
"""Resolve ``Document.id`` -> canonical virtual path for the search hits."""
|
runtime: ToolRuntime[None, SurfSenseFilesystemState],
|
||||||
doc_ids = [
|
) -> SearchScope:
|
||||||
doc_id
|
"""Assemble the retrieval scope: workspace document-type filter + @-mention pins."""
|
||||||
for doc_id in (
|
ctx = getattr(runtime, "context", None)
|
||||||
(doc.get("document") or {}).get("id")
|
document_ids = await referenced_document_ids(
|
||||||
for doc in results
|
session,
|
||||||
if isinstance(doc, dict)
|
search_space_id=search_space_id,
|
||||||
)
|
document_ids=getattr(ctx, "mentioned_document_ids", None),
|
||||||
if isinstance(doc_id, int)
|
folder_ids=getattr(ctx, "mentioned_folder_ids", None),
|
||||||
]
|
)
|
||||||
if not doc_ids:
|
return SearchScope(
|
||||||
return {}
|
document_types=document_types,
|
||||||
|
document_ids=document_ids or None,
|
||||||
async with shielded_async_session() as session:
|
|
||||||
index: PathIndex = await build_path_index(session, search_space_id)
|
|
||||||
folder_rows = await session.execute(
|
|
||||||
select(Document.id, Document.folder_id).where(
|
|
||||||
Document.search_space_id == search_space_id,
|
|
||||||
Document.id.in_(doc_ids),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
|
|
||||||
|
|
||||||
paths: dict[int, str] = {}
|
|
||||||
for doc in results:
|
|
||||||
doc_meta = doc.get("document") or {}
|
|
||||||
doc_id = doc_meta.get("id")
|
|
||||||
if not isinstance(doc_id, int):
|
|
||||||
continue
|
|
||||||
folder_id = folder_by_doc_id.get(doc_id, doc_meta.get("folder_id"))
|
|
||||||
paths[doc_id] = doc_to_virtual_path(
|
|
||||||
doc_id=doc_id,
|
|
||||||
title=str(doc_meta.get("title") or "untitled"),
|
|
||||||
folder_id=folder_id if isinstance(folder_id, int) else None,
|
|
||||||
index=index,
|
|
||||||
)
|
|
||||||
return paths
|
|
||||||
|
|
||||||
|
|
||||||
def _format_hits(
|
|
||||||
results: list[dict[str, Any]],
|
|
||||||
*,
|
|
||||||
paths: dict[int, str],
|
|
||||||
query: str,
|
|
||||||
) -> str:
|
|
||||||
"""Render search hits as a compact, model-readable block."""
|
|
||||||
if not results:
|
|
||||||
return (
|
|
||||||
f"No knowledge-base matches found for query: {query!r}.\n"
|
|
||||||
"Tell the user nothing relevant was found in their workspace, or "
|
|
||||||
"try a different query."
|
|
||||||
)
|
|
||||||
|
|
||||||
lines: list[str] = [f"<knowledge_base_results query={query!r}>"]
|
|
||||||
total = len(lines[0])
|
|
||||||
for rank, doc in enumerate(results, start=1):
|
|
||||||
doc_meta = doc.get("document") or {}
|
|
||||||
doc_id = doc_meta.get("id")
|
|
||||||
title = str(doc_meta.get("title") or "untitled")
|
|
||||||
doc_type = doc_meta.get("document_type") or doc.get("source") or "document"
|
|
||||||
score = doc.get("score")
|
|
||||||
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
|
|
||||||
path = paths.get(doc_id) if isinstance(doc_id, int) else None
|
|
||||||
|
|
||||||
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
|
|
||||||
f"\n path: {path}" if path else ""
|
|
||||||
)
|
|
||||||
|
|
||||||
content = (doc.get("content") or "").strip()
|
|
||||||
if content:
|
|
||||||
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
|
|
||||||
if len(content) > _PER_DOC_SNIPPET_CHARS:
|
|
||||||
snippet += " ..."
|
|
||||||
body = "\n " + snippet.replace("\n", "\n ")
|
|
||||||
else:
|
|
||||||
body = "\n (no preview available; read the document for details)"
|
|
||||||
|
|
||||||
entry = header + body
|
|
||||||
if total + len(entry) > _MAX_TOTAL_CHARS:
|
|
||||||
lines.append("\n<!-- additional matches truncated to fit context -->")
|
|
||||||
break
|
|
||||||
lines.append(entry)
|
|
||||||
total += len(entry)
|
|
||||||
|
|
||||||
lines.append(
|
|
||||||
"\n\nTo read a full document, delegate to the knowledge_base specialist "
|
|
||||||
"with `task`, referencing the path above."
|
|
||||||
)
|
)
|
||||||
lines.append("\n</knowledge_base_results>")
|
|
||||||
return "".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
def _matched_chunk_ids(results: list[dict[str, Any]]) -> dict[int, list[int]]:
|
|
||||||
"""Extract ``Document.id`` -> matched chunk ids for state hand-off."""
|
|
||||||
matched: dict[int, list[int]] = {}
|
|
||||||
for doc in results:
|
|
||||||
doc_id = (doc.get("document") or {}).get("id")
|
|
||||||
if not isinstance(doc_id, int):
|
|
||||||
continue
|
|
||||||
chunk_ids = doc.get("matched_chunk_ids") or []
|
|
||||||
normalized = [int(cid) for cid in chunk_ids if isinstance(cid, int | str)]
|
|
||||||
if normalized:
|
|
||||||
matched[doc_id] = normalized
|
|
||||||
return matched
|
|
||||||
|
|
||||||
|
|
||||||
def create_search_knowledge_base_tool(
|
def create_search_knowledge_base_tool(
|
||||||
|
|
@ -176,8 +92,7 @@ def create_search_knowledge_base_tool(
|
||||||
"""Factory for the on-demand ``search_knowledge_base`` tool."""
|
"""Factory for the on-demand ``search_knowledge_base`` tool."""
|
||||||
|
|
||||||
_space_id = search_space_id
|
_space_id = search_space_id
|
||||||
_connectors = available_connectors
|
_document_types = _search_types(available_connectors, available_document_types)
|
||||||
_doc_types = available_document_types
|
|
||||||
|
|
||||||
async def _impl(
|
async def _impl(
|
||||||
query: Annotated[
|
query: Annotated[
|
||||||
|
|
@ -195,34 +110,45 @@ def create_search_knowledge_base_tool(
|
||||||
return "Error: provide a non-empty search query."
|
return "Error: provide a non-empty search query."
|
||||||
|
|
||||||
clamped_top_k = min(max(1, top_k), _MAX_TOP_K)
|
clamped_top_k = min(max(1, top_k), _MAX_TOP_K)
|
||||||
t0 = time.perf_counter()
|
registry = load_registry(getattr(runtime, "state", None))
|
||||||
results = await _hybrid_search_kb(
|
|
||||||
query=cleaned_query,
|
|
||||||
search_space_id=_space_id,
|
|
||||||
available_connectors=_connectors,
|
|
||||||
available_document_types=_doc_types,
|
|
||||||
top_k=clamped_top_k,
|
|
||||||
)
|
|
||||||
|
|
||||||
paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
|
t0 = time.perf_counter()
|
||||||
rendered = _format_hits(results, paths=paths, query=cleaned_query)
|
async with shielded_async_session() as session:
|
||||||
matched = _matched_chunk_ids(results)
|
scope = await _build_search_scope(
|
||||||
|
session,
|
||||||
|
search_space_id=_space_id,
|
||||||
|
document_types=_document_types,
|
||||||
|
runtime=runtime,
|
||||||
|
)
|
||||||
|
hits = await search_chunks(
|
||||||
|
session,
|
||||||
|
search_space_id=_space_id,
|
||||||
|
query=cleaned_query,
|
||||||
|
scope=scope,
|
||||||
|
top_k=clamped_top_k,
|
||||||
|
)
|
||||||
|
rendered = build_context(cleaned_query, hits, registry)
|
||||||
|
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[search_knowledge_base] tool query=%r results=%d chars=%d in %.3fs",
|
"[search_knowledge_base] tool query=%r docs=%d in %.3fs",
|
||||||
cleaned_query[:60],
|
cleaned_query[:60],
|
||||||
len(results),
|
len(hits),
|
||||||
len(rendered),
|
|
||||||
time.perf_counter() - t0,
|
time.perf_counter() - t0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if rendered is None:
|
||||||
|
return (
|
||||||
|
f"No knowledge-base matches found for query: {cleaned_query!r}.\n"
|
||||||
|
"Tell the user nothing relevant was found in their workspace, or "
|
||||||
|
"try a different query."
|
||||||
|
)
|
||||||
|
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
"messages": [
|
"messages": [
|
||||||
ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id)
|
ToolMessage(content=rendered, tool_call_id=runtime.tool_call_id)
|
||||||
],
|
],
|
||||||
|
"citation_registry": registry,
|
||||||
}
|
}
|
||||||
if matched:
|
|
||||||
update["kb_matched_chunk_ids"] = matched
|
|
||||||
return Command(update=update)
|
return Command(update=update)
|
||||||
|
|
||||||
return StructuredTool.from_function(
|
return StructuredTool.from_function(
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,6 @@ This middleware runs ``before_agent`` on every turn and writes:
|
||||||
* ``state["kb_priority"]`` — the top-K most relevant documents for the
|
* ``state["kb_priority"]`` — the top-K most relevant documents for the
|
||||||
current user message, used to render a ``<priority_documents>`` system
|
current user message, used to render a ``<priority_documents>`` system
|
||||||
message immediately before the user turn.
|
message immediately before the user turn.
|
||||||
* ``state["kb_matched_chunk_ids"]`` — internal hand-off mapping
|
|
||||||
(``Document.id`` → matched chunk IDs) consumed by
|
|
||||||
:class:`KBPostgresBackend._load_file_data` when the agent first reads each
|
|
||||||
document, so the XML wrapper can flag matched sections in
|
|
||||||
``<chunk_index>``.
|
|
||||||
|
|
||||||
The previous "scoped filesystem" behaviour (synthetic ``ls`` + state
|
The previous "scoped filesystem" behaviour (synthetic ``ls`` + state
|
||||||
``files`` seeding) is intentionally removed: documents are now lazy-loaded
|
``files`` seeding) is intentionally removed: documents are now lazy-loaded
|
||||||
|
|
@ -816,7 +811,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
]
|
]
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
"kb_priority": priority,
|
"kb_priority": priority,
|
||||||
"kb_matched_chunk_ids": {},
|
|
||||||
}
|
}
|
||||||
if self.inject_system_message:
|
if self.inject_system_message:
|
||||||
new_messages = list(state.get("messages") or [])
|
new_messages = list(state.get("messages") or [])
|
||||||
|
|
@ -930,7 +924,7 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
merged.append(doc)
|
merged.append(doc)
|
||||||
|
|
||||||
_t_materialize = time.perf_counter()
|
_t_materialize = time.perf_counter()
|
||||||
priority, matched_chunk_ids = await self._materialize_priority(merged)
|
priority = await self._materialize_priority(merged)
|
||||||
|
|
||||||
if folder_mention_ids:
|
if folder_mention_ids:
|
||||||
folder_entries = await self._materialize_folder_priority(folder_mention_ids)
|
folder_entries = await self._materialize_folder_priority(folder_mention_ids)
|
||||||
|
|
@ -957,7 +951,6 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
|
|
||||||
update: dict[str, Any] = {
|
update: dict[str, Any] = {
|
||||||
"kb_priority": priority,
|
"kb_priority": priority,
|
||||||
"kb_matched_chunk_ids": matched_chunk_ids,
|
|
||||||
}
|
}
|
||||||
if self.inject_system_message:
|
if self.inject_system_message:
|
||||||
new_messages = list(messages)
|
new_messages = list(messages)
|
||||||
|
|
@ -1016,13 +1009,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
|
|
||||||
async def _materialize_priority(
|
async def _materialize_priority(
|
||||||
self, merged: list[dict[str, Any]]
|
self, merged: list[dict[str, Any]]
|
||||||
) -> tuple[list[dict[str, Any]], dict[int, list[int]]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Resolve canonical paths and matched chunk ids for the priority list."""
|
"""Resolve canonical paths for the priority list."""
|
||||||
priority: list[dict[str, Any]] = []
|
priority: list[dict[str, Any]] = []
|
||||||
matched_chunk_ids: dict[int, list[int]] = {}
|
|
||||||
|
|
||||||
if not merged:
|
if not merged:
|
||||||
return priority, matched_chunk_ids
|
return priority
|
||||||
|
|
||||||
_t0 = time.perf_counter()
|
_t0 = time.perf_counter()
|
||||||
async with shielded_async_session() as session:
|
async with shielded_async_session() as session:
|
||||||
|
|
@ -1067,18 +1059,12 @@ class KnowledgePriorityMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
"mentioned": bool(doc.get("_user_mentioned")),
|
"mentioned": bool(doc.get("_user_mentioned")),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if isinstance(doc_id, int):
|
|
||||||
chunk_ids = doc.get("matched_chunk_ids") or []
|
|
||||||
if chunk_ids:
|
|
||||||
matched_chunk_ids[doc_id] = [
|
|
||||||
int(cid) for cid in chunk_ids if isinstance(cid, int | str)
|
|
||||||
]
|
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[kb_priority.materialize] db=%.3fs docs=%d",
|
"[kb_priority.materialize] db=%.3fs docs=%d",
|
||||||
time.perf_counter() - _t0,
|
time.perf_counter() - _t0,
|
||||||
len(merged),
|
len(merged),
|
||||||
)
|
)
|
||||||
return priority, matched_chunk_ids
|
return priority
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,8 @@ extra fields needed to implement Postgres-backed virtual filesystem semantics:
|
||||||
* ``dirty_path_tool_calls`` — sidecar map ``path -> latest tool_call_id`` for
|
* ``dirty_path_tool_calls`` — sidecar map ``path -> latest tool_call_id`` for
|
||||||
dirty paths; used to bind the per-path snapshot to an action_id.
|
dirty paths; used to bind the per-path snapshot to an action_id.
|
||||||
* ``kb_priority`` — top-K priority hints rendered into a system message.
|
* ``kb_priority`` — top-K priority hints rendered into a system message.
|
||||||
* ``kb_matched_chunk_ids`` — internal hand-off for matched-chunk highlighting.
|
|
||||||
* ``kb_anon_doc`` — Redis-loaded anonymous document (if any).
|
* ``kb_anon_doc`` — Redis-loaded anonymous document (if any).
|
||||||
|
* ``citation_registry`` — per-conversation ``[n]`` -> source map for citations.
|
||||||
* ``tree_version`` — bumped by persistence; invalidates the tree render cache.
|
* ``tree_version`` — bumped by persistence; invalidates the tree render cache.
|
||||||
* ``workspace_tree_text`` — pre-rendered ``<workspace_tree>`` body for the turn.
|
* ``workspace_tree_text`` — pre-rendered ``<workspace_tree>`` body for the turn.
|
||||||
|
|
||||||
|
|
@ -30,9 +30,11 @@ from typing import Annotated, Any, NotRequired
|
||||||
from deepagents.middleware.filesystem import FilesystemState
|
from deepagents.middleware.filesystem import FilesystemState
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||||
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt
|
from app.agents.chat.multi_agent_chat.shared.receipts.receipt import Receipt
|
||||||
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
|
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
|
||||||
_add_unique_reducer,
|
_add_unique_reducer,
|
||||||
|
_citation_registry_merge_reducer,
|
||||||
_dict_merge_with_tombstones_reducer,
|
_dict_merge_with_tombstones_reducer,
|
||||||
_int_counter_merge_reducer,
|
_int_counter_merge_reducer,
|
||||||
_list_append_reducer,
|
_list_append_reducer,
|
||||||
|
|
@ -162,12 +164,16 @@ class SurfSenseFilesystemState(FilesystemState):
|
||||||
kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]]
|
kb_priority: NotRequired[Annotated[list[KbPriorityEntry], _replace_reducer]]
|
||||||
"""Top-K priority hints rendered as a system message before the user turn."""
|
"""Top-K priority hints rendered as a system message before the user turn."""
|
||||||
|
|
||||||
kb_matched_chunk_ids: NotRequired[Annotated[dict[int, list[int]], _replace_reducer]]
|
|
||||||
"""Internal: ``Document.id`` -> list of matched chunk IDs from hybrid search."""
|
|
||||||
|
|
||||||
kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]]
|
kb_anon_doc: NotRequired[Annotated[KbAnonDoc | None, _replace_reducer]]
|
||||||
"""Anonymous-session document loaded from Redis (read-only, no DB row)."""
|
"""Anonymous-session document loaded from Redis (read-only, no DB row)."""
|
||||||
|
|
||||||
|
citation_registry: NotRequired[
|
||||||
|
Annotated[CitationRegistry, _citation_registry_merge_reducer]
|
||||||
|
]
|
||||||
|
"""Per-conversation ``[n]`` -> source map; written by retrieval, read by the
|
||||||
|
normalizer. Merges (union, find-or-create) so parallel/subagent registrations
|
||||||
|
stay globally consistent instead of clobbering each other."""
|
||||||
|
|
||||||
tree_version: NotRequired[Annotated[int, _replace_reducer]]
|
tree_version: NotRequired[Annotated[int, _replace_reducer]]
|
||||||
"""Monotonically increasing counter; bumped when commits change the KB tree."""
|
"""Monotonically increasing counter; bumped when commits change the KB tree."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
These reducers back the extra state fields used by the cloud-mode filesystem
|
These reducers back the extra state fields used by the cloud-mode filesystem
|
||||||
agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`,
|
agent (`cwd`, `staged_dirs`, `pending_moves`, `dirty_paths`, `doc_id_by_path`,
|
||||||
`kb_priority`, `kb_matched_chunk_ids`, `kb_anon_doc`, `tree_version`).
|
`kb_priority`, `kb_anon_doc`, `tree_version`).
|
||||||
|
|
||||||
Tools mutate these fields ONLY via `Command(update={...})` returns; the
|
Tools mutate these fields ONLY via `Command(update={...})` returns; the
|
||||||
reducers are responsible for merging successive updates atomically and for
|
reducers are responsible for merging successive updates atomically and for
|
||||||
|
|
@ -20,6 +20,8 @@ from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, Final, TypeVar
|
from typing import Any, Final, TypeVar
|
||||||
|
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||||
|
|
||||||
_CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
|
_CLEAR: Final[str] = "\x00__SURFSENSE_FILESYSTEM_CLEAR__\x00"
|
||||||
"""Reset sentinel; pass it inside a list/dict update to request a reset.
|
"""Reset sentinel; pass it inside a list/dict update to request a reset.
|
||||||
|
|
||||||
|
|
@ -204,6 +206,41 @@ def _int_counter_merge_reducer(
|
||||||
return base
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def _as_registry(value: Any) -> CitationRegistry | None:
|
||||||
|
"""Coerce a state value into a ``CitationRegistry``.
|
||||||
|
|
||||||
|
The checkpointer serializes ``Command.update`` via ``ormsgpack`` *before*
|
||||||
|
reducers run, so an update can arrive as a plain ``dict`` rather than a model.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, CitationRegistry):
|
||||||
|
return value
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return CitationRegistry.model_validate(value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _citation_registry_merge_reducer(
|
||||||
|
left: Any,
|
||||||
|
right: Any,
|
||||||
|
) -> CitationRegistry | None:
|
||||||
|
"""Union two citation registries instead of replacing.
|
||||||
|
|
||||||
|
Find-or-create across both sides so ``[n]`` stays globally consistent when
|
||||||
|
branches (parent + subagents, parallel tool calls) each register into a
|
||||||
|
registry forked from the same base. Collisions re-mint rather than drop. See
|
||||||
|
:meth:`CitationRegistry.merge`.
|
||||||
|
"""
|
||||||
|
right_reg = _as_registry(right)
|
||||||
|
left_reg = _as_registry(left)
|
||||||
|
if right_reg is None:
|
||||||
|
return left_reg
|
||||||
|
if left_reg is None:
|
||||||
|
return right_reg
|
||||||
|
return left_reg.merge(right_reg)
|
||||||
|
|
||||||
|
|
||||||
def _initial_filesystem_state() -> dict[str, Any]:
|
def _initial_filesystem_state() -> dict[str, Any]:
|
||||||
"""Default empty values for SurfSense filesystem state fields.
|
"""Default empty values for SurfSense filesystem state fields.
|
||||||
|
|
||||||
|
|
@ -222,7 +259,6 @@ def _initial_filesystem_state() -> dict[str, Any]:
|
||||||
"dirty_paths": [],
|
"dirty_paths": [],
|
||||||
"dirty_path_tool_calls": {},
|
"dirty_path_tool_calls": {},
|
||||||
"kb_priority": [],
|
"kb_priority": [],
|
||||||
"kb_matched_chunk_ids": {},
|
|
||||||
"kb_anon_doc": None,
|
"kb_anon_doc": None,
|
||||||
"tree_version": 0,
|
"tree_version": 0,
|
||||||
}
|
}
|
||||||
|
|
@ -231,6 +267,7 @@ def _initial_filesystem_state() -> dict[str, Any]:
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"_CLEAR",
|
"_CLEAR",
|
||||||
"_add_unique_reducer",
|
"_add_unique_reducer",
|
||||||
|
"_citation_registry_merge_reducer",
|
||||||
"_dict_merge_with_tombstones_reducer",
|
"_dict_merge_with_tombstones_reducer",
|
||||||
"_initial_filesystem_state",
|
"_initial_filesystem_state",
|
||||||
"_int_counter_merge_reducer",
|
"_int_counter_merge_reducer",
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,4 @@ Read-only specialist for the user's workspace (documents and folders). Use to fi
|
||||||
|
|
||||||
Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs.
|
Pass your full question as one string. The specialist runs in isolation: it cannot see this thread, so include any path hints, filters, or constraints it needs.
|
||||||
|
|
||||||
The specialist returns plain prose with absolute paths and `[citation:<chunk_id>]` markers when claims came from KB-indexed chunks. Preserve those markers verbatim if you forward the answer.
|
The specialist returns plain prose with absolute paths and `[n]` citation labels when claims came from KB-indexed documents. Preserve those labels verbatim if you forward the answer.
|
||||||
|
|
|
||||||
|
|
@ -35,42 +35,31 @@ Map outcomes to your `status`:
|
||||||
|
|
||||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
|
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
|
||||||
|
|
||||||
## Chunk citations in your prose
|
## Citations in your prose
|
||||||
|
|
||||||
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
When `read_file` returns a KB-indexed document under `/documents/`, it comes back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific passage, append its `[n]` to the sentence stating that fact, copying the label **exactly** as shown. The caller relays these labels verbatim and the server resolves each one, so a wrong number silently breaks the citation.
|
||||||
|
|
||||||
### Where chunk ids live in `read_file` output
|
### Where the labels live in `read_file` output
|
||||||
|
|
||||||
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
A KB document reads back like this — only the bracketed `[n]` is a citation label:
|
||||||
|
|
||||||
```
|
```
|
||||||
<document>
|
<document title="Q2 Roadmap" source="File" view="full">
|
||||||
<document_metadata>
|
[3] First milestone is …
|
||||||
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
[4] Second milestone is …
|
||||||
...
|
|
||||||
</document_metadata>
|
|
||||||
<chunk_index>
|
|
||||||
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
|
||||||
<entry chunk_id="129" lines="23-30" matched="true"/>
|
|
||||||
</chunk_index>
|
|
||||||
<document_content>
|
|
||||||
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
|
||||||
<chunk id='129'><![CDATA[…]]></chunk>
|
|
||||||
</document_content>
|
|
||||||
</document>
|
</document>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rules
|
### Rules
|
||||||
|
|
||||||
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
|
||||||
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
|
||||||
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
|
||||||
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
|
||||||
|
- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
|
||||||
- Prefer **fewer accurate citations** over many speculative ones.
|
- Prefer **fewer accurate citations** over many speculative ones.
|
||||||
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
- Tool results without `[n]` labels (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no label and need none.
|
||||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
- Populate `evidence.citations` with **only** the labels you actually emitted — same numbers.
|
||||||
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
|
|
||||||
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
|
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
|
@ -89,7 +78,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
|
||||||
"path": "/documents/meetings/2026-05-11-meeting.md",
|
"path": "/documents/meetings/2026-05-11-meeting.md",
|
||||||
"matched_candidates": null,
|
"matched_candidates": null,
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"chunk_ids": null
|
"citations": null
|
||||||
},
|
},
|
||||||
"next_step": null,
|
"next_step": null,
|
||||||
"missing_fields": null,
|
"missing_fields": null,
|
||||||
|
|
@ -121,7 +110,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
|
||||||
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
|
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
|
||||||
],
|
],
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"chunk_ids": null
|
"citations": null
|
||||||
},
|
},
|
||||||
"next_step": "Ask the user which design doc to update.",
|
"next_step": "Ask the user which design doc to update.",
|
||||||
"missing_fields": ["path"],
|
"missing_fields": ["path"],
|
||||||
|
|
@ -142,7 +131,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
||||||
"path": string | null,
|
"path": string | null,
|
||||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||||
"content_excerpt": string | null,
|
"content_excerpt": string | null,
|
||||||
"chunk_ids": string[] | null
|
"citations": number[] | null
|
||||||
},
|
},
|
||||||
"next_step": string | null,
|
"next_step": string | null,
|
||||||
"missing_fields": string[] | null,
|
"missing_fields": string[] | null,
|
||||||
|
|
|
||||||
|
|
@ -33,11 +33,11 @@ Map outcomes to your `status`:
|
||||||
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
|
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
|
||||||
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
|
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
|
||||||
|
|
||||||
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
|
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
|
||||||
|
|
||||||
## Chunk citations in your prose
|
## Citations in your prose
|
||||||
|
|
||||||
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
|
In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Do not emit `[n]` or `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
|
|
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
||||||
"path": "/notes/meetings/2026-05-11-meeting.md",
|
"path": "/notes/meetings/2026-05-11-meeting.md",
|
||||||
"matched_candidates": null,
|
"matched_candidates": null,
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"chunk_ids": null
|
"citations": null
|
||||||
},
|
},
|
||||||
"next_step": null,
|
"next_step": null,
|
||||||
"missing_fields": null,
|
"missing_fields": null,
|
||||||
|
|
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
|
||||||
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
|
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
|
||||||
],
|
],
|
||||||
"content_excerpt": null,
|
"content_excerpt": null,
|
||||||
"chunk_ids": null
|
"citations": null
|
||||||
},
|
},
|
||||||
"next_step": "Ask the user which design doc to update.",
|
"next_step": "Ask the user which design doc to update.",
|
||||||
"missing_fields": ["path"],
|
"missing_fields": ["path"],
|
||||||
|
|
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
|
||||||
"path": string | null,
|
"path": string | null,
|
||||||
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
"matched_candidates": [ { "id": string, "label": string } ] | null,
|
||||||
"content_excerpt": string | null,
|
"content_excerpt": string | null,
|
||||||
"chunk_ids": string[] | null
|
"citations": number[] | null
|
||||||
},
|
},
|
||||||
"next_step": string | null,
|
"next_step": string | null,
|
||||||
"missing_fields": string[] | null,
|
"missing_fields": string[] | null,
|
||||||
|
|
|
||||||
|
|
@ -28,41 +28,30 @@ Reply in plain prose:
|
||||||
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
||||||
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
||||||
|
|
||||||
## Chunk citations
|
## Citations
|
||||||
|
|
||||||
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
|
When the evidence for a claim came from a `read_file` response for a KB-indexed document under `/documents/`, the document reads back as a `<document … view="full">` block whose passages are each prefixed with a bracketed label — `[1]`, `[2]`, `[3]`. That `[n]` is the citation label. Append the relevant `[n]` to the sentence stating the claim, copying it **exactly** as shown. The caller passes these labels through verbatim and the server resolves each one, so a wrong number silently breaks the citation.
|
||||||
|
|
||||||
### Where chunk ids live in `read_file` output
|
### Where the labels live in `read_file` output
|
||||||
|
|
||||||
A KB document's XML has three numeric attributes — only **one** is a citation source:
|
A KB document reads back like this — only the bracketed `[n]` is a citation label:
|
||||||
|
|
||||||
```
|
```
|
||||||
<document>
|
<document title="Q2 Roadmap" source="File" view="full">
|
||||||
<document_metadata>
|
[3] First milestone is …
|
||||||
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
|
[4] Second milestone is …
|
||||||
...
|
|
||||||
</document_metadata>
|
|
||||||
<chunk_index>
|
|
||||||
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
|
|
||||||
<entry chunk_id="129" lines="23-30" matched="true"/>
|
|
||||||
</chunk_index>
|
|
||||||
<document_content>
|
|
||||||
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
|
|
||||||
<chunk id='129'><![CDATA[…]]></chunk>
|
|
||||||
</document_content>
|
|
||||||
</document>
|
</document>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rules
|
### Rules
|
||||||
|
|
||||||
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
|
- Use the **exact** `[n]` shown next to the passage you actually quoted or paraphrased. Copy it digit-for-digit; do **not** retype from memory or renumber.
|
||||||
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
|
- Before emitting an `[n]`, confirm that bracketed label appears in the `read_file` output you are summarising this turn. If you can't see it, omit the citation.
|
||||||
- Never cite `<document_id>` — that's the parent doc, not a chunk.
|
- Labels are **not** sequential by position — a passage may be `[7]` while the one above it is `[3]` (numbering is shared across the whole conversation). Copy what you see; never guess an adjacent number.
|
||||||
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
|
- Prefer **fewer accurate citations** over many speculative ones. One correct `[3]` is more useful than a string of wrong numbers.
|
||||||
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
|
- Several passages behind one point → each in its own brackets with nothing between: `[3][4]`. Never `[3, 4]` and never a range like `[3-4]`.
|
||||||
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
|
- Write the bare label `[n]` only — no `[citation:…]` wrapper, no markdown links, no parentheses, no footnote numbers.
|
||||||
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
|
- If a claim came from a tool result that did **not** carry `[n]` labels (`ls`, `glob`, `grep` listings, error strings), skip the citation.
|
||||||
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
|
- The absolute path under `/documents/` is always required; `[n]` labels are additive, they do not replace the path reference.
|
||||||
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
|
|
||||||
|
|
||||||
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
|
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [3][4].`
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,6 @@ Reply in plain prose:
|
||||||
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
|
||||||
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
|
||||||
|
|
||||||
## Chunk citations
|
## Citations
|
||||||
|
|
||||||
In desktop mode your filesystem tools read local files only, and local-file `read_file` responses do **not** carry `<chunk id='…'>` tags. Cite each claim with the absolute local path; do not emit `[citation:…]` markers — your caller has nothing to resolve them against.
|
In desktop mode your filesystem tools read local files only, which are not KB-indexed and carry no `[n]` citation labels. Cite each claim with the absolute local path; do not emit `[n]` or `[citation:…]` markers — your caller has nothing to resolve them against.
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ from app.schemas.new_chat import MentionedDocumentInfo
|
||||||
|
|
||||||
from .chat import resolve_chat_references
|
from .chat import resolve_chat_references
|
||||||
from .connectors import resolve_connector_references
|
from .connectors import resolve_connector_references
|
||||||
from .documents import resolve_document_references
|
from .documents import referenced_document_ids, resolve_document_references
|
||||||
from .folders import resolve_folder_references
|
from .folders import resolve_folder_references
|
||||||
from .models import (
|
from .models import (
|
||||||
ChatReference,
|
ChatReference,
|
||||||
|
|
@ -89,6 +89,7 @@ __all__ = [
|
||||||
"FolderReference",
|
"FolderReference",
|
||||||
"Reference",
|
"Reference",
|
||||||
"ReferenceKind",
|
"ReferenceKind",
|
||||||
|
"referenced_document_ids",
|
||||||
"render_reference_pointers",
|
"render_reference_pointers",
|
||||||
"resolve_references",
|
"resolve_references",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
"""Resolve ``@document`` references.
|
||||||
|
|
||||||
|
Two concerns, one subject: ``resolver`` turns document ids into pointer
|
||||||
|
references for the model, ``referenced`` turns ``@document`` / ``@folder``
|
||||||
|
mentions into the document ids a retrieval is confined to.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .referenced import referenced_document_ids
|
||||||
|
from .resolver import resolve_document_references
|
||||||
|
|
||||||
|
__all__ = ["referenced_document_ids", "resolve_document_references"]
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
"""Resolve ``@document`` / ``@folder`` mentions to the documents they point at.
|
||||||
|
|
||||||
|
Reference resolution, not retrieval: this answers "which knowledge-base
|
||||||
|
documents did the user point at this turn?". ``@document`` ids pass through;
|
||||||
|
``@folder`` ids expand to the documents directly inside each folder within this
|
||||||
|
search space (direct children only, not nested subfolders). The caller turns the
|
||||||
|
returned ids into a retrieval ``SearchScope``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.db import Document
|
||||||
|
|
||||||
|
|
||||||
|
async def referenced_document_ids(
|
||||||
|
session: AsyncSession,
|
||||||
|
*,
|
||||||
|
search_space_id: int,
|
||||||
|
document_ids: list[int] | None = None,
|
||||||
|
folder_ids: list[int] | None = None,
|
||||||
|
) -> tuple[int, ...]:
|
||||||
|
"""Sorted document ids the user pointed at (empty = nothing referenced)."""
|
||||||
|
doc_ids = set(document_ids or [])
|
||||||
|
folders = list(folder_ids or [])
|
||||||
|
if folders:
|
||||||
|
rows = await session.execute(
|
||||||
|
select(Document.id).where(
|
||||||
|
Document.search_space_id == search_space_id,
|
||||||
|
Document.folder_id.in_(folders),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
doc_ids.update(rows.scalars().all())
|
||||||
|
return tuple(sorted(doc_ids))
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["referenced_document_ids"]
|
||||||
|
|
@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path
|
from app.agents.chat.runtime.path_resolver import PathIndex, doc_to_virtual_path
|
||||||
from app.db import Document
|
from app.db import Document
|
||||||
|
|
||||||
from .models import DocumentReference
|
from ..models import DocumentReference
|
||||||
|
|
||||||
|
|
||||||
async def resolve_document_references(
|
async def resolve_document_references(
|
||||||
|
|
@ -0,0 +1,237 @@
|
||||||
|
"""Behavior tests for the ``search_knowledge_base`` main-agent tool.
|
||||||
|
|
||||||
|
These exercise the tool through its public contract: seed a real document,
|
||||||
|
invoke the tool, and assert on the ``Command`` it returns — the rendered
|
||||||
|
``<retrieved_context>`` carries ``[n]`` labels and the citation registry handed
|
||||||
|
back on state is populated.
|
||||||
|
The tool's own DB session is redirected to the test session, and the embedding
|
||||||
|
leg is pinned so the search is deterministic without a live model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import uuid
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.messages import ToolMessage
|
||||||
|
from langgraph.types import Command
|
||||||
|
|
||||||
|
from app.agents.chat.multi_agent_chat.main_agent.tools import search_knowledge_base
|
||||||
|
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
|
||||||
|
create_search_knowledge_base_tool,
|
||||||
|
)
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.citations import CitationRegistry
|
||||||
|
from app.config import config
|
||||||
|
from app.db import Chunk, Document, DocumentType, Folder
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
_DIM = config.embedding_model_instance.dimension
|
||||||
|
|
||||||
|
|
||||||
|
def _axis(index: int) -> list[float]:
|
||||||
|
vector = [0.0] * _DIM
|
||||||
|
vector[index] = 1.0
|
||||||
|
return vector
|
||||||
|
|
||||||
|
|
||||||
|
async def _add_document(
|
||||||
|
db_session,
|
||||||
|
*,
|
||||||
|
search_space_id: int,
|
||||||
|
title: str,
|
||||||
|
text: str,
|
||||||
|
folder_id: int | None = None,
|
||||||
|
):
|
||||||
|
document = Document(
|
||||||
|
title=title,
|
||||||
|
document_type=DocumentType.FILE,
|
||||||
|
content=text,
|
||||||
|
content_hash=uuid.uuid4().hex,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
folder_id=folder_id,
|
||||||
|
status={"state": "ready"},
|
||||||
|
)
|
||||||
|
db_session.add(document)
|
||||||
|
await db_session.flush()
|
||||||
|
db_session.add(
|
||||||
|
Chunk(content=text, document_id=document.id, position=0, embedding=_axis(0))
|
||||||
|
)
|
||||||
|
await db_session.flush()
|
||||||
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
async def _add_folder(db_session, *, search_space_id: int, name: str = "Folder"):
|
||||||
|
folder = Folder(name=name, position="0", search_space_id=search_space_id)
|
||||||
|
db_session.add(folder)
|
||||||
|
await db_session.flush()
|
||||||
|
return folder
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def _tool_uses_test_session(db_session, monkeypatch):
|
||||||
|
"""Redirect the tool's ``shielded_async_session`` to the test transaction."""
|
||||||
|
|
||||||
|
@contextlib.asynccontextmanager
|
||||||
|
async def _session():
|
||||||
|
yield db_session
|
||||||
|
|
||||||
|
monkeypatch.setattr(search_knowledge_base, "shielded_async_session", _session)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def _pinned_embedding(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
config.embedding_model_instance, "embed", lambda _query: _axis(0)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _invoke(tool, query: str, state: dict | None = None, context=None):
|
||||||
|
runtime = SimpleNamespace(
|
||||||
|
state=state or {}, tool_call_id="call-1", context=context
|
||||||
|
)
|
||||||
|
return await tool.coroutine(query, runtime)
|
||||||
|
|
||||||
|
|
||||||
|
def _mentions(*, document_ids=(), folder_ids=()):
|
||||||
|
return SimpleNamespace(
|
||||||
|
mentioned_document_ids=list(document_ids),
|
||||||
|
mentioned_folder_ids=list(folder_ids),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def test_tool_returns_retrieved_context_with_numbered_passages(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Asyncio Guide",
|
||||||
|
text="The asyncio library enables concurrency.",
|
||||||
|
)
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(tool, "asyncio")
|
||||||
|
|
||||||
|
assert isinstance(result, Command)
|
||||||
|
message = result.update["messages"][0]
|
||||||
|
assert isinstance(message, ToolMessage)
|
||||||
|
assert "<retrieved_context>" in message.content
|
||||||
|
assert "[1]" in message.content
|
||||||
|
|
||||||
|
|
||||||
|
async def test_tool_populates_citation_registry_on_state(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Asyncio Guide",
|
||||||
|
text="The asyncio library enables concurrency.",
|
||||||
|
)
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(tool, "asyncio")
|
||||||
|
|
||||||
|
registry = result.update["citation_registry"]
|
||||||
|
assert isinstance(registry, CitationRegistry)
|
||||||
|
assert registry.by_n # at least one passage was registered as [n]
|
||||||
|
|
||||||
|
|
||||||
|
async def test_tool_reuses_existing_registry_numbering(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Asyncio Guide",
|
||||||
|
text="The asyncio library enables concurrency.",
|
||||||
|
)
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
first = await _invoke(tool, "asyncio")
|
||||||
|
carried = first.update["citation_registry"]
|
||||||
|
second = await _invoke(tool, "asyncio", state={"citation_registry": carried})
|
||||||
|
|
||||||
|
# Same passage searched twice keeps a single [n] (find-or-create).
|
||||||
|
assert len(second.update["citation_registry"].by_n) == 1
|
||||||
|
|
||||||
|
|
||||||
|
async def test_tool_reports_no_matches_without_touching_state(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(tool, "nonexistent-term-zzz")
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert "No knowledge-base matches" in result
|
||||||
|
|
||||||
|
|
||||||
|
async def test_tool_rejects_empty_query(
|
||||||
|
db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(tool, " ")
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert "non-empty" in result
|
||||||
|
|
||||||
|
|
||||||
|
async def test_document_mention_confines_search_to_pinned_doc(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
pinned = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Pinned",
|
||||||
|
text="asyncio appears in the pinned doc.",
|
||||||
|
)
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Other",
|
||||||
|
text="asyncio appears in the other doc.",
|
||||||
|
)
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(
|
||||||
|
tool, "asyncio", context=_mentions(document_ids=[pinned.id])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Search is confined to the pinned doc: only its content is rendered.
|
||||||
|
content = result.update["messages"][0].content
|
||||||
|
assert "Pinned" in content
|
||||||
|
assert "Other" not in content
|
||||||
|
|
||||||
|
|
||||||
|
async def test_folder_mention_confines_search_to_folder_documents(
|
||||||
|
db_session, db_search_space, _tool_uses_test_session, _pinned_embedding
|
||||||
|
):
|
||||||
|
folder = await _add_folder(db_session, search_space_id=db_search_space.id)
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Inside",
|
||||||
|
text="asyncio appears inside the folder.",
|
||||||
|
folder_id=folder.id,
|
||||||
|
)
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Outside",
|
||||||
|
text="asyncio appears outside the folder.",
|
||||||
|
)
|
||||||
|
tool = create_search_knowledge_base_tool(search_space_id=db_search_space.id)
|
||||||
|
|
||||||
|
result = await _invoke(
|
||||||
|
tool, "asyncio", context=_mentions(folder_ids=[folder.id])
|
||||||
|
)
|
||||||
|
|
||||||
|
# Search is confined to the folder's document: only its content is rendered.
|
||||||
|
content = result.update["messages"][0].content
|
||||||
|
assert "Inside" in content
|
||||||
|
assert "Outside" not in content
|
||||||
|
|
@ -4,9 +4,14 @@ from __future__ import annotations
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.citations import (
|
||||||
|
CitationRegistry,
|
||||||
|
CitationSourceType,
|
||||||
|
)
|
||||||
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
|
from app.agents.chat.multi_agent_chat.shared.state.reducers import (
|
||||||
_CLEAR,
|
_CLEAR,
|
||||||
_add_unique_reducer,
|
_add_unique_reducer,
|
||||||
|
_citation_registry_merge_reducer,
|
||||||
_dict_merge_with_tombstones_reducer,
|
_dict_merge_with_tombstones_reducer,
|
||||||
_initial_filesystem_state,
|
_initial_filesystem_state,
|
||||||
_list_append_reducer,
|
_list_append_reducer,
|
||||||
|
|
@ -93,6 +98,57 @@ class TestDictMergeWithTombstones:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _kb_registry(chunk_id: int) -> CitationRegistry:
|
||||||
|
registry = CitationRegistry()
|
||||||
|
registry.register(
|
||||||
|
CitationSourceType.KB_CHUNK, {"document_id": 1, "chunk_id": chunk_id}
|
||||||
|
)
|
||||||
|
return registry
|
||||||
|
|
||||||
|
|
||||||
|
class TestCitationRegistryMergeReducer:
|
||||||
|
def test_none_left_returns_right(self):
|
||||||
|
right = _kb_registry(10)
|
||||||
|
assert _citation_registry_merge_reducer(None, right) is right
|
||||||
|
|
||||||
|
def test_none_right_returns_left(self):
|
||||||
|
left = _kb_registry(10)
|
||||||
|
assert _citation_registry_merge_reducer(left, None) is left
|
||||||
|
|
||||||
|
def test_both_none_returns_none(self):
|
||||||
|
assert _citation_registry_merge_reducer(None, None) is None
|
||||||
|
|
||||||
|
def test_unions_two_registries(self):
|
||||||
|
left = _kb_registry(10)
|
||||||
|
right = _kb_registry(11)
|
||||||
|
|
||||||
|
merged = _citation_registry_merge_reducer(left, right)
|
||||||
|
|
||||||
|
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
|
||||||
|
assert chunk_ids == {10, 11}
|
||||||
|
|
||||||
|
def test_coerces_serialized_dict_update(self):
|
||||||
|
# The checkpointer serializes Command.update via ormsgpack before the
|
||||||
|
# reducer runs, so `right` can arrive as a plain dict.
|
||||||
|
left = _kb_registry(10)
|
||||||
|
right = _kb_registry(11).model_dump()
|
||||||
|
|
||||||
|
merged = _citation_registry_merge_reducer(left, right)
|
||||||
|
|
||||||
|
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
|
||||||
|
assert chunk_ids == {10, 11}
|
||||||
|
|
||||||
|
def test_coerces_both_sides_from_dict(self):
|
||||||
|
left = _kb_registry(10).model_dump()
|
||||||
|
right = _kb_registry(11).model_dump()
|
||||||
|
|
||||||
|
merged = _citation_registry_merge_reducer(left, right)
|
||||||
|
|
||||||
|
assert isinstance(merged, CitationRegistry)
|
||||||
|
chunk_ids = {entry.locator["chunk_id"] for entry in merged.by_n.values()}
|
||||||
|
assert chunk_ids == {10, 11}
|
||||||
|
|
||||||
|
|
||||||
class TestInitialFilesystemState:
|
class TestInitialFilesystemState:
|
||||||
def test_default_shape(self):
|
def test_default_shape(self):
|
||||||
state = _initial_filesystem_state()
|
state = _initial_filesystem_state()
|
||||||
|
|
@ -106,7 +162,6 @@ class TestInitialFilesystemState:
|
||||||
assert state["dirty_paths"] == []
|
assert state["dirty_paths"] == []
|
||||||
assert state["dirty_path_tool_calls"] == {}
|
assert state["dirty_path_tool_calls"] == {}
|
||||||
assert state["kb_priority"] == []
|
assert state["kb_priority"] == []
|
||||||
assert state["kb_matched_chunk_ids"] == {}
|
|
||||||
assert state["kb_anon_doc"] is None
|
assert state["kb_anon_doc"] is None
|
||||||
assert state["tree_version"] == 0
|
assert state["tree_version"] == 0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,9 +6,6 @@ import pytest
|
||||||
from langchain_core.messages import AIMessage, HumanMessage
|
from langchain_core.messages import AIMessage, HumanMessage
|
||||||
|
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks
|
from app.agents.chat.multi_agent_chat.shared.middleware import knowledge_search as ks
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
|
|
||||||
build_document_xml as _build_document_xml,
|
|
||||||
)
|
|
||||||
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
|
from app.agents.chat.multi_agent_chat.shared.middleware.knowledge_search import (
|
||||||
KBSearchPlan,
|
KBSearchPlan,
|
||||||
KnowledgePriorityMiddleware,
|
KnowledgePriorityMiddleware,
|
||||||
|
|
@ -59,88 +56,6 @@ class TestResolveSearchTypes:
|
||||||
assert result.count("FILE") == 1
|
assert result.count("FILE") == 1
|
||||||
|
|
||||||
|
|
||||||
# ── _build_document_xml ────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class TestBuildDocumentXml:
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_document(self):
|
|
||||||
return {
|
|
||||||
"document_id": 42,
|
|
||||||
"document": {
|
|
||||||
"id": 42,
|
|
||||||
"document_type": "FILE",
|
|
||||||
"title": "Test Doc",
|
|
||||||
"metadata": {"url": "https://example.com"},
|
|
||||||
},
|
|
||||||
"chunks": [
|
|
||||||
{"chunk_id": 101, "content": "First chunk content"},
|
|
||||||
{"chunk_id": 102, "content": "Second chunk content"},
|
|
||||||
{"chunk_id": 103, "content": "Third chunk content"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_contains_document_metadata(self, sample_document):
|
|
||||||
xml = _build_document_xml(sample_document)
|
|
||||||
assert "<document_id>42</document_id>" in xml
|
|
||||||
assert "<document_type>FILE</document_type>" in xml
|
|
||||||
assert "Test Doc" in xml
|
|
||||||
|
|
||||||
def test_contains_chunk_index(self, sample_document):
|
|
||||||
xml = _build_document_xml(sample_document)
|
|
||||||
assert "<chunk_index>" in xml
|
|
||||||
assert "</chunk_index>" in xml
|
|
||||||
assert 'chunk_id="101"' in xml
|
|
||||||
assert 'chunk_id="102"' in xml
|
|
||||||
assert 'chunk_id="103"' in xml
|
|
||||||
|
|
||||||
def test_matched_chunks_flagged_in_index(self, sample_document):
|
|
||||||
xml = _build_document_xml(sample_document, matched_chunk_ids={101, 103})
|
|
||||||
lines = xml.split("\n")
|
|
||||||
for line in lines:
|
|
||||||
if 'chunk_id="101"' in line:
|
|
||||||
assert 'matched="true"' in line
|
|
||||||
if 'chunk_id="102"' in line:
|
|
||||||
assert 'matched="true"' not in line
|
|
||||||
if 'chunk_id="103"' in line:
|
|
||||||
assert 'matched="true"' in line
|
|
||||||
|
|
||||||
def test_chunk_content_in_document_content_section(self, sample_document):
|
|
||||||
xml = _build_document_xml(sample_document)
|
|
||||||
assert "<document_content>" in xml
|
|
||||||
assert "First chunk content" in xml
|
|
||||||
assert "Second chunk content" in xml
|
|
||||||
assert "Third chunk content" in xml
|
|
||||||
|
|
||||||
def test_line_numbers_in_chunk_index_are_accurate(self, sample_document):
|
|
||||||
"""Verify that the line ranges in chunk_index actually point to the right content."""
|
|
||||||
xml = _build_document_xml(sample_document, matched_chunk_ids={101})
|
|
||||||
xml_lines = xml.split("\n")
|
|
||||||
|
|
||||||
for line in xml_lines:
|
|
||||||
if 'chunk_id="101"' in line and "lines=" in line:
|
|
||||||
import re
|
|
||||||
|
|
||||||
m = re.search(r'lines="(\d+)-(\d+)"', line)
|
|
||||||
assert m, f"No lines= attribute found in: {line}"
|
|
||||||
start, _end = int(m.group(1)), int(m.group(2))
|
|
||||||
target_line = xml_lines[start - 1]
|
|
||||||
assert "101" in target_line
|
|
||||||
assert "First chunk content" in target_line
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
pytest.fail("chunk_id=101 entry not found in chunk_index")
|
|
||||||
|
|
||||||
def test_splits_into_lines_correctly(self, sample_document):
|
|
||||||
"""Each chunk occupies exactly one line (no embedded newlines)."""
|
|
||||||
xml = _build_document_xml(sample_document)
|
|
||||||
lines = xml.split("\n")
|
|
||||||
chunk_lines = [
|
|
||||||
line for line in lines if "<![CDATA[" in line and "<chunk" in line
|
|
||||||
]
|
|
||||||
assert len(chunk_lines) == 3
|
|
||||||
|
|
||||||
|
|
||||||
# ── planner parsing / date normalization ───────────────────────────────
|
# ── planner parsing / date normalization ───────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue