Merge pull request #1523 from CREDO23/fix/chat-citations

[Feat] Line-level KB citations
This commit is contained in:
Rohan Verma 2026-06-19 21:01:15 -07:00 committed by GitHub
commit cd2242147a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
53 changed files with 1793 additions and 415 deletions

View file

@ -0,0 +1,31 @@
"""add chunks.start_char/end_char for citation offsets
Char offsets into the document's source_markdown (half-open span) let citations
resolve the exact passage a chunk came from. Nullable because historical rows
have no span; they populate on the next connector sync or user edit/reindex.
No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
every secondary index per row (see migration 165 for the same reasoning).
Revision ID: 166
Revises: 165
"""
from collections.abc import Sequence
from alembic import op
revision: str = "166"
down_revision: str | None = "165"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
def downgrade() -> None:
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")

View file

@ -18,7 +18,6 @@ skipped (e.g. client disconnect).
from __future__ import annotations from __future__ import annotations
import asyncio
import logging import logging
from datetime import UTC, datetime from datetime import UTC, datetime
from typing import Any from typing import Any
@ -58,9 +57,8 @@ from app.db import (
FolderRevision, FolderRevision,
shielded_async_session, shielded_async_session,
) )
from app.indexing_pipeline.document_chunker import chunk_text from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
from app.utils.document_converters import ( from app.utils.document_converters import (
embed_texts,
generate_content_hash, generate_content_hash,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
@ -234,24 +232,23 @@ async def _create_document(
session.add(doc) session.add(doc)
await session.flush() await session.flush()
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0] summary_embedding, chunk_embeddings = await build_chunk_embeddings(
content, use_code_chunker=False
)
doc.embedding = summary_embedding doc.embedding = summary_embedding
chunks = chunk_text(content) session.add_all(
if chunks: [
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) Chunk(
session.add_all( document_id=doc.id,
[ content=sl.text,
Chunk( embedding=embedding,
document_id=doc.id, position=i,
content=text, start_char=sl.start_char,
embedding=embedding, end_char=sl.end_char,
position=i, )
) for i, (sl, embedding) in enumerate(chunk_embeddings)
for i, (text, embedding) in enumerate( ]
zip(chunks, chunk_embeddings, strict=True) )
)
]
)
return doc return doc
@ -287,26 +284,25 @@ async def _update_document(
search_space_id, search_space_id,
) )
summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0] summary_embedding, chunk_embeddings = await build_chunk_embeddings(
content, use_code_chunker=False
)
document.embedding = summary_embedding document.embedding = summary_embedding
await session.execute(delete(Chunk).where(Chunk.document_id == document.id)) await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
chunks = chunk_text(content) session.add_all(
if chunks: [
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) Chunk(
session.add_all( document_id=document.id,
[ content=sl.text,
Chunk( embedding=embedding,
document_id=document.id, position=i,
content=text, start_char=sl.start_char,
embedding=embedding, end_char=sl.end_char,
position=i, )
) for i, (sl, embedding) in enumerate(chunk_embeddings)
for i, (text, embedding) in enumerate( ]
zip(chunks, chunk_embeddings, strict=True) )
)
]
)
return document return document

View file

@ -1,42 +1,58 @@
<citations> <citations>
Citations reach the answer through two channels. Use whichever applies — and Citations reach the answer through three channels. Use whichever applies, and
never invent ids you didn't see. Citation ids are resolved by exact-match never invent ids you didn't see: ids are matched exactly, so a wrong one
lookup; a wrong id silently breaks the link, so when in doubt, omit. silently breaks the link — when in doubt, omit. Always write a citation as
plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
parentheses.
### Channel A — chunk blocks injected this turn ### Channel A — web_search chunk blocks injected this turn
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
turn: turn, the chunk `id` is the result's URL:
1. For each factual statement taken from those chunks, add 1. For each factual statement taken from a chunk, add `[citation:<url>]`
`[citation:chunk_id]` using the **exact** id from a visible using the **exact** id from a visible `<chunk id='…'>` tag. Copy the
`<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim); URL verbatim; do not retype it from memory.
do not retype from memory. 2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
2. `<document_id>` is the parent doc id, **not** a citation source —
only ids inside `<chunk id='…'>` count.
3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
each id copied individually). each id copied individually).
4. Never invent, normalise, or guess at adjacent ids; if unsure, omit. 3. Never invent, normalise, or guess at a URL; if unsure, omit.
5. Plain brackets only — no markdown links, no footnote numbering.
### Channel B — citations relayed by a `task` specialist ### Channel B — citations relayed by a `task` specialist
A `task(...)` tool message may contain `[citation:<chunk_id>]` markers A `task(...)` tool message may contain `[citation:…]` markers the
the specialist already attached to its prose. The specialist saw the specialist already attached to its prose — line citations
underlying `<chunk id='…'>` blocks; you didn't. So: (`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The
specialist read the underlying document and tied each marker to a
passage; you didn't. So:
1. **Preserve those markers verbatim** in your final answer — do not 1. **Preserve those markers verbatim** in your final answer — do not
reformat, renumber, drop, or wrap them in markdown links. When you reformat, renumber, drop, or wrap them in markdown links. When you
paraphrase a specialist sentence, copy the marker character-for- paraphrase a specialist sentence, copy the marker character-for-
character; do not regenerate the id from memory (LLMs reliably character; do not regenerate it from memory (LLMs reliably corrupt
corrupt nearby digits). nearby digits).
2. Keep each marker attached to the sentence the specialist attached 2. Keep each marker attached to the sentence the specialist attached
it to. it to.
3. Do **not** add new `[citation:…]` markers of your own to a 3. Do **not** add new `[citation:…]` markers of your own to a
specialist's prose; if a fact has no marker, the specialist specialist's prose; if a fact has no marker, the specialist
couldn't tie it to a chunk and neither can you. couldn't tie it to a source and neither can you.
4. When a specialist returns JSON, the citation markers live inside 4. When a specialist returns JSON, the citation markers live inside
the prose-bearing fields (e.g. a summary or excerpt). Pull them the prose-bearing fields (e.g. a summary or excerpt). Pull them
along with the surrounding sentence when you quote. along with the surrounding sentence when you quote.
If neither channel surfaces citation markers this turn, do not fabricate ### Channel C — your knowledge base (search hits and `read_file`)
them. Knowledge-base facts are cited by line range using the document id:
`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
matched passage. When that passage supports your point, copy the token
verbatim — that is the entire citation.
2. When you `read_file` a `/documents/...` path, its header gives the
`<document_id>` and an optional `<matched_lines>` pointer, and the body is
shown with line numbers; cite the lines you actually used. Use `read_file`
when you need more context than a search passage shows.
3. Copy document ids and line numbers exactly as shown — never estimate,
shift, or invent them.
4. Older documents without a numbered body instead show `<chunk id='N'>`
blocks; cite those with `[citation:N]`, copying the id exactly.
If none of these channels surfaces a citable source this turn, do not
fabricate citations.
</citations> </citations>

View file

@ -33,6 +33,7 @@ from app.agents.chat.runtime.path_resolver import (
) )
from app.db import Document, shielded_async_session from app.db import Document, shielded_async_session
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
from app.utils.text_spans import char_span_to_line_range
_perf_log = get_perf_logger() _perf_log = get_perf_logger()
@ -56,12 +57,16 @@ _TOOL_DESCRIPTION = (
) )
async def _resolve_virtual_paths( async def _resolve_doc_context(
results: list[dict[str, Any]], results: list[dict[str, Any]],
*, *,
search_space_id: int, search_space_id: int,
) -> dict[int, str]: ) -> tuple[dict[int, str], dict[int, str]]:
"""Resolve ``Document.id`` -> canonical virtual path for the search hits.""" """Resolve ``Document.id`` -> (canonical virtual path, source_markdown).
``source_markdown`` is the canonical body the chunk spans index into; the
renderer uses it to turn a chunk's char span into a line range.
"""
doc_ids = [ doc_ids = [
doc_id doc_id
for doc_id in ( for doc_id in (
@ -72,17 +77,24 @@ async def _resolve_virtual_paths(
if isinstance(doc_id, int) if isinstance(doc_id, int)
] ]
if not doc_ids: if not doc_ids:
return {} return {}, {}
async with shielded_async_session() as session: async with shielded_async_session() as session:
index: PathIndex = await build_path_index(session, search_space_id) index: PathIndex = await build_path_index(session, search_space_id)
folder_rows = await session.execute( rows = await session.execute(
select(Document.id, Document.folder_id).where( select(
Document.id, Document.folder_id, Document.source_markdown
).where(
Document.search_space_id == search_space_id, Document.search_space_id == search_space_id,
Document.id.in_(doc_ids), Document.id.in_(doc_ids),
) )
) )
folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()} folder_by_doc_id: dict[int, int | None] = {}
bodies: dict[int, str] = {}
for row in rows.all():
folder_by_doc_id[row.id] = row.folder_id
if row.source_markdown:
bodies[row.id] = row.source_markdown
paths: dict[int, str] = {} paths: dict[int, str] = {}
for doc in results: for doc in results:
@ -97,13 +109,76 @@ async def _resolve_virtual_paths(
folder_id=folder_id if isinstance(folder_id, int) else None, folder_id=folder_id if isinstance(folder_id, int) else None,
index=index, index=index,
) )
return paths return paths, bodies
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
start = chunk.get("start_char")
end = chunk.get("end_char")
if (
not body
or not isinstance(doc_id, int)
or not isinstance(start, int)
or not isinstance(end, int)
):
return ""
start_line, end_line = char_span_to_line_range(body, start, end)
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
def _render_passage(
chunk: dict[str, Any], body: str | None, doc_id: int | None
) -> str | None:
"""Render one matched chunk as an indented passage tagged with its token."""
content = (chunk.get("content") or "").strip()
if not content:
return None
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
indented = snippet.replace("\n", "\n ")
token = _citation_token(chunk, body, doc_id)
head = f"\n {token}" if token else ""
return f"{head}\n {indented}"
def _matched_passages(
doc: dict[str, Any], body: str | None, doc_id: int | None
) -> str:
"""Render the RRF-matched chunks; '' when none can be rendered."""
by_id = {
c.get("chunk_id"): c
for c in (doc.get("chunks") or [])
if isinstance(c, dict)
}
rendered: list[str] = []
for chunk_id in doc.get("matched_chunk_ids") or []:
chunk = by_id.get(chunk_id)
if chunk is None:
continue
passage = _render_passage(chunk, body, doc_id)
if passage:
rendered.append(passage)
return "".join(rendered)
def _fallback_snippet(doc: dict[str, Any]) -> str:
"""Top-of-document preview, used only when no matched chunk is available."""
content = (doc.get("content") or "").strip()
if not content:
return "\n (no preview available; read the document for details)"
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
return "\n " + snippet.replace("\n", "\n ")
def _format_hits( def _format_hits(
results: list[dict[str, Any]], results: list[dict[str, Any]],
*, *,
paths: dict[int, str], paths: dict[int, str],
bodies: dict[int, str],
query: str, query: str,
) -> str: ) -> str:
"""Render search hits as a compact, model-readable block.""" """Render search hits as a compact, model-readable block."""
@ -124,21 +199,15 @@ def _format_hits(
score = doc.get("score") score = doc.get("score")
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a" score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
path = paths.get(doc_id) if isinstance(doc_id, int) else None path = paths.get(doc_id) if isinstance(doc_id, int) else None
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + ( id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else "" f"\n path: {path}" if path else ""
) )
content = (doc.get("content") or "").strip() passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
if content: entry = header + (passages or _fallback_snippet(doc))
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
body = "\n " + snippet.replace("\n", "\n ")
else:
body = "\n (no preview available; read the document for details)"
entry = header + body
if total + len(entry) > _MAX_TOTAL_CHARS: if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n<!-- additional matches truncated to fit context -->") lines.append("\n<!-- additional matches truncated to fit context -->")
break break
@ -146,8 +215,9 @@ def _format_hits(
total += len(entry) total += len(entry)
lines.append( lines.append(
"\n\nTo read a full document, delegate to the knowledge_base specialist " "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
"with `task`, referencing the path above." "verbatim. To quote more context or read the full document, delegate to "
"the knowledge_base specialist with `task` using the path above."
) )
lines.append("\n</knowledge_base_results>") lines.append("\n</knowledge_base_results>")
return "".join(lines) return "".join(lines)
@ -204,8 +274,10 @@ def create_search_knowledge_base_tool(
top_k=clamped_top_k, top_k=clamped_top_k,
) )
paths = await _resolve_virtual_paths(results, search_space_id=_space_id) paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id)
rendered = _format_hits(results, paths=paths, query=cleaned_query) rendered = _format_hits(
results, paths=paths, bodies=bodies, query=cleaned_query
)
matched = _matched_chunk_ids(results) matched = _matched_chunk_ids(results)
_perf_log.info( _perf_log.info(

View file

@ -45,6 +45,10 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import ( from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml, build_document_xml,
) )
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
build_read_preamble,
compute_matched_line_ranges,
)
from app.agents.chat.runtime.path_resolver import ( from app.agents.chat.runtime.path_resolver import (
DOCUMENTS_ROOT, DOCUMENTS_ROOT,
build_path_index, build_path_index,
@ -64,6 +68,12 @@ def _basename(path: str) -> str:
return path.rsplit("/", 1)[-1] return path.rsplit("/", 1)[-1]
def _metadata_url(metadata: dict[str, Any]) -> str:
return (
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
)
def _is_under(child: str, parent: str) -> bool: def _is_under(child: str, parent: str) -> bool:
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics).""" """Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
if parent == "/": if parent == "/":
@ -460,8 +470,11 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path) loaded = await self._load_file_data(file_path)
if loaded is None: if loaded is None:
return f"Error: File '{file_path}' not found" return f"Error: File '{file_path}' not found"
file_data, _ = loaded file_data, _, preamble = loaded
return format_read_response(file_data, offset, limit) body = format_read_response(file_data, offset, limit)
if preamble and offset == 0:
return preamble + body
return body
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override] def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
return asyncio.run(self.aread(file_path, offset, limit)) return asyncio.run(self.aread(file_path, offset, limit))
@ -469,12 +482,14 @@ class KBPostgresBackend(BackendProtocol):
async def _load_file_data( async def _load_file_data(
self, self,
path: str, path: str,
) -> tuple[dict[str, Any], int | None] | None: ) -> tuple[dict[str, Any], int | None, str | None] | None:
"""Lazy-load a virtual KB document into a deepagents ``FileData``. """Lazy-load a virtual KB document into a deepagents ``FileData``.
Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
to any known document. ``doc_id`` is ``None`` for the synthetic doesn't map to any known document. ``doc_id`` is ``None`` for the
anonymous document so the caller doesn't track it as a DB-backed file. synthetic anonymous document. ``preamble`` is the metadata header to
show above a numbered ``source_markdown`` body (``None`` for the legacy
chunk-reconstructed XML reads used when a document has no body).
""" """
anon = self._kb_anon_doc() anon = self._kb_anon_doc()
if anon and str(anon.get("path") or "") == path: if anon and str(anon.get("path") or "") == path:
@ -492,7 +507,7 @@ class KBPostgresBackend(BackendProtocol):
} }
xml = build_document_xml(doc_payload, matched_chunk_ids=set()) xml = build_document_xml(doc_payload, matched_chunk_ids=set())
file_data = create_file_data(xml) file_data = create_file_data(xml)
return file_data, None return file_data, None, None
if not path.startswith(DOCUMENTS_ROOT): if not path.startswith(DOCUMENTS_ROOT):
return None return None
@ -505,41 +520,58 @@ class KBPostgresBackend(BackendProtocol):
) )
if document is None: if document is None:
return None return None
chunk_rows = await session.execute( source_markdown = document.source_markdown or ""
select(Chunk.id, Chunk.content) document_type = (
.where(Chunk.document_id == document.id)
.order_by(Chunk.position, Chunk.id)
)
chunks = [
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
]
doc_payload = {
"document_id": document.id,
"chunks": chunks,
"matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
"document": {
"id": document.id,
"title": document.title,
"document_type": (
document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
),
"metadata": dict(document.document_metadata or {}),
},
"source": (
document.document_type.value document.document_type.value
if getattr(document, "document_type", None) is not None if getattr(document, "document_type", None) is not None
else "UNKNOWN" else "UNKNOWN"
), )
metadata = dict(document.document_metadata or {})
chunk_rows = await session.execute(
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
.where(Chunk.document_id == document.id)
.order_by(Chunk.position, Chunk.id)
)
chunk_records = chunk_rows.all()
document_id = document.id
document_title = document.title
matched = self._matched_chunk_ids(document_id)
# Canonical read: serve the verbatim body with cat -n line numbers that
# line up with chunk char spans, so the agent cites real source lines.
if source_markdown:
ranges = compute_matched_line_ranges(
source_markdown,
[(r.id, r.start_char, r.end_char) for r in chunk_records],
matched,
)
preamble = build_read_preamble(
document_id=document_id,
document_type=document_type,
title=document_title,
url=_metadata_url(metadata),
matched_line_ranges=ranges,
)
return create_file_data(source_markdown), document_id, preamble
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
doc_payload = {
"document_id": document_id,
"chunks": [
{"chunk_id": r.id, "content": r.content} for r in chunk_records
],
"matched_chunk_ids": list(matched),
"document": {
"id": document_id,
"title": document_title,
"document_type": document_type,
"metadata": metadata,
},
"source": document_type,
} }
xml = build_document_xml( xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
doc_payload, return create_file_data(xml), document_id, None
matched_chunk_ids=self._matched_chunk_ids(document.id),
)
file_data = create_file_data(xml)
return file_data, document.id
# ------------------------------------------------------------------ writes # ------------------------------------------------------------------ writes
@ -571,7 +603,7 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path) loaded = await self._load_file_data(file_path)
if loaded is None: if loaded is None:
return EditResult(error=f"Error: File '{file_path}' not found") return EditResult(error=f"Error: File '{file_path}' not found")
file_data, _ = loaded file_data, _, _ = loaded
content = file_data_to_string(file_data) content = file_data_to_string(file_data)
result = perform_string_replacement( result = perform_string_replacement(

View file

@ -0,0 +1,73 @@
"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
``source_markdown`` makes those line numbers line up exactly with the chunk
char spans and the editor highlight. This module renders the small header the
agent sees above that body: document identity plus the matched line ranges to
seek to, and a concrete reminder of the line-citation token shape.
"""
from __future__ import annotations
from collections.abc import Iterable
from app.utils.text_spans import char_span_to_line_range
def _format_range(start: int, end: int) -> str:
return f"{start}" if start == end else f"{start}-{end}"
def compute_matched_line_ranges(
source_markdown: str,
chunks: Iterable[tuple[int, int | None, int | None]],
matched_chunk_ids: set[int],
) -> list[tuple[int, int]]:
"""Map matched chunks to sorted, de-duplicated 1-based line ranges.
``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
spans (legacy rows) are skipped they have no resolvable location.
"""
ranges: set[tuple[int, int]] = set()
for chunk_id, start_char, end_char in chunks:
if chunk_id not in matched_chunk_ids:
continue
if start_char is None or end_char is None:
continue
ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
return sorted(ranges)
def build_read_preamble(
*,
document_id: int,
document_type: str,
title: str,
url: str,
matched_line_ranges: list[tuple[int, int]],
) -> str:
"""Render the metadata header shown above a numbered ``source_markdown`` body.
``matched_line_ranges`` are 1-based inclusive line ranges (already derived
from chunk char spans) to point the agent at the relevant lines.
"""
lines = [
"<document_metadata>",
f" <document_id>{document_id}</document_id>",
f" <document_type>{document_type}</document_type>",
f" <title><![CDATA[{title}]]></title>",
f" <url><![CDATA[{url}]]></url>",
]
if matched_line_ranges:
ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
lines.append(f" <matched_lines>{ranges}</matched_lines>")
lines.append("</document_metadata>")
lines.append(
f"Cite lines from this document as [citation:d{document_id}#L<start>-<end>] "
"using the line numbers shown below."
)
lines.append("")
return "\n".join(lines)
__all__ = ["build_read_preamble", "compute_matched_line_ranges"]

View file

@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: File '{validated}' not found" return f"Error: File '{validated}' not found"
_, doc_id_to_attach = loaded _, doc_id_to_attach, _ = loaded
res: EditResult = await backend.aedit( res: EditResult = await backend.aedit(
validated, old_string, new_string, replace_all=replace_all validated, old_string, new_string, replace_all=replace_all

View file

@ -75,7 +75,7 @@ async def cloud_move_file(
loaded = await backend._load_file_data(source) loaded = await backend._load_file_data(source)
if loaded is None: if loaded is None:
return f"Error: source '{source}' not found." return f"Error: source '{source}' not found."
source_file_data, loaded_doc_id = loaded source_file_data, loaded_doc_id, _ = loaded
if source_doc_id is None: if source_doc_id is None:
source_doc_id = loaded_doc_id source_doc_id = loaded_doc_id

View file

@ -58,8 +58,10 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: File '{validated}' not found" return f"Error: File '{validated}' not found"
file_data, doc_id = loaded file_data, doc_id, preamble = loaded
rendered = format_read_response(file_data, offset, limit) rendered = format_read_response(file_data, offset, limit)
if preamble and offset == 0:
rendered = preamble + rendered
update: dict[str, Any] = { update: dict[str, Any] = {
"files": {validated: file_data}, "files": {validated: file_data},
"messages": [ "messages": [

View file

@ -74,7 +74,7 @@ async def cloud_rm(
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: file '{validated}' not found." return f"Error: file '{validated}' not found."
_, resolved_doc_id = loaded _, resolved_doc_id, _ = loaded
files_update: dict[str, Any] = {validated: None} files_update: dict[str, Any] = {validated: None}
update: dict[str, Any] = { update: dict[str, Any] = {

View file

@ -35,42 +35,24 @@ Map outcomes to your `status`:
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
## Chunk citations in your prose ## Citations in your prose
When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. `read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
### Where chunk ids live in `read_file` output **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>` and an optional `<matched_lines>` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
A KB document's XML has three numeric attributes — only **one** is a citation source: **Legacy chunk blocks (older docs without a stored body).** The response is XML with `<chunk id='N'>` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `<chunk id='…'>` tag.
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document>
```
### Rules ### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. - Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. - Never cite `<document_id>` on its own — it identifies the document, not a passage. In the numbered form it is only the `d<document_id>` prefix of a line citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk. - Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Prefer **fewer accurate citations** over many speculative ones. - Prefer **fewer accurate citations** over many speculative ones.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. - Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none. - Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite.
- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits. - Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters.
## Examples ## Examples
@ -89,7 +71,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
"path": "/documents/meetings/2026-05-11-meeting.md", "path": "/documents/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -121,7 +103,7 @@ A KB document's XML has three numeric attributes — only **one** is a citation
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" } { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -142,7 +124,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"chunk_ids": string[] | null "citations": string[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -33,11 +33,11 @@ Map outcomes to your `status`:
- Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`. - Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`.
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`. - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.) You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
## Chunk citations in your prose ## Citations in your prose
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work. In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
## Examples ## Examples
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
"path": "/notes/meetings/2026-05-11-meeting.md", "path": "/notes/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" } { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"chunk_ids": null "citations": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"chunk_ids": string[] | null "citations": string[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -28,41 +28,21 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Chunk citations ## Citations
When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. `read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
### Where chunk ids live in `read_file` output - **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>`, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
- **Legacy chunk blocks (older docs).** XML with `<chunk id='N'>` blocks. Cite the chunk a claim came from as `[citation:N]`.
A KB document's XML has three numeric attributes — only **one** is a citation source:
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document>
```
### Rules ### Rules
- Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. - Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation.
- Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. - Never cite `<document_id>` on its own — in the numbered form it is only the `d<document_id>` prefix of a line citation.
- Never cite `<document_id>` — that's the parent doc, not a chunk. - Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. - Multiple passages supporting the same point → comma-separated and copied individually.
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation. - Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite.
- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference. - The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference.
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].` Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].`

View file

@ -953,8 +953,9 @@ class Config:
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true" os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
) )
# Bump to invalidate every cached embedding set after a chunker change. # Bump to invalidate every cached embedding set after a chunker change.
# v2: chunks became exact (raw) slices of source_markdown for citation spans.
EMBEDDING_CACHE_CHUNKER_VERSION = int( EMBEDDING_CACHE_CHUNKER_VERSION = int(
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1") os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2")
) )
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90")) EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int( EMBEDDING_CACHE_MAX_TOTAL_MB = int(

View file

@ -1467,6 +1467,11 @@ class Chunk(BaseModel, TimestampMixin):
# ordering reads are document-scoped (covered by ix_chunks_document_id) and # ordering reads are document-scoped (covered by ix_chunks_document_id) and
# building a position index on the large chunks table is not worth it. # building a position index on the large chunks table is not worth it.
position = Column(Integer, nullable=False, server_default="0") position = Column(Integer, nullable=False, server_default="0")
# Half-open char span into the document's source_markdown the chunk was cut
# from. Nullable: historical rows predate spans and populate on reindex.
# Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
start_char = Column(Integer, nullable=True)
end_char = Column(Integer, nullable=True)
document_id = Column( document_id = Column(
Integer, Integer,

View file

@ -18,23 +18,26 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.service import EmbeddingCacheService from app.indexing_pipeline.cache.service import EmbeddingCacheService
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans
from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_embedder import embed_texts
from app.observability import metrics from app.observability import metrics
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ChunkPair = tuple[str, np.ndarray] SliceEmbedding = tuple[ChunkSlice, np.ndarray]
async def build_chunk_embeddings( async def build_chunk_embeddings(
markdown: str, *, use_code_chunker: bool markdown: str, *, use_code_chunker: bool
) -> tuple[np.ndarray, list[ChunkPair]]: ) -> tuple[np.ndarray, list[SliceEmbedding]]:
"""Return the document-level vector and ordered ``(chunk_text, vector)`` pairs. """Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs.
Drop-in for the inline chunk+embed step; reuses prior output when the same Slices are always recomputed (cheap) so their char spans are exact; only the
markdown has already been embedded with the current model and chunker. embeddings are cached, reused when the same markdown was embedded with the
current model and chunker.
""" """
slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
settings = load_embedding_cache_settings() settings = load_embedding_cache_settings()
chunker_kind = "code" if use_code_chunker else "hybrid" chunker_kind = "code" if use_code_chunker else "hybrid"
embedding_dim = getattr(config.embedding_model_instance, "dimension", None) embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
@ -45,7 +48,7 @@ async def build_chunk_embeddings(
embedding_dim=embedding_dim, embedding_dim=embedding_dim,
) )
if not cacheable: if not cacheable:
return await _compute(markdown, use_code_chunker=use_code_chunker) return await _compute(markdown, slices)
key = EmbeddingKey( key = EmbeddingKey(
markdown_sha256=_hash_text(markdown), markdown_sha256=_hash_text(markdown),
@ -56,31 +59,30 @@ async def build_chunk_embeddings(
) )
cached = await _recall(key) cached = await _recall(key)
if cached is not None: if cached is not None and _aligns(cached, slices):
metrics.record_embedding_cache_lookup( metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, embedding_model=key.embedding_model,
chunker_kind=chunker_kind, chunker_kind=chunker_kind,
outcome="hit", outcome="hit",
) )
logger.debug("Embedding cache hit for %s", key.markdown_sha256) logger.debug("Embedding cache hit for %s", key.markdown_sha256)
return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] return cached.summary_embedding, list(
zip(slices, (c.embedding for c in cached.chunks), strict=True)
)
metrics.record_embedding_cache_lookup( metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss" embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
) )
summary_embedding, chunk_pairs = await _compute( summary_embedding, pairs = await _compute(markdown, slices)
markdown, use_code_chunker=use_code_chunker await _remember(key, summary_embedding, pairs)
return summary_embedding, pairs
async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
"""Chunk markdown into ordered, char-addressed slices off the event loop."""
return await asyncio.to_thread(
chunk_markdown_with_spans, markdown, use_code_chunker
) )
await _remember(key, summary_embedding, chunk_pairs)
return summary_embedding, chunk_pairs
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
if use_code_chunker:
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
return await asyncio.to_thread(chunk_text_hybrid, markdown)
async def embed_batch(texts: list[str]) -> list[np.ndarray]: async def embed_batch(texts: list[str]) -> list[np.ndarray]:
@ -88,13 +90,19 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
return await asyncio.to_thread(embed_texts, texts) return await asyncio.to_thread(embed_texts, texts)
def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
"""A hit is only usable if its texts still match the current chunking."""
return len(cached.chunks) == len(slices) and all(
c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
)
async def _compute( async def _compute(
markdown: str, *, use_code_chunker: bool markdown: str, slices: list[ChunkSlice]
) -> tuple[np.ndarray, list[ChunkPair]]: ) -> tuple[np.ndarray, list[SliceEmbedding]]:
chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker) embeddings = await embed_batch([markdown, *(s.text for s in slices)])
embeddings = await embed_batch([markdown, *chunk_texts])
summary_embedding, *chunk_embeddings = embeddings summary_embedding, *chunk_embeddings = embeddings
return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False)) return summary_embedding, list(zip(slices, chunk_embeddings, strict=True))
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
@ -110,14 +118,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
async def _remember( async def _remember(
key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair] key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding]
) -> None: ) -> None:
try: try:
from app.tasks.celery_tasks import get_celery_session_maker from app.tasks.celery_tasks import get_celery_session_maker
embedding_set = EmbeddingSet( embedding_set = EmbeddingSet(
summary_embedding=summary_embedding, summary_embedding=summary_embedding,
chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs], chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs],
) )
async with get_celery_session_maker()() as session: async with get_celery_session_maker()() as session:
await EmbeddingCacheService(session).remember(key, embedding_set) await EmbeddingCacheService(session).remember(key, embedding_set)

View file

@ -19,6 +19,9 @@ class ExistingChunk:
id: int id: int
content: str content: str
position: int position: int
# Stored char span; None for legacy rows indexed before spans existed.
start_char: int | None = None
end_char: int | None = None
@dataclass(frozen=True, slots=True) @dataclass(frozen=True, slots=True)

View file

@ -1,16 +1,30 @@
import re import re
from dataclasses import dataclass
from app.config import config from app.config import config
# Regex that matches a Markdown table block (header + separator + one or more rows) # Regex that matches a Markdown table block (header + separator + one or more rows)
# A table block starts with a | at the beginning of a line and ends when a # A table block starts with a | at the beginning of a line and ends when a
# non-table line (or end of string) is encountered. # non-table line (or end of string) is encountered. The final row may end at EOF
# without a trailing newline, so the whole table stays one slice.
_TABLE_BLOCK_RE = re.compile( _TABLE_BLOCK_RE = re.compile(
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)", r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
re.MULTILINE, re.MULTILINE,
) )
@dataclass(frozen=True, slots=True)
class ChunkSlice:
"""A chunk paired with its half-open char span into the source markdown.
Invariant: ``markdown[start_char:end_char] == text``.
"""
text: str
start_char: int
end_char: int
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
"""Chunk a text string using the configured chunker and return the chunk texts.""" """Chunk a text string using the configured chunker and return the chunk texts."""
chunker = ( chunker = (
@ -19,41 +33,63 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
return [c.text for c in chunker.chunk(text)] return [c.text for c in chunker.chunk(text)]
def chunk_text_hybrid(text: str) -> list[str]: def chunk_markdown_with_spans(
"""Table-aware chunker that prevents Markdown tables from being split mid-row. text: str, use_code_chunker: bool = False
) -> list[ChunkSlice]:
"""Chunk markdown into a lossless, contiguous partition of char-addressed slices.
Algorithm: Tables stay whole (issue #1334) and every slice is an exact substring of
1. Scan the document for Markdown table blocks. ``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
2. Each table block is emitted as a single, unmodified chunk so that its the offset record citations resolve against.
header, separator row, and data rows always stay together.
3. The non-table prose segments between (and around) tables are passed through
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
document order.
This ensures that table data is never sliced in the middle by the token-based
chunker, which would otherwise produce garbled rows that are useless for RAG.
Fixes #1334.
""" """
chunks: list[str] = [] if not text:
return []
slices: list[ChunkSlice] = []
cursor = 0 cursor = 0
for match in _TABLE_BLOCK_RE.finditer(text): for match in _TABLE_BLOCK_RE.finditer(text):
# Prose before this table if match.start() > cursor:
prose = text[cursor : match.start()].strip() slices.extend(
if prose: _segment_slices(text, cursor, match.start(), use_code_chunker)
chunks.extend(chunk_text(prose)) )
slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
# The table itself is kept as one indivisible chunk
table_block = match.group(0).strip()
if table_block:
chunks.append(table_block)
cursor = match.end() cursor = match.end()
# Remaining prose after the last table (or entire text if no tables) if len(text) > cursor:
trailing = text[cursor:].strip() slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
if trailing:
chunks.extend(chunk_text(trailing))
return chunks return slices
def _segment_slices(
text: str, start: int, end: int, use_code_chunker: bool
) -> list[ChunkSlice]:
"""Sub-chunk one non-table segment into contiguous, char-addressed slices."""
chunker = (
config.code_chunker_instance if use_code_chunker else config.chunker_instance
)
segment = text[start:end]
chunks = chunker.chunk(segment)
slices: list[ChunkSlice] = []
local = 0
for chunk in chunks:
# Use the chunker's end offset only as a cut point, then re-slice the
# segment ourselves so the result is an exact, gap-free substring.
local_end = min(max(chunk.end_index, local), len(segment))
if local_end <= local:
continue
slices.append(
ChunkSlice(segment[local:local_end], start + local, start + local_end)
)
local = local_end
if local < len(segment):
if slices:
last = slices[-1]
slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
else:
slices.append(ChunkSlice(segment[local:], start + local, end))
return slices

View file

@ -20,9 +20,10 @@ from app.db import (
DocumentType, DocumentType,
) )
from app.indexing_pipeline.cache import build_chunk_embeddings from app.indexing_pipeline.cache import build_chunk_embeddings
from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch
from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_chunker import ChunkSlice
from app.indexing_pipeline.document_hashing import ( from app.indexing_pipeline.document_hashing import (
compute_content_hash, compute_content_hash,
compute_identifier_hash, compute_identifier_hash,
@ -489,12 +490,22 @@ class IndexingPipelineService:
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]: async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
result = await self.session.execute( result = await self.session.execute(
select(Chunk.id, Chunk.content, Chunk.position).where( select(
Chunk.document_id == document_id Chunk.id,
) Chunk.content,
Chunk.position,
Chunk.start_char,
Chunk.end_char,
).where(Chunk.document_id == document_id)
) )
return [ return [
ExistingChunk(id=row.id, content=row.content, position=row.position) ExistingChunk(
id=row.id,
content=row.content,
position=row.position,
start_char=row.start_char,
end_char=row.end_char,
)
for row in result for row in result
] ]
@ -505,15 +516,21 @@ class IndexingPipelineService:
delete(Chunk).where(Chunk.document_id == document.id) delete(Chunk).where(Chunk.document_id == document.id)
) )
summary_embedding, chunk_pairs = await build_chunk_embeddings( summary_embedding, slice_pairs = await build_chunk_embeddings(
content, content,
use_code_chunker=connector_doc.should_use_code_chunker, use_code_chunker=connector_doc.should_use_code_chunker,
) )
document.embedding = summary_embedding document.embedding = summary_embedding
return [ return [
Chunk(content=text, embedding=emb, position=i) Chunk(
for i, (text, emb) in enumerate(chunk_pairs) content=chunk_slice.text,
embedding=emb,
position=i,
start_char=chunk_slice.start_char,
end_char=chunk_slice.end_char,
)
for i, (chunk_slice, emb) in enumerate(slice_pairs)
] ]
async def _reindex_incrementally( async def _reindex_incrementally(
@ -525,35 +542,39 @@ class IndexingPipelineService:
) -> int: ) -> int:
"""Edit path: keep rows whose text survived, embed only new texts. """Edit path: keep rows whose text survived, embed only new texts.
Unchanged rows keep their embedding and their HNSW/GIN index entries; Unchanged rows keep their embedding and their HNSW/GIN index entries. An
moved rows get a position-only UPDATE, which touches neither index. edit can shift a kept chunk's char span without changing its text, so
every kept row's position and span are refreshed whenever they drift.
""" """
new_texts = await chunk_markdown( slices = await chunk_slices(
content, use_code_chunker=connector_doc.should_use_code_chunker content, use_code_chunker=connector_doc.should_use_code_chunker
) )
new_texts = [s.text for s in slices]
plan = reconcile(existing, new_texts) plan = reconcile(existing, new_texts)
# One batch: the document-level summary vector plus the missing chunks. # One batch: the document-level summary vector plus the missing chunks.
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]]) embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
summary_embedding, *new_embeddings = embeddings summary_embedding, *new_embeddings = embeddings
if plan.reused:
await self.session.execute(
update(Chunk),
[{"id": cid, "position": pos} for cid, pos in plan.reused],
)
if plan.to_delete: if plan.to_delete:
await self.session.execute( await self.session.execute(
delete(Chunk).where(Chunk.id.in_(plan.to_delete)) delete(Chunk).where(Chunk.id.in_(plan.to_delete))
) )
span_updates = self._kept_row_span_updates(existing, slices, plan)
if span_updates:
await self.session.execute(update(Chunk), span_updates)
self.session.add_all( self.session.add_all(
Chunk( Chunk(
content=text, content=slices[pos].text,
embedding=emb, embedding=emb,
position=pos, position=pos,
start_char=slices[pos].start_char,
end_char=slices[pos].end_char,
document_id=document.id, document_id=document.id,
) )
for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True) for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True)
) )
document.embedding = summary_embedding document.embedding = summary_embedding
@ -564,6 +585,36 @@ class IndexingPipelineService:
) )
return len(new_texts) return len(new_texts)
@staticmethod
def _kept_row_span_updates(
existing: list[ExistingChunk],
slices: list[ChunkSlice],
plan: ChunkPlan,
) -> list[dict]:
"""Position/span writes for kept rows, emitted only where a value drifts."""
deleted = set(plan.to_delete)
moved = dict(plan.reused)
updates: list[dict] = []
for chunk in existing:
if chunk.id in deleted:
continue
new_position = moved.get(chunk.id, chunk.position)
target = slices[new_position]
if (
chunk.position != new_position
or chunk.start_char != target.start_char
or chunk.end_char != target.end_char
):
updates.append(
{
"id": chunk.id,
"position": new_position,
"start_char": target.start_char,
"end_char": target.end_char,
}
)
return updates
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled.""" """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
try: try:

View file

@ -440,8 +440,15 @@ class ChucksHybridSearchRetriever:
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
# Select only the columns we need (skip Chunk.embedding ~12KB/row). # Select only the columns we need (skip Chunk.embedding ~12KB/row).
# start_char/end_char carry the citation span; None for legacy rows.
chunk_query = ( chunk_query = (
select(Chunk.id, Chunk.content, Chunk.document_id) select(
Chunk.id,
Chunk.content,
Chunk.document_id,
Chunk.start_char,
Chunk.end_char,
)
.join(numbered, Chunk.id == numbered.c.chunk_id) .join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter) .where(chunk_filter)
.order_by(Chunk.document_id, Chunk.position, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
@ -476,7 +483,14 @@ class ChucksHybridSearchRetriever:
if doc_id not in doc_map: if doc_id not in doc_map:
continue continue
doc_entry = doc_map[doc_id] doc_entry = doc_map[doc_id]
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content}) doc_entry["chunks"].append(
{
"chunk_id": row.id,
"content": row.content,
"start_char": row.start_char,
"end_char": row.end_char,
}
)
if row.id in matched_chunk_ids: if row.id in matched_chunk_ids:
doc_entry["matched_chunk_ids"].append(row.id) doc_entry["matched_chunk_ids"].append(row.id)

View file

@ -37,6 +37,7 @@ from app.schemas import (
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
from app.users import current_active_user from app.users import current_active_user
from app.utils.rbac import check_permission from app.utils.rbac import check_permission
from app.utils.text_spans import char_span_to_line_range
try: try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@ -967,9 +968,12 @@ async def get_document_by_chunk_id(
session: AsyncSession = Depends(get_async_session), session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user), user: User = Depends(current_active_user),
): ):
""" """Resolve a chunk id to its document plus a window of surrounding chunks.
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
Uses SQL-level pagination to avoid loading all chunks into memory. Returns the cited chunk's 1-based line range (cited_start_line/
cited_end_line) when char spans exist, so callers can anchor the citation
to exact source lines. Uses SQL-level pagination to avoid loading all
chunks into memory.
""" """
try: try:
from sqlalchemy import and_, func, or_ from sqlalchemy import and_, func, or_
@ -1033,6 +1037,17 @@ async def get_document_by_chunk_id(
) )
windowed_chunks = windowed_result.scalars().all() windowed_chunks = windowed_result.scalars().all()
cited_start_line: int | None = None
cited_end_line: int | None = None
if (
chunk.start_char is not None
and chunk.end_char is not None
and document.source_markdown
):
cited_start_line, cited_end_line = char_span_to_line_range(
document.source_markdown, chunk.start_char, chunk.end_char
)
return DocumentWithChunksRead( return DocumentWithChunksRead(
id=document.id, id=document.id,
title=document.title, title=document.title,
@ -1047,6 +1062,8 @@ async def get_document_by_chunk_id(
chunks=windowed_chunks, chunks=windowed_chunks,
total_chunks=total_chunks, total_chunks=total_chunks,
chunk_start_index=start, chunk_start_index=start,
cited_start_line=cited_start_line,
cited_end_line=cited_end_line,
) )
except HTTPException: except HTTPException:
raise raise

View file

@ -42,6 +42,34 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
EDITOR_PLATE_MAX_LINES = 5000 EDITOR_PLATE_MAX_LINES = 5000
def _raise_no_canonical_body(document: Document) -> None:
"""Translate a missing source_markdown into a status-aware HTTP error."""
doc_status = document.status or {}
state = (
doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
)
if state in ("pending", "processing"):
raise HTTPException(
status_code=409,
detail="This document is still being processed. Please wait a moment and try again.",
)
if state == "failed":
reason = (
doc_status.get("reason", "Unknown error")
if isinstance(doc_status, dict)
else "Unknown error"
)
raise HTTPException(
status_code=422,
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
)
raise HTTPException(
status_code=400,
detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
)
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content") @router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
async def get_editor_content( async def get_editor_content(
search_space_id: int, search_space_id: int,
@ -52,8 +80,9 @@ async def get_editor_content(
""" """
Get document content for editing. Get document content for editing.
Returns source_markdown for the Plate.js editor. Returns source_markdown (the canonical body) for the Plate.js editor, with a
Falls back to blocknote_document markdown conversion, then chunk reconstruction. one-time migration from legacy blocknote_document. Never reconstructs the
body from chunks.
Requires DOCUMENTS_READ permission. Requires DOCUMENTS_READ permission.
""" """
@ -123,52 +152,9 @@ async def get_editor_content(
await session.commit() await session.commit()
return _build_response(empty_markdown) return _build_response(empty_markdown)
chunk_contents_result = await session.execute( # No canonical body. Chunks are an index artifact, never the source of
select(Chunk.content) # truth, so surface the processing state instead of rebuilding from them.
.filter(Chunk.document_id == document_id) _raise_no_canonical_body(document)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if not chunk_contents:
doc_status = document.status or {}
state = (
doc_status.get("state", "ready")
if isinstance(doc_status, dict)
else "ready"
)
if state in ("pending", "processing"):
raise HTTPException(
status_code=409,
detail="This document is still being processed. Please wait a moment and try again.",
)
if state == "failed":
reason = (
doc_status.get("reason", "Unknown error")
if isinstance(doc_status, dict)
else "Unknown error"
)
raise HTTPException(
status_code=422,
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
)
raise HTTPException(
status_code=400,
detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
)
markdown_content = "\n\n".join(chunk_contents)
if not markdown_content.strip():
raise HTTPException(
status_code=400,
detail="This document appears to be empty. Try re-uploading or editing it to add content.",
)
document.source_markdown = markdown_content
await session.commit()
return _build_response(markdown_content)
@router.get( @router.get(
@ -181,8 +167,9 @@ async def download_document_markdown(
user: User = Depends(current_active_user), user: User = Depends(current_active_user),
): ):
""" """
Download the full document content as a .md file. Download the canonical document body as a .md file.
Reconstructs markdown from source_markdown or chunks.
Serves source_markdown, migrating legacy blocknote_document when present.
""" """
await check_permission( await check_permission(
session, session,
@ -208,15 +195,6 @@ async def download_document_markdown(
from app.utils.blocknote_to_markdown import blocknote_to_markdown from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown = blocknote_to_markdown(document.blocknote_document) markdown = blocknote_to_markdown(document.blocknote_document)
if markdown is None:
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:
markdown = "\n\n".join(chunk_contents)
if not markdown or not markdown.strip(): if not markdown or not markdown.strip():
raise HTTPException( raise HTTPException(
@ -357,15 +335,6 @@ async def export_document(
from app.utils.blocknote_to_markdown import blocknote_to_markdown from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown_content = blocknote_to_markdown(document.blocknote_document) markdown_content = blocknote_to_markdown(document.blocknote_document)
if markdown_content is None:
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:
markdown_content = "\n\n".join(chunk_contents)
if not markdown_content or not markdown_content.strip(): if not markdown_content or not markdown_content.strip():
raise HTTPException(status_code=400, detail="Document has no content to export") raise HTTPException(status_code=400, detail="Document has no content to export")

View file

@ -17,4 +17,7 @@ class ChunkUpdate(ChunkBase):
class ChunkRead(ChunkBase, IDModel, TimestampModel): class ChunkRead(ChunkBase, IDModel, TimestampModel):
start_char: int | None = None
end_char: int | None = None
model_config = ConfigDict(from_attributes=True) model_config = ConfigDict(from_attributes=True)

View file

@ -73,6 +73,10 @@ class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = [] chunks: list[ChunkRead] = []
total_chunks: int = 0 total_chunks: int = 0
chunk_start_index: int = 0 chunk_start_index: int = 0
# 1-based inclusive line range of the cited chunk within source_markdown;
# None when the chunk predates char spans or the body is unavailable.
cited_start_line: int | None = None
cited_end_line: int | None = None
model_config = ConfigDict(from_attributes=True) model_config = ConfigDict(from_attributes=True)

View file

@ -0,0 +1,23 @@
"""Convert char spans into document-relative line ranges.
Chunks store half-open char spans into ``source_markdown``; citations and the
editor speak in line numbers. This is the single shared conversion so search,
the resolve API, and highlighting all agree on what "lines X-Y" means.
"""
from __future__ import annotations
def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
"""Return the 1-based inclusive line range covering ``[start_char, end_char)``.
Offsets are clamped to ``text`` bounds. An empty span resolves to the single
line containing it.
"""
n = len(text)
start = max(0, min(start_char, n))
end = max(start, min(end_char, n))
start_line = text.count("\n", 0, start) + 1
last_char_index = max(start, end - 1)
end_line = text.count("\n", 0, last_char_index) + 1
return start_line, end_line

View file

@ -0,0 +1,80 @@
"""NOTE writes must carry the same char spans as the indexing pipeline.
``_create_document`` / ``_update_document`` are the cloud agent's KB write
paths. They must chunk through the shared span chunker so every persisted
chunk resolves back to an exact slice of ``source_markdown`` for citations.
"""
from __future__ import annotations
import pytest
from sqlalchemy import select
from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
middleware as kb,
)
from app.db import Chunk
pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
rows = await session.execute(
select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
)
return list(rows.scalars().all())
def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
assert chunks
for chunk in chunks:
assert chunk.start_char is not None
assert chunk.end_char is not None
assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_create_populates_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
chunks = await _ordered_chunks(db_session, doc.id)
_assert_spans_resolve(doc.source_markdown, chunks)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_update_refreshes_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
updated = await kb._update_document(
db_session,
doc_id=doc.id,
content=_NEW_BODY,
virtual_path="/documents/note.md",
search_space_id=db_search_space.id,
)
await db_session.flush()
assert updated is not None
chunks = await _ordered_chunks(db_session, updated.id)
_assert_spans_resolve(updated.source_markdown, chunks)

View file

@ -158,13 +158,12 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
@pytest.fixture @pytest.fixture
def patched_chunk_text(monkeypatch) -> MagicMock: def patched_chunk_text(monkeypatch) -> MagicMock:
mock = MagicMock(return_value=["Test chunk content."]) from app.indexing_pipeline.document_chunker import ChunkSlice
text = "Test chunk content."
mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text", "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
mock,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock, mock,
) )
return mock return mock

View file

@ -286,9 +286,12 @@ def _mock_external_apis(monkeypatch):
"app.indexing_pipeline.cache.cached_indexing.embed_texts", "app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
) )
from app.indexing_pipeline.document_chunker import ChunkSlice
chunk = "Test chunk content."
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text", "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
MagicMock(return_value=["Test chunk content."]), MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
) )

View file

@ -176,9 +176,14 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
@pytest.mark.usefixtures("patched_embed_texts") @pytest.mark.usefixtures("patched_embed_texts")
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending.""" """Reindexing replaces old chunks with new content rather than appending."""
from app.indexing_pipeline.document_chunker import ChunkSlice
mocker.patch( mocker.patch(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
side_effect=[["Original chunk."], ["Updated chunk."]], side_effect=[
[ChunkSlice("Original chunk.", 0, len("Original chunk."))],
[ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
],
) )
adapter = UploadDocumentAdapter(db_session) adapter = UploadDocumentAdapter(db_session)

View file

@ -18,16 +18,22 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
@pytest.fixture @pytest.fixture
def paragraph_chunker(monkeypatch): def paragraph_chunker(monkeypatch):
"""One chunk per markdown paragraph, so edits map to chunk-level diffs.""" """One slice per markdown paragraph, so edits map to chunk-level diffs."""
from app.indexing_pipeline.document_chunker import ChunkSlice
def _split(markdown, **_kwargs): def _split(markdown, *_args, **_kwargs):
return [p for p in markdown.split("\n\n") if p.strip()] slices = []
cursor = 0
for para in markdown.split("\n\n"):
start = markdown.index(para, cursor)
cursor = start + len(para)
if para.strip():
slices.append(ChunkSlice(para, start, cursor))
return slices
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text", _split "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
) _split,
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
) )

View file

@ -0,0 +1,96 @@
"""Indexing records char spans so a chunk addresses its exact slice of the body.
Uses the real chunker (only embeddings are faked) so the span/partition
invariants are exercised end to end.
"""
import pytest
from sqlalchemy import select
from app.db import Chunk, Document
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
pytestmark = pytest.mark.integration
_BODY = (
"# Report\n\n"
+ "Intro paragraph that is reasonably long and descriptive. " * 8
+ "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
+ "Closing paragraph with a different shape and more words to chunk. " * 8
)
async def _ordered_chunks(session, document_id) -> list[Chunk]:
result = await session.execute(
select(Chunk)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
return list(result.scalars().all())
def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
for chunk in chunks:
assert chunk.start_char is not None and chunk.end_char is not None
assert body[chunk.start_char : chunk.end_char] == chunk.content
assert "".join(c.content for c in chunks) == body
async def _index(session, connector_doc) -> int:
service = IndexingPipelineService(session=session)
prepared = await service.prepare_for_indexing([connector_doc])
document = prepared[0]
await service.index(document, connector_doc)
return document.id
async def _reload_body(session, document_id) -> str:
result = await session.execute(select(Document).filter(Document.id == document_id))
return result.scalars().first().source_markdown
@pytest.mark.usefixtures("patched_embed_texts")
async def test_scratch_index_records_spans_addressing_body(
db_session, db_search_space, make_connector_document
):
connector_doc = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
document_id = await _index(db_session, connector_doc)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert len(chunks) > 1
_assert_spans_address_body(chunks, body)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_incremental_reindex_refreshes_shifted_spans(
db_session, db_search_space, make_connector_document
):
"""Inserting text at the top shifts every later chunk's span; kept rows must
have their spans refreshed, not left pointing at the old offsets."""
service = IndexingPipelineService(session=db_session)
original = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
prepared = await service.prepare_for_indexing([original])
document_id = prepared[0].id
await service.index(prepared[0], original)
edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
edited = make_connector_document(
search_space_id=db_search_space.id, source_markdown=edited_body
)
prepared_again = await service.prepare_for_indexing([edited])
assert prepared_again, "edited content should requeue the document"
await service.index(prepared_again[0], edited)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert body == edited_body
_assert_spans_address_body(chunks, body)

View file

@ -40,11 +40,19 @@ def _make_document(
) )
def _make_chunk(*, content: str, document_id: int) -> Chunk: def _make_chunk(
*,
content: str,
document_id: int,
start_char: int | None = None,
end_char: int | None = None,
) -> Chunk:
return Chunk( return Chunk(
content=content, content=content,
document_id=document_id, document_id=document_id,
embedding=DUMMY_EMBEDDING, embedding=DUMMY_EMBEDDING,
start_char=start_char,
end_char=end_char,
) )
@ -91,6 +99,8 @@ async def seed_large_doc(
_make_chunk( _make_chunk(
content="quarterly performance review summary note content", content="quarterly performance review summary note content",
document_id=small_doc.id, document_id=small_doc.id,
start_char=0,
end_char=10,
), ),
] ]

View file

@ -98,6 +98,32 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID" assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
async def test_chunk_spans_returned(db_session, seed_large_doc):
"""Each chunk dict carries start_char/end_char (the citation span)."""
space_id = seed_large_doc["search_space"].id
small_doc_id = seed_large_doc["small_doc"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review summary",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
for chunk in result["chunks"]:
assert "start_char" in chunk
assert "end_char" in chunk
if result["document"].get("id") == small_doc_id:
seeded = result["chunks"][0]
assert seeded["start_char"] == 0
assert seeded["end_char"] == 10
break
else:
pytest.fail("Small doc not found in search results")
async def test_score_is_positive_float(db_session, seed_large_doc): async def test_score_is_positive_float(db_session, seed_large_doc):
"""Each result should have a positive float score from RRF.""" """Each result should have a positive float score from RRF."""
space_id = seed_large_doc["search_space"].id space_id = seed_large_doc["search_space"].id

View file

@ -0,0 +1,127 @@
"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
derives the cited chunk's line range from source_markdown."""
import pytest
import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
pytestmark = pytest.mark.integration
_BODY = "alpha\nbravo\ncharlie\ndelta"
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
source_markdown: str = _BODY,
) -> Document:
doc = Document(
title="Doc",
document_type=DocumentType.FILE,
document_metadata={},
content=source_markdown,
content_hash="hash-by-chunk",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunk(
session: AsyncSession,
document: Document,
*,
content: str,
position: int,
start_char: int | None,
end_char: int | None,
) -> Chunk:
chunk = Chunk(
content=content,
position=position,
document_id=document.id,
start_char=start_char,
end_char=end_char,
)
session.add(chunk)
await session.flush()
return chunk
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
async def test_cited_line_range_derived_from_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
cited = await _add_chunk(
db_session,
doc,
content="charlie\ndelta",
position=1,
start_char=12,
end_char=len(_BODY),
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line == 3
assert result.cited_end_line == 4
async def test_chunk_spans_exposed_in_response(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
chunk = next(c for c in result.chunks if c.id == cited.id)
assert chunk.start_char == 0
assert chunk.end_char == 12
async def test_cited_line_range_null_without_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha", position=0, start_char=None, end_char=None
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line is None
assert result.cited_end_line is None

View file

@ -0,0 +1,175 @@
"""Phase A contract: editor read paths serve source_markdown and never
reconstruct or mutate the body from chunks."""
import pytest
import pytest_asyncio
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Chunk,
Document,
DocumentStatus,
DocumentType,
SearchSpace,
User,
)
pytestmark = pytest.mark.integration
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
document_type: DocumentType = DocumentType.FILE,
source_markdown: str | None = "# Title\n\nBody line.",
content: str = "Body line.",
status: dict | None = None,
) -> Document:
doc = Document(
title="Doc",
document_type=document_type,
document_metadata={},
content=content,
content_hash="hash-001",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=status or DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
for position, text in enumerate(texts):
session.add(Chunk(content=text, position=position, document_id=document.id))
await session.flush()
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
class TestGetEditorContent:
async def test_returns_source_markdown_verbatim(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown="# Real\n\nCanonical body.")
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == "# Real\n\nCanonical body."
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
"""A ready document without source_markdown must not be rebuilt from chunks."""
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
await db_session.refresh(doc)
assert doc.source_markdown is None
async def test_processing_document_without_body_returns_409(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.processing()
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 409
async def test_failed_document_without_body_returns_422(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.failed("boom")
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 422
async def test_empty_note_initializes_to_empty_markdown(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == ""
class TestDownloadMarkdown:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import download_document_markdown
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await download_document_markdown(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
class TestExportDocument:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import export_document
from app.routes.reports_routes import ExportFormat
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await export_document(
db_search_space.id,
doc.id,
format=ExportFormat.PLAIN,
session=db_session,
user=db_user,
)
assert exc.value.status_code == 400

View file

@ -0,0 +1,87 @@
"""Unit tests for search_knowledge_base hit rendering.
The tool must surface the passage that actually matched (the RRF-ranked
chunk), not the top of the document, and annotate it with its line range
when the chunk carries a char span.
"""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
_format_hits,
)
pytestmark = pytest.mark.unit
_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
def _hit() -> dict:
intro = "Intro paragraph."
matched = "Matched passage here."
matched_start = _BODY.index(matched)
return {
"document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
"score": 0.42,
"content": _BODY.replace("\n\n", "\n\n"),
"matched_chunk_ids": [102],
"chunks": [
{
"chunk_id": 101,
"content": intro,
"start_char": 0,
"end_char": len(intro),
},
{
"chunk_id": 102,
"content": matched,
"start_char": matched_start,
"end_char": matched_start + len(matched),
},
],
}
def test_renders_matched_passage_not_top_of_doc() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Matched passage here." in out
# The intro chunk was not matched, so it must not be shown as the snippet.
assert "Intro paragraph." not in out
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
# "Matched passage here." sits on line 3 of the body; the hit must surface
# a ready-to-copy token so the agent can cite without a separate read.
assert "[citation:d7#L3-3]" in out
def test_header_includes_document_id() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "id=7" in out
def test_omits_citation_token_when_spans_absent() -> None:
hit = _hit()
for chunk in hit["chunks"]:
chunk["start_char"] = None
chunk["end_char"] = None
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Matched passage here." in out
# No concrete, copyable token for this document without spans (the closing
# instruction's placeholder template doesn't count).
assert "[citation:d7#L" not in out
def test_falls_back_to_content_when_no_matched_ids() -> None:
hit = _hit()
hit["matched_chunk_ids"] = []
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Intro paragraph." in out
def test_no_results_message() -> None:
out = _format_hits([], paths={}, bodies={}, query="missing")
assert "No knowledge-base matches" in out

View file

@ -0,0 +1,72 @@
"""Span-aware chunking contract: slices form a lossless, contiguous partition
of the markdown, and every slice's char span addresses its own text."""
import pytest
from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
pytestmark = pytest.mark.unit
def _assert_lossless_partition(md: str, slices) -> None:
assert "".join(s.text for s in slices) == md
cursor = 0
for s in slices:
assert s.start_char == cursor, "slices must be contiguous"
assert s.end_char >= s.start_char
assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
cursor = s.end_char
assert cursor == len(md)
def test_prose_partition_and_spans():
md = (
"# Title\n\n"
+ "First paragraph with several words here. " * 20
+ "\n\nSecond section with more prose to force multiple chunks. " * 20
)
slices = chunk_markdown_with_spans(md)
assert len(slices) > 1
_assert_lossless_partition(md, slices)
def test_table_kept_whole_with_exact_span():
table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
md = f"Intro prose before the table.\n{table}\nClosing prose after."
slices = chunk_markdown_with_spans(md)
_assert_lossless_partition(md, slices)
table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
assert any("| 1 | 2 |" in s.text for s in table_slices)
for s in table_slices:
assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
def test_table_at_eof_without_trailing_newline_stays_whole():
md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
slices = chunk_markdown_with_spans(md)
_assert_lossless_partition(md, slices)
table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
assert len(table_slices) == 1
assert "| a | b |" in table_slices[0].text
def test_code_chunker_partition_and_spans():
code = "\n\n".join(
f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40)
)
slices = chunk_markdown_with_spans(code, use_code_chunker=True)
assert len(slices) >= 1
_assert_lossless_partition(code, slices)
def test_empty_markdown_yields_no_slices():
assert chunk_markdown_with_spans("") == []

View file

@ -37,12 +37,9 @@ def _make_orm_doc(connector_doc, doc_id):
async def test_index_calls_embed_and_chunk_via_to_thread( async def test_index_calls_embed_and_chunk_via_to_thread(
pipeline, make_connector_document, monkeypatch pipeline, make_connector_document, monkeypatch
): ):
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop. """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
from app.indexing_pipeline.document_chunker import ChunkSlice
Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
path, see issue #1334) is verified separately in
``test_non_code_documents_use_hybrid_chunker``.
"""
to_thread_calls = [] to_thread_calls = []
original_to_thread = asyncio.to_thread original_to_thread = asyncio.to_thread
@ -51,11 +48,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
return await original_to_thread(func, *args, **kwargs) return await original_to_thread(func, *args, **kwargs)
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread) monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid" mock_chunker.__name__ = "chunk_markdown_with_spans"
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
mock_chunk_hybrid, mock_chunker,
) )
mock_embed = MagicMock( mock_embed = MagicMock(
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@ -90,34 +87,25 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
await pipeline.index(document, connector_doc) await pipeline.index(document, connector_doc)
# Either chunker entry point satisfies the "chunking runs off the event assert "chunk_markdown_with_spans" in to_thread_calls
# loop" contract this test guards. Routing between the two is verified
# in test_non_code_documents_use_hybrid_chunker.
assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
assert "embed_texts" in to_thread_calls assert "embed_texts" in to_thread_calls
assert document.status == DocumentStatus.ready() assert document.status == DocumentStatus.ready()
async def test_non_code_documents_use_hybrid_chunker( async def test_non_code_documents_use_prose_chunker(
pipeline, make_connector_document, monkeypatch pipeline, make_connector_document, monkeypatch
): ):
"""Non-code documents route through ``chunk_text_hybrid`` (issue #1334). """Non-code documents chunk with use_code_chunker=False (issue #1334).
The hybrid chunker preserves Markdown table integrity by avoiding splits The table-aware prose path keeps Markdown tables intact; only documents
mid-row. Only documents flagged with ``should_use_code_chunker=True`` flagged with ``should_use_code_chunker=True`` request the code chunker.
should take the ``chunk_text`` path.
""" """
mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) from app.indexing_pipeline.document_chunker import ChunkSlice
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
mock_chunk_hybrid, mock_chunker,
)
mock_chunk_code = MagicMock(return_value=["chunk1"])
mock_chunk_code.__name__ = "chunk_text"
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock_chunk_code,
) )
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.embed_texts", "app.indexing_pipeline.cache.cached_indexing.embed_texts",
@ -149,8 +137,49 @@ async def test_non_code_documents_use_hybrid_chunker(
await pipeline.index(document, connector_doc) await pipeline.index(document, connector_doc)
mock_chunk_hybrid.assert_called_once() mock_chunker.assert_called_once()
mock_chunk_code.assert_not_called() assert mock_chunker.call_args.args[1] is False
async def test_code_documents_request_code_chunker(
pipeline, make_connector_document, monkeypatch
):
"""Code-flagged documents forward use_code_chunker=True to the chunker."""
from app.indexing_pipeline.document_chunker import ChunkSlice
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
mock_chunker,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
async def _noop_persist(_session, doc, *_args, **_kwargs):
doc.status = DocumentStatus.ready()
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
_noop_persist,
)
connector_doc = make_connector_document(
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
unique_id="repo-1",
search_space_id=1,
should_use_code_chunker=True,
)
document = MagicMock(spec=Document)
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc)
mock_chunker.assert_called_once()
assert mock_chunker.call_args.args[1] is True
def _mock_session_factory(orm_docs_by_id): def _mock_session_factory(orm_docs_by_id):

View file

@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
def __init__(self, *, children=None, file_data=None) -> None: def __init__(self, *, children=None, file_data=None) -> None:
self.als_info = AsyncMock(return_value=children or []) self.als_info = AsyncMock(return_value=children or [])
self._load_file_data = AsyncMock( self._load_file_data = AsyncMock(
return_value=(file_data, 17) if file_data is not None else None return_value=(file_data, 17, None) if file_data is not None else None
) )

View file

@ -69,13 +69,25 @@ class _FakeSession:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None: def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
"""Avoid loading the embedding model in unit tests.""" """Avoid loading the embedding model in unit tests.
Mirrors the legacy stub: one chunk spanning the whole content, with a
zero summary/chunk vector, routed through the shared span builder.
"""
from app.indexing_pipeline.document_chunker import ChunkSlice
async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
summary = np.zeros(8, dtype=np.float32)
pairs = (
[(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
if content
else []
)
return summary, pairs
monkeypatch.setattr( monkeypatch.setattr(
kb_persistence, kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
"embed_texts",
lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
) )
monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
@pytest.mark.asyncio @pytest.mark.asyncio

View file

@ -0,0 +1,92 @@
"""Unit tests for the numbered-document read preamble."""
import pytest
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
build_read_preamble,
compute_matched_line_ranges,
)
pytestmark = pytest.mark.unit
_BODY = "alpha\nbravo\ncharlie\ndelta"
class TestComputeMatchedLineRanges:
def test_maps_matched_chunk_spans_to_line_ranges(self):
chunks = [(1, 0, 12), (2, 12, len(_BODY))]
ranges = compute_matched_line_ranges(_BODY, chunks, {2})
assert ranges == [(3, 4)]
def test_includes_only_matched_chunks(self):
chunks = [(1, 0, 5), (2, 6, 11)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
assert ranges == [(1, 1)]
def test_skips_chunks_without_spans(self):
chunks = [(1, None, None)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
assert ranges == []
def test_sorted_and_deduplicated(self):
chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
assert ranges == [(1, 1), (3, 4)]
class TestBuildReadPreamble:
def test_contains_document_metadata(self):
preamble = build_read_preamble(
document_id=42,
document_type="FILE",
title="Test Doc",
url="https://example.com",
matched_line_ranges=[],
)
assert "<document_id>42</document_id>" in preamble
assert "<document_type>FILE</document_type>" in preamble
assert "Test Doc" in preamble
assert "https://example.com" in preamble
def test_citation_hint_uses_document_id(self):
preamble = build_read_preamble(
document_id=42,
document_type="FILE",
title="Test Doc",
url="",
matched_line_ranges=[],
)
assert "[citation:d42#L" in preamble
def test_lists_matched_line_ranges(self):
preamble = build_read_preamble(
document_id=7,
document_type="NOTE",
title="Notes",
url="",
matched_line_ranges=[(12, 18), (40, 40)],
)
assert "<matched_lines>" in preamble
assert "12-18" in preamble
assert "40" in preamble
def test_omits_matched_lines_block_when_empty(self):
preamble = build_read_preamble(
document_id=7,
document_type="NOTE",
title="Notes",
url="",
matched_line_ranges=[],
)
assert "<matched_lines>" not in preamble
def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
preamble = build_read_preamble(
document_id=1,
document_type="FILE",
title="t",
url="",
matched_line_ranges=[],
)
assert preamble.endswith("\n")

View file

@ -0,0 +1,39 @@
"""Unit tests for char-span -> line-range conversion."""
from __future__ import annotations
import pytest
from app.utils.text_spans import char_span_to_line_range
pytestmark = pytest.mark.unit
_TEXT = "line1\nline2\nline3"
def test_single_line_span() -> None:
start = _TEXT.index("line2")
assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
def test_first_line_span() -> None:
assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
def test_last_line_span() -> None:
start = _TEXT.index("line3")
assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
def test_multi_line_span() -> None:
# "line1\nline2" spans lines 1-2.
assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
def test_empty_span_resolves_to_its_line() -> None:
start = _TEXT.index("line2")
assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
def test_offsets_clamped_to_text_bounds() -> None:
assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)

View file

@ -270,6 +270,12 @@ button {
contain-intrinsic-size: 0 40px; contain-intrinsic-size: 0 40px;
} }
/* Monaco whole-line highlight for a cited source span (Phase E). */
.citation-line-highlight {
background-color: color-mix(in srgb, var(--primary) 16%, transparent);
box-shadow: inset 2px 0 0 0 var(--primary);
}
@source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}"; @source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
@source "../node_modules/streamdown/dist/*.js"; @source "../node_modules/streamdown/dist/*.js";
@source "../node_modules/@streamdown/code/dist/*.js"; @source "../node_modules/@streamdown/code/dist/*.js";

View file

@ -1,6 +1,11 @@
import { atom } from "jotai"; import { atom } from "jotai";
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom"; import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
export interface EditorLineRange {
start: number;
end: number;
}
interface EditorPanelState { interface EditorPanelState {
isOpen: boolean; isOpen: boolean;
kind: "document" | "local_file" | "memory"; kind: "document" | "local_file" | "memory";
@ -9,6 +14,10 @@ interface EditorPanelState {
searchSpaceId: number | null; searchSpaceId: number | null;
memoryScope: "user" | "team" | null; memoryScope: "user" | "team" | null;
title: string | null; title: string | null;
// Citation line anchor: when set, the editor opens the raw source view
// scrolled to and highlighting this 1-based inclusive line range.
highlightLines: EditorLineRange | null;
forceSourceView: boolean;
} }
const initialState: EditorPanelState = { const initialState: EditorPanelState = {
@ -19,6 +28,8 @@ const initialState: EditorPanelState = {
searchSpaceId: null, searchSpaceId: null,
memoryScope: null, memoryScope: null,
title: null, title: null,
highlightLines: null,
forceSourceView: false,
}; };
export const editorPanelAtom = atom<EditorPanelState>(initialState); export const editorPanelAtom = atom<EditorPanelState>(initialState);
@ -33,7 +44,14 @@ export const openEditorPanelAtom = atom(
get, get,
set, set,
payload: payload:
| { documentId: number; searchSpaceId: number; title?: string; kind?: "document" } | {
documentId: number;
searchSpaceId: number;
title?: string;
kind?: "document";
highlightLines?: EditorLineRange | null;
forceSourceView?: boolean;
}
| { | {
kind: "local_file"; kind: "local_file";
localFilePath: string; localFilePath: string;
@ -59,6 +77,8 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null, searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: null, memoryScope: null,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: null,
forceSourceView: false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);
@ -73,6 +93,8 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null, searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: payload.memoryScope, memoryScope: payload.memoryScope,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: null,
forceSourceView: false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);
@ -86,6 +108,8 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId, searchSpaceId: payload.searchSpaceId,
memoryScope: null, memoryScope: null,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: payload.highlightLines ?? null,
forceSourceView: payload.forceSourceView ?? false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);

View file

@ -2,9 +2,11 @@
import { useSetAtom } from "jotai"; import { useSetAtom } from "jotai";
import { FileText } from "lucide-react"; import { FileText } from "lucide-react";
import { useParams } from "next/navigation";
import type { FC } from "react"; import type { FC } from "react";
import { useId, useState } from "react"; import { useId, useState } from "react";
import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom"; import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context"; import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
import { CitationPanelContent } from "@/components/citation-panel/citation-panel"; import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
import { Citation } from "@/components/tool-ui/citation"; import { Citation } from "@/components/tool-ui/citation";
@ -108,6 +110,50 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
); );
}; };
interface LineCitationProps {
documentId: number;
startLine: number;
endLine: number;
}
/**
* Inline citation for a knowledge-base document line range
* (`[citation:d<documentId>#L<start>-<end>]`). Clicking opens the document in
* the editor's read-only source view, scrolled to and highlighting the cited
* lines the same anchor the citation panel uses for chunk citations.
*/
export const LineCitation: FC<LineCitationProps> = ({ documentId, startLine, endLine }) => {
const openEditorPanel = useSetAtom(openEditorPanelAtom);
const params = useParams();
const searchSpaceId = Number(params?.search_space_id);
const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`;
const handleClick = () => {
if (!Number.isFinite(searchSpaceId)) return;
openEditorPanel({
documentId,
searchSpaceId,
highlightLines: { start: startLine, end: endLine },
forceSourceView: true,
});
};
return (
<Button
type="button"
variant="ghost"
onClick={handleClick}
className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
title={`View cited lines ${startLine}${endLine}`}
aria-label={`View cited document lines ${startLine} to ${endLine}`}
>
<FileText className="size-3" />
{label}
</Button>
);
};
import { tryGetHostname } from "@/lib/url"; import { tryGetHostname } from "@/lib/url";
interface UrlCitationProps { interface UrlCitationProps {

View file

@ -46,6 +46,13 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]); const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
const citedLineLabel = useMemo(() => {
const start = data?.cited_start_line;
const end = data?.cited_end_line;
if (start == null || end == null) return null;
return start === end ? `Line ${start}` : `Lines ${start}${end}`;
}, [data?.cited_start_line, data?.cited_end_line]);
const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0; const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
const startIndex = data?.chunk_start_index ?? 0; const startIndex = data?.chunk_start_index ?? 0;
const hasMoreAbove = startIndex > 0; const hasMoreAbove = startIndex > 0;
@ -75,10 +82,15 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
const handleOpenFullDocument = () => { const handleOpenFullDocument = () => {
if (!data) return; if (!data) return;
const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null;
openEditorPanel({ openEditorPanel({
documentId: data.id, documentId: data.id,
searchSpaceId: data.search_space_id, searchSpaceId: data.search_space_id,
title: data.title, title: data.title,
highlightLines: hasLineAnchor
? { start: data.cited_start_line as number, end: data.cited_end_line as number }
: null,
forceSourceView: hasLineAnchor,
}); });
}; };
@ -110,6 +122,7 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
</p> </p>
</div> </div>
<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground"> <div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
{citedLineLabel && <span>{citedLineLabel}</span>}
{totalChunks > 0 && <span>{totalChunks} chunks</span>} {totalChunks > 0 && <span>{totalChunks} chunks</span>}
{!isLoading && !error && data && ( {!isLoading && !error && data && (
<Button <Button
@ -172,7 +185,9 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
Chunk #{chunk.id} Chunk #{chunk.id}
</span> </span>
{isCited && ( {isCited && (
<span className="text-[11px] font-semibold text-primary">Cited chunk</span> <span className="text-[11px] font-semibold text-primary">
{citedLineLabel ? `Cited chunk · ${citedLineLabel}` : "Cited chunk"}
</span>
)} )}
</div> </div>
<div className="text-sm"> <div className="text-sm">

View file

@ -1,7 +1,7 @@
"use client"; "use client";
import type { ReactNode } from "react"; import type { ReactNode } from "react";
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import { import {
type CitationToken, type CitationToken,
type CitationUrlMap, type CitationUrlMap,
@ -21,6 +21,16 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R
if (token.kind === "url") { if (token.kind === "url") {
return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />; return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
} }
if (token.kind === "line") {
return (
<LineCitation
key={`citation-line-${token.documentId}-${token.startLine}-${ordinalKey}`}
documentId={token.documentId}
startLine={token.startLine}
endLine={token.endLine}
/>
);
}
return ( return (
<InlineCitation <InlineCitation
key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`} key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}

View file

@ -149,6 +149,8 @@ export function EditorPanelContent({
searchSpaceId, searchSpaceId,
title, title,
onClose, onClose,
highlightLines = null,
forceSourceView = false,
}: { }: {
kind?: "document" | "local_file" | "memory"; kind?: "document" | "local_file" | "memory";
documentId?: number; documentId?: number;
@ -157,6 +159,8 @@ export function EditorPanelContent({
searchSpaceId?: number; searchSpaceId?: number;
title: string | null; title: string | null;
onClose?: () => void; onClose?: () => void;
highlightLines?: { start: number; end: number } | null;
forceSourceView?: boolean;
}) { }) {
const electronAPI = useElectronAPI(); const electronAPI = useElectronAPI();
const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null); const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
@ -205,7 +209,7 @@ export function EditorPanelContent({
const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines; const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
const viewerMode: ViewerMode = isMemoryMode const viewerMode: ViewerMode = isMemoryMode
? "plate" ? "plate"
: editorDoc?.viewer_mode === "monaco" || isLargeDocument : editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView
? "monaco" ? "monaco"
: "plate"; : "plate";
@ -828,6 +832,7 @@ export function EditorPanelContent({
value={editorDoc.source_markdown} value={editorDoc.source_markdown}
readOnly readOnly
onChange={() => {}} onChange={() => {}}
highlightLines={highlightLines}
/> />
</div> </div>
</div> </div>
@ -918,6 +923,8 @@ function DesktopEditorPanel() {
searchSpaceId={panelState.searchSpaceId ?? undefined} searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title} title={panelState.title}
onClose={closePanel} onClose={closePanel}
highlightLines={panelState.highlightLines}
forceSourceView={panelState.forceSourceView}
/> />
</div> </div>
); );
@ -957,6 +964,8 @@ function MobileEditorDrawer() {
memoryScope={panelState.memoryScope ?? undefined} memoryScope={panelState.memoryScope ?? undefined}
searchSpaceId={panelState.searchSpaceId ?? undefined} searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title} title={panelState.title}
highlightLines={panelState.highlightLines}
forceSourceView={panelState.forceSourceView}
/> />
</div> </div>
</DrawerContent> </DrawerContent>

View file

@ -3,9 +3,10 @@
import { type Descendant, KEYS } from "platejs"; import { type Descendant, KEYS } from "platejs";
import { createPlatePlugin, type PlateElementProps } from "platejs/react"; import { createPlatePlugin, type PlateElementProps } from "platejs/react";
import type { FC } from "react"; import type { FC } from "react";
import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import { import {
CITATION_REGEX, CITATION_REGEX,
type CitationToken,
type CitationUrlMap, type CitationUrlMap,
parseTextWithCitations, parseTextWithCitations,
} from "@/lib/citations/citation-parser"; } from "@/lib/citations/citation-parser";
@ -17,9 +18,12 @@ import {
*/ */
export type CitationElementNode = { export type CitationElementNode = {
type: "citation"; type: "citation";
kind: "chunk" | "doc" | "url"; kind: "chunk" | "doc" | "url" | "line";
chunkId?: number; chunkId?: number;
url?: string; url?: string;
documentId?: number;
startLine?: number;
endLine?: number;
/** Original literal token that produced this citation node. */ /** Original literal token that produced this citation node. */
rawText: string; rawText: string;
children: [{ text: "" }]; children: [{ text: "" }];
@ -33,11 +37,22 @@ const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
element, element,
}) => { }) => {
const isUrl = element.kind === "url"; const isUrl = element.kind === "url";
const isLine =
element.kind === "line" &&
element.documentId !== undefined &&
element.startLine !== undefined &&
element.endLine !== undefined;
return ( return (
<span {...attributes} className="inline-flex align-baseline"> <span {...attributes} className="inline-flex align-baseline">
<span contentEditable={false}> <span contentEditable={false}>
{isUrl && element.url ? ( {isUrl && element.url ? (
<UrlCitation url={element.url} /> <UrlCitation url={element.url} />
) : isLine ? (
<LineCitation
documentId={element.documentId as number}
startLine={element.startLine as number}
endLine={element.endLine as number}
/>
) : element.chunkId !== undefined ? ( ) : element.chunkId !== undefined ? (
<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} /> <InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
) : null} ) : null}
@ -97,10 +112,7 @@ function copyMarks(textNode: SlateText): Record<string, unknown> {
return marks; return marks;
} }
function makeCitationElement( function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode {
rawText: string,
segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
): CitationElementNode {
if (segment.kind === "url") { if (segment.kind === "url") {
return { return {
type: CITATION_TYPE, type: CITATION_TYPE,
@ -110,6 +122,17 @@ function makeCitationElement(
children: [{ text: "" }], children: [{ text: "" }],
}; };
} }
if (segment.kind === "line") {
return {
type: CITATION_TYPE,
kind: "line",
documentId: segment.documentId,
startLine: segment.startLine,
endLine: segment.endLine,
rawText,
children: [{ text: "" }],
};
}
return { return {
type: CITATION_TYPE, type: CITATION_TYPE,
kind: segment.isDocsChunk ? "doc" : "chunk", kind: segment.isDocsChunk ? "doc" : "chunk",

View file

@ -2,7 +2,7 @@
import dynamic from "next/dynamic"; import dynamic from "next/dynamic";
import { useTheme } from "next-themes"; import { useTheme } from "next-themes";
import { useEffect, useRef } from "react"; import { useCallback, useEffect, useRef } from "react";
import { Spinner } from "@/components/ui/spinner"; import { Spinner } from "@/components/ui/spinner";
const MonacoEditor = dynamic(() => import("@monaco-editor/react"), { const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
@ -17,6 +17,8 @@ interface SourceCodeEditorProps {
readOnly?: boolean; readOnly?: boolean;
fontSize?: number; fontSize?: number;
onSave?: () => Promise<void> | void; onSave?: () => Promise<void> | void;
/** 1-based inclusive line range to reveal and highlight (e.g. a citation). */
highlightLines?: { start: number; end: number } | null;
} }
export function SourceCodeEditor({ export function SourceCodeEditor({
@ -27,10 +29,45 @@ export function SourceCodeEditor({
readOnly = false, readOnly = false,
fontSize = 12, fontSize = 12,
onSave, onSave,
highlightLines = null,
}: SourceCodeEditorProps) { }: SourceCodeEditorProps) {
const { resolvedTheme } = useTheme(); const { resolvedTheme } = useTheme();
const onSaveRef = useRef(onSave); const onSaveRef = useRef(onSave);
const monacoRef = useRef<any>(null); const monacoRef = useRef<any>(null);
const editorRef = useRef<any>(null);
const decorationsRef = useRef<any>(null);
const highlightLinesRef = useRef(highlightLines);
highlightLinesRef.current = highlightLines;
const applyHighlight = useCallback(() => {
const editor = editorRef.current;
const monaco = monacoRef.current;
if (!editor || !monaco) return;
if (decorationsRef.current) {
decorationsRef.current.clear();
decorationsRef.current = null;
}
const range = highlightLinesRef.current;
if (!range) return;
const lineCount = editor.getModel()?.getLineCount() ?? range.end;
const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount);
const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount);
try {
decorationsRef.current = editor.createDecorationsCollection([
{
range: new monaco.Range(start, 1, end, 1),
options: { isWholeLine: true, className: "citation-line-highlight" },
},
]);
} catch {
// Decoration failure must not block the reveal below.
}
editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate);
}, []);
useEffect(() => {
applyHighlight();
}, [applyHighlight, highlightLines?.start, highlightLines?.end]);
const normalizedModelPath = (() => { const normalizedModelPath = (() => {
const raw = (path || "local-file.txt").trim(); const raw = (path || "local-file.txt").trim();
const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`; const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
@ -104,7 +141,16 @@ export function SourceCodeEditor({
}} }}
onMount={(editor, monaco) => { onMount={(editor, monaco) => {
monacoRef.current = monaco; monacoRef.current = monaco;
editorRef.current = editor;
applySidebarTheme(monaco); applySidebarTheme(monaco);
// Reveal now, then once more after the first layout settles:
// the panel slide-in animation means the editor often has no
// usable viewport height on the initial frame.
applyHighlight();
const layoutSub = editor.onDidLayoutChange(() => {
applyHighlight();
layoutSub.dispose();
});
if (!isManualSaveEnabled) return; if (!isManualSaveEnabled) return;
editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => { editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
void onSaveRef.current?.(); void onSaveRef.current?.();

View file

@ -12,6 +12,7 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl"; import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
import { useMediaQuery } from "@/hooks/use-media-query";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
import { DocumentsSidebar } from "../sidebar"; import { DocumentsSidebar } from "../sidebar";
@ -196,6 +197,9 @@ export function RightPanel({
const citationState = useAtomValue(citationPanelAtom); const citationState = useAtomValue(citationPanelAtom);
const closeCitation = useSetAtom(closeCitationPanelAtom); const closeCitation = useSetAtom(closeCitationPanelAtom);
const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom); const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
// Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without
// this guard both render together and two editors fight over one model.
const isDesktop = useMediaQuery("(min-width: 1024px)");
const documentsOpen = documentsPanel?.open ?? false; const documentsOpen = documentsPanel?.open ?? false;
const reportOpen = reportState.isOpen && !!reportState.reportId; const reportOpen = reportState.isOpen && !!reportState.reportId;
@ -267,7 +271,7 @@ export function RightPanel({
<CollapseButton onClick={() => setCollapsed(true)} /> <CollapseButton onClick={() => setCollapsed(true)} />
) : null; ) : null;
if (!isVisible) return null; if (!isVisible || !isDesktop) return null;
return ( return (
<aside <aside
@ -308,6 +312,8 @@ export function RightPanel({
searchSpaceId={editorState.searchSpaceId ?? undefined} searchSpaceId={editorState.searchSpaceId ?? undefined}
title={editorState.title} title={editorState.title}
onClose={closeEditor} onClose={closeEditor}
highlightLines={editorState.highlightLines}
forceSourceView={editorState.forceSourceView}
/> />
</div> </div>
)} )}

View file

@ -70,10 +70,15 @@ export const documentWithChunks = document.extend({
id: z.number(), id: z.number(),
content: z.string(), content: z.string(),
created_at: z.string(), created_at: z.string(),
start_char: z.number().nullable().optional(),
end_char: z.number().nullable().optional(),
}) })
), ),
total_chunks: z.number().optional().default(0), total_chunks: z.number().optional().default(0),
chunk_start_index: z.number().optional().default(0), chunk_start_index: z.number().optional().default(0),
// 1-based inclusive line range of the cited chunk within source_markdown.
cited_start_line: z.number().nullable().optional(),
cited_end_line: z.number().nullable().optional(),
}); });
/** /**

View file

@ -18,12 +18,16 @@ import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
* sometimes emit. * sometimes emit.
*/ */
export const CITATION_REGEX = export const CITATION_REGEX =
/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g; /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|d\d+#L\d+-\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
/** Matches the knowledge-base line-citation form `d<documentId>#L<start>-<end>`. */
const LINE_CITATION_REGEX = /^d(\d+)#L(\d+)-(\d+)$/;
/** A single parsed citation reference. */ /** A single parsed citation reference. */
export type CitationToken = export type CitationToken =
| { kind: "url"; url: string } | { kind: "url"; url: string }
| { kind: "chunk"; chunkId: number; isDocsChunk: boolean }; | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
| { kind: "line"; documentId: number; startLine: number; endLine: number };
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */ /** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
export type ParsedSegment = string | CitationToken; export type ParsedSegment = string | CitationToken;
@ -95,7 +99,15 @@ export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): Pa
const captured = match[1]; const captured = match[1];
if (captured.startsWith("http://") || captured.startsWith("https://")) { const lineMatch = LINE_CITATION_REGEX.exec(captured);
if (lineMatch) {
segments.push({
kind: "line",
documentId: Number.parseInt(lineMatch[1], 10),
startLine: Number.parseInt(lineMatch[2], 10),
endLine: Number.parseInt(lineMatch[3], 10),
});
} else if (captured.startsWith("http://") || captured.startsWith("https://")) {
segments.push({ kind: "url", url: captured.trim() }); segments.push({ kind: "url", url: captured.trim() });
} else if (captured.startsWith("urlcite")) { } else if (captured.startsWith("urlcite")) {
const url = urlMap.get(captured); const url = urlMap.get(captured);