Merge remote-tracking branch 'upstream/dev' into feat/api-key

This commit is contained in:
Anish Sarkar 2026-06-23 13:09:53 +05:30
commit 3695e1d5c5
64 changed files with 1043 additions and 1852 deletions

View file

@ -1,31 +0,0 @@
"""add chunks.start_char/end_char for citation offsets
Char offsets into the document's source_markdown (half-open span) let citations
resolve the exact passage a chunk came from. Nullable because historical rows
have no span; they populate on the next connector sync or user edit/reindex.
No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
every secondary index per row (see migration 165 for the same reasoning).
Revision ID: 166
Revises: 165
"""
from collections.abc import Sequence
from alembic import op
revision: str = "166"
down_revision: str | None = "165"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
def downgrade() -> None:
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")

View file

@ -18,6 +18,7 @@ skipped (e.g. client disconnect).
from __future__ import annotations from __future__ import annotations
import asyncio
import logging import logging
from datetime import UTC, datetime from datetime import UTC, datetime
from typing import Any from typing import Any
@ -57,8 +58,9 @@ from app.db import (
FolderRevision, FolderRevision,
shielded_async_session, shielded_async_session,
) )
from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings from app.indexing_pipeline.document_chunker import chunk_text
from app.utils.document_converters import ( from app.utils.document_converters import (
embed_texts,
generate_content_hash, generate_content_hash,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
@ -232,23 +234,24 @@ async def _create_document(
session.add(doc) session.add(doc)
await session.flush() await session.flush()
summary_embedding, chunk_embeddings = await build_chunk_embeddings( summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
content, use_code_chunker=False
)
doc.embedding = summary_embedding doc.embedding = summary_embedding
session.add_all( chunks = chunk_text(content)
[ if chunks:
Chunk( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
document_id=doc.id, session.add_all(
content=sl.text, [
embedding=embedding, Chunk(
position=i, document_id=doc.id,
start_char=sl.start_char, content=text,
end_char=sl.end_char, embedding=embedding,
) position=i,
for i, (sl, embedding) in enumerate(chunk_embeddings) )
] for i, (text, embedding) in enumerate(
) zip(chunks, chunk_embeddings, strict=True)
)
]
)
return doc return doc
@ -284,25 +287,26 @@ async def _update_document(
search_space_id, search_space_id,
) )
summary_embedding, chunk_embeddings = await build_chunk_embeddings( summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
content, use_code_chunker=False
)
document.embedding = summary_embedding document.embedding = summary_embedding
await session.execute(delete(Chunk).where(Chunk.document_id == document.id)) await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
session.add_all( chunks = chunk_text(content)
[ if chunks:
Chunk( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
document_id=document.id, session.add_all(
content=sl.text, [
embedding=embedding, Chunk(
position=i, document_id=document.id,
start_char=sl.start_char, content=text,
end_char=sl.end_char, embedding=embedding,
) position=i,
for i, (sl, embedding) in enumerate(chunk_embeddings) )
] for i, (text, embedding) in enumerate(
) zip(chunks, chunk_embeddings, strict=True)
)
]
)
return document return document

View file

@ -1,58 +1,42 @@
<citations> <citations>
Citations reach the answer through three channels. Use whichever applies, and Citations reach the answer through two channels. Use whichever applies — and
never invent ids you didn't see: ids are matched exactly, so a wrong one never invent ids you didn't see. Citation ids are resolved by exact-match
silently breaks the link — when in doubt, omit. Always write a citation as lookup; a wrong id silently breaks the link, so when in doubt, omit.
plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
parentheses.
### Channel A — web_search chunk blocks injected this turn ### Channel A — chunk blocks injected this turn
When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
turn, the chunk `id` is the result's URL: turn:
1. For each factual statement taken from a chunk, add `[citation:<url>]` 1. For each factual statement taken from those chunks, add
using the **exact** id from a visible `<chunk id='…'>` tag. Copy the `[citation:chunk_id]` using the **exact** id from a visible
URL verbatim; do not retype it from memory. `<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated, do not retype from memory.
2. `<document_id>` is the parent doc id, **not** a citation source —
only ids inside `<chunk id='…'>` count.
3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
each id copied individually). each id copied individually).
3. Never invent, normalise, or guess at a URL; if unsure, omit. 4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
5. Plain brackets only — no markdown links, no footnote numbering.
### Channel B — citations relayed by a `task` specialist ### Channel B — citations relayed by a `task` specialist
A `task(...)` tool message may contain `[citation:…]` markers the A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
specialist already attached to its prose — line citations the specialist already attached to its prose. The specialist saw the
(`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The underlying `<chunk id='…'>` blocks; you didn't. So:
specialist read the underlying document and tied each marker to a
passage; you didn't. So:
1. **Preserve those markers verbatim** in your final answer — do not 1. **Preserve those markers verbatim** in your final answer — do not
reformat, renumber, drop, or wrap them in markdown links. When you reformat, renumber, drop, or wrap them in markdown links. When you
paraphrase a specialist sentence, copy the marker character-for- paraphrase a specialist sentence, copy the marker character-for-
character; do not regenerate it from memory (LLMs reliably corrupt character; do not regenerate the id from memory (LLMs reliably
nearby digits). corrupt nearby digits).
2. Keep each marker attached to the sentence the specialist attached 2. Keep each marker attached to the sentence the specialist attached
it to. it to.
3. Do **not** add new `[citation:…]` markers of your own to a 3. Do **not** add new `[citation:…]` markers of your own to a
specialist's prose; if a fact has no marker, the specialist specialist's prose; if a fact has no marker, the specialist
couldn't tie it to a source and neither can you. couldn't tie it to a chunk and neither can you.
4. When a specialist returns JSON, the citation markers live inside 4. When a specialist returns JSON, the citation markers live inside
the prose-bearing fields (e.g. a summary or excerpt). Pull them the prose-bearing fields (e.g. a summary or excerpt). Pull them
along with the surrounding sentence when you quote. along with the surrounding sentence when you quote.
### Channel C — your knowledge base (search hits and `read_file`) If neither channel surfaces citation markers this turn, do not fabricate
Knowledge-base facts are cited by line range using the document id: them.
`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
matched passage. When that passage supports your point, copy the token
verbatim — that is the entire citation.
2. When you `read_file` a `/documents/...` path, its header gives the
`<document_id>` and an optional `<matched_lines>` pointer, and the body is
shown with line numbers; cite the lines you actually used. Use `read_file`
when you need more context than a search passage shows.
3. Copy document ids and line numbers exactly as shown — never estimate,
shift, or invent them.
4. Older documents without a numbered body instead show `<chunk id='N'>`
blocks; cite those with `[citation:N]`, copying the id exactly.
If none of these channels surfaces a citable source this turn, do not
fabricate citations.
</citations> </citations>

View file

@ -33,7 +33,6 @@ from app.agents.chat.runtime.path_resolver import (
) )
from app.db import Document, shielded_async_session from app.db import Document, shielded_async_session
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
from app.utils.text_spans import char_span_to_line_range
_perf_log = get_perf_logger() _perf_log = get_perf_logger()
@ -57,16 +56,12 @@ _TOOL_DESCRIPTION = (
) )
async def _resolve_doc_context( async def _resolve_virtual_paths(
results: list[dict[str, Any]], results: list[dict[str, Any]],
*, *,
search_space_id: int, search_space_id: int,
) -> tuple[dict[int, str], dict[int, str]]: ) -> dict[int, str]:
"""Resolve ``Document.id`` -> (canonical virtual path, source_markdown). """Resolve ``Document.id`` -> canonical virtual path for the search hits."""
``source_markdown`` is the canonical body the chunk spans index into; the
renderer uses it to turn a chunk's char span into a line range.
"""
doc_ids = [ doc_ids = [
doc_id doc_id
for doc_id in ( for doc_id in (
@ -77,24 +72,17 @@ async def _resolve_doc_context(
if isinstance(doc_id, int) if isinstance(doc_id, int)
] ]
if not doc_ids: if not doc_ids:
return {}, {} return {}
async with shielded_async_session() as session: async with shielded_async_session() as session:
index: PathIndex = await build_path_index(session, search_space_id) index: PathIndex = await build_path_index(session, search_space_id)
rows = await session.execute( folder_rows = await session.execute(
select( select(Document.id, Document.folder_id).where(
Document.id, Document.folder_id, Document.source_markdown
).where(
Document.search_space_id == search_space_id, Document.search_space_id == search_space_id,
Document.id.in_(doc_ids), Document.id.in_(doc_ids),
) )
) )
folder_by_doc_id: dict[int, int | None] = {} folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
bodies: dict[int, str] = {}
for row in rows.all():
folder_by_doc_id[row.id] = row.folder_id
if row.source_markdown:
bodies[row.id] = row.source_markdown
paths: dict[int, str] = {} paths: dict[int, str] = {}
for doc in results: for doc in results:
@ -109,76 +97,13 @@ async def _resolve_doc_context(
folder_id=folder_id if isinstance(folder_id, int) else None, folder_id=folder_id if isinstance(folder_id, int) else None,
index=index, index=index,
) )
return paths, bodies return paths
def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
"""Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
start = chunk.get("start_char")
end = chunk.get("end_char")
if (
not body
or not isinstance(doc_id, int)
or not isinstance(start, int)
or not isinstance(end, int)
):
return ""
start_line, end_line = char_span_to_line_range(body, start, end)
return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
def _render_passage(
chunk: dict[str, Any], body: str | None, doc_id: int | None
) -> str | None:
"""Render one matched chunk as an indented passage tagged with its token."""
content = (chunk.get("content") or "").strip()
if not content:
return None
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
indented = snippet.replace("\n", "\n ")
token = _citation_token(chunk, body, doc_id)
head = f"\n {token}" if token else ""
return f"{head}\n {indented}"
def _matched_passages(
doc: dict[str, Any], body: str | None, doc_id: int | None
) -> str:
"""Render the RRF-matched chunks; '' when none can be rendered."""
by_id = {
c.get("chunk_id"): c
for c in (doc.get("chunks") or [])
if isinstance(c, dict)
}
rendered: list[str] = []
for chunk_id in doc.get("matched_chunk_ids") or []:
chunk = by_id.get(chunk_id)
if chunk is None:
continue
passage = _render_passage(chunk, body, doc_id)
if passage:
rendered.append(passage)
return "".join(rendered)
def _fallback_snippet(doc: dict[str, Any]) -> str:
"""Top-of-document preview, used only when no matched chunk is available."""
content = (doc.get("content") or "").strip()
if not content:
return "\n (no preview available; read the document for details)"
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
return "\n " + snippet.replace("\n", "\n ")
def _format_hits( def _format_hits(
results: list[dict[str, Any]], results: list[dict[str, Any]],
*, *,
paths: dict[int, str], paths: dict[int, str],
bodies: dict[int, str],
query: str, query: str,
) -> str: ) -> str:
"""Render search hits as a compact, model-readable block.""" """Render search hits as a compact, model-readable block."""
@ -199,15 +124,21 @@ def _format_hits(
score = doc.get("score") score = doc.get("score")
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a" score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
path = paths.get(doc_id) if isinstance(doc_id, int) else None path = paths.get(doc_id) if isinstance(doc_id, int) else None
body = bodies.get(doc_id) if isinstance(doc_id, int) else None
id_str = f"id={doc_id}, " if isinstance(doc_id, int) else "" header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else "" f"\n path: {path}" if path else ""
) )
passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None) content = (doc.get("content") or "").strip()
entry = header + (passages or _fallback_snippet(doc)) if content:
snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
if len(content) > _PER_DOC_SNIPPET_CHARS:
snippet += " ..."
body = "\n " + snippet.replace("\n", "\n ")
else:
body = "\n (no preview available; read the document for details)"
entry = header + body
if total + len(entry) > _MAX_TOTAL_CHARS: if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n<!-- additional matches truncated to fit context -->") lines.append("\n<!-- additional matches truncated to fit context -->")
break break
@ -215,9 +146,8 @@ def _format_hits(
total += len(entry) total += len(entry)
lines.append( lines.append(
"\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token " "\n\nTo read a full document, delegate to the knowledge_base specialist "
"verbatim. To quote more context or read the full document, delegate to " "with `task`, referencing the path above."
"the knowledge_base specialist with `task` using the path above."
) )
lines.append("\n</knowledge_base_results>") lines.append("\n</knowledge_base_results>")
return "".join(lines) return "".join(lines)
@ -274,10 +204,8 @@ def create_search_knowledge_base_tool(
top_k=clamped_top_k, top_k=clamped_top_k,
) )
paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id) paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
rendered = _format_hits( rendered = _format_hits(results, paths=paths, query=cleaned_query)
results, paths=paths, bodies=bodies, query=cleaned_query
)
matched = _matched_chunk_ids(results) matched = _matched_chunk_ids(results)
_perf_log.info( _perf_log.info(

View file

@ -45,10 +45,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import ( from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml, build_document_xml,
) )
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
build_read_preamble,
compute_matched_line_ranges,
)
from app.agents.chat.runtime.path_resolver import ( from app.agents.chat.runtime.path_resolver import (
DOCUMENTS_ROOT, DOCUMENTS_ROOT,
build_path_index, build_path_index,
@ -68,12 +64,6 @@ def _basename(path: str) -> str:
return path.rsplit("/", 1)[-1] return path.rsplit("/", 1)[-1]
def _metadata_url(metadata: dict[str, Any]) -> str:
return (
metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
)
def _is_under(child: str, parent: str) -> bool: def _is_under(child: str, parent: str) -> bool:
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics).""" """Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
if parent == "/": if parent == "/":
@ -470,11 +460,8 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path) loaded = await self._load_file_data(file_path)
if loaded is None: if loaded is None:
return f"Error: File '{file_path}' not found" return f"Error: File '{file_path}' not found"
file_data, _, preamble = loaded file_data, _ = loaded
body = format_read_response(file_data, offset, limit) return format_read_response(file_data, offset, limit)
if preamble and offset == 0:
return preamble + body
return body
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override] def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
return asyncio.run(self.aread(file_path, offset, limit)) return asyncio.run(self.aread(file_path, offset, limit))
@ -482,14 +469,12 @@ class KBPostgresBackend(BackendProtocol):
async def _load_file_data( async def _load_file_data(
self, self,
path: str, path: str,
) -> tuple[dict[str, Any], int | None, str | None] | None: ) -> tuple[dict[str, Any], int | None] | None:
"""Lazy-load a virtual KB document into a deepagents ``FileData``. """Lazy-load a virtual KB document into a deepagents ``FileData``.
Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
doesn't map to any known document. ``doc_id`` is ``None`` for the to any known document. ``doc_id`` is ``None`` for the synthetic
synthetic anonymous document. ``preamble`` is the metadata header to anonymous document so the caller doesn't track it as a DB-backed file.
show above a numbered ``source_markdown`` body (``None`` for the legacy
chunk-reconstructed XML reads used when a document has no body).
""" """
anon = self._kb_anon_doc() anon = self._kb_anon_doc()
if anon and str(anon.get("path") or "") == path: if anon and str(anon.get("path") or "") == path:
@ -507,7 +492,7 @@ class KBPostgresBackend(BackendProtocol):
} }
xml = build_document_xml(doc_payload, matched_chunk_ids=set()) xml = build_document_xml(doc_payload, matched_chunk_ids=set())
file_data = create_file_data(xml) file_data = create_file_data(xml)
return file_data, None, None return file_data, None
if not path.startswith(DOCUMENTS_ROOT): if not path.startswith(DOCUMENTS_ROOT):
return None return None
@ -520,58 +505,41 @@ class KBPostgresBackend(BackendProtocol):
) )
if document is None: if document is None:
return None return None
source_markdown = document.source_markdown or ""
document_type = (
document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
)
metadata = dict(document.document_metadata or {})
chunk_rows = await session.execute( chunk_rows = await session.execute(
select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char) select(Chunk.id, Chunk.content)
.where(Chunk.document_id == document.id) .where(Chunk.document_id == document.id)
.order_by(Chunk.position, Chunk.id) .order_by(Chunk.position, Chunk.id)
) )
chunk_records = chunk_rows.all() chunks = [
document_id = document.id {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
document_title = document.title ]
matched = self._matched_chunk_ids(document_id)
# Canonical read: serve the verbatim body with cat -n line numbers that
# line up with chunk char spans, so the agent cites real source lines.
if source_markdown:
ranges = compute_matched_line_ranges(
source_markdown,
[(r.id, r.start_char, r.end_char) for r in chunk_records],
matched,
)
preamble = build_read_preamble(
document_id=document_id,
document_type=document_type,
title=document_title,
url=_metadata_url(metadata),
matched_line_ranges=ranges,
)
return create_file_data(source_markdown), document_id, preamble
# Legacy fallback: no canonical body, reconstruct from chunks as XML.
doc_payload = { doc_payload = {
"document_id": document_id, "document_id": document.id,
"chunks": [ "chunks": chunks,
{"chunk_id": r.id, "content": r.content} for r in chunk_records "matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
],
"matched_chunk_ids": list(matched),
"document": { "document": {
"id": document_id, "id": document.id,
"title": document_title, "title": document.title,
"document_type": document_type, "document_type": (
"metadata": metadata, document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
),
"metadata": dict(document.document_metadata or {}),
}, },
"source": document_type, "source": (
document.document_type.value
if getattr(document, "document_type", None) is not None
else "UNKNOWN"
),
} }
xml = build_document_xml(doc_payload, matched_chunk_ids=matched) xml = build_document_xml(
return create_file_data(xml), document_id, None doc_payload,
matched_chunk_ids=self._matched_chunk_ids(document.id),
)
file_data = create_file_data(xml)
return file_data, document.id
# ------------------------------------------------------------------ writes # ------------------------------------------------------------------ writes
@ -603,7 +571,7 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path) loaded = await self._load_file_data(file_path)
if loaded is None: if loaded is None:
return EditResult(error=f"Error: File '{file_path}' not found") return EditResult(error=f"Error: File '{file_path}' not found")
file_data, _, _ = loaded file_data, _ = loaded
content = file_data_to_string(file_data) content = file_data_to_string(file_data)
result = perform_string_replacement( result = perform_string_replacement(

View file

@ -1,73 +0,0 @@
"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
``source_markdown`` makes those line numbers line up exactly with the chunk
char spans and the editor highlight. This module renders the small header the
agent sees above that body: document identity plus the matched line ranges to
seek to, and a concrete reminder of the line-citation token shape.
"""
from __future__ import annotations
from collections.abc import Iterable
from app.utils.text_spans import char_span_to_line_range
def _format_range(start: int, end: int) -> str:
return f"{start}" if start == end else f"{start}-{end}"
def compute_matched_line_ranges(
source_markdown: str,
chunks: Iterable[tuple[int, int | None, int | None]],
matched_chunk_ids: set[int],
) -> list[tuple[int, int]]:
"""Map matched chunks to sorted, de-duplicated 1-based line ranges.
``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
spans (legacy rows) are skipped they have no resolvable location.
"""
ranges: set[tuple[int, int]] = set()
for chunk_id, start_char, end_char in chunks:
if chunk_id not in matched_chunk_ids:
continue
if start_char is None or end_char is None:
continue
ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
return sorted(ranges)
def build_read_preamble(
*,
document_id: int,
document_type: str,
title: str,
url: str,
matched_line_ranges: list[tuple[int, int]],
) -> str:
"""Render the metadata header shown above a numbered ``source_markdown`` body.
``matched_line_ranges`` are 1-based inclusive line ranges (already derived
from chunk char spans) to point the agent at the relevant lines.
"""
lines = [
"<document_metadata>",
f" <document_id>{document_id}</document_id>",
f" <document_type>{document_type}</document_type>",
f" <title><![CDATA[{title}]]></title>",
f" <url><![CDATA[{url}]]></url>",
]
if matched_line_ranges:
ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
lines.append(f" <matched_lines>{ranges}</matched_lines>")
lines.append("</document_metadata>")
lines.append(
f"Cite lines from this document as [citation:d{document_id}#L<start>-<end>] "
"using the line numbers shown below."
)
lines.append("")
return "\n".join(lines)
__all__ = ["build_read_preamble", "compute_matched_line_ranges"]

View file

@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: File '{validated}' not found" return f"Error: File '{validated}' not found"
_, doc_id_to_attach, _ = loaded _, doc_id_to_attach = loaded
res: EditResult = await backend.aedit( res: EditResult = await backend.aedit(
validated, old_string, new_string, replace_all=replace_all validated, old_string, new_string, replace_all=replace_all

View file

@ -75,7 +75,7 @@ async def cloud_move_file(
loaded = await backend._load_file_data(source) loaded = await backend._load_file_data(source)
if loaded is None: if loaded is None:
return f"Error: source '{source}' not found." return f"Error: source '{source}' not found."
source_file_data, loaded_doc_id, _ = loaded source_file_data, loaded_doc_id = loaded
if source_doc_id is None: if source_doc_id is None:
source_doc_id = loaded_doc_id source_doc_id = loaded_doc_id

View file

@ -58,10 +58,8 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: File '{validated}' not found" return f"Error: File '{validated}' not found"
file_data, doc_id, preamble = loaded file_data, doc_id = loaded
rendered = format_read_response(file_data, offset, limit) rendered = format_read_response(file_data, offset, limit)
if preamble and offset == 0:
rendered = preamble + rendered
update: dict[str, Any] = { update: dict[str, Any] = {
"files": {validated: file_data}, "files": {validated: file_data},
"messages": [ "messages": [

View file

@ -74,7 +74,7 @@ async def cloud_rm(
loaded = await backend._load_file_data(validated) loaded = await backend._load_file_data(validated)
if loaded is None: if loaded is None:
return f"Error: file '{validated}' not found." return f"Error: file '{validated}' not found."
_, resolved_doc_id, _ = loaded _, resolved_doc_id = loaded
files_update: dict[str, Any] = {validated: None} files_update: dict[str, Any] = {validated: None}
update: dict[str, Any] = { update: dict[str, Any] = {

View file

@ -240,24 +240,23 @@ def create_generate_image_tool(
error="No images were generated", error="No images were generated",
) )
# Update all image URLs in response_dict to be absolute (for the serving endpoint)
from urllib.parse import urlparse
for image in images:
if image.get("url"):
raw_url: str = image["url"]
if raw_url.startswith("/") and provider_base_url:
parsed = urlparse(provider_base_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
image["url"] = f"{origin}{raw_url}" # Update the stored dict!
first_image = images[0] first_image = images[0]
revised_prompt = first_image.get("revised_prompt", prompt) revised_prompt = first_image.get("revised_prompt", prompt)
# b64_json (e.g. gpt-image-1) is served via our backend endpoint so # b64_json (e.g. gpt-image-1) is served via our backend endpoint so
# megabytes of base64 don't bloat the LLM context. # megabytes of base64 don't bloat the LLM context.
# Some OpenAI-compatible backends (e.g. Xinference) return a relative
# URL like /files/image.png. Browsers can't resolve these, so we
# prepend the provider's base origin when the URL starts with "/".
if first_image.get("url"): if first_image.get("url"):
raw_url: str = first_image["url"] image_url = first_image["url"]
if raw_url.startswith("/") and provider_base_url:
from urllib.parse import urlparse
parsed = urlparse(provider_base_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
image_url = f"{origin}{raw_url}"
else:
image_url = raw_url
elif first_image.get("b64_json"): elif first_image.get("b64_json"):
backend_url = config.BACKEND_URL or "http://localhost:8000" backend_url = config.BACKEND_URL or "http://localhost:8000"
image_url = ( image_url = (

View file

@ -35,24 +35,42 @@ Map outcomes to your `status`:
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
## Citations in your prose ## Chunk citations in your prose
`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation. When `read_file` returns a KB-indexed document under `/documents/`, the response includes `<chunk id='…'>` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:<chunk_id>]` to the sentence stating that fact, using the **exact** id from the `<chunk id='…'>` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
**Numbered body (default).** A `<document_metadata>` header gives the `<document_id>` and an optional `<matched_lines>` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`). ### Where chunk ids live in `read_file` output
**Legacy chunk blocks (older docs without a stored body).** The response is XML with `<chunk id='N'>` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `<chunk id='…'>` tag. A KB document's XML has three numeric attributes — only **one** is a citation source:
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document>
```
### Rules ### Rules
- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. - Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
- Never cite `<document_id>` on its own — it identifies the document, not a passage. In the numbered form it is only the `d<document_id>` prefix of a line citation. - Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick. - Never cite `<document_id>` — that's the parent doc, not a chunk.
- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Prefer **fewer accurate citations** over many speculative ones. - Prefer **fewer accurate citations** over many speculative ones.
- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`. - Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite. - Tool results without `<chunk id='…'>` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters. - Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
## Examples ## Examples
@ -71,7 +89,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
"path": "/documents/meetings/2026-05-11-meeting.md", "path": "/documents/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"citations": null "chunk_ids": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -103,7 +121,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" } { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"citations": null "chunk_ids": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -124,7 +142,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"citations": string[] | null "chunk_ids": string[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -33,11 +33,11 @@ Map outcomes to your `status`:
- Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`. - Any other `"Error: …"``status=error` and relay the tool's message verbatim as `next_step`.
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`. - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.) You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
## Citations in your prose ## Chunk citations in your prose
In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work. In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `<chunk id='…'>` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
## Examples ## Examples
@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
"path": "/notes/meetings/2026-05-11-meeting.md", "path": "/notes/meetings/2026-05-11-meeting.md",
"matched_candidates": null, "matched_candidates": null,
"content_excerpt": null, "content_excerpt": null,
"citations": null "chunk_ids": null
}, },
"next_step": null, "next_step": null,
"missing_fields": null, "missing_fields": null,
@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" } { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
], ],
"content_excerpt": null, "content_excerpt": null,
"citations": null "chunk_ids": null
}, },
"next_step": "Ask the user which design doc to update.", "next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"], "missing_fields": ["path"],
@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null, "path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null, "matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null, "content_excerpt": string | null,
"citations": string[] | null "chunk_ids": string[] | null
}, },
"next_step": string | null, "next_step": string | null,
"missing_fields": string[] | null, "missing_fields": string[] | null,

View file

@ -28,21 +28,41 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
## Citations ## Chunk citations
`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation. When the evidence for a claim came from a `read_file` response that included `<chunk id='…'>` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:<chunk_id>]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
- **Numbered body (default).** A `<document_metadata>` header gives the `<document_id>`, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`). ### Where chunk ids live in `read_file` output
- **Legacy chunk blocks (older docs).** XML with `<chunk id='N'>` blocks. Cite the chunk a claim came from as `[citation:N]`.
A KB document's XML has three numeric attributes — only **one** is a citation source:
```
<document>
<document_metadata>
<document_id>42</document_id> ← NOT a citation. Parent doc id; ignore for citations.
...
</document_metadata>
<chunk_index>
<entry chunk_id="128" lines="14-22"/> ← Index hint; the same id also appears below.
<entry chunk_id="129" lines="23-30" matched="true"/>
</chunk_index>
<document_content>
<chunk id='128'><![CDATA[…]]></chunk> ← This is the citation source.
<chunk id='129'><![CDATA[…]]></chunk>
</document_content>
</document>
```
### Rules ### Rules
- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation. - Use the **exact** id from a `<chunk id='…'>` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
- Never cite `<document_id>` on its own — in the numbered form it is only the `d<document_id>` prefix of a line citation. - Before emitting `[citation:N]`, confirm the literal substring `<chunk id='N'>` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones. - Never cite `<document_id>` — that's the parent doc, not a chunk.
- Multiple passages supporting the same point → comma-separated and copied individually. - Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite. - If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without `<chunk id='…'>`), skip the citation.
- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference. - The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].` Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`

View file

@ -957,9 +957,8 @@ class Config:
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true" os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
) )
# Bump to invalidate every cached embedding set after a chunker change. # Bump to invalidate every cached embedding set after a chunker change.
# v2: chunks became exact (raw) slices of source_markdown for citation spans.
EMBEDDING_CACHE_CHUNKER_VERSION = int( EMBEDDING_CACHE_CHUNKER_VERSION = int(
os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2") os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
) )
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90")) EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int( EMBEDDING_CACHE_MAX_TOTAL_MB = int(

View file

@ -1470,11 +1470,6 @@ class Chunk(BaseModel, TimestampMixin):
# ordering reads are document-scoped (covered by ix_chunks_document_id) and # ordering reads are document-scoped (covered by ix_chunks_document_id) and
# building a position index on the large chunks table is not worth it. # building a position index on the large chunks table is not worth it.
position = Column(Integer, nullable=False, server_default="0") position = Column(Integer, nullable=False, server_default="0")
# Half-open char span into the document's source_markdown the chunk was cut
# from. Nullable: historical rows predate spans and populate on reindex.
# Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
start_char = Column(Integer, nullable=True)
end_char = Column(Integer, nullable=True)
document_id = Column( document_id = Column(
Integer, Integer,

View file

@ -18,26 +18,23 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.service import EmbeddingCacheService from app.indexing_pipeline.cache.service import EmbeddingCacheService
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_embedder import embed_texts
from app.observability import metrics from app.observability import metrics
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SliceEmbedding = tuple[ChunkSlice, np.ndarray] ChunkPair = tuple[str, np.ndarray]
async def build_chunk_embeddings( async def build_chunk_embeddings(
markdown: str, *, use_code_chunker: bool markdown: str, *, use_code_chunker: bool
) -> tuple[np.ndarray, list[SliceEmbedding]]: ) -> tuple[np.ndarray, list[ChunkPair]]:
"""Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs. """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
Slices are always recomputed (cheap) so their char spans are exact; only the Drop-in for the inline chunk+embed step; reuses prior output when the same
embeddings are cached, reused when the same markdown was embedded with the markdown has already been embedded with the current model and chunker.
current model and chunker.
""" """
slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
settings = load_embedding_cache_settings() settings = load_embedding_cache_settings()
chunker_kind = "code" if use_code_chunker else "hybrid" chunker_kind = "code" if use_code_chunker else "hybrid"
embedding_dim = getattr(config.embedding_model_instance, "dimension", None) embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
@ -48,7 +45,7 @@ async def build_chunk_embeddings(
embedding_dim=embedding_dim, embedding_dim=embedding_dim,
) )
if not cacheable: if not cacheable:
return await _compute(markdown, slices) return await _compute(markdown, use_code_chunker=use_code_chunker)
key = EmbeddingKey( key = EmbeddingKey(
markdown_sha256=_hash_text(markdown), markdown_sha256=_hash_text(markdown),
@ -59,30 +56,31 @@ async def build_chunk_embeddings(
) )
cached = await _recall(key) cached = await _recall(key)
if cached is not None and _aligns(cached, slices): if cached is not None:
metrics.record_embedding_cache_lookup( metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, embedding_model=key.embedding_model,
chunker_kind=chunker_kind, chunker_kind=chunker_kind,
outcome="hit", outcome="hit",
) )
logger.debug("Embedding cache hit for %s", key.markdown_sha256) logger.debug("Embedding cache hit for %s", key.markdown_sha256)
return cached.summary_embedding, list( return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
zip(slices, (c.embedding for c in cached.chunks), strict=True)
)
metrics.record_embedding_cache_lookup( metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss" embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
) )
summary_embedding, pairs = await _compute(markdown, slices) summary_embedding, chunk_pairs = await _compute(
await _remember(key, summary_embedding, pairs) markdown, use_code_chunker=use_code_chunker
return summary_embedding, pairs
async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
"""Chunk markdown into ordered, char-addressed slices off the event loop."""
return await asyncio.to_thread(
chunk_markdown_with_spans, markdown, use_code_chunker
) )
await _remember(key, summary_embedding, chunk_pairs)
return summary_embedding, chunk_pairs
async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
"""Chunk markdown into ordered texts with the pipeline's chunker selection."""
if use_code_chunker:
return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
# Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
return await asyncio.to_thread(chunk_text_hybrid, markdown)
async def embed_batch(texts: list[str]) -> list[np.ndarray]: async def embed_batch(texts: list[str]) -> list[np.ndarray]:
@ -90,19 +88,13 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
return await asyncio.to_thread(embed_texts, texts) return await asyncio.to_thread(embed_texts, texts)
def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
"""A hit is only usable if its texts still match the current chunking."""
return len(cached.chunks) == len(slices) and all(
c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
)
async def _compute( async def _compute(
markdown: str, slices: list[ChunkSlice] markdown: str, *, use_code_chunker: bool
) -> tuple[np.ndarray, list[SliceEmbedding]]: ) -> tuple[np.ndarray, list[ChunkPair]]:
embeddings = await embed_batch([markdown, *(s.text for s in slices)]) chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
embeddings = await embed_batch([markdown, *chunk_texts])
summary_embedding, *chunk_embeddings = embeddings summary_embedding, *chunk_embeddings = embeddings
return summary_embedding, list(zip(slices, chunk_embeddings, strict=True)) return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
@ -118,14 +110,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
async def _remember( async def _remember(
key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding] key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
) -> None: ) -> None:
try: try:
from app.tasks.celery_tasks import get_celery_session_maker from app.tasks.celery_tasks import get_celery_session_maker
embedding_set = EmbeddingSet( embedding_set = EmbeddingSet(
summary_embedding=summary_embedding, summary_embedding=summary_embedding,
chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs], chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
) )
async with get_celery_session_maker()() as session: async with get_celery_session_maker()() as session:
await EmbeddingCacheService(session).remember(key, embedding_set) await EmbeddingCacheService(session).remember(key, embedding_set)

View file

@ -19,9 +19,6 @@ class ExistingChunk:
id: int id: int
content: str content: str
position: int position: int
# Stored char span; None for legacy rows indexed before spans existed.
start_char: int | None = None
end_char: int | None = None
@dataclass(frozen=True, slots=True) @dataclass(frozen=True, slots=True)

View file

@ -1,30 +1,16 @@
import re import re
from dataclasses import dataclass
from app.config import config from app.config import config
# Regex that matches a Markdown table block (header + separator + one or more rows) # Regex that matches a Markdown table block (header + separator + one or more rows)
# A table block starts with a | at the beginning of a line and ends when a # A table block starts with a | at the beginning of a line and ends when a
# non-table line (or end of string) is encountered. The final row may end at EOF # non-table line (or end of string) is encountered.
# without a trailing newline, so the whole table stays one slice.
_TABLE_BLOCK_RE = re.compile( _TABLE_BLOCK_RE = re.compile(
r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)", r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
re.MULTILINE, re.MULTILINE,
) )
@dataclass(frozen=True, slots=True)
class ChunkSlice:
"""A chunk paired with its half-open char span into the source markdown.
Invariant: ``markdown[start_char:end_char] == text``.
"""
text: str
start_char: int
end_char: int
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
"""Chunk a text string using the configured chunker and return the chunk texts.""" """Chunk a text string using the configured chunker and return the chunk texts."""
chunker = ( chunker = (
@ -33,63 +19,41 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
return [c.text for c in chunker.chunk(text)] return [c.text for c in chunker.chunk(text)]
def chunk_markdown_with_spans( def chunk_text_hybrid(text: str) -> list[str]:
text: str, use_code_chunker: bool = False """Table-aware chunker that prevents Markdown tables from being split mid-row.
) -> list[ChunkSlice]:
"""Chunk markdown into a lossless, contiguous partition of char-addressed slices.
Tables stay whole (issue #1334) and every slice is an exact substring of Algorithm:
``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is 1. Scan the document for Markdown table blocks.
the offset record citations resolve against. 2. Each table block is emitted as a single, unmodified chunk so that its
header, separator row, and data rows always stay together.
3. The non-table prose segments between (and around) tables are passed through
the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
document order.
This ensures that table data is never sliced in the middle by the token-based
chunker, which would otherwise produce garbled rows that are useless for RAG.
Fixes #1334.
""" """
if not text: chunks: list[str] = []
return []
slices: list[ChunkSlice] = []
cursor = 0 cursor = 0
for match in _TABLE_BLOCK_RE.finditer(text): for match in _TABLE_BLOCK_RE.finditer(text):
if match.start() > cursor: # Prose before this table
slices.extend( prose = text[cursor : match.start()].strip()
_segment_slices(text, cursor, match.start(), use_code_chunker) if prose:
) chunks.extend(chunk_text(prose))
slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
# The table itself is kept as one indivisible chunk
table_block = match.group(0).strip()
if table_block:
chunks.append(table_block)
cursor = match.end() cursor = match.end()
if len(text) > cursor: # Remaining prose after the last table (or entire text if no tables)
slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker)) trailing = text[cursor:].strip()
if trailing:
chunks.extend(chunk_text(trailing))
return slices return chunks
def _segment_slices(
text: str, start: int, end: int, use_code_chunker: bool
) -> list[ChunkSlice]:
"""Sub-chunk one non-table segment into contiguous, char-addressed slices."""
chunker = (
config.code_chunker_instance if use_code_chunker else config.chunker_instance
)
segment = text[start:end]
chunks = chunker.chunk(segment)
slices: list[ChunkSlice] = []
local = 0
for chunk in chunks:
# Use the chunker's end offset only as a cut point, then re-slice the
# segment ourselves so the result is an exact, gap-free substring.
local_end = min(max(chunk.end_index, local), len(segment))
if local_end <= local:
continue
slices.append(
ChunkSlice(segment[local:local_end], start + local, start + local_end)
)
local = local_end
if local < len(segment):
if slices:
last = slices[-1]
slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
else:
slices.append(ChunkSlice(segment[local:], start + local, end))
return slices

View file

@ -20,10 +20,9 @@ from app.db import (
DocumentType, DocumentType,
) )
from app.indexing_pipeline.cache import build_chunk_embeddings from app.indexing_pipeline.cache import build_chunk_embeddings
from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_chunker import ChunkSlice
from app.indexing_pipeline.document_hashing import ( from app.indexing_pipeline.document_hashing import (
compute_content_hash, compute_content_hash,
compute_identifier_hash, compute_identifier_hash,
@ -490,22 +489,12 @@ class IndexingPipelineService:
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]: async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
result = await self.session.execute( result = await self.session.execute(
select( select(Chunk.id, Chunk.content, Chunk.position).where(
Chunk.id, Chunk.document_id == document_id
Chunk.content, )
Chunk.position,
Chunk.start_char,
Chunk.end_char,
).where(Chunk.document_id == document_id)
) )
return [ return [
ExistingChunk( ExistingChunk(id=row.id, content=row.content, position=row.position)
id=row.id,
content=row.content,
position=row.position,
start_char=row.start_char,
end_char=row.end_char,
)
for row in result for row in result
] ]
@ -516,21 +505,15 @@ class IndexingPipelineService:
delete(Chunk).where(Chunk.document_id == document.id) delete(Chunk).where(Chunk.document_id == document.id)
) )
summary_embedding, slice_pairs = await build_chunk_embeddings( summary_embedding, chunk_pairs = await build_chunk_embeddings(
content, content,
use_code_chunker=connector_doc.should_use_code_chunker, use_code_chunker=connector_doc.should_use_code_chunker,
) )
document.embedding = summary_embedding document.embedding = summary_embedding
return [ return [
Chunk( Chunk(content=text, embedding=emb, position=i)
content=chunk_slice.text, for i, (text, emb) in enumerate(chunk_pairs)
embedding=emb,
position=i,
start_char=chunk_slice.start_char,
end_char=chunk_slice.end_char,
)
for i, (chunk_slice, emb) in enumerate(slice_pairs)
] ]
async def _reindex_incrementally( async def _reindex_incrementally(
@ -542,39 +525,35 @@ class IndexingPipelineService:
) -> int: ) -> int:
"""Edit path: keep rows whose text survived, embed only new texts. """Edit path: keep rows whose text survived, embed only new texts.
Unchanged rows keep their embedding and their HNSW/GIN index entries. An Unchanged rows keep their embedding and their HNSW/GIN index entries;
edit can shift a kept chunk's char span without changing its text, so moved rows get a position-only UPDATE, which touches neither index.
every kept row's position and span are refreshed whenever they drift.
""" """
slices = await chunk_slices( new_texts = await chunk_markdown(
content, use_code_chunker=connector_doc.should_use_code_chunker content, use_code_chunker=connector_doc.should_use_code_chunker
) )
new_texts = [s.text for s in slices]
plan = reconcile(existing, new_texts) plan = reconcile(existing, new_texts)
# One batch: the document-level summary vector plus the missing chunks. # One batch: the document-level summary vector plus the missing chunks.
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]]) embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
summary_embedding, *new_embeddings = embeddings summary_embedding, *new_embeddings = embeddings
if plan.reused:
await self.session.execute(
update(Chunk),
[{"id": cid, "position": pos} for cid, pos in plan.reused],
)
if plan.to_delete: if plan.to_delete:
await self.session.execute( await self.session.execute(
delete(Chunk).where(Chunk.id.in_(plan.to_delete)) delete(Chunk).where(Chunk.id.in_(plan.to_delete))
) )
span_updates = self._kept_row_span_updates(existing, slices, plan)
if span_updates:
await self.session.execute(update(Chunk), span_updates)
self.session.add_all( self.session.add_all(
Chunk( Chunk(
content=slices[pos].text, content=text,
embedding=emb, embedding=emb,
position=pos, position=pos,
start_char=slices[pos].start_char,
end_char=slices[pos].end_char,
document_id=document.id, document_id=document.id,
) )
for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True) for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
) )
document.embedding = summary_embedding document.embedding = summary_embedding
@ -585,36 +564,6 @@ class IndexingPipelineService:
) )
return len(new_texts) return len(new_texts)
@staticmethod
def _kept_row_span_updates(
existing: list[ExistingChunk],
slices: list[ChunkSlice],
plan: ChunkPlan,
) -> list[dict]:
"""Position/span writes for kept rows, emitted only where a value drifts."""
deleted = set(plan.to_delete)
moved = dict(plan.reused)
updates: list[dict] = []
for chunk in existing:
if chunk.id in deleted:
continue
new_position = moved.get(chunk.id, chunk.position)
target = slices[new_position]
if (
chunk.position != new_position
or chunk.start_char != target.start_char
or chunk.end_char != target.end_char
):
updates.append(
{
"id": chunk.id,
"position": new_position,
"start_char": target.start_char,
"end_char": target.end_char,
}
)
return updates
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled.""" """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
try: try:

View file

@ -440,15 +440,8 @@ class ChucksHybridSearchRetriever:
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
# Select only the columns we need (skip Chunk.embedding ~12KB/row). # Select only the columns we need (skip Chunk.embedding ~12KB/row).
# start_char/end_char carry the citation span; None for legacy rows.
chunk_query = ( chunk_query = (
select( select(Chunk.id, Chunk.content, Chunk.document_id)
Chunk.id,
Chunk.content,
Chunk.document_id,
Chunk.start_char,
Chunk.end_char,
)
.join(numbered, Chunk.id == numbered.c.chunk_id) .join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter) .where(chunk_filter)
.order_by(Chunk.document_id, Chunk.position, Chunk.id) .order_by(Chunk.document_id, Chunk.position, Chunk.id)
@ -483,14 +476,7 @@ class ChucksHybridSearchRetriever:
if doc_id not in doc_map: if doc_id not in doc_map:
continue continue
doc_entry = doc_map[doc_id] doc_entry = doc_map[doc_id]
doc_entry["chunks"].append( doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
{
"chunk_id": row.id,
"content": row.content,
"start_char": row.start_char,
"end_char": row.end_char,
}
)
if row.id in matched_chunk_ids: if row.id in matched_chunk_ids:
doc_entry["matched_chunk_ids"].append(row.id) doc_entry["matched_chunk_ids"].append(row.id)

View file

@ -38,7 +38,6 @@ from app.schemas import (
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
from app.users import get_auth_context from app.users import get_auth_context
from app.utils.rbac import check_permission from app.utils.rbac import check_permission
from app.utils.text_spans import char_span_to_line_range
try: try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@ -977,12 +976,9 @@ async def get_document_by_chunk_id(
session: AsyncSession = Depends(get_async_session), session: AsyncSession = Depends(get_async_session),
auth: AuthContext = Depends(get_auth_context), auth: AuthContext = Depends(get_auth_context),
): ):
"""Resolve a chunk id to its document plus a window of surrounding chunks. """
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
Returns the cited chunk's 1-based line range (cited_start_line/ Uses SQL-level pagination to avoid loading all chunks into memory.
cited_end_line) when char spans exist, so callers can anchor the citation
to exact source lines. Uses SQL-level pagination to avoid loading all
chunks into memory.
""" """
try: try:
from sqlalchemy import and_, func, or_ from sqlalchemy import and_, func, or_
@ -1046,17 +1042,6 @@ async def get_document_by_chunk_id(
) )
windowed_chunks = windowed_result.scalars().all() windowed_chunks = windowed_result.scalars().all()
cited_start_line: int | None = None
cited_end_line: int | None = None
if (
chunk.start_char is not None
and chunk.end_char is not None
and document.source_markdown
):
cited_start_line, cited_end_line = char_span_to_line_range(
document.source_markdown, chunk.start_char, chunk.end_char
)
return DocumentWithChunksRead( return DocumentWithChunksRead(
id=document.id, id=document.id,
title=document.title, title=document.title,
@ -1071,8 +1056,6 @@ async def get_document_by_chunk_id(
chunks=windowed_chunks, chunks=windowed_chunks,
total_chunks=total_chunks, total_chunks=total_chunks,
chunk_start_index=start, chunk_start_index=start,
cited_start_line=cited_start_line,
cited_end_line=cited_end_line,
) )
except HTTPException: except HTTPException:
raise raise

View file

@ -43,34 +43,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
EDITOR_PLATE_MAX_LINES = 5000 EDITOR_PLATE_MAX_LINES = 5000
def _raise_no_canonical_body(document: Document) -> None:
"""Translate a missing source_markdown into a status-aware HTTP error."""
doc_status = document.status or {}
state = (
doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
)
if state in ("pending", "processing"):
raise HTTPException(
status_code=409,
detail="This document is still being processed. Please wait a moment and try again.",
)
if state == "failed":
reason = (
doc_status.get("reason", "Unknown error")
if isinstance(doc_status, dict)
else "Unknown error"
)
raise HTTPException(
status_code=422,
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
)
raise HTTPException(
status_code=400,
detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
)
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content") @router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
async def get_editor_content( async def get_editor_content(
search_space_id: int, search_space_id: int,
@ -82,9 +54,8 @@ async def get_editor_content(
""" """
Get document content for editing. Get document content for editing.
Returns source_markdown (the canonical body) for the Plate.js editor, with a Returns source_markdown for the Plate.js editor.
one-time migration from legacy blocknote_document. Never reconstructs the Falls back to blocknote_document markdown conversion, then chunk reconstruction.
body from chunks.
Requires DOCUMENTS_READ permission. Requires DOCUMENTS_READ permission.
""" """
@ -154,9 +125,52 @@ async def get_editor_content(
await session.commit() await session.commit()
return _build_response(empty_markdown) return _build_response(empty_markdown)
# No canonical body. Chunks are an index artifact, never the source of chunk_contents_result = await session.execute(
# truth, so surface the processing state instead of rebuilding from them. select(Chunk.content)
_raise_no_canonical_body(document) .filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if not chunk_contents:
doc_status = document.status or {}
state = (
doc_status.get("state", "ready")
if isinstance(doc_status, dict)
else "ready"
)
if state in ("pending", "processing"):
raise HTTPException(
status_code=409,
detail="This document is still being processed. Please wait a moment and try again.",
)
if state == "failed":
reason = (
doc_status.get("reason", "Unknown error")
if isinstance(doc_status, dict)
else "Unknown error"
)
raise HTTPException(
status_code=422,
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
)
raise HTTPException(
status_code=400,
detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
)
markdown_content = "\n\n".join(chunk_contents)
if not markdown_content.strip():
raise HTTPException(
status_code=400,
detail="This document appears to be empty. Try re-uploading or editing it to add content.",
)
document.source_markdown = markdown_content
await session.commit()
return _build_response(markdown_content)
@router.get( @router.get(
@ -170,9 +184,8 @@ async def download_document_markdown(
): ):
user = auth.user user = auth.user
""" """
Download the canonical document body as a .md file. Download the full document content as a .md file.
Reconstructs markdown from source_markdown or chunks.
Serves source_markdown, migrating legacy blocknote_document when present.
""" """
await check_permission( await check_permission(
session, session,
@ -198,6 +211,15 @@ async def download_document_markdown(
from app.utils.blocknote_to_markdown import blocknote_to_markdown from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown = blocknote_to_markdown(document.blocknote_document) markdown = blocknote_to_markdown(document.blocknote_document)
if markdown is None:
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:
markdown = "\n\n".join(chunk_contents)
if not markdown or not markdown.strip(): if not markdown or not markdown.strip():
raise HTTPException( raise HTTPException(
@ -340,6 +362,15 @@ async def export_document(
from app.utils.blocknote_to_markdown import blocknote_to_markdown from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown_content = blocknote_to_markdown(document.blocknote_document) markdown_content = blocknote_to_markdown(document.blocknote_document)
if markdown_content is None:
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:
markdown_content = "\n\n".join(chunk_contents)
if not markdown_content or not markdown_content.strip(): if not markdown_content or not markdown_content.strip():
raise HTTPException(status_code=400, detail="Document has no content to export") raise HTTPException(status_code=400, detail="Document has no content to export")

View file

@ -214,7 +214,7 @@ async def _execute_image_generation(
) )
# Store response # Store response
image_gen.response_data = ( response_dict = (
response.model_dump() if hasattr(response, "model_dump") else dict(response) response.model_dump() if hasattr(response, "model_dump") else dict(response)
) )
if not image_gen.model and hasattr(response, "_hidden_params"): if not image_gen.model and hasattr(response, "_hidden_params"):
@ -222,6 +222,20 @@ async def _execute_image_generation(
if isinstance(hidden, dict) and hidden.get("model"): if isinstance(hidden, dict) and hidden.get("model"):
image_gen.model = hidden["model"] image_gen.model = hidden["model"]
# Fix relative URLs in response data (for the serving endpoint)
from urllib.parse import urlparse
images = response_dict.get("data", [])
provider_base_url = resolved_kwargs.get("api_base")
for image in images:
if image.get("url"):
raw_url: str = image["url"]
if raw_url.startswith("/") and provider_base_url:
parsed = urlparse(provider_base_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
image["url"] = f"{origin}{raw_url}"
image_gen.response_data = response_dict
# ============================================================================= # =============================================================================
# Image Generation Execution + Results CRUD # Image Generation Execution + Results CRUD

View file

@ -17,7 +17,4 @@ class ChunkUpdate(ChunkBase):
class ChunkRead(ChunkBase, IDModel, TimestampModel): class ChunkRead(ChunkBase, IDModel, TimestampModel):
start_char: int | None = None
end_char: int | None = None
model_config = ConfigDict(from_attributes=True) model_config = ConfigDict(from_attributes=True)

View file

@ -73,10 +73,6 @@ class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = [] chunks: list[ChunkRead] = []
total_chunks: int = 0 total_chunks: int = 0
chunk_start_index: int = 0 chunk_start_index: int = 0
# 1-based inclusive line range of the cited chunk within source_markdown;
# None when the chunk predates char spans or the body is unavailable.
cited_start_line: int | None = None
cited_end_line: int | None = None
model_config = ConfigDict(from_attributes=True) model_config = ConfigDict(from_attributes=True)

View file

@ -1,23 +0,0 @@
"""Convert char spans into document-relative line ranges.
Chunks store half-open char spans into ``source_markdown``; citations and the
editor speak in line numbers. This is the single shared conversion so search,
the resolve API, and highlighting all agree on what "lines X-Y" means.
"""
from __future__ import annotations
def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
"""Return the 1-based inclusive line range covering ``[start_char, end_char)``.
Offsets are clamped to ``text`` bounds. An empty span resolves to the single
line containing it.
"""
n = len(text)
start = max(0, min(start_char, n))
end = max(start, min(end_char, n))
start_line = text.count("\n", 0, start) + 1
last_char_index = max(start, end - 1)
end_line = text.count("\n", 0, last_char_index) + 1
return start_line, end_line

View file

@ -1,80 +0,0 @@
"""NOTE writes must carry the same char spans as the indexing pipeline.
``_create_document`` / ``_update_document`` are the cloud agent's KB write
paths. They must chunk through the shared span chunker so every persisted
chunk resolves back to an exact slice of ``source_markdown`` for citations.
"""
from __future__ import annotations
import pytest
from sqlalchemy import select
from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
middleware as kb,
)
from app.db import Chunk
pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
rows = await session.execute(
select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
)
return list(rows.scalars().all())
def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
assert chunks
for chunk in chunks:
assert chunk.start_char is not None
assert chunk.end_char is not None
assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_create_populates_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
chunks = await _ordered_chunks(db_session, doc.id)
_assert_spans_resolve(doc.source_markdown, chunks)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_note_update_refreshes_chunk_spans(
db_session, db_search_space, db_user
) -> None:
doc = await kb._create_document(
db_session,
virtual_path="/documents/note.md",
content=_BODY,
search_space_id=db_search_space.id,
created_by_id=str(db_user.id),
)
await db_session.flush()
updated = await kb._update_document(
db_session,
doc_id=doc.id,
content=_NEW_BODY,
virtual_path="/documents/note.md",
search_space_id=db_search_space.id,
)
await db_session.flush()
assert updated is not None
chunks = await _ordered_chunks(db_session, updated.id)
_assert_spans_resolve(updated.source_markdown, chunks)

View file

@ -158,12 +158,13 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
@pytest.fixture @pytest.fixture
def patched_chunk_text(monkeypatch) -> MagicMock: def patched_chunk_text(monkeypatch) -> MagicMock:
from app.indexing_pipeline.document_chunker import ChunkSlice mock = MagicMock(return_value=["Test chunk content."])
text = "Test chunk content."
mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock, mock,
) )
return mock return mock

View file

@ -286,12 +286,9 @@ def _mock_external_apis(monkeypatch):
"app.indexing_pipeline.cache.cached_indexing.embed_texts", "app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
) )
from app.indexing_pipeline.document_chunker import ChunkSlice
chunk = "Test chunk content."
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text",
MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]), MagicMock(return_value=["Test chunk content."]),
) )

View file

@ -176,14 +176,9 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
@pytest.mark.usefixtures("patched_embed_texts") @pytest.mark.usefixtures("patched_embed_texts")
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending.""" """Reindexing replaces old chunks with new content rather than appending."""
from app.indexing_pipeline.document_chunker import ChunkSlice
mocker.patch( mocker.patch(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
side_effect=[ side_effect=[["Original chunk."], ["Updated chunk."]],
[ChunkSlice("Original chunk.", 0, len("Original chunk."))],
[ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
],
) )
adapter = UploadDocumentAdapter(db_session) adapter = UploadDocumentAdapter(db_session)

View file

@ -18,22 +18,16 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
@pytest.fixture @pytest.fixture
def paragraph_chunker(monkeypatch): def paragraph_chunker(monkeypatch):
"""One slice per markdown paragraph, so edits map to chunk-level diffs.""" """One chunk per markdown paragraph, so edits map to chunk-level diffs."""
from app.indexing_pipeline.document_chunker import ChunkSlice
def _split(markdown, *_args, **_kwargs): def _split(markdown, **_kwargs):
slices = [] return [p for p in markdown.split("\n\n") if p.strip()]
cursor = 0
for para in markdown.split("\n\n"):
start = markdown.index(para, cursor)
cursor = start + len(para)
if para.strip():
slices.append(ChunkSlice(para, start, cursor))
return slices
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
_split, )
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
) )

View file

@ -1,96 +0,0 @@
"""Indexing records char spans so a chunk addresses its exact slice of the body.
Uses the real chunker (only embeddings are faked) so the span/partition
invariants are exercised end to end.
"""
import pytest
from sqlalchemy import select
from app.db import Chunk, Document
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
pytestmark = pytest.mark.integration
_BODY = (
"# Report\n\n"
+ "Intro paragraph that is reasonably long and descriptive. " * 8
+ "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
+ "Closing paragraph with a different shape and more words to chunk. " * 8
)
async def _ordered_chunks(session, document_id) -> list[Chunk]:
result = await session.execute(
select(Chunk)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.position, Chunk.id)
)
return list(result.scalars().all())
def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
for chunk in chunks:
assert chunk.start_char is not None and chunk.end_char is not None
assert body[chunk.start_char : chunk.end_char] == chunk.content
assert "".join(c.content for c in chunks) == body
async def _index(session, connector_doc) -> int:
service = IndexingPipelineService(session=session)
prepared = await service.prepare_for_indexing([connector_doc])
document = prepared[0]
await service.index(document, connector_doc)
return document.id
async def _reload_body(session, document_id) -> str:
result = await session.execute(select(Document).filter(Document.id == document_id))
return result.scalars().first().source_markdown
@pytest.mark.usefixtures("patched_embed_texts")
async def test_scratch_index_records_spans_addressing_body(
db_session, db_search_space, make_connector_document
):
connector_doc = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
document_id = await _index(db_session, connector_doc)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert len(chunks) > 1
_assert_spans_address_body(chunks, body)
@pytest.mark.usefixtures("patched_embed_texts")
async def test_incremental_reindex_refreshes_shifted_spans(
db_session, db_search_space, make_connector_document
):
"""Inserting text at the top shifts every later chunk's span; kept rows must
have their spans refreshed, not left pointing at the old offsets."""
service = IndexingPipelineService(session=db_session)
original = make_connector_document(
search_space_id=db_search_space.id, source_markdown=_BODY
)
prepared = await service.prepare_for_indexing([original])
document_id = prepared[0].id
await service.index(prepared[0], original)
edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
edited = make_connector_document(
search_space_id=db_search_space.id, source_markdown=edited_body
)
prepared_again = await service.prepare_for_indexing([edited])
assert prepared_again, "edited content should requeue the document"
await service.index(prepared_again[0], edited)
body = await _reload_body(db_session, document_id)
chunks = await _ordered_chunks(db_session, document_id)
assert body == edited_body
_assert_spans_address_body(chunks, body)

View file

@ -40,19 +40,11 @@ def _make_document(
) )
def _make_chunk( def _make_chunk(*, content: str, document_id: int) -> Chunk:
*,
content: str,
document_id: int,
start_char: int | None = None,
end_char: int | None = None,
) -> Chunk:
return Chunk( return Chunk(
content=content, content=content,
document_id=document_id, document_id=document_id,
embedding=DUMMY_EMBEDDING, embedding=DUMMY_EMBEDDING,
start_char=start_char,
end_char=end_char,
) )
@ -99,8 +91,6 @@ async def seed_large_doc(
_make_chunk( _make_chunk(
content="quarterly performance review summary note content", content="quarterly performance review summary note content",
document_id=small_doc.id, document_id=small_doc.id,
start_char=0,
end_char=10,
), ),
] ]

View file

@ -98,32 +98,6 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID" assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
async def test_chunk_spans_returned(db_session, seed_large_doc):
"""Each chunk dict carries start_char/end_char (the citation span)."""
space_id = seed_large_doc["search_space"].id
small_doc_id = seed_large_doc["small_doc"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review summary",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
for chunk in result["chunks"]:
assert "start_char" in chunk
assert "end_char" in chunk
if result["document"].get("id") == small_doc_id:
seeded = result["chunks"][0]
assert seeded["start_char"] == 0
assert seeded["end_char"] == 10
break
else:
pytest.fail("Small doc not found in search results")
async def test_score_is_positive_float(db_session, seed_large_doc): async def test_score_is_positive_float(db_session, seed_large_doc):
"""Each result should have a positive float score from RRF.""" """Each result should have a positive float score from RRF."""
space_id = seed_large_doc["search_space"].id space_id = seed_large_doc["search_space"].id

View file

@ -1,127 +0,0 @@
"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
derives the cited chunk's line range from source_markdown."""
import pytest
import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
pytestmark = pytest.mark.integration
_BODY = "alpha\nbravo\ncharlie\ndelta"
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
source_markdown: str = _BODY,
) -> Document:
doc = Document(
title="Doc",
document_type=DocumentType.FILE,
document_metadata={},
content=source_markdown,
content_hash="hash-by-chunk",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunk(
session: AsyncSession,
document: Document,
*,
content: str,
position: int,
start_char: int | None,
end_char: int | None,
) -> Chunk:
chunk = Chunk(
content=content,
position=position,
document_id=document.id,
start_char=start_char,
end_char=end_char,
)
session.add(chunk)
await session.flush()
return chunk
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
async def test_cited_line_range_derived_from_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
cited = await _add_chunk(
db_session,
doc,
content="charlie\ndelta",
position=1,
start_char=12,
end_char=len(_BODY),
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line == 3
assert result.cited_end_line == 4
async def test_chunk_spans_exposed_in_response(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
chunk = next(c for c in result.chunks if c.id == cited.id)
assert chunk.start_char == 0
assert chunk.end_char == 12
async def test_cited_line_range_null_without_spans(
db_session, db_search_space, db_user, make_document
):
from app.routes.documents_routes import get_document_by_chunk_id
doc = await make_document()
cited = await _add_chunk(
db_session, doc, content="alpha", position=0, start_char=None, end_char=None
)
result = await get_document_by_chunk_id(
cited.id, chunk_window=5, session=db_session, user=db_user
)
assert result.cited_start_line is None
assert result.cited_end_line is None

View file

@ -1,175 +0,0 @@
"""Phase A contract: editor read paths serve source_markdown and never
reconstruct or mutate the body from chunks."""
import pytest
import pytest_asyncio
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Chunk,
Document,
DocumentStatus,
DocumentType,
SearchSpace,
User,
)
pytestmark = pytest.mark.integration
async def _make_document(
session: AsyncSession,
search_space: SearchSpace,
user: User,
*,
document_type: DocumentType = DocumentType.FILE,
source_markdown: str | None = "# Title\n\nBody line.",
content: str = "Body line.",
status: dict | None = None,
) -> Document:
doc = Document(
title="Doc",
document_type=document_type,
document_metadata={},
content=content,
content_hash="hash-001",
source_markdown=source_markdown,
search_space_id=search_space.id,
created_by_id=user.id,
status=status or DocumentStatus.ready(),
)
session.add(doc)
await session.flush()
return doc
async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
for position, text in enumerate(texts):
session.add(Chunk(content=text, position=position, document_id=document.id))
await session.flush()
@pytest_asyncio.fixture
async def make_document(db_session, db_search_space, db_user):
async def _make(**overrides):
return await _make_document(db_session, db_search_space, db_user, **overrides)
return _make
class TestGetEditorContent:
async def test_returns_source_markdown_verbatim(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown="# Real\n\nCanonical body.")
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == "# Real\n\nCanonical body."
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
"""A ready document without source_markdown must not be rebuilt from chunks."""
from app.routes.editor_routes import get_editor_content
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
await db_session.refresh(doc)
assert doc.source_markdown is None
async def test_processing_document_without_body_returns_409(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.processing()
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 409
async def test_failed_document_without_body_returns_422(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(
source_markdown=None, status=DocumentStatus.failed("boom")
)
with pytest.raises(HTTPException) as exc:
await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 422
async def test_empty_note_initializes_to_empty_markdown(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import get_editor_content
doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
result = await get_editor_content(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert result["source_markdown"] == ""
class TestDownloadMarkdown:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import download_document_markdown
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await download_document_markdown(
db_search_space.id, doc.id, session=db_session, user=db_user
)
assert exc.value.status_code == 400
class TestExportDocument:
async def test_does_not_reconstruct_body_from_chunks(
self, db_session, db_search_space, db_user, make_document
):
from app.routes.editor_routes import export_document
from app.routes.reports_routes import ExportFormat
doc = await make_document(source_markdown=None)
await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
with pytest.raises(HTTPException) as exc:
await export_document(
db_search_space.id,
doc.id,
format=ExportFormat.PLAIN,
session=db_session,
user=db_user,
)
assert exc.value.status_code == 400

View file

@ -1,87 +0,0 @@
"""Unit tests for search_knowledge_base hit rendering.
The tool must surface the passage that actually matched (the RRF-ranked
chunk), not the top of the document, and annotate it with its line range
when the chunk carries a char span.
"""
from __future__ import annotations
import pytest
from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
_format_hits,
)
pytestmark = pytest.mark.unit
_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
def _hit() -> dict:
intro = "Intro paragraph."
matched = "Matched passage here."
matched_start = _BODY.index(matched)
return {
"document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
"score": 0.42,
"content": _BODY.replace("\n\n", "\n\n"),
"matched_chunk_ids": [102],
"chunks": [
{
"chunk_id": 101,
"content": intro,
"start_char": 0,
"end_char": len(intro),
},
{
"chunk_id": 102,
"content": matched,
"start_char": matched_start,
"end_char": matched_start + len(matched),
},
],
}
def test_renders_matched_passage_not_top_of_doc() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Matched passage here." in out
# The intro chunk was not matched, so it must not be shown as the snippet.
assert "Intro paragraph." not in out
def test_emits_copyable_line_citation_token_when_spans_present() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
# "Matched passage here." sits on line 3 of the body; the hit must surface
# a ready-to-copy token so the agent can cite without a separate read.
assert "[citation:d7#L3-3]" in out
def test_header_includes_document_id() -> None:
out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "id=7" in out
def test_omits_citation_token_when_spans_absent() -> None:
hit = _hit()
for chunk in hit["chunks"]:
chunk["start_char"] = None
chunk["end_char"] = None
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Matched passage here." in out
# No concrete, copyable token for this document without spans (the closing
# instruction's placeholder template doesn't count).
assert "[citation:d7#L" not in out
def test_falls_back_to_content_when_no_matched_ids() -> None:
hit = _hit()
hit["matched_chunk_ids"] = []
out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
assert "Intro paragraph." in out
def test_no_results_message() -> None:
out = _format_hits([], paths={}, bodies={}, query="missing")
assert "No knowledge-base matches" in out

View file

@ -1,72 +0,0 @@
"""Span-aware chunking contract: slices form a lossless, contiguous partition
of the markdown, and every slice's char span addresses its own text."""
import pytest
from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
pytestmark = pytest.mark.unit
def _assert_lossless_partition(md: str, slices) -> None:
assert "".join(s.text for s in slices) == md
cursor = 0
for s in slices:
assert s.start_char == cursor, "slices must be contiguous"
assert s.end_char >= s.start_char
assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
cursor = s.end_char
assert cursor == len(md)
def test_prose_partition_and_spans():
md = (
"# Title\n\n"
+ "First paragraph with several words here. " * 20
+ "\n\nSecond section with more prose to force multiple chunks. " * 20
)
slices = chunk_markdown_with_spans(md)
assert len(slices) > 1
_assert_lossless_partition(md, slices)
def test_table_kept_whole_with_exact_span():
table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
md = f"Intro prose before the table.\n{table}\nClosing prose after."
slices = chunk_markdown_with_spans(md)
_assert_lossless_partition(md, slices)
table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
assert any("| 1 | 2 |" in s.text for s in table_slices)
for s in table_slices:
assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
def test_table_at_eof_without_trailing_newline_stays_whole():
md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
slices = chunk_markdown_with_spans(md)
_assert_lossless_partition(md, slices)
table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
assert len(table_slices) == 1
assert "| a | b |" in table_slices[0].text
def test_code_chunker_partition_and_spans():
code = "\n\n".join(
f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40)
)
slices = chunk_markdown_with_spans(code, use_code_chunker=True)
assert len(slices) >= 1
_assert_lossless_partition(code, slices)
def test_empty_markdown_yields_no_slices():
assert chunk_markdown_with_spans("") == []

View file

@ -37,9 +37,12 @@ def _make_orm_doc(connector_doc, doc_id):
async def test_index_calls_embed_and_chunk_via_to_thread( async def test_index_calls_embed_and_chunk_via_to_thread(
pipeline, make_connector_document, monkeypatch pipeline, make_connector_document, monkeypatch
): ):
"""index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.""" """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
from app.indexing_pipeline.document_chunker import ChunkSlice
Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
path, see issue #1334) is verified separately in
``test_non_code_documents_use_hybrid_chunker``.
"""
to_thread_calls = [] to_thread_calls = []
original_to_thread = asyncio.to_thread original_to_thread = asyncio.to_thread
@ -48,11 +51,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
return await original_to_thread(func, *args, **kwargs) return await original_to_thread(func, *args, **kwargs)
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread) monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)]) mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunker.__name__ = "chunk_markdown_with_spans" mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock_chunker, mock_chunk_hybrid,
) )
mock_embed = MagicMock( mock_embed = MagicMock(
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@ -87,25 +90,34 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
await pipeline.index(document, connector_doc) await pipeline.index(document, connector_doc)
assert "chunk_markdown_with_spans" in to_thread_calls # Either chunker entry point satisfies the "chunking runs off the event
# loop" contract this test guards. Routing between the two is verified
# in test_non_code_documents_use_hybrid_chunker.
assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
assert "embed_texts" in to_thread_calls assert "embed_texts" in to_thread_calls
assert document.status == DocumentStatus.ready() assert document.status == DocumentStatus.ready()
async def test_non_code_documents_use_prose_chunker( async def test_non_code_documents_use_hybrid_chunker(
pipeline, make_connector_document, monkeypatch pipeline, make_connector_document, monkeypatch
): ):
"""Non-code documents chunk with use_code_chunker=False (issue #1334). """Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
The table-aware prose path keeps Markdown tables intact; only documents The hybrid chunker preserves Markdown table integrity by avoiding splits
flagged with ``should_use_code_chunker=True`` request the code chunker. mid-row. Only documents flagged with ``should_use_code_chunker=True``
should take the ``chunk_text`` path.
""" """
from app.indexing_pipeline.document_chunker import ChunkSlice mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock_chunker, mock_chunk_hybrid,
)
mock_chunk_code = MagicMock(return_value=["chunk1"])
mock_chunk_code.__name__ = "chunk_text"
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock_chunk_code,
) )
monkeypatch.setattr( monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.embed_texts", "app.indexing_pipeline.cache.cached_indexing.embed_texts",
@ -137,49 +149,8 @@ async def test_non_code_documents_use_prose_chunker(
await pipeline.index(document, connector_doc) await pipeline.index(document, connector_doc)
mock_chunker.assert_called_once() mock_chunk_hybrid.assert_called_once()
assert mock_chunker.call_args.args[1] is False mock_chunk_code.assert_not_called()
async def test_code_documents_request_code_chunker(
pipeline, make_connector_document, monkeypatch
):
"""Code-flagged documents forward use_code_chunker=True to the chunker."""
from app.indexing_pipeline.document_chunker import ChunkSlice
mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
mock_chunker,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
async def _noop_persist(_session, doc, *_args, **_kwargs):
doc.status = DocumentStatus.ready()
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
_noop_persist,
)
connector_doc = make_connector_document(
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
unique_id="repo-1",
search_space_id=1,
should_use_code_chunker=True,
)
document = MagicMock(spec=Document)
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc)
mock_chunker.assert_called_once()
assert mock_chunker.call_args.args[1] is True
def _mock_session_factory(orm_docs_by_id): def _mock_session_factory(orm_docs_by_id):

View file

@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
def __init__(self, *, children=None, file_data=None) -> None: def __init__(self, *, children=None, file_data=None) -> None:
self.als_info = AsyncMock(return_value=children or []) self.als_info = AsyncMock(return_value=children or [])
self._load_file_data = AsyncMock( self._load_file_data = AsyncMock(
return_value=(file_data, 17, None) if file_data is not None else None return_value=(file_data, 17) if file_data is not None else None
) )

View file

@ -69,25 +69,13 @@ class _FakeSession:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None: def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
"""Avoid loading the embedding model in unit tests. """Avoid loading the embedding model in unit tests."""
Mirrors the legacy stub: one chunk spanning the whole content, with a
zero summary/chunk vector, routed through the shared span builder.
"""
from app.indexing_pipeline.document_chunker import ChunkSlice
async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
summary = np.zeros(8, dtype=np.float32)
pairs = (
[(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
if content
else []
)
return summary, pairs
monkeypatch.setattr( monkeypatch.setattr(
kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings kb_persistence,
"embed_texts",
lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
) )
monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
@pytest.mark.asyncio @pytest.mark.asyncio

View file

@ -1,92 +0,0 @@
"""Unit tests for the numbered-document read preamble."""
import pytest
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
build_read_preamble,
compute_matched_line_ranges,
)
pytestmark = pytest.mark.unit
_BODY = "alpha\nbravo\ncharlie\ndelta"
class TestComputeMatchedLineRanges:
def test_maps_matched_chunk_spans_to_line_ranges(self):
chunks = [(1, 0, 12), (2, 12, len(_BODY))]
ranges = compute_matched_line_ranges(_BODY, chunks, {2})
assert ranges == [(3, 4)]
def test_includes_only_matched_chunks(self):
chunks = [(1, 0, 5), (2, 6, 11)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
assert ranges == [(1, 1)]
def test_skips_chunks_without_spans(self):
chunks = [(1, None, None)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1})
assert ranges == []
def test_sorted_and_deduplicated(self):
chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
assert ranges == [(1, 1), (3, 4)]
class TestBuildReadPreamble:
def test_contains_document_metadata(self):
preamble = build_read_preamble(
document_id=42,
document_type="FILE",
title="Test Doc",
url="https://example.com",
matched_line_ranges=[],
)
assert "<document_id>42</document_id>" in preamble
assert "<document_type>FILE</document_type>" in preamble
assert "Test Doc" in preamble
assert "https://example.com" in preamble
def test_citation_hint_uses_document_id(self):
preamble = build_read_preamble(
document_id=42,
document_type="FILE",
title="Test Doc",
url="",
matched_line_ranges=[],
)
assert "[citation:d42#L" in preamble
def test_lists_matched_line_ranges(self):
preamble = build_read_preamble(
document_id=7,
document_type="NOTE",
title="Notes",
url="",
matched_line_ranges=[(12, 18), (40, 40)],
)
assert "<matched_lines>" in preamble
assert "12-18" in preamble
assert "40" in preamble
def test_omits_matched_lines_block_when_empty(self):
preamble = build_read_preamble(
document_id=7,
document_type="NOTE",
title="Notes",
url="",
matched_line_ranges=[],
)
assert "<matched_lines>" not in preamble
def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
preamble = build_read_preamble(
document_id=1,
document_type="FILE",
title="t",
url="",
matched_line_ranges=[],
)
assert preamble.endswith("\n")

View file

@ -0,0 +1,162 @@
"""Tests for async_retry utilities."""
import httpx
import pytest
from app.connectors.exceptions import (
ConnectorAPIError,
ConnectorAuthError,
ConnectorError,
ConnectorRateLimitError,
ConnectorTimeoutError,
)
from app.utils.async_retry import _is_retryable, raise_for_status
pytestmark = pytest.mark.unit
def make_response(
status_code: int,
*,
headers: dict[str, str] | None = None,
json_body=None,
text_body: str = "",
):
kwargs = {
"status_code": status_code,
"headers": headers,
"request": httpx.Request("GET", "https://x"),
}
if json_body is not None:
kwargs["json"] = json_body
else:
kwargs["text"] = text_body
return httpx.Response(**kwargs)
def test_raise_for_status_does_not_raise_for_success():
response = make_response(200)
raise_for_status(response)
@pytest.mark.parametrize(
("retry_after_header", "expected"),
[
("5", 5.0),
(None, None),
("abc", None),
],
)
def test_raise_for_status_429(retry_after_header, expected):
headers = {}
if retry_after_header is not None:
headers["Retry-After"] = retry_after_header
response = make_response(
429,
headers=headers,
json_body={"detail": "rate limited"},
)
with pytest.raises(ConnectorRateLimitError) as exc_info:
raise_for_status(response)
exc = exc_info.value
assert exc.retry_after == expected
assert exc.response_body == {"detail": "rate limited"}
@pytest.mark.parametrize("status_code", [401, 403])
def test_raise_for_status_auth_errors(status_code):
response = make_response(
status_code,
json_body={"error": "unauthorized"},
)
with pytest.raises(ConnectorAuthError) as exc_info:
raise_for_status(response)
exc = exc_info.value
assert exc.status_code == status_code
assert exc.response_body == {"error": "unauthorized"}
def test_raise_for_status_gateway_timeout():
response = make_response(
504,
json_body={"error": "timeout"},
)
with pytest.raises(ConnectorTimeoutError):
raise_for_status(response)
@pytest.mark.parametrize("status_code", [500, 502])
def test_raise_for_status_server_errors(status_code):
response = make_response(
status_code,
json_body={"error": "server"},
)
with pytest.raises(ConnectorAPIError) as exc_info:
raise_for_status(response)
assert exc_info.value.status_code == status_code
@pytest.mark.parametrize("status_code", [400, 404])
def test_raise_for_status_client_errors(status_code):
response = make_response(
status_code,
json_body={"error": "client"},
)
with pytest.raises(ConnectorAPIError) as exc_info:
raise_for_status(response)
assert exc_info.value.status_code == status_code
def test_raise_for_status_uses_text_when_json_parsing_fails():
response = make_response(
500,
text_body="Internal server error",
)
with pytest.raises(ConnectorAPIError) as exc_info:
raise_for_status(response)
assert exc_info.value.response_body == "Internal server error"
def test_connector_error_retryable_false():
exc = ConnectorError("boom")
assert _is_retryable(exc) is False
def test_rate_limit_error_is_retryable():
exc = ConnectorRateLimitError()
assert _is_retryable(exc) is True
def test_timeout_exception_is_retryable():
exc = httpx.TimeoutException("timeout")
assert _is_retryable(exc) is True
def test_connect_error_is_retryable():
exc = httpx.ConnectError("connection failed")
assert _is_retryable(exc) is True
def test_unrelated_exception_is_not_retryable():
exc = ValueError("boom")
assert _is_retryable(exc) is False

View file

@ -0,0 +1,293 @@
"""Tests for strip_markdown_fences() and extract_text_content() in
app/utils/content_utils.py.
Out of scope: bootstrap_history_from_db() async + DB, belongs in
integration tests.
Run:
uv run pytest -m unit tests/unit/utils/test_content_utils.py
"""
import pytest
pytestmark = pytest.mark.unit
# ===========================================================================
# strip_markdown_fences()
# ===========================================================================
class TestStripMarkdownFences:
"""Tests for strip_markdown_fences(text: str) -> str.
Regex: r"^```(?:\\w+)?\\s*\\n(.*?)```\\s*$" (re.DOTALL)
Called on text.strip() so surrounding whitespace is handled before
the regex runs. The captured group is also .strip()-ped before return.
"""
# ------------------------------------------------------------------
# Fenced with a language tag
# ------------------------------------------------------------------
def test_json_fence_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = '```json\n{"key": "value"}\n```'
assert strip_markdown_fences(text) == '{"key": "value"}'
def test_python_fence_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = "```python\ndef hello():\n return 'hi'\n```"
assert strip_markdown_fences(text) == "def hello():\n return 'hi'"
def test_yaml_fence_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = "```yaml\nkey: value\n```"
assert strip_markdown_fences(text) == "key: value"
def test_sql_multiline_fence_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = "```sql\nSELECT *\nFROM users\nWHERE id = 1;\n```"
assert strip_markdown_fences(text) == "SELECT *\nFROM users\nWHERE id = 1;"
# ------------------------------------------------------------------
# Fenced without a language tag
# ------------------------------------------------------------------
def test_no_lang_tag_single_line_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = "```\nhello world\n```"
assert strip_markdown_fences(text) == "hello world"
def test_no_lang_tag_multiline_returns_inner_content(self):
from app.utils.content_utils import strip_markdown_fences
text = "```\nline one\nline two\n```"
assert strip_markdown_fences(text) == "line one\nline two"
# ------------------------------------------------------------------
# Plain text — no fences → returned unchanged
# ------------------------------------------------------------------
def test_plain_text_returned_unchanged(self):
from app.utils.content_utils import strip_markdown_fences
text = "just plain text with no fences"
assert strip_markdown_fences(text) == text
def test_plain_text_with_newlines_returned_unchanged(self):
from app.utils.content_utils import strip_markdown_fences
text = "line one\nline two\nline three"
assert strip_markdown_fences(text) == text
def test_empty_string_returned_unchanged(self):
from app.utils.content_utils import strip_markdown_fences
assert strip_markdown_fences("") == ""
# ------------------------------------------------------------------
# Surrounding whitespace handling
# The function calls text.strip() before matching, so leading/trailing
# whitespace outside the fence is consumed. The captured group is also
# .strip()-ped, so whitespace between the fence markers and content is
# removed too.
# ------------------------------------------------------------------
def test_leading_whitespace_around_fence_stripped(self):
from app.utils.content_utils import strip_markdown_fences
text = " ```json\n{}\n```"
assert strip_markdown_fences(text) == "{}"
def test_trailing_whitespace_around_fence_stripped(self):
from app.utils.content_utils import strip_markdown_fences
text = "```json\n{}\n``` "
assert strip_markdown_fences(text) == "{}"
def test_surrounding_newlines_stripped(self):
from app.utils.content_utils import strip_markdown_fences
text = '\n\n```json\n{"a": 1}\n```\n\n'
assert strip_markdown_fences(text) == '{"a": 1}'
def test_inner_indentation_preserved(self):
"""The captured group is .strip()-ped, so leading whitespace on the
*first* line is removed, but indentation on subsequent lines is kept."""
from app.utils.content_utils import strip_markdown_fences
text = "```\n indented line\n deeper indent\n```"
result = strip_markdown_fences(text)
# .strip() removes the leading spaces from the first captured line
assert "indented line" in result
# indentation on the second line is preserved
assert " deeper indent" in result
# ===========================================================================
# extract_text_content()
# ===========================================================================
class TestExtractTextContent:
"""Tests for extract_text_content(content: str | dict | list) -> str."""
# ------------------------------------------------------------------
# str input → returned as-is
# ------------------------------------------------------------------
def test_str_input_returned_as_is(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content("hello world") == "hello world"
def test_str_empty_returned_as_is(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content("") == ""
def test_str_with_internal_whitespace_returned_as_is(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content(" spaced ") == " spaced "
# ------------------------------------------------------------------
# dict with "text" key → return content["text"]
# ------------------------------------------------------------------
def test_dict_with_text_key_returns_its_value(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content({"text": "from dict"}) == "from dict"
def test_dict_with_text_key_empty_value(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content({"text": ""}) == ""
def test_dict_with_text_key_ignores_other_keys(self):
from app.utils.content_utils import extract_text_content
d = {"text": "important", "role": "assistant", "extra": 99}
assert extract_text_content(d) == "important"
# ------------------------------------------------------------------
# dict without "text" key → str(dict)
# ------------------------------------------------------------------
def test_dict_without_text_key_returns_str_repr(self):
from app.utils.content_utils import extract_text_content
d = {"role": "assistant", "value": 42}
assert extract_text_content(d) == str(d)
def test_empty_dict_returns_str_repr(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content({}) == str({})
# ------------------------------------------------------------------
# list of parts — text dicts and plain strings
# Parts are joined with "\n" (per implementation: "\n".join(texts))
# ------------------------------------------------------------------
def test_list_text_type_parts_joined_with_newline(self):
from app.utils.content_utils import extract_text_content
parts = [
{"type": "text", "text": "Hello"},
{"type": "text", "text": "world"},
]
assert extract_text_content(parts) == "Hello\nworld"
def test_list_plain_strings_joined_with_newline(self):
from app.utils.content_utils import extract_text_content
parts = ["foo", "bar"]
assert extract_text_content(parts) == "foo\nbar"
def test_list_mixed_text_dicts_and_plain_strings(self):
from app.utils.content_utils import extract_text_content
parts = [
{"type": "text", "text": "Hello"},
"plain",
{"type": "text", "text": "world"},
]
result = extract_text_content(parts)
assert "Hello" in result
assert "plain" in result
assert "world" in result
def test_list_non_text_type_parts_ignored(self):
"""tool_use, image, and other non-text blocks must not leak into output."""
from app.utils.content_utils import extract_text_content
parts = [
{"type": "tool_use", "id": "abc", "name": "search_kb"},
{"type": "text", "text": "visible text"},
{"type": "image", "source": {"url": "https://example.com/img.png"}},
]
result = extract_text_content(parts)
assert result == "visible text"
assert "tool_use" not in result
assert "search_kb" not in result
assert "image" not in result
def test_list_only_non_text_parts_returns_empty_string(self):
from app.utils.content_utils import extract_text_content
parts = [
{"type": "tool_use", "id": "x"},
{"type": "image", "source": {}},
]
assert extract_text_content(parts) == ""
def test_list_single_text_part(self):
from app.utils.content_utils import extract_text_content
parts = [{"type": "text", "text": "only me"}]
assert extract_text_content(parts) == "only me"
def test_list_text_part_missing_text_key_contributes_empty_string(self):
"""part.get("text", "") — a text-typed dict with no "text" key gives ""."""
from app.utils.content_utils import extract_text_content
parts = [{"type": "text"}, {"type": "text", "text": "after"}]
result = extract_text_content(parts)
# both parts collected; joined → "\nafter" or "after" depending on strip
assert "after" in result
# ------------------------------------------------------------------
# Empty list → empty string
# ------------------------------------------------------------------
def test_empty_list_returns_empty_string(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content([]) == ""
# ------------------------------------------------------------------
# Unsupported types → empty string (the final bare `return ""`)
# ------------------------------------------------------------------
def test_none_returns_empty_string(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content(None) == ""
def test_integer_returns_empty_string(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content(42) == ""
def test_boolean_returns_empty_string(self):
from app.utils.content_utils import extract_text_content
assert extract_text_content(True) == ""

View file

@ -1,39 +0,0 @@
"""Unit tests for char-span -> line-range conversion."""
from __future__ import annotations
import pytest
from app.utils.text_spans import char_span_to_line_range
pytestmark = pytest.mark.unit
_TEXT = "line1\nline2\nline3"
def test_single_line_span() -> None:
start = _TEXT.index("line2")
assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
def test_first_line_span() -> None:
assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
def test_last_line_span() -> None:
start = _TEXT.index("line3")
assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
def test_multi_line_span() -> None:
# "line1\nline2" spans lines 1-2.
assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
def test_empty_span_resolves_to_its_line() -> None:
start = _TEXT.index("line2")
assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
def test_offsets_clamped_to_text_bounds() -> None:
assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)

View file

@ -14,7 +14,10 @@ SURFSENSE_BACKEND_INTERNAL_URL=http://backend:8000
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# Runtime configuration (read at runtime by the server, no rebuild needed) # Runtime configuration (read at runtime by the server, no rebuild needed)
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# Configure these plain variables for runtime behavior. They are read by server
# code when the app starts/serves requests, so changing them requires restarting
# the web process but not rebuilding the frontend bundle.
#
# Authentication method: LOCAL (email/password) or GOOGLE (OAuth). # Authentication method: LOCAL (email/password) or GOOGLE (OAuth).
AUTH_TYPE=LOCAL AUTH_TYPE=LOCAL
# Document parsing backend: DOCLING, LLAMACLOUD, etc. # Document parsing backend: DOCLING, LLAMACLOUD, etc.
@ -22,16 +25,6 @@ ETL_SERVICE=DOCLING
# Deployment mode: self-hosted or cloud. # Deployment mode: self-hosted or cloud.
DEPLOYMENT_MODE=self-hosted DEPLOYMENT_MODE=self-hosted
# ─────────────────────────────────────────────────────────────────────────────
# Build-time fallbacks for packaged clients (e.g. Electron) without a runtime
# config provider. Optional; Docker reads the plain runtime vars above first.
# ─────────────────────────────────────────────────────────────────────────────
# NEXT_PUBLIC_AUTH_TYPE=GOOGLE
# NEXT_PUBLIC_ETL_SERVICE=DOCLING
# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
# Overrides the app version shown in the UI (defaults to package.json version).
# NEXT_PUBLIC_APP_VERSION=
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
# Database (Contact Form, optional) # Database (Contact Form, optional)
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
@ -72,3 +65,20 @@ NEXT_PUBLIC_GOOGLE_ADSENSE_SLOT_FREE_HUB_BEFORE_FAQ=
# ───────────────────────────────────────────────────────────────────────────── # ─────────────────────────────────────────────────────────────────────────────
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_ENABLED=false NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_ENABLED=false
NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_MESSAGE= NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_MESSAGE=
# ─────────────────────────────────────────────────────────────────────────────
# Internal build-time fallbacks
# ─────────────────────────────────────────────────────────────────────────────
#
# Most deployments should leave these unset.
#
# These are only for SurfSense-managed production/cloud builds or packaged
# clients that do not have the normal server runtime config available.
#
# NEXT_PUBLIC_* values are embedded into the browser bundle during `next build`.
# Changing them after the bundle is built has no effect.
# NEXT_PUBLIC_AUTH_TYPE=GOOGLE
# NEXT_PUBLIC_ETL_SERVICE=DOCLING
# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted
# NEXT_PUBLIC_APP_VERSION=

View file

@ -58,6 +58,11 @@
--highlight: oklch(0.852 0.199 91.936); --highlight: oklch(0.852 0.199 91.936);
} }
html[data-surfsense-auth-type="GOOGLE"] .runtime-auth-local,
html[data-surfsense-auth-type="LOCAL"] .runtime-auth-google {
display: none;
}
.dark { .dark {
--background: oklch(0.145 0 0); --background: oklch(0.145 0 0);
--foreground: oklch(0.985 0 0); --foreground: oklch(0.985 0 0);
@ -270,12 +275,6 @@ button {
contain-intrinsic-size: 0 40px; contain-intrinsic-size: 0 40px;
} }
/* Monaco whole-line highlight for a cited source span (Phase E). */
.citation-line-highlight {
background-color: color-mix(in srgb, var(--primary) 16%, transparent);
box-shadow: inset 2px 0 0 0 var(--primary);
}
@source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}"; @source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
@source "../node_modules/streamdown/dist/*.js"; @source "../node_modules/streamdown/dist/*.js";
@source "../node_modules/@streamdown/code/dist/*.js"; @source "../node_modules/@streamdown/code/dist/*.js";

View file

@ -2,6 +2,7 @@ import type { Metadata, Viewport } from "next";
import "./globals.css"; import "./globals.css";
import { RootProvider } from "fumadocs-ui/provider/next"; import { RootProvider } from "fumadocs-ui/provider/next";
import { Roboto } from "next/font/google"; import { Roboto } from "next/font/google";
import Script from "next/script";
import { AnnouncementToastProvider } from "@/components/announcements/AnnouncementToastProvider"; import { AnnouncementToastProvider } from "@/components/announcements/AnnouncementToastProvider";
import { DesktopUpdateToast } from "@/components/desktop/desktop-update-toast"; import { DesktopUpdateToast } from "@/components/desktop/desktop-update-toast";
import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvider"; import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvider";
@ -16,8 +17,13 @@ import {
import { ThemeProvider } from "@/components/theme/theme-provider"; import { ThemeProvider } from "@/components/theme/theme-provider";
import { Toaster } from "@/components/ui/sonner"; import { Toaster } from "@/components/ui/sonner";
import { LocaleProvider } from "@/contexts/LocaleContext"; import { LocaleProvider } from "@/contexts/LocaleContext";
import { BUILD_TIME_AUTH_TYPE } from "@/lib/env-config";
import { PlatformProvider } from "@/contexts/platform-context"; import { PlatformProvider } from "@/contexts/platform-context";
import { ReactQueryClientProvider } from "@/lib/query-client/query-client.provider"; import { ReactQueryClientProvider } from "@/lib/query-client/query-client.provider";
import {
getRuntimeAuthInitScript,
resolveRuntimeAuthUiMode,
} from "@/lib/runtime-auth-config";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
const roboto = Roboto({ const roboto = Roboto({
@ -131,8 +137,15 @@ export default function RootLayout({
// Language can be switched dynamically through LanguageSwitcher component // Language can be switched dynamically through LanguageSwitcher component
// Locale state is managed by LocaleContext and persisted in localStorage // Locale state is managed by LocaleContext and persisted in localStorage
return ( return (
<html lang="en" suppressHydrationWarning> <html
lang="en"
data-surfsense-auth-type={resolveRuntimeAuthUiMode(BUILD_TIME_AUTH_TYPE)}
suppressHydrationWarning
>
<head> <head>
<Script id="surfsense-runtime-auth-init" strategy="beforeInteractive">
{getRuntimeAuthInitScript(BUILD_TIME_AUTH_TYPE)}
</Script>
<link rel="preconnect" href="https://api.github.com" /> <link rel="preconnect" href="https://api.github.com" />
<OrganizationJsonLd /> <OrganizationJsonLd />
<WebSiteJsonLd /> <WebSiteJsonLd />

View file

@ -1,11 +1,6 @@
import { atom } from "jotai"; import { atom } from "jotai";
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom"; import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
export interface EditorLineRange {
start: number;
end: number;
}
interface EditorPanelState { interface EditorPanelState {
isOpen: boolean; isOpen: boolean;
kind: "document" | "local_file" | "memory"; kind: "document" | "local_file" | "memory";
@ -14,10 +9,6 @@ interface EditorPanelState {
searchSpaceId: number | null; searchSpaceId: number | null;
memoryScope: "user" | "team" | null; memoryScope: "user" | "team" | null;
title: string | null; title: string | null;
// Citation line anchor: when set, the editor opens the raw source view
// scrolled to and highlighting this 1-based inclusive line range.
highlightLines: EditorLineRange | null;
forceSourceView: boolean;
} }
const initialState: EditorPanelState = { const initialState: EditorPanelState = {
@ -28,8 +19,6 @@ const initialState: EditorPanelState = {
searchSpaceId: null, searchSpaceId: null,
memoryScope: null, memoryScope: null,
title: null, title: null,
highlightLines: null,
forceSourceView: false,
}; };
export const editorPanelAtom = atom<EditorPanelState>(initialState); export const editorPanelAtom = atom<EditorPanelState>(initialState);
@ -44,14 +33,7 @@ export const openEditorPanelAtom = atom(
get, get,
set, set,
payload: payload:
| { | { documentId: number; searchSpaceId: number; title?: string; kind?: "document" }
documentId: number;
searchSpaceId: number;
title?: string;
kind?: "document";
highlightLines?: EditorLineRange | null;
forceSourceView?: boolean;
}
| { | {
kind: "local_file"; kind: "local_file";
localFilePath: string; localFilePath: string;
@ -77,8 +59,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null, searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: null, memoryScope: null,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: null,
forceSourceView: false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);
@ -93,8 +73,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null, searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: payload.memoryScope, memoryScope: payload.memoryScope,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: null,
forceSourceView: false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);
@ -108,8 +86,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId, searchSpaceId: payload.searchSpaceId,
memoryScope: null, memoryScope: null,
title: payload.title ?? null, title: payload.title ?? null,
highlightLines: payload.highlightLines ?? null,
forceSourceView: payload.forceSourceView ?? false,
}); });
set(rightPanelTabAtom, "editor"); set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false); set(rightPanelCollapsedAtom, false);

View file

@ -2,11 +2,9 @@
import { useSetAtom } from "jotai"; import { useSetAtom } from "jotai";
import { FileText } from "lucide-react"; import { FileText } from "lucide-react";
import { useParams } from "next/navigation";
import type { FC } from "react"; import type { FC } from "react";
import { useId, useState } from "react"; import { useId, useState } from "react";
import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom"; import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context"; import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
import { CitationPanelContent } from "@/components/citation-panel/citation-panel"; import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
import { Citation } from "@/components/tool-ui/citation"; import { Citation } from "@/components/tool-ui/citation";
@ -110,50 +108,6 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
); );
}; };
interface LineCitationProps {
documentId: number;
startLine: number;
endLine: number;
}
/**
* Inline citation for a knowledge-base document line range
* (`[citation:d<documentId>#L<start>-<end>]`). Clicking opens the document in
* the editor's read-only source view, scrolled to and highlighting the cited
* lines the same anchor the citation panel uses for chunk citations.
*/
export const LineCitation: FC<LineCitationProps> = ({ documentId, startLine, endLine }) => {
const openEditorPanel = useSetAtom(openEditorPanelAtom);
const params = useParams();
const searchSpaceId = Number(params?.search_space_id);
const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`;
const handleClick = () => {
if (!Number.isFinite(searchSpaceId)) return;
openEditorPanel({
documentId,
searchSpaceId,
highlightLines: { start: startLine, end: endLine },
forceSourceView: true,
});
};
return (
<Button
type="button"
variant="ghost"
onClick={handleClick}
className="ml-0.5 inline-flex h-5 min-w-5 items-center justify-center gap-0.5 rounded-md bg-popover px-1.5 text-[11px] font-medium text-popover-foreground/80 align-baseline"
title={`View cited lines ${startLine}${endLine}`}
aria-label={`View cited document lines ${startLine} to ${endLine}`}
>
<FileText className="size-3" />
{label}
</Button>
);
};
import { tryGetHostname } from "@/lib/url"; import { tryGetHostname } from "@/lib/url";
interface UrlCitationProps { interface UrlCitationProps {

View file

@ -3,7 +3,7 @@
import Link from "next/link"; import Link from "next/link";
import { useState } from "react"; import { useState } from "react";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config"; import { buildBackendUrl } from "@/lib/env-config";
import { trackLoginAttempt } from "@/lib/posthog/events"; import { trackLoginAttempt } from "@/lib/posthog/events";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
@ -46,7 +46,6 @@ interface SignInButtonProps {
} }
export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => { export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE";
const [isRedirecting, setIsRedirecting] = useState(false); const [isRedirecting, setIsRedirecting] = useState(false);
const handleGoogleLogin = () => { const handleGoogleLogin = () => {
@ -56,44 +55,45 @@ export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => {
window.location.href = buildBackendUrl("/auth/google/authorize-redirect"); window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
}; };
const getClassName = () => { const getGoogleClassName = () => {
if (variant === "desktop") { if (variant === "desktop") {
return isGoogleAuth return "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white";
? "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white"
: "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black";
} }
if (variant === "compact") { if (variant === "compact") {
return isGoogleAuth return "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white";
? "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white"
: "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black";
} }
// mobile // mobile
return isGoogleAuth return "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation";
? "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation"
: "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation";
}; };
if (isGoogleAuth) { const getLocalClassName = () => {
return ( if (variant === "desktop") {
return "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black";
}
if (variant === "compact") {
return "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black";
}
return "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation";
};
return (
<>
<Button <Button
type="button" type="button"
variant="ghost" variant="ghost"
onClick={handleGoogleLogin} onClick={handleGoogleLogin}
disabled={isRedirecting} disabled={isRedirecting}
className={cn( className={cn(
"flex items-center justify-center gap-2 transition-colors duration-200 disabled:cursor-not-allowed disabled:opacity-50", "runtime-auth-google flex items-center justify-center gap-2 transition-colors duration-200 disabled:cursor-not-allowed disabled:opacity-50",
getClassName() getGoogleClassName()
)} )}
> >
<GoogleLogo className="h-4 w-4" /> <GoogleLogo className="h-4 w-4" />
<span>Sign In</span> <span>Sign In</span>
</Button> </Button>
); <Link href="/login" className={cn("runtime-auth-local", getLocalClassName())}>
} Sign In
</Link>
return ( </>
<Link href="/login" className={getClassName()}>
Sign In
</Link>
); );
}; };

View file

@ -46,13 +46,6 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]); const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
const citedLineLabel = useMemo(() => {
const start = data?.cited_start_line;
const end = data?.cited_end_line;
if (start == null || end == null) return null;
return start === end ? `Line ${start}` : `Lines ${start}${end}`;
}, [data?.cited_start_line, data?.cited_end_line]);
const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0; const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
const startIndex = data?.chunk_start_index ?? 0; const startIndex = data?.chunk_start_index ?? 0;
const hasMoreAbove = startIndex > 0; const hasMoreAbove = startIndex > 0;
@ -82,15 +75,10 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
const handleOpenFullDocument = () => { const handleOpenFullDocument = () => {
if (!data) return; if (!data) return;
const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null;
openEditorPanel({ openEditorPanel({
documentId: data.id, documentId: data.id,
searchSpaceId: data.search_space_id, searchSpaceId: data.search_space_id,
title: data.title, title: data.title,
highlightLines: hasLineAnchor
? { start: data.cited_start_line as number, end: data.cited_end_line as number }
: null,
forceSourceView: hasLineAnchor,
}); });
}; };
@ -122,7 +110,6 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
</p> </p>
</div> </div>
<div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground"> <div className="flex items-center gap-3 shrink-0 text-[11px] text-muted-foreground">
{citedLineLabel && <span>{citedLineLabel}</span>}
{totalChunks > 0 && <span>{totalChunks} chunks</span>} {totalChunks > 0 && <span>{totalChunks} chunks</span>}
{!isLoading && !error && data && ( {!isLoading && !error && data && (
<Button <Button
@ -185,9 +172,7 @@ export const CitationPanelContent: FC<CitationPanelContentProps> = ({
Chunk #{chunk.id} Chunk #{chunk.id}
</span> </span>
{isCited && ( {isCited && (
<span className="text-[11px] font-semibold text-primary"> <span className="text-[11px] font-semibold text-primary">Cited chunk</span>
{citedLineLabel ? `Cited chunk · ${citedLineLabel}` : "Cited chunk"}
</span>
)} )}
</div> </div>
<div className="text-sm"> <div className="text-sm">

View file

@ -1,7 +1,7 @@
"use client"; "use client";
import type { ReactNode } from "react"; import type { ReactNode } from "react";
import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import { import {
type CitationToken, type CitationToken,
type CitationUrlMap, type CitationUrlMap,
@ -21,16 +21,6 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R
if (token.kind === "url") { if (token.kind === "url") {
return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />; return <UrlCitation key={`citation-url-${ordinalKey}`} url={token.url} />;
} }
if (token.kind === "line") {
return (
<LineCitation
key={`citation-line-${token.documentId}-${token.startLine}-${ordinalKey}`}
documentId={token.documentId}
startLine={token.startLine}
endLine={token.endLine}
/>
);
}
return ( return (
<InlineCitation <InlineCitation
key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`} key={`citation-${token.isDocsChunk ? "doc-" : ""}${token.chunkId}-${ordinalKey}`}

View file

@ -149,8 +149,6 @@ export function EditorPanelContent({
searchSpaceId, searchSpaceId,
title, title,
onClose, onClose,
highlightLines = null,
forceSourceView = false,
}: { }: {
kind?: "document" | "local_file" | "memory"; kind?: "document" | "local_file" | "memory";
documentId?: number; documentId?: number;
@ -159,8 +157,6 @@ export function EditorPanelContent({
searchSpaceId?: number; searchSpaceId?: number;
title: string | null; title: string | null;
onClose?: () => void; onClose?: () => void;
highlightLines?: { start: number; end: number } | null;
forceSourceView?: boolean;
}) { }) {
const electronAPI = useElectronAPI(); const electronAPI = useElectronAPI();
const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null); const [editorDoc, setEditorDoc] = useState<EditorContent | null>(null);
@ -209,7 +205,7 @@ export function EditorPanelContent({
const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines; const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
const viewerMode: ViewerMode = isMemoryMode const viewerMode: ViewerMode = isMemoryMode
? "plate" ? "plate"
: editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView : editorDoc?.viewer_mode === "monaco" || isLargeDocument
? "monaco" ? "monaco"
: "plate"; : "plate";
@ -832,7 +828,6 @@ export function EditorPanelContent({
value={editorDoc.source_markdown} value={editorDoc.source_markdown}
readOnly readOnly
onChange={() => {}} onChange={() => {}}
highlightLines={highlightLines}
/> />
</div> </div>
</div> </div>
@ -923,8 +918,6 @@ function DesktopEditorPanel() {
searchSpaceId={panelState.searchSpaceId ?? undefined} searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title} title={panelState.title}
onClose={closePanel} onClose={closePanel}
highlightLines={panelState.highlightLines}
forceSourceView={panelState.forceSourceView}
/> />
</div> </div>
); );
@ -964,8 +957,6 @@ function MobileEditorDrawer() {
memoryScope={panelState.memoryScope ?? undefined} memoryScope={panelState.memoryScope ?? undefined}
searchSpaceId={panelState.searchSpaceId ?? undefined} searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title} title={panelState.title}
highlightLines={panelState.highlightLines}
forceSourceView={panelState.forceSourceView}
/> />
</div> </div>
</DrawerContent> </DrawerContent>

View file

@ -3,10 +3,9 @@
import { type Descendant, KEYS } from "platejs"; import { type Descendant, KEYS } from "platejs";
import { createPlatePlugin, type PlateElementProps } from "platejs/react"; import { createPlatePlugin, type PlateElementProps } from "platejs/react";
import type { FC } from "react"; import type { FC } from "react";
import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import { import {
CITATION_REGEX, CITATION_REGEX,
type CitationToken,
type CitationUrlMap, type CitationUrlMap,
parseTextWithCitations, parseTextWithCitations,
} from "@/lib/citations/citation-parser"; } from "@/lib/citations/citation-parser";
@ -18,12 +17,9 @@ import {
*/ */
export type CitationElementNode = { export type CitationElementNode = {
type: "citation"; type: "citation";
kind: "chunk" | "doc" | "url" | "line"; kind: "chunk" | "doc" | "url";
chunkId?: number; chunkId?: number;
url?: string; url?: string;
documentId?: number;
startLine?: number;
endLine?: number;
/** Original literal token that produced this citation node. */ /** Original literal token that produced this citation node. */
rawText: string; rawText: string;
children: [{ text: "" }]; children: [{ text: "" }];
@ -37,22 +33,11 @@ const CitationElement: FC<PlateElementProps<CitationElementNode>> = ({
element, element,
}) => { }) => {
const isUrl = element.kind === "url"; const isUrl = element.kind === "url";
const isLine =
element.kind === "line" &&
element.documentId !== undefined &&
element.startLine !== undefined &&
element.endLine !== undefined;
return ( return (
<span {...attributes} className="inline-flex align-baseline"> <span {...attributes} className="inline-flex align-baseline">
<span contentEditable={false}> <span contentEditable={false}>
{isUrl && element.url ? ( {isUrl && element.url ? (
<UrlCitation url={element.url} /> <UrlCitation url={element.url} />
) : isLine ? (
<LineCitation
documentId={element.documentId as number}
startLine={element.startLine as number}
endLine={element.endLine as number}
/>
) : element.chunkId !== undefined ? ( ) : element.chunkId !== undefined ? (
<InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} /> <InlineCitation chunkId={element.chunkId} isDocsChunk={element.kind === "doc"} />
) : null} ) : null}
@ -112,7 +97,10 @@ function copyMarks(textNode: SlateText): Record<string, unknown> {
return marks; return marks;
} }
function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode { function makeCitationElement(
rawText: string,
segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
): CitationElementNode {
if (segment.kind === "url") { if (segment.kind === "url") {
return { return {
type: CITATION_TYPE, type: CITATION_TYPE,
@ -122,17 +110,6 @@ function makeCitationElement(rawText: string, segment: CitationToken): CitationE
children: [{ text: "" }], children: [{ text: "" }],
}; };
} }
if (segment.kind === "line") {
return {
type: CITATION_TYPE,
kind: "line",
documentId: segment.documentId,
startLine: segment.startLine,
endLine: segment.endLine,
rawText,
children: [{ text: "" }],
};
}
return { return {
type: CITATION_TYPE, type: CITATION_TYPE,
kind: segment.isDocsChunk ? "doc" : "chunk", kind: segment.isDocsChunk ? "doc" : "chunk",

View file

@ -2,7 +2,7 @@
import dynamic from "next/dynamic"; import dynamic from "next/dynamic";
import { useTheme } from "next-themes"; import { useTheme } from "next-themes";
import { useCallback, useEffect, useRef } from "react"; import { useEffect, useRef } from "react";
import { Spinner } from "@/components/ui/spinner"; import { Spinner } from "@/components/ui/spinner";
const MonacoEditor = dynamic(() => import("@monaco-editor/react"), { const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
@ -17,8 +17,6 @@ interface SourceCodeEditorProps {
readOnly?: boolean; readOnly?: boolean;
fontSize?: number; fontSize?: number;
onSave?: () => Promise<void> | void; onSave?: () => Promise<void> | void;
/** 1-based inclusive line range to reveal and highlight (e.g. a citation). */
highlightLines?: { start: number; end: number } | null;
} }
export function SourceCodeEditor({ export function SourceCodeEditor({
@ -29,45 +27,10 @@ export function SourceCodeEditor({
readOnly = false, readOnly = false,
fontSize = 12, fontSize = 12,
onSave, onSave,
highlightLines = null,
}: SourceCodeEditorProps) { }: SourceCodeEditorProps) {
const { resolvedTheme } = useTheme(); const { resolvedTheme } = useTheme();
const onSaveRef = useRef(onSave); const onSaveRef = useRef(onSave);
const monacoRef = useRef<any>(null); const monacoRef = useRef<any>(null);
const editorRef = useRef<any>(null);
const decorationsRef = useRef<any>(null);
const highlightLinesRef = useRef(highlightLines);
highlightLinesRef.current = highlightLines;
const applyHighlight = useCallback(() => {
const editor = editorRef.current;
const monaco = monacoRef.current;
if (!editor || !monaco) return;
if (decorationsRef.current) {
decorationsRef.current.clear();
decorationsRef.current = null;
}
const range = highlightLinesRef.current;
if (!range) return;
const lineCount = editor.getModel()?.getLineCount() ?? range.end;
const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount);
const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount);
try {
decorationsRef.current = editor.createDecorationsCollection([
{
range: new monaco.Range(start, 1, end, 1),
options: { isWholeLine: true, className: "citation-line-highlight" },
},
]);
} catch {
// Decoration failure must not block the reveal below.
}
editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate);
}, []);
useEffect(() => {
applyHighlight();
}, [applyHighlight, highlightLines?.start, highlightLines?.end]);
const normalizedModelPath = (() => { const normalizedModelPath = (() => {
const raw = (path || "local-file.txt").trim(); const raw = (path || "local-file.txt").trim();
const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`; const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
@ -141,16 +104,7 @@ export function SourceCodeEditor({
}} }}
onMount={(editor, monaco) => { onMount={(editor, monaco) => {
monacoRef.current = monaco; monacoRef.current = monaco;
editorRef.current = editor;
applySidebarTheme(monaco); applySidebarTheme(monaco);
// Reveal now, then once more after the first layout settles:
// the panel slide-in animation means the editor often has no
// usable viewport height on the initial frame.
applyHighlight();
const layoutSub = editor.onDidLayoutChange(() => {
applyHighlight();
layoutSub.dispose();
});
if (!isManualSaveEnabled) return; if (!isManualSaveEnabled) return;
editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => { editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
void onSaveRef.current?.(); void onSaveRef.current?.();

View file

@ -37,7 +37,7 @@ import {
getAssetLabel, getAssetLabel,
usePrimaryDownload, usePrimaryDownload,
} from "@/lib/desktop-download-utils"; } from "@/lib/desktop-download-utils";
import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config"; import { buildBackendUrl } from "@/lib/env-config";
import { trackLoginAttempt } from "@/lib/posthog/events"; import { trackLoginAttempt } from "@/lib/posthog/events";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
@ -314,7 +314,6 @@ export function HeroSection() {
} }
function GetStartedButton() { function GetStartedButton() {
const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE";
const [isRedirecting, setIsRedirecting] = useState(false); const [isRedirecting, setIsRedirecting] = useState(false);
const handleGoogleLogin = () => { const handleGoogleLogin = () => {
@ -324,29 +323,26 @@ function GetStartedButton() {
window.location.href = buildBackendUrl("/auth/google/authorize-redirect"); window.location.href = buildBackendUrl("/auth/google/authorize-redirect");
}; };
if (isGoogleAuth) { return (
return ( <>
<Button <Button
type="button" type="button"
variant="ghost" variant="ghost"
onClick={handleGoogleLogin} onClick={handleGoogleLogin}
disabled={isRedirecting} disabled={isRedirecting}
className="h-14 w-full cursor-pointer gap-3 rounded-lg border border-white bg-white text-center text-base font-medium text-[#1f1f1f] shadow-sm transition duration-150 hover:bg-zinc-100 hover:text-[#1f1f1f] sm:w-56 dark:border-white" className="runtime-auth-google h-14 w-full cursor-pointer gap-3 rounded-lg border border-white bg-white text-center text-base font-medium text-[#1f1f1f] shadow-sm transition duration-150 hover:bg-zinc-100 hover:text-[#1f1f1f] sm:w-56 dark:border-white"
> >
<GoogleLogo className="h-5 w-5" /> <GoogleLogo className="h-5 w-5" />
<span>Continue with Google</span> <span>Continue with Google</span>
</Button> </Button>
); <Button
} asChild
variant="ghost"
return ( className="runtime-auth-local h-14 w-full rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 hover:bg-black sm:w-52 dark:bg-white dark:text-black dark:hover:bg-white"
<Button >
asChild <Link href="/login">Get Started</Link>
variant="ghost" </Button>
className="h-14 w-full rounded-lg bg-black text-center text-base font-medium text-white shadow-sm ring-1 shadow-black/10 ring-black/10 transition duration-150 active:scale-98 hover:bg-black sm:w-52 dark:bg-white dark:text-black dark:hover:bg-white" </>
>
<Link href="/login">Get Started</Link>
</Button>
); );
} }

View file

@ -12,7 +12,6 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl"; import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
import { useMediaQuery } from "@/hooks/use-media-query";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
import { DocumentsSidebar } from "../sidebar"; import { DocumentsSidebar } from "../sidebar";
@ -197,9 +196,6 @@ export function RightPanel({
const citationState = useAtomValue(citationPanelAtom); const citationState = useAtomValue(citationPanelAtom);
const closeCitation = useSetAtom(closeCitationPanelAtom); const closeCitation = useSetAtom(closeCitationPanelAtom);
const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom); const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
// Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without
// this guard both render together and two editors fight over one model.
const isDesktop = useMediaQuery("(min-width: 1024px)");
const documentsOpen = documentsPanel?.open ?? false; const documentsOpen = documentsPanel?.open ?? false;
const reportOpen = reportState.isOpen && !!reportState.reportId; const reportOpen = reportState.isOpen && !!reportState.reportId;
@ -271,7 +267,7 @@ export function RightPanel({
<CollapseButton onClick={() => setCollapsed(true)} /> <CollapseButton onClick={() => setCollapsed(true)} />
) : null; ) : null;
if (!isVisible || !isDesktop) return null; if (!isVisible) return null;
return ( return (
<aside <aside
@ -312,8 +308,6 @@ export function RightPanel({
searchSpaceId={editorState.searchSpaceId ?? undefined} searchSpaceId={editorState.searchSpaceId ?? undefined}
title={editorState.title} title={editorState.title}
onClose={closeEditor} onClose={closeEditor}
highlightLines={editorState.highlightLines}
forceSourceView={editorState.forceSourceView}
/> />
</div> </div>
)} )}

View file

@ -272,6 +272,7 @@ export function ModelSelector({
type="button" type="button"
variant="ghost" variant="ghost"
size="sm" size="sm"
aria-label="Select chat model"
className={cn( className={cn(
"h-8 min-w-0 gap-2 rounded-md px-3 text-muted-foreground transition-colors", "h-8 min-w-0 gap-2 rounded-md px-3 text-muted-foreground transition-colors",
"select-none", "select-none",

View file

@ -70,15 +70,10 @@ export const documentWithChunks = document.extend({
id: z.number(), id: z.number(),
content: z.string(), content: z.string(),
created_at: z.string(), created_at: z.string(),
start_char: z.number().nullable().optional(),
end_char: z.number().nullable().optional(),
}) })
), ),
total_chunks: z.number().optional().default(0), total_chunks: z.number().optional().default(0),
chunk_start_index: z.number().optional().default(0), chunk_start_index: z.number().optional().default(0),
// 1-based inclusive line range of the cited chunk within source_markdown.
cited_start_line: z.number().nullable().optional(),
cited_end_line: z.number().nullable().optional(),
}); });
/** /**

View file

@ -18,16 +18,12 @@ import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
* sometimes emit. * sometimes emit.
*/ */
export const CITATION_REGEX = export const CITATION_REGEX =
/[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|d\d+#L\d+-\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g; /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
/** Matches the knowledge-base line-citation form `d<documentId>#L<start>-<end>`. */
const LINE_CITATION_REGEX = /^d(\d+)#L(\d+)-(\d+)$/;
/** A single parsed citation reference. */ /** A single parsed citation reference. */
export type CitationToken = export type CitationToken =
| { kind: "url"; url: string } | { kind: "url"; url: string }
| { kind: "chunk"; chunkId: number; isDocsChunk: boolean } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
| { kind: "line"; documentId: number; startLine: number; endLine: number };
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */ /** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
export type ParsedSegment = string | CitationToken; export type ParsedSegment = string | CitationToken;
@ -99,15 +95,7 @@ export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): Pa
const captured = match[1]; const captured = match[1];
const lineMatch = LINE_CITATION_REGEX.exec(captured); if (captured.startsWith("http://") || captured.startsWith("https://")) {
if (lineMatch) {
segments.push({
kind: "line",
documentId: Number.parseInt(lineMatch[1], 10),
startLine: Number.parseInt(lineMatch[2], 10),
endLine: Number.parseInt(lineMatch[3], 10),
});
} else if (captured.startsWith("http://") || captured.startsWith("https://")) {
segments.push({ kind: "url", url: captured.trim() }); segments.push({ kind: "url", url: captured.trim() });
} else if (captured.startsWith("urlcite")) { } else if (captured.startsWith("urlcite")) {
const url = urlMap.get(captured); const url = urlMap.get(captured);

View file

@ -0,0 +1,52 @@
export const RUNTIME_AUTH_TYPE_COOKIE_NAME = "surfsense_auth_type";
export type RuntimeAuthUiMode = "GOOGLE" | "LOCAL";
export function resolveRuntimeAuthUiMode(
value: string | null | undefined,
fallback: string | null | undefined = "GOOGLE"
): RuntimeAuthUiMode {
const candidate = value?.trim().toUpperCase();
if (candidate === "GOOGLE") return "GOOGLE";
if (candidate === "LOCAL") return "LOCAL";
const fallbackCandidate = fallback?.trim().toUpperCase();
return fallbackCandidate === "GOOGLE" ? "GOOGLE" : "LOCAL";
}
export function getRuntimeAuthInitScript(fallbackAuthType: string): string {
const fallback = resolveRuntimeAuthUiMode(fallbackAuthType);
const cookieName = JSON.stringify(RUNTIME_AUTH_TYPE_COOKIE_NAME);
const fallbackValue = JSON.stringify(fallback);
return `
(function() {
try {
var cookieName = ${cookieName};
var fallback = ${fallbackValue};
var prefix = cookieName + "=";
var rawValue = fallback;
var cookies = document.cookie ? document.cookie.split(";") : [];
for (var i = 0; i < cookies.length; i++) {
var cookie = cookies[i].trim();
if (cookie.indexOf(prefix) === 0) {
rawValue = decodeURIComponent(cookie.slice(prefix.length));
break;
}
}
var normalized = String(rawValue || fallback).toUpperCase() === "GOOGLE" ? "GOOGLE" : "LOCAL";
window.__SURFSENSE_AUTH_TYPE__ = normalized;
document.documentElement.setAttribute("data-surfsense-auth-type", normalized);
} catch (_) {
window.__SURFSENSE_AUTH_TYPE__ = ${fallbackValue};
document.documentElement.setAttribute("data-surfsense-auth-type", ${fallbackValue});
}
})();
`;
}
declare global {
interface Window {
__SURFSENSE_AUTH_TYPE__?: RuntimeAuthUiMode;
}
}

24
surfsense_web/proxy.ts Normal file
View file

@ -0,0 +1,24 @@
import { NextResponse, type NextRequest } from "next/server";
import { BUILD_TIME_AUTH_TYPE } from "@/lib/env-config";
import {
RUNTIME_AUTH_TYPE_COOKIE_NAME,
resolveRuntimeAuthUiMode,
} from "@/lib/runtime-auth-config";
export function proxy(request: NextRequest) {
const response = NextResponse.next();
const authType = resolveRuntimeAuthUiMode(process.env.AUTH_TYPE, BUILD_TIME_AUTH_TYPE);
response.cookies.set(RUNTIME_AUTH_TYPE_COOKIE_NAME, authType, {
path: "/",
maxAge: 60 * 60 * 24 * 365,
sameSite: "lax",
secure: request.nextUrl.protocol === "https:",
});
return response;
}
export const config = {
matcher: ["/((?!api|auth|_next/static|_next/image|favicon.ico|.*\\..*).*)"],
};