diff --git a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py b/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py
deleted file mode 100644
index 336711612..000000000
--- a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""add chunks.start_char/end_char for citation offsets
-
-Char offsets into the document's source_markdown (half-open span) let citations
-resolve the exact passage a chunk came from. Nullable because historical rows
-have no span; they populate on the next connector sync or user edit/reindex.
-
-No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
-every secondary index per row (see migration 165 for the same reasoning).
-
-Revision ID: 166
-Revises: 165
-"""
-
-from collections.abc import Sequence
-
-from alembic import op
-
-revision: str = "166"
-down_revision: str | None = "165"
-branch_labels: str | Sequence[str] | None = None
-depends_on: str | Sequence[str] | None = None
-
-
-def upgrade() -> None:
- op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
- op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
-
-
-def downgrade() -> None:
- op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
- op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
index d66e9073c..a6c83a7d4 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
@@ -18,6 +18,7 @@ skipped (e.g. client disconnect).
from __future__ import annotations
+import asyncio
import logging
from datetime import UTC, datetime
from typing import Any
@@ -57,8 +58,9 @@ from app.db import (
FolderRevision,
shielded_async_session,
)
-from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
+from app.indexing_pipeline.document_chunker import chunk_text
from app.utils.document_converters import (
+ embed_texts,
generate_content_hash,
generate_unique_identifier_hash,
)
@@ -232,23 +234,24 @@ async def _create_document(
session.add(doc)
await session.flush()
- summary_embedding, chunk_embeddings = await build_chunk_embeddings(
- content, use_code_chunker=False
- )
+ summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
doc.embedding = summary_embedding
- session.add_all(
- [
- Chunk(
- document_id=doc.id,
- content=sl.text,
- embedding=embedding,
- position=i,
- start_char=sl.start_char,
- end_char=sl.end_char,
- )
- for i, (sl, embedding) in enumerate(chunk_embeddings)
- ]
- )
+ chunks = chunk_text(content)
+ if chunks:
+ chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
+ session.add_all(
+ [
+ Chunk(
+ document_id=doc.id,
+ content=text,
+ embedding=embedding,
+ position=i,
+ )
+ for i, (text, embedding) in enumerate(
+ zip(chunks, chunk_embeddings, strict=True)
+ )
+ ]
+ )
return doc
@@ -284,25 +287,26 @@ async def _update_document(
search_space_id,
)
- summary_embedding, chunk_embeddings = await build_chunk_embeddings(
- content, use_code_chunker=False
- )
+ summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
document.embedding = summary_embedding
await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
- session.add_all(
- [
- Chunk(
- document_id=document.id,
- content=sl.text,
- embedding=embedding,
- position=i,
- start_char=sl.start_char,
- end_char=sl.end_char,
- )
- for i, (sl, embedding) in enumerate(chunk_embeddings)
- ]
- )
+ chunks = chunk_text(content)
+ if chunks:
+ chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
+ session.add_all(
+ [
+ Chunk(
+ document_id=document.id,
+ content=text,
+ embedding=embedding,
+ position=i,
+ )
+ for i, (text, embedding) in enumerate(
+ zip(chunks, chunk_embeddings, strict=True)
+ )
+ ]
+ )
return document
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
index 8e67615d0..2abd95d5a 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@@ -1,58 +1,42 @@
-Citations reach the answer through three channels. Use whichever applies, and
-never invent ids you didn't see: ids are matched exactly, so a wrong one
-silently breaks the link — when in doubt, omit. Always write a citation as
-plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
-parentheses.
+Citations reach the answer through two channels. Use whichever applies — and
+never invent ids you didn't see. Citation ids are resolved by exact-match
+lookup; a wrong id silently breaks the link, so when in doubt, omit.
-### Channel A — web_search chunk blocks injected this turn
+### Channel A — chunk blocks injected this turn
When `web_search` returns `` / `` blocks in this
-turn, the chunk `id` is the result's URL:
+turn:
-1. For each factual statement taken from a chunk, add `[citation:]`
- using the **exact** id from a visible `` tag. Copy the
- URL verbatim; do not retype it from memory.
-2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
+1. For each factual statement taken from those chunks, add
+ `[citation:chunk_id]` using the **exact** id from a visible
+ `` tag. Copy digit-for-digit (or the URL verbatim);
+ do not retype from memory.
+2. `` is the parent doc id, **not** a citation source —
+ only ids inside `` count.
+3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
each id copied individually).
-3. Never invent, normalise, or guess at a URL; if unsure, omit.
+4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
+5. Plain brackets only — no markdown links, no footnote numbering.
### Channel B — citations relayed by a `task` specialist
-A `task(...)` tool message may contain `[citation:…]` markers the
-specialist already attached to its prose — line citations
-(`[citation:d#L-]`) or chunk ids (`[citation:N]`). The
-specialist read the underlying document and tied each marker to a
-passage; you didn't. So:
+A `task(...)` tool message may contain `[citation:]` markers
+the specialist already attached to its prose. The specialist saw the
+underlying `` blocks; you didn't. So:
1. **Preserve those markers verbatim** in your final answer — do not
reformat, renumber, drop, or wrap them in markdown links. When you
paraphrase a specialist sentence, copy the marker character-for-
- character; do not regenerate it from memory (LLMs reliably corrupt
- nearby digits).
+ character; do not regenerate the id from memory (LLMs reliably
+ corrupt nearby digits).
2. Keep each marker attached to the sentence the specialist attached
it to.
3. Do **not** add new `[citation:…]` markers of your own to a
specialist's prose; if a fact has no marker, the specialist
- couldn't tie it to a source and neither can you.
+ couldn't tie it to a chunk and neither can you.
4. When a specialist returns JSON, the citation markers live inside
the prose-bearing fields (e.g. a summary or excerpt). Pull them
along with the surrounding sentence when you quote.
-### Channel C — your knowledge base (search hits and `read_file`)
-Knowledge-base facts are cited by line range using the document id:
-`[citation:d#L-]` (a single line is `#L-`).
-
-1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
- matched passage. When that passage supports your point, copy the token
- verbatim — that is the entire citation.
-2. When you `read_file` a `/documents/...` path, its header gives the
- `` and an optional `` pointer, and the body is
- shown with line numbers; cite the lines you actually used. Use `read_file`
- when you need more context than a search passage shows.
-3. Copy document ids and line numbers exactly as shown — never estimate,
- shift, or invent them.
-4. Older documents without a numbered body instead show ``
- blocks; cite those with `[citation:N]`, copying the id exactly.
-
-If none of these channels surfaces a citable source this turn, do not
-fabricate citations.
+If neither channel surfaces citation markers this turn, do not fabricate
+them.
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
index 0696dc92e..9236e9121 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py
@@ -33,7 +33,6 @@ from app.agents.chat.runtime.path_resolver import (
)
from app.db import Document, shielded_async_session
from app.utils.perf import get_perf_logger
-from app.utils.text_spans import char_span_to_line_range
_perf_log = get_perf_logger()
@@ -57,16 +56,12 @@ _TOOL_DESCRIPTION = (
)
-async def _resolve_doc_context(
+async def _resolve_virtual_paths(
results: list[dict[str, Any]],
*,
search_space_id: int,
-) -> tuple[dict[int, str], dict[int, str]]:
- """Resolve ``Document.id`` -> (canonical virtual path, source_markdown).
-
- ``source_markdown`` is the canonical body the chunk spans index into; the
- renderer uses it to turn a chunk's char span into a line range.
- """
+) -> dict[int, str]:
+ """Resolve ``Document.id`` -> canonical virtual path for the search hits."""
doc_ids = [
doc_id
for doc_id in (
@@ -77,24 +72,17 @@ async def _resolve_doc_context(
if isinstance(doc_id, int)
]
if not doc_ids:
- return {}, {}
+ return {}
async with shielded_async_session() as session:
index: PathIndex = await build_path_index(session, search_space_id)
- rows = await session.execute(
- select(
- Document.id, Document.folder_id, Document.source_markdown
- ).where(
+ folder_rows = await session.execute(
+ select(Document.id, Document.folder_id).where(
Document.search_space_id == search_space_id,
Document.id.in_(doc_ids),
)
)
- folder_by_doc_id: dict[int, int | None] = {}
- bodies: dict[int, str] = {}
- for row in rows.all():
- folder_by_doc_id[row.id] = row.folder_id
- if row.source_markdown:
- bodies[row.id] = row.source_markdown
+ folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()}
paths: dict[int, str] = {}
for doc in results:
@@ -109,76 +97,13 @@ async def _resolve_doc_context(
folder_id=folder_id if isinstance(folder_id, int) else None,
index=index,
)
- return paths, bodies
-
-
-def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str:
- """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans."""
- start = chunk.get("start_char")
- end = chunk.get("end_char")
- if (
- not body
- or not isinstance(doc_id, int)
- or not isinstance(start, int)
- or not isinstance(end, int)
- ):
- return ""
- start_line, end_line = char_span_to_line_range(body, start, end)
- return f"[citation:d{doc_id}#L{start_line}-{end_line}]"
-
-
-def _render_passage(
- chunk: dict[str, Any], body: str | None, doc_id: int | None
-) -> str | None:
- """Render one matched chunk as an indented passage tagged with its token."""
- content = (chunk.get("content") or "").strip()
- if not content:
- return None
- snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
- if len(content) > _PER_DOC_SNIPPET_CHARS:
- snippet += " ..."
- indented = snippet.replace("\n", "\n ")
- token = _citation_token(chunk, body, doc_id)
- head = f"\n {token}" if token else ""
- return f"{head}\n {indented}"
-
-
-def _matched_passages(
- doc: dict[str, Any], body: str | None, doc_id: int | None
-) -> str:
- """Render the RRF-matched chunks; '' when none can be rendered."""
- by_id = {
- c.get("chunk_id"): c
- for c in (doc.get("chunks") or [])
- if isinstance(c, dict)
- }
- rendered: list[str] = []
- for chunk_id in doc.get("matched_chunk_ids") or []:
- chunk = by_id.get(chunk_id)
- if chunk is None:
- continue
- passage = _render_passage(chunk, body, doc_id)
- if passage:
- rendered.append(passage)
- return "".join(rendered)
-
-
-def _fallback_snippet(doc: dict[str, Any]) -> str:
- """Top-of-document preview, used only when no matched chunk is available."""
- content = (doc.get("content") or "").strip()
- if not content:
- return "\n (no preview available; read the document for details)"
- snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
- if len(content) > _PER_DOC_SNIPPET_CHARS:
- snippet += " ..."
- return "\n " + snippet.replace("\n", "\n ")
+ return paths
def _format_hits(
results: list[dict[str, Any]],
*,
paths: dict[int, str],
- bodies: dict[int, str],
query: str,
) -> str:
"""Render search hits as a compact, model-readable block."""
@@ -199,15 +124,21 @@ def _format_hits(
score = doc.get("score")
score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a"
path = paths.get(doc_id) if isinstance(doc_id, int) else None
- body = bodies.get(doc_id) if isinstance(doc_id, int) else None
- id_str = f"id={doc_id}, " if isinstance(doc_id, int) else ""
- header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + (
+ header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + (
f"\n path: {path}" if path else ""
)
- passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None)
- entry = header + (passages or _fallback_snippet(doc))
+ content = (doc.get("content") or "").strip()
+ if content:
+ snippet = content[:_PER_DOC_SNIPPET_CHARS].strip()
+ if len(content) > _PER_DOC_SNIPPET_CHARS:
+ snippet += " ..."
+ body = "\n " + snippet.replace("\n", "\n ")
+ else:
+ body = "\n (no preview available; read the document for details)"
+
+ entry = header + body
if total + len(entry) > _MAX_TOTAL_CHARS:
lines.append("\n")
break
@@ -215,9 +146,8 @@ def _format_hits(
total += len(entry)
lines.append(
- "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token "
- "verbatim. To quote more context or read the full document, delegate to "
- "the knowledge_base specialist with `task` using the path above."
+ "\n\nTo read a full document, delegate to the knowledge_base specialist "
+ "with `task`, referencing the path above."
)
lines.append("\n")
return "".join(lines)
@@ -274,10 +204,8 @@ def create_search_knowledge_base_tool(
top_k=clamped_top_k,
)
- paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id)
- rendered = _format_hits(
- results, paths=paths, bodies=bodies, query=cleaned_query
- )
+ paths = await _resolve_virtual_paths(results, search_space_id=_space_id)
+ rendered = _format_hits(results, paths=paths, query=cleaned_query)
matched = _matched_chunk_ids(results)
_perf_log.info(
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
index e704d5599..e13196537 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py
@@ -45,10 +45,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import (
build_document_xml,
)
-from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
- build_read_preamble,
- compute_matched_line_ranges,
-)
from app.agents.chat.runtime.path_resolver import (
DOCUMENTS_ROOT,
build_path_index,
@@ -68,12 +64,6 @@ def _basename(path: str) -> str:
return path.rsplit("/", 1)[-1]
-def _metadata_url(metadata: dict[str, Any]) -> str:
- return (
- metadata.get("url") or metadata.get("source") or metadata.get("page_url") or ""
- )
-
-
def _is_under(child: str, parent: str) -> bool:
"""Return True iff ``child`` is at-or-under ``parent`` (directory semantics)."""
if parent == "/":
@@ -470,11 +460,8 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path)
if loaded is None:
return f"Error: File '{file_path}' not found"
- file_data, _, preamble = loaded
- body = format_read_response(file_data, offset, limit)
- if preamble and offset == 0:
- return preamble + body
- return body
+ file_data, _ = loaded
+ return format_read_response(file_data, offset, limit)
def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override]
return asyncio.run(self.aread(file_path, offset, limit))
@@ -482,14 +469,12 @@ class KBPostgresBackend(BackendProtocol):
async def _load_file_data(
self,
path: str,
- ) -> tuple[dict[str, Any], int | None, str | None] | None:
+ ) -> tuple[dict[str, Any], int | None] | None:
"""Lazy-load a virtual KB document into a deepagents ``FileData``.
- Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path
- doesn't map to any known document. ``doc_id`` is ``None`` for the
- synthetic anonymous document. ``preamble`` is the metadata header to
- show above a numbered ``source_markdown`` body (``None`` for the legacy
- chunk-reconstructed XML reads used when a document has no body).
+ Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map
+ to any known document. ``doc_id`` is ``None`` for the synthetic
+ anonymous document so the caller doesn't track it as a DB-backed file.
"""
anon = self._kb_anon_doc()
if anon and str(anon.get("path") or "") == path:
@@ -507,7 +492,7 @@ class KBPostgresBackend(BackendProtocol):
}
xml = build_document_xml(doc_payload, matched_chunk_ids=set())
file_data = create_file_data(xml)
- return file_data, None, None
+ return file_data, None
if not path.startswith(DOCUMENTS_ROOT):
return None
@@ -520,58 +505,41 @@ class KBPostgresBackend(BackendProtocol):
)
if document is None:
return None
- source_markdown = document.source_markdown or ""
- document_type = (
- document.document_type.value
- if getattr(document, "document_type", None) is not None
- else "UNKNOWN"
- )
- metadata = dict(document.document_metadata or {})
chunk_rows = await session.execute(
- select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char)
+ select(Chunk.id, Chunk.content)
.where(Chunk.document_id == document.id)
.order_by(Chunk.position, Chunk.id)
)
- chunk_records = chunk_rows.all()
- document_id = document.id
- document_title = document.title
+ chunks = [
+ {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
+ ]
- matched = self._matched_chunk_ids(document_id)
-
- # Canonical read: serve the verbatim body with cat -n line numbers that
- # line up with chunk char spans, so the agent cites real source lines.
- if source_markdown:
- ranges = compute_matched_line_ranges(
- source_markdown,
- [(r.id, r.start_char, r.end_char) for r in chunk_records],
- matched,
- )
- preamble = build_read_preamble(
- document_id=document_id,
- document_type=document_type,
- title=document_title,
- url=_metadata_url(metadata),
- matched_line_ranges=ranges,
- )
- return create_file_data(source_markdown), document_id, preamble
-
- # Legacy fallback: no canonical body, reconstruct from chunks as XML.
doc_payload = {
- "document_id": document_id,
- "chunks": [
- {"chunk_id": r.id, "content": r.content} for r in chunk_records
- ],
- "matched_chunk_ids": list(matched),
+ "document_id": document.id,
+ "chunks": chunks,
+ "matched_chunk_ids": list(self._matched_chunk_ids(document.id)),
"document": {
- "id": document_id,
- "title": document_title,
- "document_type": document_type,
- "metadata": metadata,
+ "id": document.id,
+ "title": document.title,
+ "document_type": (
+ document.document_type.value
+ if getattr(document, "document_type", None) is not None
+ else "UNKNOWN"
+ ),
+ "metadata": dict(document.document_metadata or {}),
},
- "source": document_type,
+ "source": (
+ document.document_type.value
+ if getattr(document, "document_type", None) is not None
+ else "UNKNOWN"
+ ),
}
- xml = build_document_xml(doc_payload, matched_chunk_ids=matched)
- return create_file_data(xml), document_id, None
+ xml = build_document_xml(
+ doc_payload,
+ matched_chunk_ids=self._matched_chunk_ids(document.id),
+ )
+ file_data = create_file_data(xml)
+ return file_data, document.id
# ------------------------------------------------------------------ writes
@@ -603,7 +571,7 @@ class KBPostgresBackend(BackendProtocol):
loaded = await self._load_file_data(file_path)
if loaded is None:
return EditResult(error=f"Error: File '{file_path}' not found")
- file_data, _, _ = loaded
+ file_data, _ = loaded
content = file_data_to_string(file_data)
result = perform_string_replacement(
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py
deleted file mode 100644
index ced77096f..000000000
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""Read preamble for canonical (numbered ``source_markdown``) KB reads.
-
-The KB read tool numbers the body lines ``cat -n`` style, so serving the raw
-``source_markdown`` makes those line numbers line up exactly with the chunk
-char spans and the editor highlight. This module renders the small header the
-agent sees above that body: document identity plus the matched line ranges to
-seek to, and a concrete reminder of the line-citation token shape.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Iterable
-
-from app.utils.text_spans import char_span_to_line_range
-
-
-def _format_range(start: int, end: int) -> str:
- return f"{start}" if start == end else f"{start}-{end}"
-
-
-def compute_matched_line_ranges(
- source_markdown: str,
- chunks: Iterable[tuple[int, int | None, int | None]],
- matched_chunk_ids: set[int],
-) -> list[tuple[int, int]]:
- """Map matched chunks to sorted, de-duplicated 1-based line ranges.
-
- ``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without
- spans (legacy rows) are skipped — they have no resolvable location.
- """
- ranges: set[tuple[int, int]] = set()
- for chunk_id, start_char, end_char in chunks:
- if chunk_id not in matched_chunk_ids:
- continue
- if start_char is None or end_char is None:
- continue
- ranges.add(char_span_to_line_range(source_markdown, start_char, end_char))
- return sorted(ranges)
-
-
-def build_read_preamble(
- *,
- document_id: int,
- document_type: str,
- title: str,
- url: str,
- matched_line_ranges: list[tuple[int, int]],
-) -> str:
- """Render the metadata header shown above a numbered ``source_markdown`` body.
-
- ``matched_line_ranges`` are 1-based inclusive line ranges (already derived
- from chunk char spans) to point the agent at the relevant lines.
- """
- lines = [
- "",
- f" {document_id}",
- f" {document_type}",
- f" ",
- f" ",
- ]
- if matched_line_ranges:
- ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges)
- lines.append(f" {ranges}")
- lines.append("")
- lines.append(
- f"Cite lines from this document as [citation:d{document_id}#L-] "
- "using the line numbers shown below."
- )
- lines.append("")
- return "\n".join(lines)
-
-
-__all__ = ["build_read_preamble", "compute_matched_line_ranges"]
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
index 036617d8d..775469531 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py
@@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated)
if loaded is None:
return f"Error: File '{validated}' not found"
- _, doc_id_to_attach, _ = loaded
+ _, doc_id_to_attach = loaded
res: EditResult = await backend.aedit(
validated, old_string, new_string, replace_all=replace_all
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
index be61ca94f..ded4701f9 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py
@@ -75,7 +75,7 @@ async def cloud_move_file(
loaded = await backend._load_file_data(source)
if loaded is None:
return f"Error: source '{source}' not found."
- source_file_data, loaded_doc_id, _ = loaded
+ source_file_data, loaded_doc_id = loaded
if source_doc_id is None:
source_doc_id = loaded_doc_id
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
index 6cbbe6ae5..5c20619d6 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py
@@ -58,10 +58,8 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool:
loaded = await backend._load_file_data(validated)
if loaded is None:
return f"Error: File '{validated}' not found"
- file_data, doc_id, preamble = loaded
+ file_data, doc_id = loaded
rendered = format_read_response(file_data, offset, limit)
- if preamble and offset == 0:
- rendered = preamble + rendered
update: dict[str, Any] = {
"files": {validated: file_data},
"messages": [
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
index 020200cbd..e2e445d08 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py
@@ -74,7 +74,7 @@ async def cloud_rm(
loaded = await backend._load_file_data(validated)
if loaded is None:
return f"Error: file '{validated}' not found."
- _, resolved_doc_id, _ = loaded
+ _, resolved_doc_id = loaded
files_update: dict[str, Any] = {validated: None}
update: dict[str, Any] = {
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
index f377db311..c4e36fc73 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md
@@ -35,24 +35,42 @@ Map outcomes to your `status`:
You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see.
-## Citations in your prose
+## Chunk citations in your prose
-`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
+When `read_file` returns a KB-indexed document under `/documents/`, the response includes `` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:]` to the sentence stating that fact, using the **exact** id from the `` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
-**Numbered body (default).** A `` header gives the `` and an optional `` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d#L-]` (a single line is `#L-`).
+### Where chunk ids live in `read_file` output
-**Legacy chunk blocks (older docs without a stored body).** The response is XML with `` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `` tag.
+A KB document's XML has three numeric attributes — only **one** is a citation source:
+
+```
+
+
+ 42 ← NOT a citation. Parent doc id; ignore for citations.
+ ...
+
+
+ ← Index hint; the same id also appears below.
+
+
+
+ ← This is the citation source.
+
+
+
+```
### Rules
-- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory.
-- Never cite `` on its own — it identifies the document, not a passage. In the numbered form it is only the `d` prefix of a line citation.
-- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick.
+- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
+- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
+- Never cite `` — that's the parent doc, not a chunk.
+- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
- Prefer **fewer accurate citations** over many speculative ones.
-- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`.
+- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
-- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite.
-- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters.
+- Tool results without `` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none.
+- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits.
## Examples
@@ -71,7 +89,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
"path": "/documents/meetings/2026-05-11-meeting.md",
"matched_candidates": null,
"content_excerpt": null,
- "citations": null
+ "chunk_ids": null
},
"next_step": null,
"missing_fields": null,
@@ -103,7 +121,7 @@ You construct the structured `evidence` fields from your own knowledge of what y
{ "id": "/documents/design/auth-rework.md", "label": "Auth Rework" }
],
"content_excerpt": null,
- "citations": null
+ "chunk_ids": null
},
"next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"],
@@ -124,7 +142,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null,
- "citations": string[] | null
+ "chunk_ids": string[] | null
},
"next_step": string | null,
"missing_fields": string[] | null,
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
index 72a921c4f..25dafa3df 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md
@@ -33,11 +33,11 @@ Map outcomes to your `status`:
- Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`.
- HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`.
-You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.)
+You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.)
-## Citations in your prose
+## Chunk citations in your prose
-In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work.
+In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work.
## Examples
@@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
"path": "/notes/meetings/2026-05-11-meeting.md",
"matched_candidates": null,
"content_excerpt": null,
- "citations": null
+ "chunk_ids": null
},
"next_step": null,
"missing_fields": null,
@@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool
{ "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" }
],
"content_excerpt": null,
- "citations": null
+ "chunk_ids": null
},
"next_step": "Ask the user which design doc to update.",
"missing_fields": ["path"],
@@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it):
"path": string | null,
"matched_candidates": [ { "id": string, "label": string } ] | null,
"content_excerpt": string | null,
- "citations": string[] | null
+ "chunk_ids": string[] | null
},
"next_step": string | null,
"missing_fields": string[] | null,
diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
index f0aa8403e..c7813e71d 100644
--- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
+++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md
@@ -28,21 +28,41 @@ Reply in plain prose:
- If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content.
- If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop.
-## Citations
+## Chunk citations
-`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation.
+When the evidence for a claim came from a `read_file` response that included `` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation.
-- **Numbered body (default).** A `` header gives the ``, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d#L-]` (a single line is `#L-`).
-- **Legacy chunk blocks (older docs).** XML with `` blocks. Cite the chunk a claim came from as `[citation:N]`.
+### Where chunk ids live in `read_file` output
+
+A KB document's XML has three numeric attributes — only **one** is a citation source:
+
+```
+
+
+ 42 ← NOT a citation. Parent doc id; ignore for citations.
+ ...
+
+
+ ← Index hint; the same id also appears below.
+
+
+
+ ← This is the citation source.
+
+
+
+```
### Rules
-- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation.
-- Never cite `` on its own — in the numbered form it is only the `d` prefix of a line citation.
-- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones.
-- Multiple passages supporting the same point → comma-separated and copied individually.
+- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory.
+- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation.
+- Never cite `` — that's the parent doc, not a chunk.
+- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick.
+- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids.
+- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`.
- Plain square brackets only — no markdown links, no parentheses, no footnote numbers.
-- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite.
-- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference.
+- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without ``), skip the citation.
+- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference.
-Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].`
+Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].`
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index c8eb33b8f..63be54654 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -953,9 +953,8 @@ class Config:
os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true"
)
# Bump to invalidate every cached embedding set after a chunker change.
- # v2: chunks became exact (raw) slices of source_markdown for citation spans.
EMBEDDING_CACHE_CHUNKER_VERSION = int(
- os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2")
+ os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1")
)
EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90"))
EMBEDDING_CACHE_MAX_TOTAL_MB = int(
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 9aa217d2c..3f098d5d2 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -1467,11 +1467,6 @@ class Chunk(BaseModel, TimestampMixin):
# ordering reads are document-scoped (covered by ix_chunks_document_id) and
# building a position index on the large chunks table is not worth it.
position = Column(Integer, nullable=False, server_default="0")
- # Half-open char span into the document's source_markdown the chunk was cut
- # from. Nullable: historical rows predate spans and populate on reindex.
- # Invariant for span-aware rows: source_markdown[start_char:end_char] == content.
- start_char = Column(Integer, nullable=True)
- end_char = Column(Integer, nullable=True)
document_id = Column(
Integer,
diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
index 58872a219..95321a229 100644
--- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py
@@ -18,26 +18,23 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.service import EmbeddingCacheService
from app.indexing_pipeline.cache.settings import load_embedding_cache_settings
-from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans
+from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid
from app.indexing_pipeline.document_embedder import embed_texts
from app.observability import metrics
logger = logging.getLogger(__name__)
-SliceEmbedding = tuple[ChunkSlice, np.ndarray]
+ChunkPair = tuple[str, np.ndarray]
async def build_chunk_embeddings(
markdown: str, *, use_code_chunker: bool
-) -> tuple[np.ndarray, list[SliceEmbedding]]:
- """Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs.
+) -> tuple[np.ndarray, list[ChunkPair]]:
+ """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs.
- Slices are always recomputed (cheap) so their char spans are exact; only the
- embeddings are cached, reused when the same markdown was embedded with the
- current model and chunker.
+ Drop-in for the inline chunk+embed step; reuses prior output when the same
+ markdown has already been embedded with the current model and chunker.
"""
- slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker)
-
settings = load_embedding_cache_settings()
chunker_kind = "code" if use_code_chunker else "hybrid"
embedding_dim = getattr(config.embedding_model_instance, "dimension", None)
@@ -48,7 +45,7 @@ async def build_chunk_embeddings(
embedding_dim=embedding_dim,
)
if not cacheable:
- return await _compute(markdown, slices)
+ return await _compute(markdown, use_code_chunker=use_code_chunker)
key = EmbeddingKey(
markdown_sha256=_hash_text(markdown),
@@ -59,30 +56,31 @@ async def build_chunk_embeddings(
)
cached = await _recall(key)
- if cached is not None and _aligns(cached, slices):
+ if cached is not None:
metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model,
chunker_kind=chunker_kind,
outcome="hit",
)
logger.debug("Embedding cache hit for %s", key.markdown_sha256)
- return cached.summary_embedding, list(
- zip(slices, (c.embedding for c in cached.chunks), strict=True)
- )
+ return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks]
metrics.record_embedding_cache_lookup(
embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss"
)
- summary_embedding, pairs = await _compute(markdown, slices)
- await _remember(key, summary_embedding, pairs)
- return summary_embedding, pairs
-
-
-async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]:
- """Chunk markdown into ordered, char-addressed slices off the event loop."""
- return await asyncio.to_thread(
- chunk_markdown_with_spans, markdown, use_code_chunker
+ summary_embedding, chunk_pairs = await _compute(
+ markdown, use_code_chunker=use_code_chunker
)
+ await _remember(key, summary_embedding, chunk_pairs)
+ return summary_embedding, chunk_pairs
+
+
+async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]:
+ """Chunk markdown into ordered texts with the pipeline's chunker selection."""
+ if use_code_chunker:
+ return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True)
+ # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334).
+ return await asyncio.to_thread(chunk_text_hybrid, markdown)
async def embed_batch(texts: list[str]) -> list[np.ndarray]:
@@ -90,19 +88,13 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]:
return await asyncio.to_thread(embed_texts, texts)
-def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool:
- """A hit is only usable if its texts still match the current chunking."""
- return len(cached.chunks) == len(slices) and all(
- c.text == s.text for c, s in zip(cached.chunks, slices, strict=True)
- )
-
-
async def _compute(
- markdown: str, slices: list[ChunkSlice]
-) -> tuple[np.ndarray, list[SliceEmbedding]]:
- embeddings = await embed_batch([markdown, *(s.text for s in slices)])
+ markdown: str, *, use_code_chunker: bool
+) -> tuple[np.ndarray, list[ChunkPair]]:
+ chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker)
+ embeddings = await embed_batch([markdown, *chunk_texts])
summary_embedding, *chunk_embeddings = embeddings
- return summary_embedding, list(zip(slices, chunk_embeddings, strict=True))
+ return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False))
async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
@@ -118,14 +110,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None:
async def _remember(
- key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding]
+ key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair]
) -> None:
try:
from app.tasks.celery_tasks import get_celery_session_maker
embedding_set = EmbeddingSet(
summary_embedding=summary_embedding,
- chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs],
+ chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs],
)
async with get_celery_session_maker()() as session:
await EmbeddingCacheService(session).remember(key, embedding_set)
diff --git a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
index dd57a44d1..9354aeb9f 100644
--- a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
+++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py
@@ -19,9 +19,6 @@ class ExistingChunk:
id: int
content: str
position: int
- # Stored char span; None for legacy rows indexed before spans existed.
- start_char: int | None = None
- end_char: int | None = None
@dataclass(frozen=True, slots=True)
diff --git a/surfsense_backend/app/indexing_pipeline/document_chunker.py b/surfsense_backend/app/indexing_pipeline/document_chunker.py
index 096624109..6ae81b7a8 100644
--- a/surfsense_backend/app/indexing_pipeline/document_chunker.py
+++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py
@@ -1,30 +1,16 @@
import re
-from dataclasses import dataclass
from app.config import config
# Regex that matches a Markdown table block (header + separator + one or more rows)
# A table block starts with a | at the beginning of a line and ends when a
-# non-table line (or end of string) is encountered. The final row may end at EOF
-# without a trailing newline, so the whole table stays one slice.
+# non-table line (or end of string) is encountered.
_TABLE_BLOCK_RE = re.compile(
- r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)",
+ r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)",
re.MULTILINE,
)
-@dataclass(frozen=True, slots=True)
-class ChunkSlice:
- """A chunk paired with its half-open char span into the source markdown.
-
- Invariant: ``markdown[start_char:end_char] == text``.
- """
-
- text: str
- start_char: int
- end_char: int
-
-
def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
"""Chunk a text string using the configured chunker and return the chunk texts."""
chunker = (
@@ -33,63 +19,41 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]:
return [c.text for c in chunker.chunk(text)]
-def chunk_markdown_with_spans(
- text: str, use_code_chunker: bool = False
-) -> list[ChunkSlice]:
- """Chunk markdown into a lossless, contiguous partition of char-addressed slices.
+def chunk_text_hybrid(text: str) -> list[str]:
+ """Table-aware chunker that prevents Markdown tables from being split mid-row.
- Tables stay whole (issue #1334) and every slice is an exact substring of
- ``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is
- the offset record citations resolve against.
+ Algorithm:
+ 1. Scan the document for Markdown table blocks.
+ 2. Each table block is emitted as a single, unmodified chunk so that its
+ header, separator row, and data rows always stay together.
+ 3. The non-table prose segments between (and around) tables are passed through
+ the normal ``chunk_text`` chunker and their sub-chunks are interleaved in
+ document order.
+
+ This ensures that table data is never sliced in the middle by the token-based
+ chunker, which would otherwise produce garbled rows that are useless for RAG.
+
+ Fixes #1334.
"""
- if not text:
- return []
-
- slices: list[ChunkSlice] = []
+ chunks: list[str] = []
cursor = 0
for match in _TABLE_BLOCK_RE.finditer(text):
- if match.start() > cursor:
- slices.extend(
- _segment_slices(text, cursor, match.start(), use_code_chunker)
- )
- slices.append(ChunkSlice(match.group(0), match.start(), match.end()))
+ # Prose before this table
+ prose = text[cursor : match.start()].strip()
+ if prose:
+ chunks.extend(chunk_text(prose))
+
+ # The table itself is kept as one indivisible chunk
+ table_block = match.group(0).strip()
+ if table_block:
+ chunks.append(table_block)
+
cursor = match.end()
- if len(text) > cursor:
- slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker))
+ # Remaining prose after the last table (or entire text if no tables)
+ trailing = text[cursor:].strip()
+ if trailing:
+ chunks.extend(chunk_text(trailing))
- return slices
-
-
-def _segment_slices(
- text: str, start: int, end: int, use_code_chunker: bool
-) -> list[ChunkSlice]:
- """Sub-chunk one non-table segment into contiguous, char-addressed slices."""
- chunker = (
- config.code_chunker_instance if use_code_chunker else config.chunker_instance
- )
- segment = text[start:end]
- chunks = chunker.chunk(segment)
-
- slices: list[ChunkSlice] = []
- local = 0
- for chunk in chunks:
- # Use the chunker's end offset only as a cut point, then re-slice the
- # segment ourselves so the result is an exact, gap-free substring.
- local_end = min(max(chunk.end_index, local), len(segment))
- if local_end <= local:
- continue
- slices.append(
- ChunkSlice(segment[local:local_end], start + local, start + local_end)
- )
- local = local_end
-
- if local < len(segment):
- if slices:
- last = slices[-1]
- slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end)
- else:
- slices.append(ChunkSlice(segment[local:], start + local, end))
-
- return slices
+ return chunks
diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
index 0cb74089b..30ea9d5d6 100644
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@@ -20,10 +20,9 @@ from app.db import (
DocumentType,
)
from app.indexing_pipeline.cache import build_chunk_embeddings
-from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch
-from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile
+from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch
+from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile
from app.indexing_pipeline.connector_document import ConnectorDocument
-from app.indexing_pipeline.document_chunker import ChunkSlice
from app.indexing_pipeline.document_hashing import (
compute_content_hash,
compute_identifier_hash,
@@ -490,22 +489,12 @@ class IndexingPipelineService:
async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]:
result = await self.session.execute(
- select(
- Chunk.id,
- Chunk.content,
- Chunk.position,
- Chunk.start_char,
- Chunk.end_char,
- ).where(Chunk.document_id == document_id)
+ select(Chunk.id, Chunk.content, Chunk.position).where(
+ Chunk.document_id == document_id
+ )
)
return [
- ExistingChunk(
- id=row.id,
- content=row.content,
- position=row.position,
- start_char=row.start_char,
- end_char=row.end_char,
- )
+ ExistingChunk(id=row.id, content=row.content, position=row.position)
for row in result
]
@@ -516,21 +505,15 @@ class IndexingPipelineService:
delete(Chunk).where(Chunk.document_id == document.id)
)
- summary_embedding, slice_pairs = await build_chunk_embeddings(
+ summary_embedding, chunk_pairs = await build_chunk_embeddings(
content,
use_code_chunker=connector_doc.should_use_code_chunker,
)
document.embedding = summary_embedding
return [
- Chunk(
- content=chunk_slice.text,
- embedding=emb,
- position=i,
- start_char=chunk_slice.start_char,
- end_char=chunk_slice.end_char,
- )
- for i, (chunk_slice, emb) in enumerate(slice_pairs)
+ Chunk(content=text, embedding=emb, position=i)
+ for i, (text, emb) in enumerate(chunk_pairs)
]
async def _reindex_incrementally(
@@ -542,39 +525,35 @@ class IndexingPipelineService:
) -> int:
"""Edit path: keep rows whose text survived, embed only new texts.
- Unchanged rows keep their embedding and their HNSW/GIN index entries. An
- edit can shift a kept chunk's char span without changing its text, so
- every kept row's position and span are refreshed whenever they drift.
+ Unchanged rows keep their embedding and their HNSW/GIN index entries;
+ moved rows get a position-only UPDATE, which touches neither index.
"""
- slices = await chunk_slices(
+ new_texts = await chunk_markdown(
content, use_code_chunker=connector_doc.should_use_code_chunker
)
- new_texts = [s.text for s in slices]
plan = reconcile(existing, new_texts)
# One batch: the document-level summary vector plus the missing chunks.
embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]])
summary_embedding, *new_embeddings = embeddings
+ if plan.reused:
+ await self.session.execute(
+ update(Chunk),
+ [{"id": cid, "position": pos} for cid, pos in plan.reused],
+ )
if plan.to_delete:
await self.session.execute(
delete(Chunk).where(Chunk.id.in_(plan.to_delete))
)
-
- span_updates = self._kept_row_span_updates(existing, slices, plan)
- if span_updates:
- await self.session.execute(update(Chunk), span_updates)
-
self.session.add_all(
Chunk(
- content=slices[pos].text,
+ content=text,
embedding=emb,
position=pos,
- start_char=slices[pos].start_char,
- end_char=slices[pos].end_char,
document_id=document.id,
)
- for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True)
+ for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True)
)
document.embedding = summary_embedding
@@ -585,36 +564,6 @@ class IndexingPipelineService:
)
return len(new_texts)
- @staticmethod
- def _kept_row_span_updates(
- existing: list[ExistingChunk],
- slices: list[ChunkSlice],
- plan: ChunkPlan,
- ) -> list[dict]:
- """Position/span writes for kept rows, emitted only where a value drifts."""
- deleted = set(plan.to_delete)
- moved = dict(plan.reused)
- updates: list[dict] = []
- for chunk in existing:
- if chunk.id in deleted:
- continue
- new_position = moved.get(chunk.id, chunk.position)
- target = slices[new_position]
- if (
- chunk.position != new_position
- or chunk.start_char != target.start_char
- or chunk.end_char != target.end_char
- ):
- updates.append(
- {
- "id": chunk.id,
- "position": new_position,
- "start_char": target.start_char,
- "end_char": target.end_char,
- }
- )
- return updates
-
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
"""Fire-and-forget: enqueue incremental AI sort if the search space has it enabled."""
try:
diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py
index adce14e53..5e5edec2e 100644
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@@ -440,15 +440,8 @@ class ChucksHybridSearchRetriever:
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
- # start_char/end_char carry the citation span; None for legacy rows.
chunk_query = (
- select(
- Chunk.id,
- Chunk.content,
- Chunk.document_id,
- Chunk.start_char,
- Chunk.end_char,
- )
+ select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
@@ -483,14 +476,7 @@ class ChucksHybridSearchRetriever:
if doc_id not in doc_map:
continue
doc_entry = doc_map[doc_id]
- doc_entry["chunks"].append(
- {
- "chunk_id": row.id,
- "content": row.content,
- "start_char": row.start_char,
- "end_char": row.end_char,
- }
- )
+ doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
if row.id in matched_chunk_ids:
doc_entry["matched_chunk_ids"].append(row.id)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index ea6b0d4fa..53f03a0ca 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -37,7 +37,6 @@ from app.schemas import (
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
from app.users import current_active_user
from app.utils.rbac import check_permission
-from app.utils.text_spans import char_span_to_line_range
try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -968,12 +967,9 @@ async def get_document_by_chunk_id(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
- """Resolve a chunk id to its document plus a window of surrounding chunks.
-
- Returns the cited chunk's 1-based line range (cited_start_line/
- cited_end_line) when char spans exist, so callers can anchor the citation
- to exact source lines. Uses SQL-level pagination to avoid loading all
- chunks into memory.
+ """
+ Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+ Uses SQL-level pagination to avoid loading all chunks into memory.
"""
try:
from sqlalchemy import and_, func, or_
@@ -1037,17 +1033,6 @@ async def get_document_by_chunk_id(
)
windowed_chunks = windowed_result.scalars().all()
- cited_start_line: int | None = None
- cited_end_line: int | None = None
- if (
- chunk.start_char is not None
- and chunk.end_char is not None
- and document.source_markdown
- ):
- cited_start_line, cited_end_line = char_span_to_line_range(
- document.source_markdown, chunk.start_char, chunk.end_char
- )
-
return DocumentWithChunksRead(
id=document.id,
title=document.title,
@@ -1062,8 +1047,6 @@ async def get_document_by_chunk_id(
chunks=windowed_chunks,
total_chunks=total_chunks,
chunk_start_index=start,
- cited_start_line=cited_start_line,
- cited_end_line=cited_end_line,
)
except HTTPException:
raise
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index db46e4ee0..8250fff98 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -42,34 +42,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024
EDITOR_PLATE_MAX_LINES = 5000
-def _raise_no_canonical_body(document: Document) -> None:
- """Translate a missing source_markdown into a status-aware HTTP error."""
- doc_status = document.status or {}
- state = (
- doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready"
- )
-
- if state in ("pending", "processing"):
- raise HTTPException(
- status_code=409,
- detail="This document is still being processed. Please wait a moment and try again.",
- )
- if state == "failed":
- reason = (
- doc_status.get("reason", "Unknown error")
- if isinstance(doc_status, dict)
- else "Unknown error"
- )
- raise HTTPException(
- status_code=422,
- detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
- )
- raise HTTPException(
- status_code=400,
- detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.",
- )
-
-
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
async def get_editor_content(
search_space_id: int,
@@ -80,9 +52,8 @@ async def get_editor_content(
"""
Get document content for editing.
- Returns source_markdown (the canonical body) for the Plate.js editor, with a
- one-time migration from legacy blocknote_document. Never reconstructs the
- body from chunks.
+ Returns source_markdown for the Plate.js editor.
+ Falls back to blocknote_document → markdown conversion, then chunk reconstruction.
Requires DOCUMENTS_READ permission.
"""
@@ -152,9 +123,52 @@ async def get_editor_content(
await session.commit()
return _build_response(empty_markdown)
- # No canonical body. Chunks are an index artifact, never the source of
- # truth, so surface the processing state instead of rebuilding from them.
- _raise_no_canonical_body(document)
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.position, Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+
+ if not chunk_contents:
+ doc_status = document.status or {}
+ state = (
+ doc_status.get("state", "ready")
+ if isinstance(doc_status, dict)
+ else "ready"
+ )
+ if state in ("pending", "processing"):
+ raise HTTPException(
+ status_code=409,
+ detail="This document is still being processed. Please wait a moment and try again.",
+ )
+ if state == "failed":
+ reason = (
+ doc_status.get("reason", "Unknown error")
+ if isinstance(doc_status, dict)
+ else "Unknown error"
+ )
+ raise HTTPException(
+ status_code=422,
+ detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
+ )
+ raise HTTPException(
+ status_code=400,
+ detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
+ )
+
+ markdown_content = "\n\n".join(chunk_contents)
+
+ if not markdown_content.strip():
+ raise HTTPException(
+ status_code=400,
+ detail="This document appears to be empty. Try re-uploading or editing it to add content.",
+ )
+
+ document.source_markdown = markdown_content
+ await session.commit()
+
+ return _build_response(markdown_content)
@router.get(
@@ -167,9 +181,8 @@ async def download_document_markdown(
user: User = Depends(current_active_user),
):
"""
- Download the canonical document body as a .md file.
-
- Serves source_markdown, migrating legacy blocknote_document when present.
+ Download the full document content as a .md file.
+ Reconstructs markdown from source_markdown or chunks.
"""
await check_permission(
session,
@@ -195,6 +208,15 @@ async def download_document_markdown(
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown = blocknote_to_markdown(document.blocknote_document)
+ if markdown is None:
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.position, Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown = "\n\n".join(chunk_contents)
if not markdown or not markdown.strip():
raise HTTPException(
@@ -335,6 +357,15 @@ async def export_document(
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown_content = blocknote_to_markdown(document.blocknote_document)
+ if markdown_content is None:
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.position, Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown_content = "\n\n".join(chunk_contents)
if not markdown_content or not markdown_content.strip():
raise HTTPException(status_code=400, detail="Document has no content to export")
diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py
index 685aa4762..7fec0d445 100644
--- a/surfsense_backend/app/schemas/chunks.py
+++ b/surfsense_backend/app/schemas/chunks.py
@@ -17,7 +17,4 @@ class ChunkUpdate(ChunkBase):
class ChunkRead(ChunkBase, IDModel, TimestampModel):
- start_char: int | None = None
- end_char: int | None = None
-
model_config = ConfigDict(from_attributes=True)
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index 162dd6882..49d2836b2 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -73,10 +73,6 @@ class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = []
total_chunks: int = 0
chunk_start_index: int = 0
- # 1-based inclusive line range of the cited chunk within source_markdown;
- # None when the chunk predates char spans or the body is unavailable.
- cited_start_line: int | None = None
- cited_end_line: int | None = None
model_config = ConfigDict(from_attributes=True)
diff --git a/surfsense_backend/app/utils/text_spans.py b/surfsense_backend/app/utils/text_spans.py
deleted file mode 100644
index c12201174..000000000
--- a/surfsense_backend/app/utils/text_spans.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Convert char spans into document-relative line ranges.
-
-Chunks store half-open char spans into ``source_markdown``; citations and the
-editor speak in line numbers. This is the single shared conversion so search,
-the resolve API, and highlighting all agree on what "lines X-Y" means.
-"""
-
-from __future__ import annotations
-
-
-def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]:
- """Return the 1-based inclusive line range covering ``[start_char, end_char)``.
-
- Offsets are clamped to ``text`` bounds. An empty span resolves to the single
- line containing it.
- """
- n = len(text)
- start = max(0, min(start_char, n))
- end = max(start, min(end_char, n))
- start_line = text.count("\n", 0, start) + 1
- last_char_index = max(start, end - 1)
- end_line = text.count("\n", 0, last_char_index) + 1
- return start_line, end_line
diff --git a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py
deleted file mode 100644
index 77e2e5f18..000000000
--- a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""NOTE writes must carry the same char spans as the indexing pipeline.
-
-``_create_document`` / ``_update_document`` are the cloud agent's KB write
-paths. They must chunk through the shared span chunker so every persisted
-chunk resolves back to an exact slice of ``source_markdown`` for citations.
-"""
-
-from __future__ import annotations
-
-import pytest
-from sqlalchemy import select
-
-from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import (
- middleware as kb,
-)
-from app.db import Chunk
-
-pytestmark = [pytest.mark.integration, pytest.mark.asyncio]
-
-_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph."
-_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line."
-
-
-async def _ordered_chunks(session, doc_id: int) -> list[Chunk]:
- rows = await session.execute(
- select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position)
- )
- return list(rows.scalars().all())
-
-
-def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None:
- assert chunks
- for chunk in chunks:
- assert chunk.start_char is not None
- assert chunk.end_char is not None
- assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content
-
-
-@pytest.mark.usefixtures("patched_embed_texts")
-async def test_note_create_populates_chunk_spans(
- db_session, db_search_space, db_user
-) -> None:
- doc = await kb._create_document(
- db_session,
- virtual_path="/documents/note.md",
- content=_BODY,
- search_space_id=db_search_space.id,
- created_by_id=str(db_user.id),
- )
- await db_session.flush()
-
- chunks = await _ordered_chunks(db_session, doc.id)
- _assert_spans_resolve(doc.source_markdown, chunks)
-
-
-@pytest.mark.usefixtures("patched_embed_texts")
-async def test_note_update_refreshes_chunk_spans(
- db_session, db_search_space, db_user
-) -> None:
- doc = await kb._create_document(
- db_session,
- virtual_path="/documents/note.md",
- content=_BODY,
- search_space_id=db_search_space.id,
- created_by_id=str(db_user.id),
- )
- await db_session.flush()
-
- updated = await kb._update_document(
- db_session,
- doc_id=doc.id,
- content=_NEW_BODY,
- virtual_path="/documents/note.md",
- search_space_id=db_search_space.id,
- )
- await db_session.flush()
-
- assert updated is not None
- chunks = await _ordered_chunks(db_session, updated.id)
- _assert_spans_resolve(updated.source_markdown, chunks)
diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py
index e67a025cc..6b8aa3cdb 100644
--- a/surfsense_backend/tests/integration/conftest.py
+++ b/surfsense_backend/tests/integration/conftest.py
@@ -158,12 +158,13 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
@pytest.fixture
def patched_chunk_text(monkeypatch) -> MagicMock:
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
- text = "Test chunk content."
- mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))])
+ mock = MagicMock(return_value=["Test chunk content."])
monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text",
+ mock,
+ )
+ monkeypatch.setattr(
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock,
)
return mock
diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py
index f73c4eaaf..bd889360f 100644
--- a/surfsense_backend/tests/integration/document_upload/conftest.py
+++ b/surfsense_backend/tests/integration/document_upload/conftest.py
@@ -286,12 +286,9 @@ def _mock_external_apis(monkeypatch):
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
- chunk = "Test chunk content."
monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]),
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text",
+ MagicMock(return_value=["Test chunk content."]),
)
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
index e89d7592b..814129c8d 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py
@@ -176,14 +176,9 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
@pytest.mark.usefixtures("patched_embed_texts")
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending."""
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
mocker.patch(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- side_effect=[
- [ChunkSlice("Original chunk.", 0, len("Original chunk."))],
- [ChunkSlice("Updated chunk.", 0, len("Updated chunk."))],
- ],
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
+ side_effect=[["Original chunk."], ["Updated chunk."]],
)
adapter = UploadDocumentAdapter(db_session)
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
index f86ee8e4f..68d5ec0af 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py
@@ -18,22 +18,16 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
@pytest.fixture
def paragraph_chunker(monkeypatch):
- """One slice per markdown paragraph, so edits map to chunk-level diffs."""
- from app.indexing_pipeline.document_chunker import ChunkSlice
+ """One chunk per markdown paragraph, so edits map to chunk-level diffs."""
- def _split(markdown, *_args, **_kwargs):
- slices = []
- cursor = 0
- for para in markdown.split("\n\n"):
- start = markdown.index(para, cursor)
- cursor = start + len(para)
- if para.strip():
- slices.append(ChunkSlice(para, start, cursor))
- return slices
+ def _split(markdown, **_kwargs):
+ return [p for p in markdown.split("\n\n") if p.strip()]
monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- _split,
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
+ )
+ monkeypatch.setattr(
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
)
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py
deleted file mode 100644
index 869045bf6..000000000
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Indexing records char spans so a chunk addresses its exact slice of the body.
-
-Uses the real chunker (only embeddings are faked) so the span/partition
-invariants are exercised end to end.
-"""
-
-import pytest
-from sqlalchemy import select
-
-from app.db import Chunk, Document
-from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
-
-pytestmark = pytest.mark.integration
-
-_BODY = (
- "# Report\n\n"
- + "Intro paragraph that is reasonably long and descriptive. " * 8
- + "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n"
- + "Closing paragraph with a different shape and more words to chunk. " * 8
-)
-
-
-async def _ordered_chunks(session, document_id) -> list[Chunk]:
- result = await session.execute(
- select(Chunk)
- .filter(Chunk.document_id == document_id)
- .order_by(Chunk.position, Chunk.id)
- )
- return list(result.scalars().all())
-
-
-def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None:
- for chunk in chunks:
- assert chunk.start_char is not None and chunk.end_char is not None
- assert body[chunk.start_char : chunk.end_char] == chunk.content
- assert "".join(c.content for c in chunks) == body
-
-
-async def _index(session, connector_doc) -> int:
- service = IndexingPipelineService(session=session)
- prepared = await service.prepare_for_indexing([connector_doc])
- document = prepared[0]
- await service.index(document, connector_doc)
- return document.id
-
-
-async def _reload_body(session, document_id) -> str:
- result = await session.execute(select(Document).filter(Document.id == document_id))
- return result.scalars().first().source_markdown
-
-
-@pytest.mark.usefixtures("patched_embed_texts")
-async def test_scratch_index_records_spans_addressing_body(
- db_session, db_search_space, make_connector_document
-):
- connector_doc = make_connector_document(
- search_space_id=db_search_space.id, source_markdown=_BODY
- )
-
- document_id = await _index(db_session, connector_doc)
-
- body = await _reload_body(db_session, document_id)
- chunks = await _ordered_chunks(db_session, document_id)
-
- assert len(chunks) > 1
- _assert_spans_address_body(chunks, body)
-
-
-@pytest.mark.usefixtures("patched_embed_texts")
-async def test_incremental_reindex_refreshes_shifted_spans(
- db_session, db_search_space, make_connector_document
-):
- """Inserting text at the top shifts every later chunk's span; kept rows must
- have their spans refreshed, not left pointing at the old offsets."""
- service = IndexingPipelineService(session=db_session)
-
- original = make_connector_document(
- search_space_id=db_search_space.id, source_markdown=_BODY
- )
- prepared = await service.prepare_for_indexing([original])
- document_id = prepared[0].id
- await service.index(prepared[0], original)
-
- edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY
- edited = make_connector_document(
- search_space_id=db_search_space.id, source_markdown=edited_body
- )
- prepared_again = await service.prepare_for_indexing([edited])
- assert prepared_again, "edited content should requeue the document"
- await service.index(prepared_again[0], edited)
-
- body = await _reload_body(db_session, document_id)
- chunks = await _ordered_chunks(db_session, document_id)
-
- assert body == edited_body
- _assert_spans_address_body(chunks, body)
diff --git a/surfsense_backend/tests/integration/retriever/conftest.py b/surfsense_backend/tests/integration/retriever/conftest.py
index 96c6297bb..d2443723c 100644
--- a/surfsense_backend/tests/integration/retriever/conftest.py
+++ b/surfsense_backend/tests/integration/retriever/conftest.py
@@ -40,19 +40,11 @@ def _make_document(
)
-def _make_chunk(
- *,
- content: str,
- document_id: int,
- start_char: int | None = None,
- end_char: int | None = None,
-) -> Chunk:
+def _make_chunk(*, content: str, document_id: int) -> Chunk:
return Chunk(
content=content,
document_id=document_id,
embedding=DUMMY_EMBEDDING,
- start_char=start_char,
- end_char=end_char,
)
@@ -99,8 +91,6 @@ async def seed_large_doc(
_make_chunk(
content="quarterly performance review summary note content",
document_id=small_doc.id,
- start_char=0,
- end_char=10,
),
]
diff --git a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
index a8c85e65f..f80e59304 100644
--- a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
+++ b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
@@ -98,32 +98,6 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc):
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
-async def test_chunk_spans_returned(db_session, seed_large_doc):
- """Each chunk dict carries start_char/end_char (the citation span)."""
- space_id = seed_large_doc["search_space"].id
- small_doc_id = seed_large_doc["small_doc"].id
-
- retriever = ChucksHybridSearchRetriever(db_session)
- results = await retriever.hybrid_search(
- query_text="quarterly performance review summary",
- top_k=10,
- search_space_id=space_id,
- query_embedding=DUMMY_EMBEDDING,
- )
-
- for result in results:
- for chunk in result["chunks"]:
- assert "start_char" in chunk
- assert "end_char" in chunk
- if result["document"].get("id") == small_doc_id:
- seeded = result["chunks"][0]
- assert seeded["start_char"] == 0
- assert seeded["end_char"] == 10
- break
- else:
- pytest.fail("Small doc not found in search results")
-
-
async def test_score_is_positive_float(db_session, seed_large_doc):
"""Each result should have a positive float score from RRF."""
space_id = seed_large_doc["search_space"].id
diff --git a/surfsense_backend/tests/integration/test_documents_by_chunk_route.py b/surfsense_backend/tests/integration/test_documents_by_chunk_route.py
deleted file mode 100644
index f59c65d97..000000000
--- a/surfsense_backend/tests/integration/test_documents_by_chunk_route.py
+++ /dev/null
@@ -1,127 +0,0 @@
-"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and
-derives the cited chunk's line range from source_markdown."""
-
-import pytest
-import pytest_asyncio
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User
-
-pytestmark = pytest.mark.integration
-
-_BODY = "alpha\nbravo\ncharlie\ndelta"
-
-
-async def _make_document(
- session: AsyncSession,
- search_space: SearchSpace,
- user: User,
- *,
- source_markdown: str = _BODY,
-) -> Document:
- doc = Document(
- title="Doc",
- document_type=DocumentType.FILE,
- document_metadata={},
- content=source_markdown,
- content_hash="hash-by-chunk",
- source_markdown=source_markdown,
- search_space_id=search_space.id,
- created_by_id=user.id,
- status=DocumentStatus.ready(),
- )
- session.add(doc)
- await session.flush()
- return doc
-
-
-async def _add_chunk(
- session: AsyncSession,
- document: Document,
- *,
- content: str,
- position: int,
- start_char: int | None,
- end_char: int | None,
-) -> Chunk:
- chunk = Chunk(
- content=content,
- position=position,
- document_id=document.id,
- start_char=start_char,
- end_char=end_char,
- )
- session.add(chunk)
- await session.flush()
- return chunk
-
-
-@pytest_asyncio.fixture
-async def make_document(db_session, db_search_space, db_user):
- async def _make(**overrides):
- return await _make_document(db_session, db_search_space, db_user, **overrides)
-
- return _make
-
-
-async def test_cited_line_range_derived_from_spans(
- db_session, db_search_space, db_user, make_document
-):
- from app.routes.documents_routes import get_document_by_chunk_id
-
- doc = await make_document()
- await _add_chunk(
- db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
- )
- cited = await _add_chunk(
- db_session,
- doc,
- content="charlie\ndelta",
- position=1,
- start_char=12,
- end_char=len(_BODY),
- )
-
- result = await get_document_by_chunk_id(
- cited.id, chunk_window=5, session=db_session, user=db_user
- )
-
- assert result.cited_start_line == 3
- assert result.cited_end_line == 4
-
-
-async def test_chunk_spans_exposed_in_response(
- db_session, db_search_space, db_user, make_document
-):
- from app.routes.documents_routes import get_document_by_chunk_id
-
- doc = await make_document()
- cited = await _add_chunk(
- db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12
- )
-
- result = await get_document_by_chunk_id(
- cited.id, chunk_window=5, session=db_session, user=db_user
- )
-
- chunk = next(c for c in result.chunks if c.id == cited.id)
- assert chunk.start_char == 0
- assert chunk.end_char == 12
-
-
-async def test_cited_line_range_null_without_spans(
- db_session, db_search_space, db_user, make_document
-):
- from app.routes.documents_routes import get_document_by_chunk_id
-
- doc = await make_document()
- cited = await _add_chunk(
- db_session, doc, content="alpha", position=0, start_char=None, end_char=None
- )
-
- result = await get_document_by_chunk_id(
- cited.id, chunk_window=5, session=db_session, user=db_user
- )
-
- assert result.cited_start_line is None
- assert result.cited_end_line is None
diff --git a/surfsense_backend/tests/integration/test_editor_routes.py b/surfsense_backend/tests/integration/test_editor_routes.py
deleted file mode 100644
index 382d4b4de..000000000
--- a/surfsense_backend/tests/integration/test_editor_routes.py
+++ /dev/null
@@ -1,175 +0,0 @@
-"""Phase A contract: editor read paths serve source_markdown and never
-reconstruct or mutate the body from chunks."""
-
-import pytest
-import pytest_asyncio
-from fastapi import HTTPException
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.db import (
- Chunk,
- Document,
- DocumentStatus,
- DocumentType,
- SearchSpace,
- User,
-)
-
-pytestmark = pytest.mark.integration
-
-
-async def _make_document(
- session: AsyncSession,
- search_space: SearchSpace,
- user: User,
- *,
- document_type: DocumentType = DocumentType.FILE,
- source_markdown: str | None = "# Title\n\nBody line.",
- content: str = "Body line.",
- status: dict | None = None,
-) -> Document:
- doc = Document(
- title="Doc",
- document_type=document_type,
- document_metadata={},
- content=content,
- content_hash="hash-001",
- source_markdown=source_markdown,
- search_space_id=search_space.id,
- created_by_id=user.id,
- status=status or DocumentStatus.ready(),
- )
- session.add(doc)
- await session.flush()
- return doc
-
-
-async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]):
- for position, text in enumerate(texts):
- session.add(Chunk(content=text, position=position, document_id=document.id))
- await session.flush()
-
-
-@pytest_asyncio.fixture
-async def make_document(db_session, db_search_space, db_user):
- async def _make(**overrides):
- return await _make_document(db_session, db_search_space, db_user, **overrides)
-
- return _make
-
-
-class TestGetEditorContent:
- async def test_returns_source_markdown_verbatim(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import get_editor_content
-
- doc = await make_document(source_markdown="# Real\n\nCanonical body.")
-
- result = await get_editor_content(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert result["source_markdown"] == "# Real\n\nCanonical body."
-
- async def test_does_not_reconstruct_body_from_chunks(
- self, db_session, db_search_space, db_user, make_document
- ):
- """A ready document without source_markdown must not be rebuilt from chunks."""
- from app.routes.editor_routes import get_editor_content
-
- doc = await make_document(source_markdown=None)
- await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
-
- with pytest.raises(HTTPException) as exc:
- await get_editor_content(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert exc.value.status_code == 400
- await db_session.refresh(doc)
- assert doc.source_markdown is None
-
- async def test_processing_document_without_body_returns_409(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import get_editor_content
-
- doc = await make_document(
- source_markdown=None, status=DocumentStatus.processing()
- )
-
- with pytest.raises(HTTPException) as exc:
- await get_editor_content(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert exc.value.status_code == 409
-
- async def test_failed_document_without_body_returns_422(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import get_editor_content
-
- doc = await make_document(
- source_markdown=None, status=DocumentStatus.failed("boom")
- )
-
- with pytest.raises(HTTPException) as exc:
- await get_editor_content(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert exc.value.status_code == 422
-
- async def test_empty_note_initializes_to_empty_markdown(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import get_editor_content
-
- doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None)
-
- result = await get_editor_content(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert result["source_markdown"] == ""
-
-
-class TestDownloadMarkdown:
- async def test_does_not_reconstruct_body_from_chunks(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import download_document_markdown
-
- doc = await make_document(source_markdown=None)
- await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
-
- with pytest.raises(HTTPException) as exc:
- await download_document_markdown(
- db_search_space.id, doc.id, session=db_session, user=db_user
- )
-
- assert exc.value.status_code == 400
-
-
-class TestExportDocument:
- async def test_does_not_reconstruct_body_from_chunks(
- self, db_session, db_search_space, db_user, make_document
- ):
- from app.routes.editor_routes import export_document
- from app.routes.reports_routes import ExportFormat
-
- doc = await make_document(source_markdown=None)
- await _add_chunks(db_session, doc, ["chunk one", "chunk two"])
-
- with pytest.raises(HTTPException) as exc:
- await export_document(
- db_search_space.id,
- doc.id,
- format=ExportFormat.PLAIN,
- session=db_session,
- user=db_user,
- )
-
- assert exc.value.status_code == 400
diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
deleted file mode 100644
index e068792b1..000000000
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""Unit tests for search_knowledge_base hit rendering.
-
-The tool must surface the passage that actually matched (the RRF-ranked
-chunk), not the top of the document, and annotate it with its line range
-when the chunk carries a char span.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
- _format_hits,
-)
-
-pytestmark = pytest.mark.unit
-
-_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
-
-
-def _hit() -> dict:
- intro = "Intro paragraph."
- matched = "Matched passage here."
- matched_start = _BODY.index(matched)
- return {
- "document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
- "score": 0.42,
- "content": _BODY.replace("\n\n", "\n\n"),
- "matched_chunk_ids": [102],
- "chunks": [
- {
- "chunk_id": 101,
- "content": intro,
- "start_char": 0,
- "end_char": len(intro),
- },
- {
- "chunk_id": 102,
- "content": matched,
- "start_char": matched_start,
- "end_char": matched_start + len(matched),
- },
- ],
- }
-
-
-def test_renders_matched_passage_not_top_of_doc() -> None:
- out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
- assert "Matched passage here." in out
- # The intro chunk was not matched, so it must not be shown as the snippet.
- assert "Intro paragraph." not in out
-
-
-def test_emits_copyable_line_citation_token_when_spans_present() -> None:
- out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
- # "Matched passage here." sits on line 3 of the body; the hit must surface
- # a ready-to-copy token so the agent can cite without a separate read.
- assert "[citation:d7#L3-3]" in out
-
-
-def test_header_includes_document_id() -> None:
- out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
- assert "id=7" in out
-
-
-def test_omits_citation_token_when_spans_absent() -> None:
- hit = _hit()
- for chunk in hit["chunks"]:
- chunk["start_char"] = None
- chunk["end_char"] = None
- out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
- assert "Matched passage here." in out
- # No concrete, copyable token for this document without spans (the closing
- # instruction's placeholder template doesn't count).
- assert "[citation:d7#L" not in out
-
-
-def test_falls_back_to_content_when_no_matched_ids() -> None:
- hit = _hit()
- hit["matched_chunk_ids"] = []
- out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
- assert "Intro paragraph." in out
-
-
-def test_no_results_message() -> None:
- out = _format_hits([], paths={}, bodies={}, query="missing")
- assert "No knowledge-base matches" in out
diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
deleted file mode 100644
index 0ff155c3b..000000000
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Span-aware chunking contract: slices form a lossless, contiguous partition
-of the markdown, and every slice's char span addresses its own text."""
-
-import pytest
-
-from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
-
-pytestmark = pytest.mark.unit
-
-
-def _assert_lossless_partition(md: str, slices) -> None:
- assert "".join(s.text for s in slices) == md
-
- cursor = 0
- for s in slices:
- assert s.start_char == cursor, "slices must be contiguous"
- assert s.end_char >= s.start_char
- assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
- cursor = s.end_char
- assert cursor == len(md)
-
-
-def test_prose_partition_and_spans():
- md = (
- "# Title\n\n"
- + "First paragraph with several words here. " * 20
- + "\n\nSecond section with more prose to force multiple chunks. " * 20
- )
-
- slices = chunk_markdown_with_spans(md)
-
- assert len(slices) > 1
- _assert_lossless_partition(md, slices)
-
-
-def test_table_kept_whole_with_exact_span():
- table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
- md = f"Intro prose before the table.\n{table}\nClosing prose after."
-
- slices = chunk_markdown_with_spans(md)
-
- _assert_lossless_partition(md, slices)
- table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
- assert any("| 1 | 2 |" in s.text for s in table_slices)
- for s in table_slices:
- assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
-
-
-def test_table_at_eof_without_trailing_newline_stays_whole():
- md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
-
- slices = chunk_markdown_with_spans(md)
-
- _assert_lossless_partition(md, slices)
- table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
- assert len(table_slices) == 1
- assert "| a | b |" in table_slices[0].text
-
-
-def test_code_chunker_partition_and_spans():
- code = "\n\n".join(
- f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40)
- )
-
- slices = chunk_markdown_with_spans(code, use_code_chunker=True)
-
- assert len(slices) >= 1
- _assert_lossless_partition(code, slices)
-
-
-def test_empty_markdown_yields_no_slices():
- assert chunk_markdown_with_spans("") == []
diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
index 8c4936648..feb7bbc52 100644
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
@@ -37,9 +37,12 @@ def _make_orm_doc(connector_doc, doc_id):
async def test_index_calls_embed_and_chunk_via_to_thread(
pipeline, make_connector_document, monkeypatch
):
- """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
- from app.indexing_pipeline.document_chunker import ChunkSlice
+ """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
+ Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
+ path, see issue #1334) is verified separately in
+ ``test_non_code_documents_use_hybrid_chunker``.
+ """
to_thread_calls = []
original_to_thread = asyncio.to_thread
@@ -48,11 +51,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
return await original_to_thread(func, *args, **kwargs)
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
- mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
- mock_chunker.__name__ = "chunk_markdown_with_spans"
+ mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
+ mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- mock_chunker,
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
+ mock_chunk_hybrid,
)
mock_embed = MagicMock(
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@@ -87,25 +90,34 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
await pipeline.index(document, connector_doc)
- assert "chunk_markdown_with_spans" in to_thread_calls
+ # Either chunker entry point satisfies the "chunking runs off the event
+ # loop" contract this test guards. Routing between the two is verified
+ # in test_non_code_documents_use_hybrid_chunker.
+ assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
assert "embed_texts" in to_thread_calls
assert document.status == DocumentStatus.ready()
-async def test_non_code_documents_use_prose_chunker(
+async def test_non_code_documents_use_hybrid_chunker(
pipeline, make_connector_document, monkeypatch
):
- """Non-code documents chunk with use_code_chunker=False (issue #1334).
+ """Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
- The table-aware prose path keeps Markdown tables intact; only documents
- flagged with ``should_use_code_chunker=True`` request the code chunker.
+ The hybrid chunker preserves Markdown table integrity by avoiding splits
+ mid-row. Only documents flagged with ``should_use_code_chunker=True``
+ should take the ``chunk_text`` path.
"""
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
- mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
+ mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
+ mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- mock_chunker,
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
+ mock_chunk_hybrid,
+ )
+ mock_chunk_code = MagicMock(return_value=["chunk1"])
+ mock_chunk_code.__name__ = "chunk_text"
+ monkeypatch.setattr(
+ "app.indexing_pipeline.cache.cached_indexing.chunk_text",
+ mock_chunk_code,
)
monkeypatch.setattr(
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
@@ -137,49 +149,8 @@ async def test_non_code_documents_use_prose_chunker(
await pipeline.index(document, connector_doc)
- mock_chunker.assert_called_once()
- assert mock_chunker.call_args.args[1] is False
-
-
-async def test_code_documents_request_code_chunker(
- pipeline, make_connector_document, monkeypatch
-):
- """Code-flagged documents forward use_code_chunker=True to the chunker."""
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
- mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
- monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
- mock_chunker,
- )
- monkeypatch.setattr(
- "app.indexing_pipeline.cache.cached_indexing.embed_texts",
- MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
- )
- monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
-
- async def _noop_persist(_session, doc, *_args, **_kwargs):
- doc.status = DocumentStatus.ready()
-
- monkeypatch.setattr(
- "app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
- _noop_persist,
- )
-
- connector_doc = make_connector_document(
- document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
- unique_id="repo-1",
- search_space_id=1,
- should_use_code_chunker=True,
- )
- document = MagicMock(spec=Document)
- document.id = 1
- document.status = DocumentStatus.pending()
-
- await pipeline.index(document, connector_doc)
-
- mock_chunker.assert_called_once()
- assert mock_chunker.call_args.args[1] is True
+ mock_chunk_hybrid.assert_called_once()
+ mock_chunk_code.assert_not_called()
def _mock_session_factory(orm_docs_by_id):
diff --git a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
index 27653c544..898ec3765 100644
--- a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
+++ b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
@@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
def __init__(self, *, children=None, file_data=None) -> None:
self.als_info = AsyncMock(return_value=children or [])
self._load_file_data = AsyncMock(
- return_value=(file_data, 17, None) if file_data is not None else None
+ return_value=(file_data, 17) if file_data is not None else None
)
diff --git a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
index 3968eb090..e78db1e76 100644
--- a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
+++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
@@ -69,25 +69,13 @@ class _FakeSession:
@pytest.fixture(autouse=True)
def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
- """Avoid loading the embedding model in unit tests.
-
- Mirrors the legacy stub: one chunk spanning the whole content, with a
- zero summary/chunk vector, routed through the shared span builder.
- """
- from app.indexing_pipeline.document_chunker import ChunkSlice
-
- async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
- summary = np.zeros(8, dtype=np.float32)
- pairs = (
- [(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
- if content
- else []
- )
- return summary, pairs
-
+ """Avoid loading the embedding model in unit tests."""
monkeypatch.setattr(
- kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
+ kb_persistence,
+ "embed_texts",
+ lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
)
+ monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])
@pytest.mark.asyncio
diff --git a/surfsense_backend/tests/unit/middleware/test_numbered_document.py b/surfsense_backend/tests/unit/middleware/test_numbered_document.py
deleted file mode 100644
index 955c619b5..000000000
--- a/surfsense_backend/tests/unit/middleware/test_numbered_document.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""Unit tests for the numbered-document read preamble."""
-
-import pytest
-
-from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
- build_read_preamble,
- compute_matched_line_ranges,
-)
-
-pytestmark = pytest.mark.unit
-
-
-_BODY = "alpha\nbravo\ncharlie\ndelta"
-
-
-class TestComputeMatchedLineRanges:
- def test_maps_matched_chunk_spans_to_line_ranges(self):
- chunks = [(1, 0, 12), (2, 12, len(_BODY))]
- ranges = compute_matched_line_ranges(_BODY, chunks, {2})
- assert ranges == [(3, 4)]
-
- def test_includes_only_matched_chunks(self):
- chunks = [(1, 0, 5), (2, 6, 11)]
- ranges = compute_matched_line_ranges(_BODY, chunks, {1})
- assert ranges == [(1, 1)]
-
- def test_skips_chunks_without_spans(self):
- chunks = [(1, None, None)]
- ranges = compute_matched_line_ranges(_BODY, chunks, {1})
- assert ranges == []
-
- def test_sorted_and_deduplicated(self):
- chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
- ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
- assert ranges == [(1, 1), (3, 4)]
-
-
-class TestBuildReadPreamble:
- def test_contains_document_metadata(self):
- preamble = build_read_preamble(
- document_id=42,
- document_type="FILE",
- title="Test Doc",
- url="https://example.com",
- matched_line_ranges=[],
- )
- assert "42" in preamble
- assert "FILE" in preamble
- assert "Test Doc" in preamble
- assert "https://example.com" in preamble
-
- def test_citation_hint_uses_document_id(self):
- preamble = build_read_preamble(
- document_id=42,
- document_type="FILE",
- title="Test Doc",
- url="",
- matched_line_ranges=[],
- )
- assert "[citation:d42#L" in preamble
-
- def test_lists_matched_line_ranges(self):
- preamble = build_read_preamble(
- document_id=7,
- document_type="NOTE",
- title="Notes",
- url="",
- matched_line_ranges=[(12, 18), (40, 40)],
- )
- assert "" in preamble
- assert "12-18" in preamble
- assert "40" in preamble
-
- def test_omits_matched_lines_block_when_empty(self):
- preamble = build_read_preamble(
- document_id=7,
- document_type="NOTE",
- title="Notes",
- url="",
- matched_line_ranges=[],
- )
- assert "" not in preamble
-
- def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
- preamble = build_read_preamble(
- document_id=1,
- document_type="FILE",
- title="t",
- url="",
- matched_line_ranges=[],
- )
- assert preamble.endswith("\n")
diff --git a/surfsense_backend/tests/unit/utils/test_text_spans.py b/surfsense_backend/tests/unit/utils/test_text_spans.py
deleted file mode 100644
index d70418ea5..000000000
--- a/surfsense_backend/tests/unit/utils/test_text_spans.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""Unit tests for char-span -> line-range conversion."""
-
-from __future__ import annotations
-
-import pytest
-
-from app.utils.text_spans import char_span_to_line_range
-
-pytestmark = pytest.mark.unit
-
-_TEXT = "line1\nline2\nline3"
-
-
-def test_single_line_span() -> None:
- start = _TEXT.index("line2")
- assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
-
-
-def test_first_line_span() -> None:
- assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
-
-
-def test_last_line_span() -> None:
- start = _TEXT.index("line3")
- assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
-
-
-def test_multi_line_span() -> None:
- # "line1\nline2" spans lines 1-2.
- assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
-
-
-def test_empty_span_resolves_to_its_line() -> None:
- start = _TEXT.index("line2")
- assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
-
-
-def test_offsets_clamped_to_text_bounds() -> None:
- assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)
diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css
index 6950fd284..3cdb34bff 100644
--- a/surfsense_web/app/globals.css
+++ b/surfsense_web/app/globals.css
@@ -270,12 +270,6 @@ button {
contain-intrinsic-size: 0 40px;
}
-/* Monaco whole-line highlight for a cited source span (Phase E). */
-.citation-line-highlight {
- background-color: color-mix(in srgb, var(--primary) 16%, transparent);
- box-shadow: inset 2px 0 0 0 var(--primary);
-}
-
@source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}";
@source "../node_modules/streamdown/dist/*.js";
@source "../node_modules/@streamdown/code/dist/*.js";
diff --git a/surfsense_web/atoms/editor/editor-panel.atom.ts b/surfsense_web/atoms/editor/editor-panel.atom.ts
index ee609f519..c302c66ee 100644
--- a/surfsense_web/atoms/editor/editor-panel.atom.ts
+++ b/surfsense_web/atoms/editor/editor-panel.atom.ts
@@ -1,11 +1,6 @@
import { atom } from "jotai";
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
-export interface EditorLineRange {
- start: number;
- end: number;
-}
-
interface EditorPanelState {
isOpen: boolean;
kind: "document" | "local_file" | "memory";
@@ -14,10 +9,6 @@ interface EditorPanelState {
searchSpaceId: number | null;
memoryScope: "user" | "team" | null;
title: string | null;
- // Citation line anchor: when set, the editor opens the raw source view
- // scrolled to and highlighting this 1-based inclusive line range.
- highlightLines: EditorLineRange | null;
- forceSourceView: boolean;
}
const initialState: EditorPanelState = {
@@ -28,8 +19,6 @@ const initialState: EditorPanelState = {
searchSpaceId: null,
memoryScope: null,
title: null,
- highlightLines: null,
- forceSourceView: false,
};
export const editorPanelAtom = atom(initialState);
@@ -44,14 +33,7 @@ export const openEditorPanelAtom = atom(
get,
set,
payload:
- | {
- documentId: number;
- searchSpaceId: number;
- title?: string;
- kind?: "document";
- highlightLines?: EditorLineRange | null;
- forceSourceView?: boolean;
- }
+ | { documentId: number; searchSpaceId: number; title?: string; kind?: "document" }
| {
kind: "local_file";
localFilePath: string;
@@ -77,8 +59,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: null,
title: payload.title ?? null,
- highlightLines: null,
- forceSourceView: false,
});
set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false);
@@ -93,8 +73,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId ?? null,
memoryScope: payload.memoryScope,
title: payload.title ?? null,
- highlightLines: null,
- forceSourceView: false,
});
set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false);
@@ -108,8 +86,6 @@ export const openEditorPanelAtom = atom(
searchSpaceId: payload.searchSpaceId,
memoryScope: null,
title: payload.title ?? null,
- highlightLines: payload.highlightLines ?? null,
- forceSourceView: payload.forceSourceView ?? false,
});
set(rightPanelTabAtom, "editor");
set(rightPanelCollapsedAtom, false);
diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx
index 28f5212ae..59a10739c 100644
--- a/surfsense_web/components/assistant-ui/inline-citation.tsx
+++ b/surfsense_web/components/assistant-ui/inline-citation.tsx
@@ -2,11 +2,9 @@
import { useSetAtom } from "jotai";
import { FileText } from "lucide-react";
-import { useParams } from "next/navigation";
import type { FC } from "react";
import { useId, useState } from "react";
import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom";
-import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom";
import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context";
import { CitationPanelContent } from "@/components/citation-panel/citation-panel";
import { Citation } from "@/components/tool-ui/citation";
@@ -110,50 +108,6 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => {
);
};
-interface LineCitationProps {
- documentId: number;
- startLine: number;
- endLine: number;
-}
-
-/**
- * Inline citation for a knowledge-base document line range
- * (`[citation:d#L-]`). Clicking opens the document in
- * the editor's read-only source view, scrolled to and highlighting the cited
- * lines — the same anchor the citation panel uses for chunk citations.
- */
-export const LineCitation: FC = ({ documentId, startLine, endLine }) => {
- const openEditorPanel = useSetAtom(openEditorPanelAtom);
- const params = useParams();
- const searchSpaceId = Number(params?.search_space_id);
-
- const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`;
-
- const handleClick = () => {
- if (!Number.isFinite(searchSpaceId)) return;
- openEditorPanel({
- documentId,
- searchSpaceId,
- highlightLines: { start: startLine, end: endLine },
- forceSourceView: true,
- });
- };
-
- return (
-
- );
-};
-
import { tryGetHostname } from "@/lib/url";
interface UrlCitationProps {
diff --git a/surfsense_web/components/citation-panel/citation-panel.tsx b/surfsense_web/components/citation-panel/citation-panel.tsx
index 9b9a9aaa9..890ac11ac 100644
--- a/surfsense_web/components/citation-panel/citation-panel.tsx
+++ b/surfsense_web/components/citation-panel/citation-panel.tsx
@@ -46,13 +46,6 @@ export const CitationPanelContent: FC = ({
const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]);
- const citedLineLabel = useMemo(() => {
- const start = data?.cited_start_line;
- const end = data?.cited_end_line;
- if (start == null || end == null) return null;
- return start === end ? `Line ${start}` : `Lines ${start}–${end}`;
- }, [data?.cited_start_line, data?.cited_end_line]);
-
const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0;
const startIndex = data?.chunk_start_index ?? 0;
const hasMoreAbove = startIndex > 0;
@@ -82,15 +75,10 @@ export const CitationPanelContent: FC = ({
const handleOpenFullDocument = () => {
if (!data) return;
- const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null;
openEditorPanel({
documentId: data.id,
searchSpaceId: data.search_space_id,
title: data.title,
- highlightLines: hasLineAnchor
- ? { start: data.cited_start_line as number, end: data.cited_end_line as number }
- : null,
- forceSourceView: hasLineAnchor,
});
};
@@ -122,7 +110,6 @@ export const CitationPanelContent: FC = ({
- {citedLineLabel && {citedLineLabel}}
{totalChunks > 0 && {totalChunks} chunks}
{!isLoading && !error && data && (
diff --git a/surfsense_web/components/citations/citation-renderer.tsx b/surfsense_web/components/citations/citation-renderer.tsx
index b0ab13f84..f2de4b27d 100644
--- a/surfsense_web/components/citations/citation-renderer.tsx
+++ b/surfsense_web/components/citations/citation-renderer.tsx
@@ -1,7 +1,7 @@
"use client";
import type { ReactNode } from "react";
-import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import {
type CitationToken,
type CitationUrlMap,
@@ -21,16 +21,6 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R
if (token.kind === "url") {
return ;
}
- if (token.kind === "line") {
- return (
-
- );
- }
return (
void;
- highlightLines?: { start: number; end: number } | null;
- forceSourceView?: boolean;
}) {
const electronAPI = useElectronAPI();
const [editorDoc, setEditorDoc] = useState(null);
@@ -209,7 +205,7 @@ export function EditorPanelContent({
const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines;
const viewerMode: ViewerMode = isMemoryMode
? "plate"
- : editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView
+ : editorDoc?.viewer_mode === "monaco" || isLargeDocument
? "monaco"
: "plate";
@@ -832,7 +828,6 @@ export function EditorPanelContent({
value={editorDoc.source_markdown}
readOnly
onChange={() => {}}
- highlightLines={highlightLines}
/>
@@ -923,8 +918,6 @@ function DesktopEditorPanel() {
searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title}
onClose={closePanel}
- highlightLines={panelState.highlightLines}
- forceSourceView={panelState.forceSourceView}
/>
);
@@ -964,8 +957,6 @@ function MobileEditorDrawer() {
memoryScope={panelState.memoryScope ?? undefined}
searchSpaceId={panelState.searchSpaceId ?? undefined}
title={panelState.title}
- highlightLines={panelState.highlightLines}
- forceSourceView={panelState.forceSourceView}
/>
diff --git a/surfsense_web/components/editor/plugins/citation-kit.tsx b/surfsense_web/components/editor/plugins/citation-kit.tsx
index edba9a19e..97e8ec723 100644
--- a/surfsense_web/components/editor/plugins/citation-kit.tsx
+++ b/surfsense_web/components/editor/plugins/citation-kit.tsx
@@ -3,10 +3,9 @@
import { type Descendant, KEYS } from "platejs";
import { createPlatePlugin, type PlateElementProps } from "platejs/react";
import type { FC } from "react";
-import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
+import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation";
import {
CITATION_REGEX,
- type CitationToken,
type CitationUrlMap,
parseTextWithCitations,
} from "@/lib/citations/citation-parser";
@@ -18,12 +17,9 @@ import {
*/
export type CitationElementNode = {
type: "citation";
- kind: "chunk" | "doc" | "url" | "line";
+ kind: "chunk" | "doc" | "url";
chunkId?: number;
url?: string;
- documentId?: number;
- startLine?: number;
- endLine?: number;
/** Original literal token that produced this citation node. */
rawText: string;
children: [{ text: "" }];
@@ -37,22 +33,11 @@ const CitationElement: FC> = ({
element,
}) => {
const isUrl = element.kind === "url";
- const isLine =
- element.kind === "line" &&
- element.documentId !== undefined &&
- element.startLine !== undefined &&
- element.endLine !== undefined;
return (
{isUrl && element.url ? (
- ) : isLine ? (
-
) : element.chunkId !== undefined ? (
) : null}
@@ -112,7 +97,10 @@ function copyMarks(textNode: SlateText): Record {
return marks;
}
-function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode {
+function makeCitationElement(
+ rawText: string,
+ segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
+): CitationElementNode {
if (segment.kind === "url") {
return {
type: CITATION_TYPE,
@@ -122,17 +110,6 @@ function makeCitationElement(rawText: string, segment: CitationToken): CitationE
children: [{ text: "" }],
};
}
- if (segment.kind === "line") {
- return {
- type: CITATION_TYPE,
- kind: "line",
- documentId: segment.documentId,
- startLine: segment.startLine,
- endLine: segment.endLine,
- rawText,
- children: [{ text: "" }],
- };
- }
return {
type: CITATION_TYPE,
kind: segment.isDocsChunk ? "doc" : "chunk",
diff --git a/surfsense_web/components/editor/source-code-editor.tsx b/surfsense_web/components/editor/source-code-editor.tsx
index 4af4f2125..9102dffe9 100644
--- a/surfsense_web/components/editor/source-code-editor.tsx
+++ b/surfsense_web/components/editor/source-code-editor.tsx
@@ -2,7 +2,7 @@
import dynamic from "next/dynamic";
import { useTheme } from "next-themes";
-import { useCallback, useEffect, useRef } from "react";
+import { useEffect, useRef } from "react";
import { Spinner } from "@/components/ui/spinner";
const MonacoEditor = dynamic(() => import("@monaco-editor/react"), {
@@ -17,8 +17,6 @@ interface SourceCodeEditorProps {
readOnly?: boolean;
fontSize?: number;
onSave?: () => Promise | void;
- /** 1-based inclusive line range to reveal and highlight (e.g. a citation). */
- highlightLines?: { start: number; end: number } | null;
}
export function SourceCodeEditor({
@@ -29,45 +27,10 @@ export function SourceCodeEditor({
readOnly = false,
fontSize = 12,
onSave,
- highlightLines = null,
}: SourceCodeEditorProps) {
const { resolvedTheme } = useTheme();
const onSaveRef = useRef(onSave);
const monacoRef = useRef(null);
- const editorRef = useRef(null);
- const decorationsRef = useRef(null);
- const highlightLinesRef = useRef(highlightLines);
- highlightLinesRef.current = highlightLines;
-
- const applyHighlight = useCallback(() => {
- const editor = editorRef.current;
- const monaco = monacoRef.current;
- if (!editor || !monaco) return;
- if (decorationsRef.current) {
- decorationsRef.current.clear();
- decorationsRef.current = null;
- }
- const range = highlightLinesRef.current;
- if (!range) return;
- const lineCount = editor.getModel()?.getLineCount() ?? range.end;
- const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount);
- const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount);
- try {
- decorationsRef.current = editor.createDecorationsCollection([
- {
- range: new monaco.Range(start, 1, end, 1),
- options: { isWholeLine: true, className: "citation-line-highlight" },
- },
- ]);
- } catch {
- // Decoration failure must not block the reveal below.
- }
- editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate);
- }, []);
-
- useEffect(() => {
- applyHighlight();
- }, [applyHighlight, highlightLines?.start, highlightLines?.end]);
const normalizedModelPath = (() => {
const raw = (path || "local-file.txt").trim();
const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`;
@@ -141,16 +104,7 @@ export function SourceCodeEditor({
}}
onMount={(editor, monaco) => {
monacoRef.current = monaco;
- editorRef.current = editor;
applySidebarTheme(monaco);
- // Reveal now, then once more after the first layout settles:
- // the panel slide-in animation means the editor often has no
- // usable viewport height on the initial frame.
- applyHighlight();
- const layoutSub = editor.onDidLayoutChange(() => {
- applyHighlight();
- layoutSub.dispose();
- });
if (!isManualSaveEnabled) return;
editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => {
void onSaveRef.current?.();
diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
index 6662d7830..5a7588979 100644
--- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
+++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx
@@ -12,7 +12,6 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right
import { Button } from "@/components/ui/button";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl";
-import { useMediaQuery } from "@/hooks/use-media-query";
import { cn } from "@/lib/utils";
import { DocumentsSidebar } from "../sidebar";
@@ -197,9 +196,6 @@ export function RightPanel({
const citationState = useAtomValue(citationPanelAtom);
const closeCitation = useSetAtom(closeCitationPanelAtom);
const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom);
- // Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without
- // this guard both render together and two editors fight over one model.
- const isDesktop = useMediaQuery("(min-width: 1024px)");
const documentsOpen = documentsPanel?.open ?? false;
const reportOpen = reportState.isOpen && !!reportState.reportId;
@@ -271,7 +267,7 @@ export function RightPanel({
setCollapsed(true)} />
) : null;
- if (!isVisible || !isDesktop) return null;
+ if (!isVisible) return null;
return (
)}
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index a7fa19e18..da1dac537 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -70,15 +70,10 @@ export const documentWithChunks = document.extend({
id: z.number(),
content: z.string(),
created_at: z.string(),
- start_char: z.number().nullable().optional(),
- end_char: z.number().nullable().optional(),
})
),
total_chunks: z.number().optional().default(0),
chunk_start_index: z.number().optional().default(0),
- // 1-based inclusive line range of the cited chunk within source_markdown.
- cited_start_line: z.number().nullable().optional(),
- cited_end_line: z.number().nullable().optional(),
});
/**
diff --git a/surfsense_web/lib/citations/citation-parser.ts b/surfsense_web/lib/citations/citation-parser.ts
index 0d320956f..533c644c2 100644
--- a/surfsense_web/lib/citations/citation-parser.ts
+++ b/surfsense_web/lib/citations/citation-parser.ts
@@ -18,16 +18,12 @@ import { FENCED_OR_INLINE_CODE } from "@/lib/markdown/code-regions";
* sometimes emit.
*/
export const CITATION_REGEX =
- /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|d\d+#L\d+-\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
-
-/** Matches the knowledge-base line-citation form `d#L-`. */
-const LINE_CITATION_REGEX = /^d(\d+)#L(\d+)-(\d+)$/;
+ /[[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g;
/** A single parsed citation reference. */
export type CitationToken =
| { kind: "url"; url: string }
- | { kind: "chunk"; chunkId: number; isDocsChunk: boolean }
- | { kind: "line"; documentId: number; startLine: number; endLine: number };
+ | { kind: "chunk"; chunkId: number; isDocsChunk: boolean };
/** Output of `parseTextWithCitations` — interleaved text + citation tokens. */
export type ParsedSegment = string | CitationToken;
@@ -99,15 +95,7 @@ export function parseTextWithCitations(text: string, urlMap: CitationUrlMap): Pa
const captured = match[1];
- const lineMatch = LINE_CITATION_REGEX.exec(captured);
- if (lineMatch) {
- segments.push({
- kind: "line",
- documentId: Number.parseInt(lineMatch[1], 10),
- startLine: Number.parseInt(lineMatch[2], 10),
- endLine: Number.parseInt(lineMatch[3], 10),
- });
- } else if (captured.startsWith("http://") || captured.startsWith("https://")) {
+ if (captured.startsWith("http://") || captured.startsWith("https://")) {
segments.push({ kind: "url", url: captured.trim() });
} else if (captured.startsWith("urlcite")) {
const url = urlMap.get(captured);