diff --git a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py b/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py deleted file mode 100644 index 336711612..000000000 --- a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py +++ /dev/null @@ -1,31 +0,0 @@ -"""add chunks.start_char/end_char for citation offsets - -Char offsets into the document's source_markdown (half-open span) let citations -resolve the exact passage a chunk came from. Nullable because historical rows -have no span; they populate on the next connector sync or user edit/reindex. - -No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites -every secondary index per row (see migration 165 for the same reasoning). - -Revision ID: 166 -Revises: 165 -""" - -from collections.abc import Sequence - -from alembic import op - -revision: str = "166" -down_revision: str | None = "165" -branch_labels: str | Sequence[str] | None = None -depends_on: str | Sequence[str] | None = None - - -def upgrade() -> None: - op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;") - op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;") - - -def downgrade() -> None: - op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;") - op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;") diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py index d66e9073c..a6c83a7d4 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py @@ -18,6 +18,7 @@ skipped (e.g. client disconnect). from __future__ import annotations +import asyncio import logging from datetime import UTC, datetime from typing import Any @@ -57,8 +58,9 @@ from app.db import ( FolderRevision, shielded_async_session, ) -from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings +from app.indexing_pipeline.document_chunker import chunk_text from app.utils.document_converters import ( + embed_texts, generate_content_hash, generate_unique_identifier_hash, ) @@ -232,23 +234,24 @@ async def _create_document( session.add(doc) await session.flush() - summary_embedding, chunk_embeddings = await build_chunk_embeddings( - content, use_code_chunker=False - ) + summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0] doc.embedding = summary_embedding - session.add_all( - [ - Chunk( - document_id=doc.id, - content=sl.text, - embedding=embedding, - position=i, - start_char=sl.start_char, - end_char=sl.end_char, - ) - for i, (sl, embedding) in enumerate(chunk_embeddings) - ] - ) + chunks = chunk_text(content) + if chunks: + chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) + session.add_all( + [ + Chunk( + document_id=doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) + ] + ) return doc @@ -284,25 +287,26 @@ async def _update_document( search_space_id, ) - summary_embedding, chunk_embeddings = await build_chunk_embeddings( - content, use_code_chunker=False - ) + summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0] document.embedding = summary_embedding await session.execute(delete(Chunk).where(Chunk.document_id == document.id)) - session.add_all( - [ - Chunk( - document_id=document.id, - content=sl.text, - embedding=embedding, - position=i, - start_char=sl.start_char, - end_char=sl.end_char, - ) - for i, (sl, embedding) in enumerate(chunk_embeddings) - ] - ) + chunks = chunk_text(content) + if chunks: + chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) + session.add_all( + [ + Chunk( + document_id=document.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) + ] + ) return document diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md index 8e67615d0..2abd95d5a 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md @@ -1,58 +1,42 @@ -Citations reach the answer through three channels. Use whichever applies, and -never invent ids you didn't see: ids are matched exactly, so a wrong one -silently breaks the link — when in doubt, omit. Always write a citation as -plain `[citation:…]` brackets — no markdown links, no footnote numbers, no -parentheses. +Citations reach the answer through two channels. Use whichever applies — and +never invent ids you didn't see. Citation ids are resolved by exact-match +lookup; a wrong id silently breaks the link, so when in doubt, omit. -### Channel A — web_search chunk blocks injected this turn +### Channel A — chunk blocks injected this turn When `web_search` returns `` / `` blocks in this -turn, the chunk `id` is the result's URL: +turn: -1. For each factual statement taken from a chunk, add `[citation:]` - using the **exact** id from a visible `` tag. Copy the - URL verbatim; do not retype it from memory. -2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated, +1. For each factual statement taken from those chunks, add + `[citation:chunk_id]` using the **exact** id from a visible + `` tag. Copy digit-for-digit (or the URL verbatim); + do not retype from memory. +2. `` is the parent doc id, **not** a citation source — + only ids inside `` count. +3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated, each id copied individually). -3. Never invent, normalise, or guess at a URL; if unsure, omit. +4. Never invent, normalise, or guess at adjacent ids; if unsure, omit. +5. Plain brackets only — no markdown links, no footnote numbering. ### Channel B — citations relayed by a `task` specialist -A `task(...)` tool message may contain `[citation:…]` markers the -specialist already attached to its prose — line citations -(`[citation:d#L-]`) or chunk ids (`[citation:N]`). The -specialist read the underlying document and tied each marker to a -passage; you didn't. So: +A `task(...)` tool message may contain `[citation:]` markers +the specialist already attached to its prose. The specialist saw the +underlying `` blocks; you didn't. So: 1. **Preserve those markers verbatim** in your final answer — do not reformat, renumber, drop, or wrap them in markdown links. When you paraphrase a specialist sentence, copy the marker character-for- - character; do not regenerate it from memory (LLMs reliably corrupt - nearby digits). + character; do not regenerate the id from memory (LLMs reliably + corrupt nearby digits). 2. Keep each marker attached to the sentence the specialist attached it to. 3. Do **not** add new `[citation:…]` markers of your own to a specialist's prose; if a fact has no marker, the specialist - couldn't tie it to a source and neither can you. + couldn't tie it to a chunk and neither can you. 4. When a specialist returns JSON, the citation markers live inside the prose-bearing fields (e.g. a summary or excerpt). Pull them along with the surrounding sentence when you quote. -### Channel C — your knowledge base (search hits and `read_file`) -Knowledge-base facts are cited by line range using the document id: -`[citation:d#L-]` (a single line is `#L-`). - -1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each - matched passage. When that passage supports your point, copy the token - verbatim — that is the entire citation. -2. When you `read_file` a `/documents/...` path, its header gives the - `` and an optional `` pointer, and the body is - shown with line numbers; cite the lines you actually used. Use `read_file` - when you need more context than a search passage shows. -3. Copy document ids and line numbers exactly as shown — never estimate, - shift, or invent them. -4. Older documents without a numbered body instead show `` - blocks; cite those with `[citation:N]`, copying the id exactly. - -If none of these channels surfaces a citable source this turn, do not -fabricate citations. +If neither channel surfaces citation markers this turn, do not fabricate +them. diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py index 0696dc92e..9236e9121 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/tools/search_knowledge_base.py @@ -33,7 +33,6 @@ from app.agents.chat.runtime.path_resolver import ( ) from app.db import Document, shielded_async_session from app.utils.perf import get_perf_logger -from app.utils.text_spans import char_span_to_line_range _perf_log = get_perf_logger() @@ -57,16 +56,12 @@ _TOOL_DESCRIPTION = ( ) -async def _resolve_doc_context( +async def _resolve_virtual_paths( results: list[dict[str, Any]], *, search_space_id: int, -) -> tuple[dict[int, str], dict[int, str]]: - """Resolve ``Document.id`` -> (canonical virtual path, source_markdown). - - ``source_markdown`` is the canonical body the chunk spans index into; the - renderer uses it to turn a chunk's char span into a line range. - """ +) -> dict[int, str]: + """Resolve ``Document.id`` -> canonical virtual path for the search hits.""" doc_ids = [ doc_id for doc_id in ( @@ -77,24 +72,17 @@ async def _resolve_doc_context( if isinstance(doc_id, int) ] if not doc_ids: - return {}, {} + return {} async with shielded_async_session() as session: index: PathIndex = await build_path_index(session, search_space_id) - rows = await session.execute( - select( - Document.id, Document.folder_id, Document.source_markdown - ).where( + folder_rows = await session.execute( + select(Document.id, Document.folder_id).where( Document.search_space_id == search_space_id, Document.id.in_(doc_ids), ) ) - folder_by_doc_id: dict[int, int | None] = {} - bodies: dict[int, str] = {} - for row in rows.all(): - folder_by_doc_id[row.id] = row.folder_id - if row.source_markdown: - bodies[row.id] = row.source_markdown + folder_by_doc_id = {row.id: row.folder_id for row in folder_rows.all()} paths: dict[int, str] = {} for doc in results: @@ -109,76 +97,13 @@ async def _resolve_doc_context( folder_id=folder_id if isinstance(folder_id, int) else None, index=index, ) - return paths, bodies - - -def _citation_token(chunk: dict[str, Any], body: str | None, doc_id: int | None) -> str: - """Ready-to-copy ``[citation:dID#Lstart-end]`` token, or '' without spans.""" - start = chunk.get("start_char") - end = chunk.get("end_char") - if ( - not body - or not isinstance(doc_id, int) - or not isinstance(start, int) - or not isinstance(end, int) - ): - return "" - start_line, end_line = char_span_to_line_range(body, start, end) - return f"[citation:d{doc_id}#L{start_line}-{end_line}]" - - -def _render_passage( - chunk: dict[str, Any], body: str | None, doc_id: int | None -) -> str | None: - """Render one matched chunk as an indented passage tagged with its token.""" - content = (chunk.get("content") or "").strip() - if not content: - return None - snippet = content[:_PER_DOC_SNIPPET_CHARS].strip() - if len(content) > _PER_DOC_SNIPPET_CHARS: - snippet += " ..." - indented = snippet.replace("\n", "\n ") - token = _citation_token(chunk, body, doc_id) - head = f"\n {token}" if token else "" - return f"{head}\n {indented}" - - -def _matched_passages( - doc: dict[str, Any], body: str | None, doc_id: int | None -) -> str: - """Render the RRF-matched chunks; '' when none can be rendered.""" - by_id = { - c.get("chunk_id"): c - for c in (doc.get("chunks") or []) - if isinstance(c, dict) - } - rendered: list[str] = [] - for chunk_id in doc.get("matched_chunk_ids") or []: - chunk = by_id.get(chunk_id) - if chunk is None: - continue - passage = _render_passage(chunk, body, doc_id) - if passage: - rendered.append(passage) - return "".join(rendered) - - -def _fallback_snippet(doc: dict[str, Any]) -> str: - """Top-of-document preview, used only when no matched chunk is available.""" - content = (doc.get("content") or "").strip() - if not content: - return "\n (no preview available; read the document for details)" - snippet = content[:_PER_DOC_SNIPPET_CHARS].strip() - if len(content) > _PER_DOC_SNIPPET_CHARS: - snippet += " ..." - return "\n " + snippet.replace("\n", "\n ") + return paths def _format_hits( results: list[dict[str, Any]], *, paths: dict[int, str], - bodies: dict[int, str], query: str, ) -> str: """Render search hits as a compact, model-readable block.""" @@ -199,15 +124,21 @@ def _format_hits( score = doc.get("score") score_str = f"{score:.3f}" if isinstance(score, int | float) else "n/a" path = paths.get(doc_id) if isinstance(doc_id, int) else None - body = bodies.get(doc_id) if isinstance(doc_id, int) else None - id_str = f"id={doc_id}, " if isinstance(doc_id, int) else "" - header = f"\n{rank}. {title} ({id_str}type={doc_type}, score={score_str})" + ( + header = f"\n{rank}. {title} (type={doc_type}, score={score_str})" + ( f"\n path: {path}" if path else "" ) - passages = _matched_passages(doc, body, doc_id if isinstance(doc_id, int) else None) - entry = header + (passages or _fallback_snippet(doc)) + content = (doc.get("content") or "").strip() + if content: + snippet = content[:_PER_DOC_SNIPPET_CHARS].strip() + if len(content) > _PER_DOC_SNIPPET_CHARS: + snippet += " ..." + body = "\n " + snippet.replace("\n", "\n ") + else: + body = "\n (no preview available; read the document for details)" + + entry = header + body if total + len(entry) > _MAX_TOTAL_CHARS: lines.append("\n") break @@ -215,9 +146,8 @@ def _format_hits( total += len(entry) lines.append( - "\n\nTo cite a matched passage, copy its [citation:dID#Lstart-end] token " - "verbatim. To quote more context or read the full document, delegate to " - "the knowledge_base specialist with `task` using the path above." + "\n\nTo read a full document, delegate to the knowledge_base specialist " + "with `task`, referencing the path above." ) lines.append("\n") return "".join(lines) @@ -274,10 +204,8 @@ def create_search_knowledge_base_tool( top_k=clamped_top_k, ) - paths, bodies = await _resolve_doc_context(results, search_space_id=_space_id) - rendered = _format_hits( - results, paths=paths, bodies=bodies, query=cleaned_query - ) + paths = await _resolve_virtual_paths(results, search_space_id=_space_id) + rendered = _format_hits(results, paths=paths, query=cleaned_query) matched = _matched_chunk_ids(results) _perf_log.info( diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py index e704d5599..e13196537 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py @@ -45,10 +45,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.document_xml import ( build_document_xml, ) -from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import ( - build_read_preamble, - compute_matched_line_ranges, -) from app.agents.chat.runtime.path_resolver import ( DOCUMENTS_ROOT, build_path_index, @@ -68,12 +64,6 @@ def _basename(path: str) -> str: return path.rsplit("/", 1)[-1] -def _metadata_url(metadata: dict[str, Any]) -> str: - return ( - metadata.get("url") or metadata.get("source") or metadata.get("page_url") or "" - ) - - def _is_under(child: str, parent: str) -> bool: """Return True iff ``child`` is at-or-under ``parent`` (directory semantics).""" if parent == "/": @@ -470,11 +460,8 @@ class KBPostgresBackend(BackendProtocol): loaded = await self._load_file_data(file_path) if loaded is None: return f"Error: File '{file_path}' not found" - file_data, _, preamble = loaded - body = format_read_response(file_data, offset, limit) - if preamble and offset == 0: - return preamble + body - return body + file_data, _ = loaded + return format_read_response(file_data, offset, limit) def read(self, file_path: str, offset: int = 0, limit: int = 2000) -> str: # type: ignore[override] return asyncio.run(self.aread(file_path, offset, limit)) @@ -482,14 +469,12 @@ class KBPostgresBackend(BackendProtocol): async def _load_file_data( self, path: str, - ) -> tuple[dict[str, Any], int | None, str | None] | None: + ) -> tuple[dict[str, Any], int | None] | None: """Lazy-load a virtual KB document into a deepagents ``FileData``. - Returns ``(file_data, doc_id, preamble)`` or ``None`` if the path - doesn't map to any known document. ``doc_id`` is ``None`` for the - synthetic anonymous document. ``preamble`` is the metadata header to - show above a numbered ``source_markdown`` body (``None`` for the legacy - chunk-reconstructed XML reads used when a document has no body). + Returns ``(file_data, doc_id)`` or ``None`` if the path doesn't map + to any known document. ``doc_id`` is ``None`` for the synthetic + anonymous document so the caller doesn't track it as a DB-backed file. """ anon = self._kb_anon_doc() if anon and str(anon.get("path") or "") == path: @@ -507,7 +492,7 @@ class KBPostgresBackend(BackendProtocol): } xml = build_document_xml(doc_payload, matched_chunk_ids=set()) file_data = create_file_data(xml) - return file_data, None, None + return file_data, None if not path.startswith(DOCUMENTS_ROOT): return None @@ -520,58 +505,41 @@ class KBPostgresBackend(BackendProtocol): ) if document is None: return None - source_markdown = document.source_markdown or "" - document_type = ( - document.document_type.value - if getattr(document, "document_type", None) is not None - else "UNKNOWN" - ) - metadata = dict(document.document_metadata or {}) chunk_rows = await session.execute( - select(Chunk.id, Chunk.content, Chunk.start_char, Chunk.end_char) + select(Chunk.id, Chunk.content) .where(Chunk.document_id == document.id) .order_by(Chunk.position, Chunk.id) ) - chunk_records = chunk_rows.all() - document_id = document.id - document_title = document.title + chunks = [ + {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all() + ] - matched = self._matched_chunk_ids(document_id) - - # Canonical read: serve the verbatim body with cat -n line numbers that - # line up with chunk char spans, so the agent cites real source lines. - if source_markdown: - ranges = compute_matched_line_ranges( - source_markdown, - [(r.id, r.start_char, r.end_char) for r in chunk_records], - matched, - ) - preamble = build_read_preamble( - document_id=document_id, - document_type=document_type, - title=document_title, - url=_metadata_url(metadata), - matched_line_ranges=ranges, - ) - return create_file_data(source_markdown), document_id, preamble - - # Legacy fallback: no canonical body, reconstruct from chunks as XML. doc_payload = { - "document_id": document_id, - "chunks": [ - {"chunk_id": r.id, "content": r.content} for r in chunk_records - ], - "matched_chunk_ids": list(matched), + "document_id": document.id, + "chunks": chunks, + "matched_chunk_ids": list(self._matched_chunk_ids(document.id)), "document": { - "id": document_id, - "title": document_title, - "document_type": document_type, - "metadata": metadata, + "id": document.id, + "title": document.title, + "document_type": ( + document.document_type.value + if getattr(document, "document_type", None) is not None + else "UNKNOWN" + ), + "metadata": dict(document.document_metadata or {}), }, - "source": document_type, + "source": ( + document.document_type.value + if getattr(document, "document_type", None) is not None + else "UNKNOWN" + ), } - xml = build_document_xml(doc_payload, matched_chunk_ids=matched) - return create_file_data(xml), document_id, None + xml = build_document_xml( + doc_payload, + matched_chunk_ids=self._matched_chunk_ids(document.id), + ) + file_data = create_file_data(xml) + return file_data, document.id # ------------------------------------------------------------------ writes @@ -603,7 +571,7 @@ class KBPostgresBackend(BackendProtocol): loaded = await self._load_file_data(file_path) if loaded is None: return EditResult(error=f"Error: File '{file_path}' not found") - file_data, _, _ = loaded + file_data, _ = loaded content = file_data_to_string(file_data) result = perform_string_replacement( diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py deleted file mode 100644 index ced77096f..000000000 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/numbered_document.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Read preamble for canonical (numbered ``source_markdown``) KB reads. - -The KB read tool numbers the body lines ``cat -n`` style, so serving the raw -``source_markdown`` makes those line numbers line up exactly with the chunk -char spans and the editor highlight. This module renders the small header the -agent sees above that body: document identity plus the matched line ranges to -seek to, and a concrete reminder of the line-citation token shape. -""" - -from __future__ import annotations - -from collections.abc import Iterable - -from app.utils.text_spans import char_span_to_line_range - - -def _format_range(start: int, end: int) -> str: - return f"{start}" if start == end else f"{start}-{end}" - - -def compute_matched_line_ranges( - source_markdown: str, - chunks: Iterable[tuple[int, int | None, int | None]], - matched_chunk_ids: set[int], -) -> list[tuple[int, int]]: - """Map matched chunks to sorted, de-duplicated 1-based line ranges. - - ``chunks`` are ``(chunk_id, start_char, end_char)`` triples. Chunks without - spans (legacy rows) are skipped — they have no resolvable location. - """ - ranges: set[tuple[int, int]] = set() - for chunk_id, start_char, end_char in chunks: - if chunk_id not in matched_chunk_ids: - continue - if start_char is None or end_char is None: - continue - ranges.add(char_span_to_line_range(source_markdown, start_char, end_char)) - return sorted(ranges) - - -def build_read_preamble( - *, - document_id: int, - document_type: str, - title: str, - url: str, - matched_line_ranges: list[tuple[int, int]], -) -> str: - """Render the metadata header shown above a numbered ``source_markdown`` body. - - ``matched_line_ranges`` are 1-based inclusive line ranges (already derived - from chunk char spans) to point the agent at the relevant lines. - """ - lines = [ - "", - f" {document_id}", - f" {document_type}", - f" <![CDATA[{title}]]>", - f" ", - ] - if matched_line_ranges: - ranges = ", ".join(_format_range(s, e) for s, e in matched_line_ranges) - lines.append(f" {ranges}") - lines.append("") - lines.append( - f"Cite lines from this document as [citation:d{document_id}#L-] " - "using the line numbers shown below." - ) - lines.append("") - return "\n".join(lines) - - -__all__ = ["build_read_preamble", "compute_matched_line_ranges"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py index 036617d8d..775469531 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/edit_file/index.py @@ -73,7 +73,7 @@ def create_edit_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool: loaded = await backend._load_file_data(validated) if loaded is None: return f"Error: File '{validated}' not found" - _, doc_id_to_attach, _ = loaded + _, doc_id_to_attach = loaded res: EditResult = await backend.aedit( validated, old_string, new_string, replace_all=replace_all diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py index be61ca94f..ded4701f9 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/move_file/helpers.py @@ -75,7 +75,7 @@ async def cloud_move_file( loaded = await backend._load_file_data(source) if loaded is None: return f"Error: source '{source}' not found." - source_file_data, loaded_doc_id, _ = loaded + source_file_data, loaded_doc_id = loaded if source_doc_id is None: source_doc_id = loaded_doc_id diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py index 6cbbe6ae5..5c20619d6 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/read_file/index.py @@ -58,10 +58,8 @@ def create_read_file_tool(mw: SurfSenseFilesystemMiddleware) -> BaseTool: loaded = await backend._load_file_data(validated) if loaded is None: return f"Error: File '{validated}' not found" - file_data, doc_id, preamble = loaded + file_data, doc_id = loaded rendered = format_read_response(file_data, offset, limit) - if preamble and offset == 0: - rendered = preamble + rendered update: dict[str, Any] = { "files": {validated: file_data}, "messages": [ diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py index 020200cbd..e2e445d08 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/tools/rm/helpers.py @@ -74,7 +74,7 @@ async def cloud_rm( loaded = await backend._load_file_data(validated) if loaded is None: return f"Error: file '{validated}' not found." - _, resolved_doc_id, _ = loaded + _, resolved_doc_id = loaded files_update: dict[str, Any] = {validated: None} update: dict[str, Any] = { diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py index 736c508ff..ae7e33428 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/generate_image.py @@ -240,24 +240,23 @@ def create_generate_image_tool( error="No images were generated", ) + # Update all image URLs in response_dict to be absolute (for the serving endpoint) + from urllib.parse import urlparse + for image in images: + if image.get("url"): + raw_url: str = image["url"] + if raw_url.startswith("/") and provider_base_url: + parsed = urlparse(provider_base_url) + origin = f"{parsed.scheme}://{parsed.netloc}" + image["url"] = f"{origin}{raw_url}" # Update the stored dict! + first_image = images[0] revised_prompt = first_image.get("revised_prompt", prompt) # b64_json (e.g. gpt-image-1) is served via our backend endpoint so # megabytes of base64 don't bloat the LLM context. - # Some OpenAI-compatible backends (e.g. Xinference) return a relative - # URL like /files/image.png. Browsers can't resolve these, so we - # prepend the provider's base origin when the URL starts with "/". if first_image.get("url"): - raw_url: str = first_image["url"] - if raw_url.startswith("/") and provider_base_url: - from urllib.parse import urlparse - - parsed = urlparse(provider_base_url) - origin = f"{parsed.scheme}://{parsed.netloc}" - image_url = f"{origin}{raw_url}" - else: - image_url = raw_url + image_url = first_image["url"] elif first_image.get("b64_json"): backend_url = config.BACKEND_URL or "http://localhost:8000" image_url = ( diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md index f377db311..c4e36fc73 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_cloud.md @@ -35,24 +35,42 @@ Map outcomes to your `status`: You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. -## Citations in your prose +## Chunk citations in your prose -`read_file` on a KB document under `/documents/` serves it in one of two forms. Cite from whichever you actually see, attach the marker to the sentence in `action_summary` or `evidence.content_excerpt` stating that fact, and list every marker you emit in `evidence.citations`. The caller relays these markers to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation. +When `read_file` returns a KB-indexed document under `/documents/`, the response includes `` blocks. Whenever a fact in your `action_summary` or `evidence.content_excerpt` came from a specific chunk, append `[citation:]` to the sentence stating that fact, using the **exact** id from the `` tag. The caller relays these markers to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. -**Numbered body (default).** A `` header gives the `` and an optional `` pointer, then the body is shown with line numbers. Cite the lines a fact came from as `[citation:d#L-]` (a single line is `#L-`). +### Where chunk ids live in `read_file` output -**Legacy chunk blocks (older docs without a stored body).** The response is XML with `` blocks. Cite the chunk a fact came from as `[citation:N]`, using the **exact** id from a `` tag. +A KB document's XML has three numeric attributes — only **one** is a citation source: + +``` + + + 42 ← NOT a citation. Parent doc id; ignore for citations. + ... + + + ← Index hint; the same id also appears below. + + + + ← This is the citation source. + + + +``` ### Rules -- Cite only from a passage you actually quoted or paraphrased this turn. Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. -- Never cite `` on its own — it identifies the document, not a passage. In the numbered form it is only the `d` prefix of a line citation. -- Never invent, normalise, shorten, shift, or guess at ids or line numbers. If unsure, omit rather than pick. +- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. +- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. +- Never cite `` — that's the parent doc, not a chunk. +- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. - Prefer **fewer accurate citations** over many speculative ones. -- Multiple passages supporting the same point → comma-separated and copied individually: `[citation:d42#L14-22], [citation:d42#L31-39]`. +- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers. -- Tool results with no body passage (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry nothing to cite. -- Populate `evidence.citations` with **only** the markers you actually emitted — same set, same characters. +- Tool results without `` (write/edit/move confirmations, `ls` / `glob` / `grep` listings, error strings) carry no chunk id and need none. +- Populate `evidence.chunk_ids` with **only** ids you actually emitted in `[citation:…]` markers — same set, same digits. ## Examples @@ -71,7 +89,7 @@ You construct the structured `evidence` fields from your own knowledge of what y "path": "/documents/meetings/2026-05-11-meeting.md", "matched_candidates": null, "content_excerpt": null, - "citations": null + "chunk_ids": null }, "next_step": null, "missing_fields": null, @@ -103,7 +121,7 @@ You construct the structured `evidence` fields from your own knowledge of what y { "id": "/documents/design/auth-rework.md", "label": "Auth Rework" } ], "content_excerpt": null, - "citations": null + "chunk_ids": null }, "next_step": "Ask the user which design doc to update.", "missing_fields": ["path"], @@ -124,7 +142,7 @@ Return **only** one JSON object (no markdown or prose outside it): "path": string | null, "matched_candidates": [ { "id": string, "label": string } ] | null, "content_excerpt": string | null, - "citations": string[] | null + "chunk_ids": string[] | null }, "next_step": string | null, "missing_fields": string[] | null, diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md index 72a921c4f..25dafa3df 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_desktop.md @@ -33,11 +33,11 @@ Map outcomes to your `status`: - Any other `"Error: …"` → `status=error` and relay the tool's message verbatim as `next_step`. - HITL rejection → `status=blocked` with `next_step="User declined this filesystem action. Do not retry."`. -You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`citations` is always `null` in desktop mode — see "Citations in your prose" below.) +You construct the structured `evidence` fields from your own knowledge of what you called and what you observed — the tools do not return them. Never report values you did not actually see. (`chunk_ids` is always `null` in desktop mode — see "Chunk citations in your prose" below.) -## Citations in your prose +## Chunk citations in your prose -In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry chunk ids or numbered KB bodies. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.citations` `null` — the absolute path is the only reference for local-file work. +In desktop mode your filesystem tools read local files only, and local-file tool results do **not** carry `` tags. Do not emit `[citation:…]` markers in `action_summary` or `evidence.content_excerpt`, and leave `evidence.chunk_ids` `null` — the absolute path is the only reference for local-file work. ## Examples @@ -56,7 +56,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool "path": "/notes/meetings/2026-05-11-meeting.md", "matched_candidates": null, "content_excerpt": null, - "citations": null + "chunk_ids": null }, "next_step": null, "missing_fields": null, @@ -88,7 +88,7 @@ In desktop mode your filesystem tools read local files only, and local-file tool { "id": "/projects/web/design/auth-rework.md", "label": "Auth Rework" } ], "content_excerpt": null, - "citations": null + "chunk_ids": null }, "next_step": "Ask the user which design doc to update.", "missing_fields": ["path"], @@ -109,7 +109,7 @@ Return **only** one JSON object (no markdown or prose outside it): "path": string | null, "matched_candidates": [ { "id": string, "label": string } ] | null, "content_excerpt": string | null, - "citations": string[] | null + "chunk_ids": string[] | null }, "next_step": string | null, "missing_fields": string[] | null, diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md index f0aa8403e..c7813e71d 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/knowledge_base/system_prompt_readonly_cloud.md @@ -28,21 +28,41 @@ Reply in plain prose: - If the workspace does not contain the requested information, say so explicitly. Do not fabricate paths or content. - If the question is genuinely ambiguous after a thorough lookup, list the candidates with their paths and stop. -## Citations +## Chunk citations -`read_file` on a KB document under `/documents/` serves it in one of two forms; cite a claim from whichever you actually see, alongside the path. The caller passes these markers through to the end user verbatim, and the UI resolves each by exact match, so a wrong id or line number silently breaks the citation. +When the evidence for a claim came from a `read_file` response that included `` blocks (i.e. a KB-indexed document under `/documents/`), append `[citation:]` to the sentence stating that claim. The caller passes these markers through to the end user verbatim, and the UI resolves each id by exact match against the database, so a wrong id silently breaks the citation. -- **Numbered body (default).** A `` header gives the ``, and the body is shown with line numbers. Cite the lines a claim came from as `[citation:d#L-]` (a single line is `#L-`). -- **Legacy chunk blocks (older docs).** XML with `` blocks. Cite the chunk a claim came from as `[citation:N]`. +### Where chunk ids live in `read_file` output + +A KB document's XML has three numeric attributes — only **one** is a citation source: + +``` + + + 42 ← NOT a citation. Parent doc id; ignore for citations. + ... + + + ← Index hint; the same id also appears below. + + + + ← This is the citation source. + + + +``` ### Rules -- Copy document ids, line numbers, and chunk ids character-for-character; never retype from memory. If you cannot see the id/lines for a claim, omit the citation. -- Never cite `` on its own — in the numbered form it is only the `d` prefix of a line citation. -- Never invent, normalise, shorten, shift, or guess. Prefer **fewer accurate citations** over many speculative ones. -- Multiple passages supporting the same point → comma-separated and copied individually. +- Use the **exact** id from a `` tag whose content you actually quoted or paraphrased. Copy digit-for-digit; do **not** retype from memory. +- Before emitting `[citation:N]`, confirm the literal substring `` (or its index twin `chunk_id="N"`) appears in the tool result you are summarising this turn. If you can't see it, omit the citation. +- Never cite `` — that's the parent doc, not a chunk. +- Never invent, normalise, shorten, or guess at adjacent ids. If unsure between two candidates, omit rather than pick. +- Prefer **fewer accurate citations** over many speculative ones. One correct `[citation:128]` is more useful than a string of wrong ids. +- Multiple chunks supporting the same point → comma-separated and copied individually: `[citation:128], [citation:129]`. - Plain square brackets only — no markdown links, no parentheses, no footnote numbers. -- Listings (`ls` / `glob` / `grep`), error strings, and files without either form carry nothing to cite. -- The absolute path under `/documents/` is always required; citations are additive, they do not replace the path reference. +- If a claim came from a tool result that did **not** carry a chunk id (`ls`, `glob`, `grep` listings, error strings, or files without ``), skip the citation. +- The absolute path under `/documents/` is always required; chunk citations are additive, they do not replace the path reference. -Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:d42#L3-9].` +Example: `The Q2 roadmap lists three milestones (/documents/planning/q2-roadmap.md) [citation:128], [citation:129].` diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 3d8dc2aaf..b998f05cf 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -957,9 +957,8 @@ class Config: os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true" ) # Bump to invalidate every cached embedding set after a chunker change. - # v2: chunks became exact (raw) slices of source_markdown for citation spans. EMBEDDING_CACHE_CHUNKER_VERSION = int( - os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "2") + os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1") ) EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90")) EMBEDDING_CACHE_MAX_TOTAL_MB = int( diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 5cc3cea5d..a65a964fd 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1470,11 +1470,6 @@ class Chunk(BaseModel, TimestampMixin): # ordering reads are document-scoped (covered by ix_chunks_document_id) and # building a position index on the large chunks table is not worth it. position = Column(Integer, nullable=False, server_default="0") - # Half-open char span into the document's source_markdown the chunk was cut - # from. Nullable: historical rows predate spans and populate on reindex. - # Invariant for span-aware rows: source_markdown[start_char:end_char] == content. - start_char = Column(Integer, nullable=True) - end_char = Column(Integer, nullable=True) document_id = Column( Integer, diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py index 58872a219..95321a229 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py +++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py @@ -18,26 +18,23 @@ from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet from app.indexing_pipeline.cache.service import EmbeddingCacheService from app.indexing_pipeline.cache.settings import load_embedding_cache_settings -from app.indexing_pipeline.document_chunker import ChunkSlice, chunk_markdown_with_spans +from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid from app.indexing_pipeline.document_embedder import embed_texts from app.observability import metrics logger = logging.getLogger(__name__) -SliceEmbedding = tuple[ChunkSlice, np.ndarray] +ChunkPair = tuple[str, np.ndarray] async def build_chunk_embeddings( markdown: str, *, use_code_chunker: bool -) -> tuple[np.ndarray, list[SliceEmbedding]]: - """Return the document-level vector and ordered ``(ChunkSlice, vector)`` pairs. +) -> tuple[np.ndarray, list[ChunkPair]]: + """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs. - Slices are always recomputed (cheap) so their char spans are exact; only the - embeddings are cached, reused when the same markdown was embedded with the - current model and chunker. + Drop-in for the inline chunk+embed step; reuses prior output when the same + markdown has already been embedded with the current model and chunker. """ - slices = await chunk_slices(markdown, use_code_chunker=use_code_chunker) - settings = load_embedding_cache_settings() chunker_kind = "code" if use_code_chunker else "hybrid" embedding_dim = getattr(config.embedding_model_instance, "dimension", None) @@ -48,7 +45,7 @@ async def build_chunk_embeddings( embedding_dim=embedding_dim, ) if not cacheable: - return await _compute(markdown, slices) + return await _compute(markdown, use_code_chunker=use_code_chunker) key = EmbeddingKey( markdown_sha256=_hash_text(markdown), @@ -59,30 +56,31 @@ async def build_chunk_embeddings( ) cached = await _recall(key) - if cached is not None and _aligns(cached, slices): + if cached is not None: metrics.record_embedding_cache_lookup( embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit", ) logger.debug("Embedding cache hit for %s", key.markdown_sha256) - return cached.summary_embedding, list( - zip(slices, (c.embedding for c in cached.chunks), strict=True) - ) + return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] metrics.record_embedding_cache_lookup( embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss" ) - summary_embedding, pairs = await _compute(markdown, slices) - await _remember(key, summary_embedding, pairs) - return summary_embedding, pairs - - -async def chunk_slices(markdown: str, *, use_code_chunker: bool) -> list[ChunkSlice]: - """Chunk markdown into ordered, char-addressed slices off the event loop.""" - return await asyncio.to_thread( - chunk_markdown_with_spans, markdown, use_code_chunker + summary_embedding, chunk_pairs = await _compute( + markdown, use_code_chunker=use_code_chunker ) + await _remember(key, summary_embedding, chunk_pairs) + return summary_embedding, chunk_pairs + + +async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]: + """Chunk markdown into ordered texts with the pipeline's chunker selection.""" + if use_code_chunker: + return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True) + # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). + return await asyncio.to_thread(chunk_text_hybrid, markdown) async def embed_batch(texts: list[str]) -> list[np.ndarray]: @@ -90,19 +88,13 @@ async def embed_batch(texts: list[str]) -> list[np.ndarray]: return await asyncio.to_thread(embed_texts, texts) -def _aligns(cached: EmbeddingSet, slices: list[ChunkSlice]) -> bool: - """A hit is only usable if its texts still match the current chunking.""" - return len(cached.chunks) == len(slices) and all( - c.text == s.text for c, s in zip(cached.chunks, slices, strict=True) - ) - - async def _compute( - markdown: str, slices: list[ChunkSlice] -) -> tuple[np.ndarray, list[SliceEmbedding]]: - embeddings = await embed_batch([markdown, *(s.text for s in slices)]) + markdown: str, *, use_code_chunker: bool +) -> tuple[np.ndarray, list[ChunkPair]]: + chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker) + embeddings = await embed_batch([markdown, *chunk_texts]) summary_embedding, *chunk_embeddings = embeddings - return summary_embedding, list(zip(slices, chunk_embeddings, strict=True)) + return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False)) async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: @@ -118,14 +110,14 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: async def _remember( - key: EmbeddingKey, summary_embedding: np.ndarray, pairs: list[SliceEmbedding] + key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair] ) -> None: try: from app.tasks.celery_tasks import get_celery_session_maker embedding_set = EmbeddingSet( summary_embedding=summary_embedding, - chunks=[CachedChunk(text=s.text, embedding=vec) for s, vec in pairs], + chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs], ) async with get_celery_session_maker()() as session: await EmbeddingCacheService(session).remember(key, embedding_set) diff --git a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py index dd57a44d1..9354aeb9f 100644 --- a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py +++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py @@ -19,9 +19,6 @@ class ExistingChunk: id: int content: str position: int - # Stored char span; None for legacy rows indexed before spans existed. - start_char: int | None = None - end_char: int | None = None @dataclass(frozen=True, slots=True) diff --git a/surfsense_backend/app/indexing_pipeline/document_chunker.py b/surfsense_backend/app/indexing_pipeline/document_chunker.py index 096624109..6ae81b7a8 100644 --- a/surfsense_backend/app/indexing_pipeline/document_chunker.py +++ b/surfsense_backend/app/indexing_pipeline/document_chunker.py @@ -1,30 +1,16 @@ import re -from dataclasses import dataclass from app.config import config # Regex that matches a Markdown table block (header + separator + one or more rows) # A table block starts with a | at the beginning of a line and ends when a -# non-table line (or end of string) is encountered. The final row may end at EOF -# without a trailing newline, so the whole table stays one slice. +# non-table line (or end of string) is encountered. _TABLE_BLOCK_RE = re.compile( - r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*(?:\n|$))+)", + r"(?:(?:^|\n)(?=[ \t]*\|)(?:[ \t]*\|[^\n]*\n)+)", re.MULTILINE, ) -@dataclass(frozen=True, slots=True) -class ChunkSlice: - """A chunk paired with its half-open char span into the source markdown. - - Invariant: ``markdown[start_char:end_char] == text``. - """ - - text: str - start_char: int - end_char: int - - def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: """Chunk a text string using the configured chunker and return the chunk texts.""" chunker = ( @@ -33,63 +19,41 @@ def chunk_text(text: str, use_code_chunker: bool = False) -> list[str]: return [c.text for c in chunker.chunk(text)] -def chunk_markdown_with_spans( - text: str, use_code_chunker: bool = False -) -> list[ChunkSlice]: - """Chunk markdown into a lossless, contiguous partition of char-addressed slices. +def chunk_text_hybrid(text: str) -> list[str]: + """Table-aware chunker that prevents Markdown tables from being split mid-row. - Tables stay whole (issue #1334) and every slice is an exact substring of - ``text``, so ``"".join(s.text) == text`` and ``text[s:e] == s.text``. This is - the offset record citations resolve against. + Algorithm: + 1. Scan the document for Markdown table blocks. + 2. Each table block is emitted as a single, unmodified chunk so that its + header, separator row, and data rows always stay together. + 3. The non-table prose segments between (and around) tables are passed through + the normal ``chunk_text`` chunker and their sub-chunks are interleaved in + document order. + + This ensures that table data is never sliced in the middle by the token-based + chunker, which would otherwise produce garbled rows that are useless for RAG. + + Fixes #1334. """ - if not text: - return [] - - slices: list[ChunkSlice] = [] + chunks: list[str] = [] cursor = 0 for match in _TABLE_BLOCK_RE.finditer(text): - if match.start() > cursor: - slices.extend( - _segment_slices(text, cursor, match.start(), use_code_chunker) - ) - slices.append(ChunkSlice(match.group(0), match.start(), match.end())) + # Prose before this table + prose = text[cursor : match.start()].strip() + if prose: + chunks.extend(chunk_text(prose)) + + # The table itself is kept as one indivisible chunk + table_block = match.group(0).strip() + if table_block: + chunks.append(table_block) + cursor = match.end() - if len(text) > cursor: - slices.extend(_segment_slices(text, cursor, len(text), use_code_chunker)) + # Remaining prose after the last table (or entire text if no tables) + trailing = text[cursor:].strip() + if trailing: + chunks.extend(chunk_text(trailing)) - return slices - - -def _segment_slices( - text: str, start: int, end: int, use_code_chunker: bool -) -> list[ChunkSlice]: - """Sub-chunk one non-table segment into contiguous, char-addressed slices.""" - chunker = ( - config.code_chunker_instance if use_code_chunker else config.chunker_instance - ) - segment = text[start:end] - chunks = chunker.chunk(segment) - - slices: list[ChunkSlice] = [] - local = 0 - for chunk in chunks: - # Use the chunker's end offset only as a cut point, then re-slice the - # segment ourselves so the result is an exact, gap-free substring. - local_end = min(max(chunk.end_index, local), len(segment)) - if local_end <= local: - continue - slices.append( - ChunkSlice(segment[local:local_end], start + local, start + local_end) - ) - local = local_end - - if local < len(segment): - if slices: - last = slices[-1] - slices[-1] = ChunkSlice(text[last.start_char : end], last.start_char, end) - else: - slices.append(ChunkSlice(segment[local:], start + local, end)) - - return slices + return chunks diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 0cb74089b..30ea9d5d6 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -20,10 +20,9 @@ from app.db import ( DocumentType, ) from app.indexing_pipeline.cache import build_chunk_embeddings -from app.indexing_pipeline.cache.cached_indexing import chunk_slices, embed_batch -from app.indexing_pipeline.chunk_reconciler import ChunkPlan, ExistingChunk, reconcile +from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch +from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile from app.indexing_pipeline.connector_document import ConnectorDocument -from app.indexing_pipeline.document_chunker import ChunkSlice from app.indexing_pipeline.document_hashing import ( compute_content_hash, compute_identifier_hash, @@ -490,22 +489,12 @@ class IndexingPipelineService: async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]: result = await self.session.execute( - select( - Chunk.id, - Chunk.content, - Chunk.position, - Chunk.start_char, - Chunk.end_char, - ).where(Chunk.document_id == document_id) + select(Chunk.id, Chunk.content, Chunk.position).where( + Chunk.document_id == document_id + ) ) return [ - ExistingChunk( - id=row.id, - content=row.content, - position=row.position, - start_char=row.start_char, - end_char=row.end_char, - ) + ExistingChunk(id=row.id, content=row.content, position=row.position) for row in result ] @@ -516,21 +505,15 @@ class IndexingPipelineService: delete(Chunk).where(Chunk.document_id == document.id) ) - summary_embedding, slice_pairs = await build_chunk_embeddings( + summary_embedding, chunk_pairs = await build_chunk_embeddings( content, use_code_chunker=connector_doc.should_use_code_chunker, ) document.embedding = summary_embedding return [ - Chunk( - content=chunk_slice.text, - embedding=emb, - position=i, - start_char=chunk_slice.start_char, - end_char=chunk_slice.end_char, - ) - for i, (chunk_slice, emb) in enumerate(slice_pairs) + Chunk(content=text, embedding=emb, position=i) + for i, (text, emb) in enumerate(chunk_pairs) ] async def _reindex_incrementally( @@ -542,39 +525,35 @@ class IndexingPipelineService: ) -> int: """Edit path: keep rows whose text survived, embed only new texts. - Unchanged rows keep their embedding and their HNSW/GIN index entries. An - edit can shift a kept chunk's char span without changing its text, so - every kept row's position and span are refreshed whenever they drift. + Unchanged rows keep their embedding and their HNSW/GIN index entries; + moved rows get a position-only UPDATE, which touches neither index. """ - slices = await chunk_slices( + new_texts = await chunk_markdown( content, use_code_chunker=connector_doc.should_use_code_chunker ) - new_texts = [s.text for s in slices] plan = reconcile(existing, new_texts) # One batch: the document-level summary vector plus the missing chunks. embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]]) summary_embedding, *new_embeddings = embeddings + if plan.reused: + await self.session.execute( + update(Chunk), + [{"id": cid, "position": pos} for cid, pos in plan.reused], + ) if plan.to_delete: await self.session.execute( delete(Chunk).where(Chunk.id.in_(plan.to_delete)) ) - - span_updates = self._kept_row_span_updates(existing, slices, plan) - if span_updates: - await self.session.execute(update(Chunk), span_updates) - self.session.add_all( Chunk( - content=slices[pos].text, + content=text, embedding=emb, position=pos, - start_char=slices[pos].start_char, - end_char=slices[pos].end_char, document_id=document.id, ) - for (pos, _text), emb in zip(plan.to_embed, new_embeddings, strict=True) + for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True) ) document.embedding = summary_embedding @@ -585,36 +564,6 @@ class IndexingPipelineService: ) return len(new_texts) - @staticmethod - def _kept_row_span_updates( - existing: list[ExistingChunk], - slices: list[ChunkSlice], - plan: ChunkPlan, - ) -> list[dict]: - """Position/span writes for kept rows, emitted only where a value drifts.""" - deleted = set(plan.to_delete) - moved = dict(plan.reused) - updates: list[dict] = [] - for chunk in existing: - if chunk.id in deleted: - continue - new_position = moved.get(chunk.id, chunk.position) - target = slices[new_position] - if ( - chunk.position != new_position - or chunk.start_char != target.start_char - or chunk.end_char != target.end_char - ): - updates.append( - { - "id": chunk.id, - "position": new_position, - "start_char": target.start_char, - "end_char": target.end_char, - } - ) - return updates - async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled.""" try: diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py index adce14e53..5e5edec2e 100644 --- a/surfsense_backend/app/retriever/chunks_hybrid_search.py +++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py @@ -440,15 +440,8 @@ class ChucksHybridSearchRetriever: chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC # Select only the columns we need (skip Chunk.embedding ~12KB/row). - # start_char/end_char carry the citation span; None for legacy rows. chunk_query = ( - select( - Chunk.id, - Chunk.content, - Chunk.document_id, - Chunk.start_char, - Chunk.end_char, - ) + select(Chunk.id, Chunk.content, Chunk.document_id) .join(numbered, Chunk.id == numbered.c.chunk_id) .where(chunk_filter) .order_by(Chunk.document_id, Chunk.position, Chunk.id) @@ -483,14 +476,7 @@ class ChucksHybridSearchRetriever: if doc_id not in doc_map: continue doc_entry = doc_map[doc_id] - doc_entry["chunks"].append( - { - "chunk_id": row.id, - "content": row.content, - "start_char": row.start_char, - "end_char": row.end_char, - } - ) + doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content}) if row.id in matched_chunk_ids: doc_entry["matched_chunk_ids"].append(row.id) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 0de3ab9a4..9d908f4a1 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -38,7 +38,6 @@ from app.schemas import ( from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher from app.users import get_auth_context from app.utils.rbac import check_permission -from app.utils.text_spans import char_span_to_line_range try: asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) @@ -977,12 +976,9 @@ async def get_document_by_chunk_id( session: AsyncSession = Depends(get_async_session), auth: AuthContext = Depends(get_auth_context), ): - """Resolve a chunk id to its document plus a window of surrounding chunks. - - Returns the cited chunk's 1-based line range (cited_start_line/ - cited_end_line) when char spans exist, so callers can anchor the citation - to exact source lines. Uses SQL-level pagination to avoid loading all - chunks into memory. + """ + Retrieves a document based on a chunk ID, including a window of chunks around the cited one. + Uses SQL-level pagination to avoid loading all chunks into memory. """ try: from sqlalchemy import and_, func, or_ @@ -1046,17 +1042,6 @@ async def get_document_by_chunk_id( ) windowed_chunks = windowed_result.scalars().all() - cited_start_line: int | None = None - cited_end_line: int | None = None - if ( - chunk.start_char is not None - and chunk.end_char is not None - and document.source_markdown - ): - cited_start_line, cited_end_line = char_span_to_line_range( - document.source_markdown, chunk.start_char, chunk.end_char - ) - return DocumentWithChunksRead( id=document.id, title=document.title, @@ -1071,8 +1056,6 @@ async def get_document_by_chunk_id( chunks=windowed_chunks, total_chunks=total_chunks, chunk_start_index=start, - cited_start_line=cited_start_line, - cited_end_line=cited_end_line, ) except HTTPException: raise diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index 383bae80a..fe00995ea 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -43,34 +43,6 @@ EDITOR_PLATE_MAX_BYTES = 1 * 1024 * 1024 EDITOR_PLATE_MAX_LINES = 5000 -def _raise_no_canonical_body(document: Document) -> None: - """Translate a missing source_markdown into a status-aware HTTP error.""" - doc_status = document.status or {} - state = ( - doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready" - ) - - if state in ("pending", "processing"): - raise HTTPException( - status_code=409, - detail="This document is still being processed. Please wait a moment and try again.", - ) - if state == "failed": - reason = ( - doc_status.get("reason", "Unknown error") - if isinstance(doc_status, dict) - else "Unknown error" - ) - raise HTTPException( - status_code=422, - detail=f"Processing failed: {reason}. You can delete this document and re-upload it.", - ) - raise HTTPException( - status_code=400, - detail="This document has no editable content. It may not have been processed correctly. Try re-indexing or re-uploading it.", - ) - - @router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content") async def get_editor_content( search_space_id: int, @@ -82,9 +54,8 @@ async def get_editor_content( """ Get document content for editing. - Returns source_markdown (the canonical body) for the Plate.js editor, with a - one-time migration from legacy blocknote_document. Never reconstructs the - body from chunks. + Returns source_markdown for the Plate.js editor. + Falls back to blocknote_document → markdown conversion, then chunk reconstruction. Requires DOCUMENTS_READ permission. """ @@ -154,9 +125,52 @@ async def get_editor_content( await session.commit() return _build_response(empty_markdown) - # No canonical body. Chunks are an index artifact, never the source of - # truth, so surface the processing state instead of rebuilding from them. - _raise_no_canonical_body(document) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.position, Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + + if not chunk_contents: + doc_status = document.status or {} + state = ( + doc_status.get("state", "ready") + if isinstance(doc_status, dict) + else "ready" + ) + if state in ("pending", "processing"): + raise HTTPException( + status_code=409, + detail="This document is still being processed. Please wait a moment and try again.", + ) + if state == "failed": + reason = ( + doc_status.get("reason", "Unknown error") + if isinstance(doc_status, dict) + else "Unknown error" + ) + raise HTTPException( + status_code=422, + detail=f"Processing failed: {reason}. You can delete this document and re-upload it.", + ) + raise HTTPException( + status_code=400, + detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.", + ) + + markdown_content = "\n\n".join(chunk_contents) + + if not markdown_content.strip(): + raise HTTPException( + status_code=400, + detail="This document appears to be empty. Try re-uploading or editing it to add content.", + ) + + document.source_markdown = markdown_content + await session.commit() + + return _build_response(markdown_content) @router.get( @@ -170,9 +184,8 @@ async def download_document_markdown( ): user = auth.user """ - Download the canonical document body as a .md file. - - Serves source_markdown, migrating legacy blocknote_document when present. + Download the full document content as a .md file. + Reconstructs markdown from source_markdown or chunks. """ await check_permission( session, @@ -198,6 +211,15 @@ async def download_document_markdown( from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown = blocknote_to_markdown(document.blocknote_document) + if markdown is None: + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.position, Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown = "\n\n".join(chunk_contents) if not markdown or not markdown.strip(): raise HTTPException( @@ -340,6 +362,15 @@ async def export_document( from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown_content = blocknote_to_markdown(document.blocknote_document) + if markdown_content is None: + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.position, Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown_content = "\n\n".join(chunk_contents) if not markdown_content or not markdown_content.strip(): raise HTTPException(status_code=400, detail="Document has no content to export") diff --git a/surfsense_backend/app/routes/image_generation_routes.py b/surfsense_backend/app/routes/image_generation_routes.py index 0d9841c4c..9376c8f0f 100644 --- a/surfsense_backend/app/routes/image_generation_routes.py +++ b/surfsense_backend/app/routes/image_generation_routes.py @@ -214,7 +214,7 @@ async def _execute_image_generation( ) # Store response - image_gen.response_data = ( + response_dict = ( response.model_dump() if hasattr(response, "model_dump") else dict(response) ) if not image_gen.model and hasattr(response, "_hidden_params"): @@ -222,6 +222,20 @@ async def _execute_image_generation( if isinstance(hidden, dict) and hidden.get("model"): image_gen.model = hidden["model"] + # Fix relative URLs in response data (for the serving endpoint) + from urllib.parse import urlparse + images = response_dict.get("data", []) + provider_base_url = resolved_kwargs.get("api_base") + for image in images: + if image.get("url"): + raw_url: str = image["url"] + if raw_url.startswith("/") and provider_base_url: + parsed = urlparse(provider_base_url) + origin = f"{parsed.scheme}://{parsed.netloc}" + image["url"] = f"{origin}{raw_url}" + + image_gen.response_data = response_dict + # ============================================================================= # Image Generation Execution + Results CRUD diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py index 685aa4762..7fec0d445 100644 --- a/surfsense_backend/app/schemas/chunks.py +++ b/surfsense_backend/app/schemas/chunks.py @@ -17,7 +17,4 @@ class ChunkUpdate(ChunkBase): class ChunkRead(ChunkBase, IDModel, TimestampModel): - start_char: int | None = None - end_char: int | None = None - model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index 162dd6882..49d2836b2 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -73,10 +73,6 @@ class DocumentWithChunksRead(DocumentRead): chunks: list[ChunkRead] = [] total_chunks: int = 0 chunk_start_index: int = 0 - # 1-based inclusive line range of the cited chunk within source_markdown; - # None when the chunk predates char spans or the body is unavailable. - cited_start_line: int | None = None - cited_end_line: int | None = None model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/utils/text_spans.py b/surfsense_backend/app/utils/text_spans.py deleted file mode 100644 index c12201174..000000000 --- a/surfsense_backend/app/utils/text_spans.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Convert char spans into document-relative line ranges. - -Chunks store half-open char spans into ``source_markdown``; citations and the -editor speak in line numbers. This is the single shared conversion so search, -the resolve API, and highlighting all agree on what "lines X-Y" means. -""" - -from __future__ import annotations - - -def char_span_to_line_range(text: str, start_char: int, end_char: int) -> tuple[int, int]: - """Return the 1-based inclusive line range covering ``[start_char, end_char)``. - - Offsets are clamped to ``text`` bounds. An empty span resolves to the single - line containing it. - """ - n = len(text) - start = max(0, min(start_char, n)) - end = max(start, min(end_char, n)) - start_line = text.count("\n", 0, start) + 1 - last_char_index = max(start, end - 1) - end_line = text.count("\n", 0, last_char_index) + 1 - return start_line, end_line diff --git a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py b/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py deleted file mode 100644 index 77e2e5f18..000000000 --- a/surfsense_backend/tests/integration/agents/multi_agent_chat/test_kb_persistence_spans.py +++ /dev/null @@ -1,80 +0,0 @@ -"""NOTE writes must carry the same char spans as the indexing pipeline. - -``_create_document`` / ``_update_document`` are the cloud agent's KB write -paths. They must chunk through the shared span chunker so every persisted -chunk resolves back to an exact slice of ``source_markdown`` for citations. -""" - -from __future__ import annotations - -import pytest -from sqlalchemy import select - -from app.agents.chat.multi_agent_chat.main_agent.middleware.kb_persistence import ( - middleware as kb, -) -from app.db import Chunk - -pytestmark = [pytest.mark.integration, pytest.mark.asyncio] - -_BODY = "Intro paragraph.\n\nBody paragraph with detail.\n\nOutro paragraph." -_NEW_BODY = "Rewritten intro.\n\nFresh body content.\n\nNew closing line." - - -async def _ordered_chunks(session, doc_id: int) -> list[Chunk]: - rows = await session.execute( - select(Chunk).where(Chunk.document_id == doc_id).order_by(Chunk.position) - ) - return list(rows.scalars().all()) - - -def _assert_spans_resolve(source_markdown: str, chunks: list[Chunk]) -> None: - assert chunks - for chunk in chunks: - assert chunk.start_char is not None - assert chunk.end_char is not None - assert source_markdown[chunk.start_char : chunk.end_char] == chunk.content - - -@pytest.mark.usefixtures("patched_embed_texts") -async def test_note_create_populates_chunk_spans( - db_session, db_search_space, db_user -) -> None: - doc = await kb._create_document( - db_session, - virtual_path="/documents/note.md", - content=_BODY, - search_space_id=db_search_space.id, - created_by_id=str(db_user.id), - ) - await db_session.flush() - - chunks = await _ordered_chunks(db_session, doc.id) - _assert_spans_resolve(doc.source_markdown, chunks) - - -@pytest.mark.usefixtures("patched_embed_texts") -async def test_note_update_refreshes_chunk_spans( - db_session, db_search_space, db_user -) -> None: - doc = await kb._create_document( - db_session, - virtual_path="/documents/note.md", - content=_BODY, - search_space_id=db_search_space.id, - created_by_id=str(db_user.id), - ) - await db_session.flush() - - updated = await kb._update_document( - db_session, - doc_id=doc.id, - content=_NEW_BODY, - virtual_path="/documents/note.md", - search_space_id=db_search_space.id, - ) - await db_session.flush() - - assert updated is not None - chunks = await _ordered_chunks(db_session, updated.id) - _assert_spans_resolve(updated.source_markdown, chunks) diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index e67a025cc..6b8aa3cdb 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -158,12 +158,13 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock: @pytest.fixture def patched_chunk_text(monkeypatch) -> MagicMock: - from app.indexing_pipeline.document_chunker import ChunkSlice - - text = "Test chunk content." - mock = MagicMock(return_value=[ChunkSlice(text, 0, len(text))]) + mock = MagicMock(return_value=["Test chunk content."]) monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", + mock, + ) + monkeypatch.setattr( + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock, ) return mock diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index f73c4eaaf..bd889360f 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -286,12 +286,9 @@ def _mock_external_apis(monkeypatch): "app.indexing_pipeline.cache.cached_indexing.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), ) - from app.indexing_pipeline.document_chunker import ChunkSlice - - chunk = "Test chunk content." monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - MagicMock(return_value=[ChunkSlice(chunk, 0, len(chunk))]), + "app.indexing_pipeline.cache.cached_indexing.chunk_text", + MagicMock(return_value=["Test chunk content."]), ) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index e89d7592b..814129c8d 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -176,14 +176,9 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m @pytest.mark.usefixtures("patched_embed_texts") async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" - from app.indexing_pipeline.document_chunker import ChunkSlice - mocker.patch( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - side_effect=[ - [ChunkSlice("Original chunk.", 0, len("Original chunk."))], - [ChunkSlice("Updated chunk.", 0, len("Updated chunk."))], - ], + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", + side_effect=[["Original chunk."], ["Updated chunk."]], ) adapter = UploadDocumentAdapter(db_session) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py index f86ee8e4f..68d5ec0af 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py @@ -18,22 +18,16 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph." @pytest.fixture def paragraph_chunker(monkeypatch): - """One slice per markdown paragraph, so edits map to chunk-level diffs.""" - from app.indexing_pipeline.document_chunker import ChunkSlice + """One chunk per markdown paragraph, so edits map to chunk-level diffs.""" - def _split(markdown, *_args, **_kwargs): - slices = [] - cursor = 0 - for para in markdown.split("\n\n"): - start = markdown.index(para, cursor) - cursor = start + len(para) - if para.strip(): - slices.append(ChunkSlice(para, start, cursor)) - return slices + def _split(markdown, **_kwargs): + return [p for p in markdown.split("\n\n") if p.strip()] monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - _split, + "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split + ) + monkeypatch.setattr( + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split ) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py deleted file mode 100644 index 869045bf6..000000000 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_index_spans.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Indexing records char spans so a chunk addresses its exact slice of the body. - -Uses the real chunker (only embeddings are faked) so the span/partition -invariants are exercised end to end. -""" - -import pytest -from sqlalchemy import select - -from app.db import Chunk, Document -from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService - -pytestmark = pytest.mark.integration - -_BODY = ( - "# Report\n\n" - + "Intro paragraph that is reasonably long and descriptive. " * 8 - + "\n\n| col a | col b |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n\n" - + "Closing paragraph with a different shape and more words to chunk. " * 8 -) - - -async def _ordered_chunks(session, document_id) -> list[Chunk]: - result = await session.execute( - select(Chunk) - .filter(Chunk.document_id == document_id) - .order_by(Chunk.position, Chunk.id) - ) - return list(result.scalars().all()) - - -def _assert_spans_address_body(chunks: list[Chunk], body: str) -> None: - for chunk in chunks: - assert chunk.start_char is not None and chunk.end_char is not None - assert body[chunk.start_char : chunk.end_char] == chunk.content - assert "".join(c.content for c in chunks) == body - - -async def _index(session, connector_doc) -> int: - service = IndexingPipelineService(session=session) - prepared = await service.prepare_for_indexing([connector_doc]) - document = prepared[0] - await service.index(document, connector_doc) - return document.id - - -async def _reload_body(session, document_id) -> str: - result = await session.execute(select(Document).filter(Document.id == document_id)) - return result.scalars().first().source_markdown - - -@pytest.mark.usefixtures("patched_embed_texts") -async def test_scratch_index_records_spans_addressing_body( - db_session, db_search_space, make_connector_document -): - connector_doc = make_connector_document( - search_space_id=db_search_space.id, source_markdown=_BODY - ) - - document_id = await _index(db_session, connector_doc) - - body = await _reload_body(db_session, document_id) - chunks = await _ordered_chunks(db_session, document_id) - - assert len(chunks) > 1 - _assert_spans_address_body(chunks, body) - - -@pytest.mark.usefixtures("patched_embed_texts") -async def test_incremental_reindex_refreshes_shifted_spans( - db_session, db_search_space, make_connector_document -): - """Inserting text at the top shifts every later chunk's span; kept rows must - have their spans refreshed, not left pointing at the old offsets.""" - service = IndexingPipelineService(session=db_session) - - original = make_connector_document( - search_space_id=db_search_space.id, source_markdown=_BODY - ) - prepared = await service.prepare_for_indexing([original]) - document_id = prepared[0].id - await service.index(prepared[0], original) - - edited_body = "# Prepended heading\n\nA brand new opening paragraph.\n\n" + _BODY - edited = make_connector_document( - search_space_id=db_search_space.id, source_markdown=edited_body - ) - prepared_again = await service.prepare_for_indexing([edited]) - assert prepared_again, "edited content should requeue the document" - await service.index(prepared_again[0], edited) - - body = await _reload_body(db_session, document_id) - chunks = await _ordered_chunks(db_session, document_id) - - assert body == edited_body - _assert_spans_address_body(chunks, body) diff --git a/surfsense_backend/tests/integration/retriever/conftest.py b/surfsense_backend/tests/integration/retriever/conftest.py index 96c6297bb..d2443723c 100644 --- a/surfsense_backend/tests/integration/retriever/conftest.py +++ b/surfsense_backend/tests/integration/retriever/conftest.py @@ -40,19 +40,11 @@ def _make_document( ) -def _make_chunk( - *, - content: str, - document_id: int, - start_char: int | None = None, - end_char: int | None = None, -) -> Chunk: +def _make_chunk(*, content: str, document_id: int) -> Chunk: return Chunk( content=content, document_id=document_id, embedding=DUMMY_EMBEDDING, - start_char=start_char, - end_char=end_char, ) @@ -99,8 +91,6 @@ async def seed_large_doc( _make_chunk( content="quarterly performance review summary note content", document_id=small_doc.id, - start_char=0, - end_char=10, ), ] diff --git a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py index a8c85e65f..f80e59304 100644 --- a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py +++ b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py @@ -98,32 +98,6 @@ async def test_chunks_ordered_by_id(db_session, seed_large_doc): assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID" -async def test_chunk_spans_returned(db_session, seed_large_doc): - """Each chunk dict carries start_char/end_char (the citation span).""" - space_id = seed_large_doc["search_space"].id - small_doc_id = seed_large_doc["small_doc"].id - - retriever = ChucksHybridSearchRetriever(db_session) - results = await retriever.hybrid_search( - query_text="quarterly performance review summary", - top_k=10, - search_space_id=space_id, - query_embedding=DUMMY_EMBEDDING, - ) - - for result in results: - for chunk in result["chunks"]: - assert "start_char" in chunk - assert "end_char" in chunk - if result["document"].get("id") == small_doc_id: - seeded = result["chunks"][0] - assert seeded["start_char"] == 0 - assert seeded["end_char"] == 10 - break - else: - pytest.fail("Small doc not found in search results") - - async def test_score_is_positive_float(db_session, seed_large_doc): """Each result should have a positive float score from RRF.""" space_id = seed_large_doc["search_space"].id diff --git a/surfsense_backend/tests/integration/test_documents_by_chunk_route.py b/surfsense_backend/tests/integration/test_documents_by_chunk_route.py deleted file mode 100644 index f59c65d97..000000000 --- a/surfsense_backend/tests/integration/test_documents_by_chunk_route.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Phase E.1 contract: the by-chunk resolve API exposes chunk char spans and -derives the cited chunk's line range from source_markdown.""" - -import pytest -import pytest_asyncio -from sqlalchemy.ext.asyncio import AsyncSession - -from app.db import Chunk, Document, DocumentStatus, DocumentType, SearchSpace, User - -pytestmark = pytest.mark.integration - -_BODY = "alpha\nbravo\ncharlie\ndelta" - - -async def _make_document( - session: AsyncSession, - search_space: SearchSpace, - user: User, - *, - source_markdown: str = _BODY, -) -> Document: - doc = Document( - title="Doc", - document_type=DocumentType.FILE, - document_metadata={}, - content=source_markdown, - content_hash="hash-by-chunk", - source_markdown=source_markdown, - search_space_id=search_space.id, - created_by_id=user.id, - status=DocumentStatus.ready(), - ) - session.add(doc) - await session.flush() - return doc - - -async def _add_chunk( - session: AsyncSession, - document: Document, - *, - content: str, - position: int, - start_char: int | None, - end_char: int | None, -) -> Chunk: - chunk = Chunk( - content=content, - position=position, - document_id=document.id, - start_char=start_char, - end_char=end_char, - ) - session.add(chunk) - await session.flush() - return chunk - - -@pytest_asyncio.fixture -async def make_document(db_session, db_search_space, db_user): - async def _make(**overrides): - return await _make_document(db_session, db_search_space, db_user, **overrides) - - return _make - - -async def test_cited_line_range_derived_from_spans( - db_session, db_search_space, db_user, make_document -): - from app.routes.documents_routes import get_document_by_chunk_id - - doc = await make_document() - await _add_chunk( - db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12 - ) - cited = await _add_chunk( - db_session, - doc, - content="charlie\ndelta", - position=1, - start_char=12, - end_char=len(_BODY), - ) - - result = await get_document_by_chunk_id( - cited.id, chunk_window=5, session=db_session, user=db_user - ) - - assert result.cited_start_line == 3 - assert result.cited_end_line == 4 - - -async def test_chunk_spans_exposed_in_response( - db_session, db_search_space, db_user, make_document -): - from app.routes.documents_routes import get_document_by_chunk_id - - doc = await make_document() - cited = await _add_chunk( - db_session, doc, content="alpha\nbravo\n", position=0, start_char=0, end_char=12 - ) - - result = await get_document_by_chunk_id( - cited.id, chunk_window=5, session=db_session, user=db_user - ) - - chunk = next(c for c in result.chunks if c.id == cited.id) - assert chunk.start_char == 0 - assert chunk.end_char == 12 - - -async def test_cited_line_range_null_without_spans( - db_session, db_search_space, db_user, make_document -): - from app.routes.documents_routes import get_document_by_chunk_id - - doc = await make_document() - cited = await _add_chunk( - db_session, doc, content="alpha", position=0, start_char=None, end_char=None - ) - - result = await get_document_by_chunk_id( - cited.id, chunk_window=5, session=db_session, user=db_user - ) - - assert result.cited_start_line is None - assert result.cited_end_line is None diff --git a/surfsense_backend/tests/integration/test_editor_routes.py b/surfsense_backend/tests/integration/test_editor_routes.py deleted file mode 100644 index 382d4b4de..000000000 --- a/surfsense_backend/tests/integration/test_editor_routes.py +++ /dev/null @@ -1,175 +0,0 @@ -"""Phase A contract: editor read paths serve source_markdown and never -reconstruct or mutate the body from chunks.""" - -import pytest -import pytest_asyncio -from fastapi import HTTPException -from sqlalchemy.ext.asyncio import AsyncSession - -from app.db import ( - Chunk, - Document, - DocumentStatus, - DocumentType, - SearchSpace, - User, -) - -pytestmark = pytest.mark.integration - - -async def _make_document( - session: AsyncSession, - search_space: SearchSpace, - user: User, - *, - document_type: DocumentType = DocumentType.FILE, - source_markdown: str | None = "# Title\n\nBody line.", - content: str = "Body line.", - status: dict | None = None, -) -> Document: - doc = Document( - title="Doc", - document_type=document_type, - document_metadata={}, - content=content, - content_hash="hash-001", - source_markdown=source_markdown, - search_space_id=search_space.id, - created_by_id=user.id, - status=status or DocumentStatus.ready(), - ) - session.add(doc) - await session.flush() - return doc - - -async def _add_chunks(session: AsyncSession, document: Document, texts: list[str]): - for position, text in enumerate(texts): - session.add(Chunk(content=text, position=position, document_id=document.id)) - await session.flush() - - -@pytest_asyncio.fixture -async def make_document(db_session, db_search_space, db_user): - async def _make(**overrides): - return await _make_document(db_session, db_search_space, db_user, **overrides) - - return _make - - -class TestGetEditorContent: - async def test_returns_source_markdown_verbatim( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import get_editor_content - - doc = await make_document(source_markdown="# Real\n\nCanonical body.") - - result = await get_editor_content( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert result["source_markdown"] == "# Real\n\nCanonical body." - - async def test_does_not_reconstruct_body_from_chunks( - self, db_session, db_search_space, db_user, make_document - ): - """A ready document without source_markdown must not be rebuilt from chunks.""" - from app.routes.editor_routes import get_editor_content - - doc = await make_document(source_markdown=None) - await _add_chunks(db_session, doc, ["chunk one", "chunk two"]) - - with pytest.raises(HTTPException) as exc: - await get_editor_content( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert exc.value.status_code == 400 - await db_session.refresh(doc) - assert doc.source_markdown is None - - async def test_processing_document_without_body_returns_409( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import get_editor_content - - doc = await make_document( - source_markdown=None, status=DocumentStatus.processing() - ) - - with pytest.raises(HTTPException) as exc: - await get_editor_content( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert exc.value.status_code == 409 - - async def test_failed_document_without_body_returns_422( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import get_editor_content - - doc = await make_document( - source_markdown=None, status=DocumentStatus.failed("boom") - ) - - with pytest.raises(HTTPException) as exc: - await get_editor_content( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert exc.value.status_code == 422 - - async def test_empty_note_initializes_to_empty_markdown( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import get_editor_content - - doc = await make_document(document_type=DocumentType.NOTE, source_markdown=None) - - result = await get_editor_content( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert result["source_markdown"] == "" - - -class TestDownloadMarkdown: - async def test_does_not_reconstruct_body_from_chunks( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import download_document_markdown - - doc = await make_document(source_markdown=None) - await _add_chunks(db_session, doc, ["chunk one", "chunk two"]) - - with pytest.raises(HTTPException) as exc: - await download_document_markdown( - db_search_space.id, doc.id, session=db_session, user=db_user - ) - - assert exc.value.status_code == 400 - - -class TestExportDocument: - async def test_does_not_reconstruct_body_from_chunks( - self, db_session, db_search_space, db_user, make_document - ): - from app.routes.editor_routes import export_document - from app.routes.reports_routes import ExportFormat - - doc = await make_document(source_markdown=None) - await _add_chunks(db_session, doc, ["chunk one", "chunk two"]) - - with pytest.raises(HTTPException) as exc: - await export_document( - db_search_space.id, - doc.id, - format=ExportFormat.PLAIN, - session=db_session, - user=db_user, - ) - - assert exc.value.status_code == 400 diff --git a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py deleted file mode 100644 index e068792b1..000000000 --- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Unit tests for search_knowledge_base hit rendering. - -The tool must surface the passage that actually matched (the RRF-ranked -chunk), not the top of the document, and annotate it with its line range -when the chunk carries a char span. -""" - -from __future__ import annotations - -import pytest - -from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import ( - _format_hits, -) - -pytestmark = pytest.mark.unit - -_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph." - - -def _hit() -> dict: - intro = "Intro paragraph." - matched = "Matched passage here." - matched_start = _BODY.index(matched) - return { - "document": {"id": 7, "title": "note.md", "document_type": "NOTE"}, - "score": 0.42, - "content": _BODY.replace("\n\n", "\n\n"), - "matched_chunk_ids": [102], - "chunks": [ - { - "chunk_id": 101, - "content": intro, - "start_char": 0, - "end_char": len(intro), - }, - { - "chunk_id": 102, - "content": matched, - "start_char": matched_start, - "end_char": matched_start + len(matched), - }, - ], - } - - -def test_renders_matched_passage_not_top_of_doc() -> None: - out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - assert "Matched passage here." in out - # The intro chunk was not matched, so it must not be shown as the snippet. - assert "Intro paragraph." not in out - - -def test_emits_copyable_line_citation_token_when_spans_present() -> None: - out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - # "Matched passage here." sits on line 3 of the body; the hit must surface - # a ready-to-copy token so the agent can cite without a separate read. - assert "[citation:d7#L3-3]" in out - - -def test_header_includes_document_id() -> None: - out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - assert "id=7" in out - - -def test_omits_citation_token_when_spans_absent() -> None: - hit = _hit() - for chunk in hit["chunks"]: - chunk["start_char"] = None - chunk["end_char"] = None - out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - assert "Matched passage here." in out - # No concrete, copyable token for this document without spans (the closing - # instruction's placeholder template doesn't count). - assert "[citation:d7#L" not in out - - -def test_falls_back_to_content_when_no_matched_ids() -> None: - hit = _hit() - hit["matched_chunk_ids"] = [] - out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q") - assert "Intro paragraph." in out - - -def test_no_results_message() -> None: - out = _format_hits([], paths={}, bodies={}, query="missing") - assert "No knowledge-base matches" in out diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py deleted file mode 100644 index 0ff155c3b..000000000 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Span-aware chunking contract: slices form a lossless, contiguous partition -of the markdown, and every slice's char span addresses its own text.""" - -import pytest - -from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans - -pytestmark = pytest.mark.unit - - -def _assert_lossless_partition(md: str, slices) -> None: - assert "".join(s.text for s in slices) == md - - cursor = 0 - for s in slices: - assert s.start_char == cursor, "slices must be contiguous" - assert s.end_char >= s.start_char - assert md[s.start_char : s.end_char] == s.text, "span must address slice text" - cursor = s.end_char - assert cursor == len(md) - - -def test_prose_partition_and_spans(): - md = ( - "# Title\n\n" - + "First paragraph with several words here. " * 20 - + "\n\nSecond section with more prose to force multiple chunks. " * 20 - ) - - slices = chunk_markdown_with_spans(md) - - assert len(slices) > 1 - _assert_lossless_partition(md, slices) - - -def test_table_kept_whole_with_exact_span(): - table = "| a | b |\n| - | - |\n| 1 | 2 |\n" - md = f"Intro prose before the table.\n{table}\nClosing prose after." - - slices = chunk_markdown_with_spans(md) - - _assert_lossless_partition(md, slices) - table_slices = [s for s in slices if s.text.lstrip().startswith("|")] - assert any("| 1 | 2 |" in s.text for s in table_slices) - for s in table_slices: - assert "| a | b |" in s.text and "| 1 | 2 |" in s.text - - -def test_table_at_eof_without_trailing_newline_stays_whole(): - md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |" - - slices = chunk_markdown_with_spans(md) - - _assert_lossless_partition(md, slices) - table_slices = [s for s in slices if "| 1 | 2 |" in s.text] - assert len(table_slices) == 1 - assert "| a | b |" in table_slices[0].text - - -def test_code_chunker_partition_and_spans(): - code = "\n\n".join( - f"def func_{i}(x):\n total = x + {i}\n return total" for i in range(40) - ) - - slices = chunk_markdown_with_spans(code, use_code_chunker=True) - - assert len(slices) >= 1 - _assert_lossless_partition(code, slices) - - -def test_empty_markdown_yields_no_slices(): - assert chunk_markdown_with_spans("") == [] diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index 8c4936648..feb7bbc52 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -37,9 +37,12 @@ def _make_orm_doc(connector_doc, doc_id): async def test_index_calls_embed_and_chunk_via_to_thread( pipeline, make_connector_document, monkeypatch ): - """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.""" - from app.indexing_pipeline.document_chunker import ChunkSlice + """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop. + Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default + path, see issue #1334) is verified separately in + ``test_non_code_documents_use_hybrid_chunker``. + """ to_thread_calls = [] original_to_thread = asyncio.to_thread @@ -48,11 +51,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread( return await original_to_thread(func, *args, **kwargs) monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread) - mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)]) - mock_chunker.__name__ = "chunk_markdown_with_spans" + mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) + mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - mock_chunker, + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", + mock_chunk_hybrid, ) mock_embed = MagicMock( side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] @@ -87,25 +90,34 @@ async def test_index_calls_embed_and_chunk_via_to_thread( await pipeline.index(document, connector_doc) - assert "chunk_markdown_with_spans" in to_thread_calls + # Either chunker entry point satisfies the "chunking runs off the event + # loop" contract this test guards. Routing between the two is verified + # in test_non_code_documents_use_hybrid_chunker. + assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls) assert "embed_texts" in to_thread_calls assert document.status == DocumentStatus.ready() -async def test_non_code_documents_use_prose_chunker( +async def test_non_code_documents_use_hybrid_chunker( pipeline, make_connector_document, monkeypatch ): - """Non-code documents chunk with use_code_chunker=False (issue #1334). + """Non-code documents route through ``chunk_text_hybrid`` (issue #1334). - The table-aware prose path keeps Markdown tables intact; only documents - flagged with ``should_use_code_chunker=True`` request the code chunker. + The hybrid chunker preserves Markdown table integrity by avoiding splits + mid-row. Only documents flagged with ``should_use_code_chunker=True`` + should take the ``chunk_text`` path. """ - from app.indexing_pipeline.document_chunker import ChunkSlice - - mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)]) + mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) + mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - mock_chunker, + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", + mock_chunk_hybrid, + ) + mock_chunk_code = MagicMock(return_value=["chunk1"]) + mock_chunk_code.__name__ = "chunk_text" + monkeypatch.setattr( + "app.indexing_pipeline.cache.cached_indexing.chunk_text", + mock_chunk_code, ) monkeypatch.setattr( "app.indexing_pipeline.cache.cached_indexing.embed_texts", @@ -137,49 +149,8 @@ async def test_non_code_documents_use_prose_chunker( await pipeline.index(document, connector_doc) - mock_chunker.assert_called_once() - assert mock_chunker.call_args.args[1] is False - - -async def test_code_documents_request_code_chunker( - pipeline, make_connector_document, monkeypatch -): - """Code-flagged documents forward use_code_chunker=True to the chunker.""" - from app.indexing_pipeline.document_chunker import ChunkSlice - - mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)]) - monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans", - mock_chunker, - ) - monkeypatch.setattr( - "app.indexing_pipeline.cache.cached_indexing.embed_texts", - MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), - ) - monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[])) - - async def _noop_persist(_session, doc, *_args, **_kwargs): - doc.status = DocumentStatus.ready() - - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index", - _noop_persist, - ) - - connector_doc = make_connector_document( - document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, - unique_id="repo-1", - search_space_id=1, - should_use_code_chunker=True, - ) - document = MagicMock(spec=Document) - document.id = 1 - document.status = DocumentStatus.pending() - - await pipeline.index(document, connector_doc) - - mock_chunker.assert_called_once() - assert mock_chunker.call_args.args[1] is True + mock_chunk_hybrid.assert_called_once() + mock_chunk_code.assert_not_called() def _mock_session_factory(orm_docs_by_id): diff --git a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py index 27653c544..898ec3765 100644 --- a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py +++ b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py @@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend): def __init__(self, *, children=None, file_data=None) -> None: self.als_info = AsyncMock(return_value=children or []) self._load_file_data = AsyncMock( - return_value=(file_data, 17, None) if file_data is not None else None + return_value=(file_data, 17) if file_data is not None else None ) diff --git a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py index 3968eb090..e78db1e76 100644 --- a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py +++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py @@ -69,25 +69,13 @@ class _FakeSession: @pytest.fixture(autouse=True) def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None: - """Avoid loading the embedding model in unit tests. - - Mirrors the legacy stub: one chunk spanning the whole content, with a - zero summary/chunk vector, routed through the shared span builder. - """ - from app.indexing_pipeline.document_chunker import ChunkSlice - - async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool): - summary = np.zeros(8, dtype=np.float32) - pairs = ( - [(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))] - if content - else [] - ) - return summary, pairs - + """Avoid loading the embedding model in unit tests.""" monkeypatch.setattr( - kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings + kb_persistence, + "embed_texts", + lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts], ) + monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content]) @pytest.mark.asyncio diff --git a/surfsense_backend/tests/unit/middleware/test_numbered_document.py b/surfsense_backend/tests/unit/middleware/test_numbered_document.py deleted file mode 100644 index 955c619b5..000000000 --- a/surfsense_backend/tests/unit/middleware/test_numbered_document.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Unit tests for the numbered-document read preamble.""" - -import pytest - -from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import ( - build_read_preamble, - compute_matched_line_ranges, -) - -pytestmark = pytest.mark.unit - - -_BODY = "alpha\nbravo\ncharlie\ndelta" - - -class TestComputeMatchedLineRanges: - def test_maps_matched_chunk_spans_to_line_ranges(self): - chunks = [(1, 0, 12), (2, 12, len(_BODY))] - ranges = compute_matched_line_ranges(_BODY, chunks, {2}) - assert ranges == [(3, 4)] - - def test_includes_only_matched_chunks(self): - chunks = [(1, 0, 5), (2, 6, 11)] - ranges = compute_matched_line_ranges(_BODY, chunks, {1}) - assert ranges == [(1, 1)] - - def test_skips_chunks_without_spans(self): - chunks = [(1, None, None)] - ranges = compute_matched_line_ranges(_BODY, chunks, {1}) - assert ranges == [] - - def test_sorted_and_deduplicated(self): - chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)] - ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3}) - assert ranges == [(1, 1), (3, 4)] - - -class TestBuildReadPreamble: - def test_contains_document_metadata(self): - preamble = build_read_preamble( - document_id=42, - document_type="FILE", - title="Test Doc", - url="https://example.com", - matched_line_ranges=[], - ) - assert "42" in preamble - assert "FILE" in preamble - assert "Test Doc" in preamble - assert "https://example.com" in preamble - - def test_citation_hint_uses_document_id(self): - preamble = build_read_preamble( - document_id=42, - document_type="FILE", - title="Test Doc", - url="", - matched_line_ranges=[], - ) - assert "[citation:d42#L" in preamble - - def test_lists_matched_line_ranges(self): - preamble = build_read_preamble( - document_id=7, - document_type="NOTE", - title="Notes", - url="", - matched_line_ranges=[(12, 18), (40, 40)], - ) - assert "" in preamble - assert "12-18" in preamble - assert "40" in preamble - - def test_omits_matched_lines_block_when_empty(self): - preamble = build_read_preamble( - document_id=7, - document_type="NOTE", - title="Notes", - url="", - matched_line_ranges=[], - ) - assert "" not in preamble - - def test_ends_with_trailing_newline_so_body_follows_cleanly(self): - preamble = build_read_preamble( - document_id=1, - document_type="FILE", - title="t", - url="", - matched_line_ranges=[], - ) - assert preamble.endswith("\n") diff --git a/surfsense_backend/tests/unit/utils/test_async_retry.py b/surfsense_backend/tests/unit/utils/test_async_retry.py new file mode 100644 index 000000000..3e60abe76 --- /dev/null +++ b/surfsense_backend/tests/unit/utils/test_async_retry.py @@ -0,0 +1,162 @@ +"""Tests for async_retry utilities.""" + +import httpx +import pytest + +from app.connectors.exceptions import ( + ConnectorAPIError, + ConnectorAuthError, + ConnectorError, + ConnectorRateLimitError, + ConnectorTimeoutError, +) +from app.utils.async_retry import _is_retryable, raise_for_status + +pytestmark = pytest.mark.unit + + +def make_response( + status_code: int, + *, + headers: dict[str, str] | None = None, + json_body=None, + text_body: str = "", +): + kwargs = { + "status_code": status_code, + "headers": headers, + "request": httpx.Request("GET", "https://x"), + } + + if json_body is not None: + kwargs["json"] = json_body + else: + kwargs["text"] = text_body + + return httpx.Response(**kwargs) + + +def test_raise_for_status_does_not_raise_for_success(): + response = make_response(200) + + raise_for_status(response) + + +@pytest.mark.parametrize( + ("retry_after_header", "expected"), + [ + ("5", 5.0), + (None, None), + ("abc", None), + ], +) +def test_raise_for_status_429(retry_after_header, expected): + headers = {} + if retry_after_header is not None: + headers["Retry-After"] = retry_after_header + + response = make_response( + 429, + headers=headers, + json_body={"detail": "rate limited"}, + ) + + with pytest.raises(ConnectorRateLimitError) as exc_info: + raise_for_status(response) + + exc = exc_info.value + assert exc.retry_after == expected + assert exc.response_body == {"detail": "rate limited"} + + +@pytest.mark.parametrize("status_code", [401, 403]) +def test_raise_for_status_auth_errors(status_code): + response = make_response( + status_code, + json_body={"error": "unauthorized"}, + ) + + with pytest.raises(ConnectorAuthError) as exc_info: + raise_for_status(response) + + exc = exc_info.value + assert exc.status_code == status_code + assert exc.response_body == {"error": "unauthorized"} + + +def test_raise_for_status_gateway_timeout(): + response = make_response( + 504, + json_body={"error": "timeout"}, + ) + + with pytest.raises(ConnectorTimeoutError): + raise_for_status(response) + + +@pytest.mark.parametrize("status_code", [500, 502]) +def test_raise_for_status_server_errors(status_code): + response = make_response( + status_code, + json_body={"error": "server"}, + ) + + with pytest.raises(ConnectorAPIError) as exc_info: + raise_for_status(response) + + assert exc_info.value.status_code == status_code + + +@pytest.mark.parametrize("status_code", [400, 404]) +def test_raise_for_status_client_errors(status_code): + response = make_response( + status_code, + json_body={"error": "client"}, + ) + + with pytest.raises(ConnectorAPIError) as exc_info: + raise_for_status(response) + + assert exc_info.value.status_code == status_code + + +def test_raise_for_status_uses_text_when_json_parsing_fails(): + response = make_response( + 500, + text_body="Internal server error", + ) + + with pytest.raises(ConnectorAPIError) as exc_info: + raise_for_status(response) + + assert exc_info.value.response_body == "Internal server error" + + +def test_connector_error_retryable_false(): + exc = ConnectorError("boom") + + assert _is_retryable(exc) is False + + +def test_rate_limit_error_is_retryable(): + exc = ConnectorRateLimitError() + + assert _is_retryable(exc) is True + + +def test_timeout_exception_is_retryable(): + exc = httpx.TimeoutException("timeout") + + assert _is_retryable(exc) is True + + +def test_connect_error_is_retryable(): + exc = httpx.ConnectError("connection failed") + + assert _is_retryable(exc) is True + + +def test_unrelated_exception_is_not_retryable(): + exc = ValueError("boom") + + assert _is_retryable(exc) is False diff --git a/surfsense_backend/tests/unit/utils/test_content_utils.py b/surfsense_backend/tests/unit/utils/test_content_utils.py new file mode 100644 index 000000000..db898f294 --- /dev/null +++ b/surfsense_backend/tests/unit/utils/test_content_utils.py @@ -0,0 +1,293 @@ +"""Tests for strip_markdown_fences() and extract_text_content() in +app/utils/content_utils.py. + +Out of scope: bootstrap_history_from_db() — async + DB, belongs in +integration tests. + +Run: + uv run pytest -m unit tests/unit/utils/test_content_utils.py +""" + +import pytest + +pytestmark = pytest.mark.unit + + +# =========================================================================== +# strip_markdown_fences() +# =========================================================================== + + +class TestStripMarkdownFences: + """Tests for strip_markdown_fences(text: str) -> str. + + Regex: r"^```(?:\\w+)?\\s*\\n(.*?)```\\s*$" (re.DOTALL) + Called on text.strip() — so surrounding whitespace is handled before + the regex runs. The captured group is also .strip()-ped before return. + """ + + # ------------------------------------------------------------------ + # Fenced with a language tag + # ------------------------------------------------------------------ + + def test_json_fence_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = '```json\n{"key": "value"}\n```' + assert strip_markdown_fences(text) == '{"key": "value"}' + + def test_python_fence_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```python\ndef hello():\n return 'hi'\n```" + assert strip_markdown_fences(text) == "def hello():\n return 'hi'" + + def test_yaml_fence_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```yaml\nkey: value\n```" + assert strip_markdown_fences(text) == "key: value" + + def test_sql_multiline_fence_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```sql\nSELECT *\nFROM users\nWHERE id = 1;\n```" + assert strip_markdown_fences(text) == "SELECT *\nFROM users\nWHERE id = 1;" + + # ------------------------------------------------------------------ + # Fenced without a language tag + # ------------------------------------------------------------------ + + def test_no_lang_tag_single_line_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```\nhello world\n```" + assert strip_markdown_fences(text) == "hello world" + + def test_no_lang_tag_multiline_returns_inner_content(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```\nline one\nline two\n```" + assert strip_markdown_fences(text) == "line one\nline two" + + # ------------------------------------------------------------------ + # Plain text — no fences → returned unchanged + # ------------------------------------------------------------------ + + def test_plain_text_returned_unchanged(self): + from app.utils.content_utils import strip_markdown_fences + + text = "just plain text with no fences" + assert strip_markdown_fences(text) == text + + def test_plain_text_with_newlines_returned_unchanged(self): + from app.utils.content_utils import strip_markdown_fences + + text = "line one\nline two\nline three" + assert strip_markdown_fences(text) == text + + def test_empty_string_returned_unchanged(self): + from app.utils.content_utils import strip_markdown_fences + + assert strip_markdown_fences("") == "" + + # ------------------------------------------------------------------ + # Surrounding whitespace handling + # The function calls text.strip() before matching, so leading/trailing + # whitespace outside the fence is consumed. The captured group is also + # .strip()-ped, so whitespace between the fence markers and content is + # removed too. + # ------------------------------------------------------------------ + + def test_leading_whitespace_around_fence_stripped(self): + from app.utils.content_utils import strip_markdown_fences + + text = " ```json\n{}\n```" + assert strip_markdown_fences(text) == "{}" + + def test_trailing_whitespace_around_fence_stripped(self): + from app.utils.content_utils import strip_markdown_fences + + text = "```json\n{}\n``` " + assert strip_markdown_fences(text) == "{}" + + def test_surrounding_newlines_stripped(self): + from app.utils.content_utils import strip_markdown_fences + + text = '\n\n```json\n{"a": 1}\n```\n\n' + assert strip_markdown_fences(text) == '{"a": 1}' + + def test_inner_indentation_preserved(self): + """The captured group is .strip()-ped, so leading whitespace on the + *first* line is removed, but indentation on subsequent lines is kept.""" + from app.utils.content_utils import strip_markdown_fences + + text = "```\n indented line\n deeper indent\n```" + result = strip_markdown_fences(text) + # .strip() removes the leading spaces from the first captured line + assert "indented line" in result + # indentation on the second line is preserved + assert " deeper indent" in result + + +# =========================================================================== +# extract_text_content() +# =========================================================================== + + +class TestExtractTextContent: + """Tests for extract_text_content(content: str | dict | list) -> str.""" + + # ------------------------------------------------------------------ + # str input → returned as-is + # ------------------------------------------------------------------ + + def test_str_input_returned_as_is(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content("hello world") == "hello world" + + def test_str_empty_returned_as_is(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content("") == "" + + def test_str_with_internal_whitespace_returned_as_is(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content(" spaced ") == " spaced " + + # ------------------------------------------------------------------ + # dict with "text" key → return content["text"] + # ------------------------------------------------------------------ + + def test_dict_with_text_key_returns_its_value(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content({"text": "from dict"}) == "from dict" + + def test_dict_with_text_key_empty_value(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content({"text": ""}) == "" + + def test_dict_with_text_key_ignores_other_keys(self): + from app.utils.content_utils import extract_text_content + + d = {"text": "important", "role": "assistant", "extra": 99} + assert extract_text_content(d) == "important" + + # ------------------------------------------------------------------ + # dict without "text" key → str(dict) + # ------------------------------------------------------------------ + + def test_dict_without_text_key_returns_str_repr(self): + from app.utils.content_utils import extract_text_content + + d = {"role": "assistant", "value": 42} + assert extract_text_content(d) == str(d) + + def test_empty_dict_returns_str_repr(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content({}) == str({}) + + # ------------------------------------------------------------------ + # list of parts — text dicts and plain strings + # Parts are joined with "\n" (per implementation: "\n".join(texts)) + # ------------------------------------------------------------------ + + def test_list_text_type_parts_joined_with_newline(self): + from app.utils.content_utils import extract_text_content + + parts = [ + {"type": "text", "text": "Hello"}, + {"type": "text", "text": "world"}, + ] + assert extract_text_content(parts) == "Hello\nworld" + + def test_list_plain_strings_joined_with_newline(self): + from app.utils.content_utils import extract_text_content + + parts = ["foo", "bar"] + assert extract_text_content(parts) == "foo\nbar" + + def test_list_mixed_text_dicts_and_plain_strings(self): + from app.utils.content_utils import extract_text_content + + parts = [ + {"type": "text", "text": "Hello"}, + "plain", + {"type": "text", "text": "world"}, + ] + result = extract_text_content(parts) + assert "Hello" in result + assert "plain" in result + assert "world" in result + + def test_list_non_text_type_parts_ignored(self): + """tool_use, image, and other non-text blocks must not leak into output.""" + from app.utils.content_utils import extract_text_content + + parts = [ + {"type": "tool_use", "id": "abc", "name": "search_kb"}, + {"type": "text", "text": "visible text"}, + {"type": "image", "source": {"url": "https://example.com/img.png"}}, + ] + result = extract_text_content(parts) + assert result == "visible text" + assert "tool_use" not in result + assert "search_kb" not in result + assert "image" not in result + + def test_list_only_non_text_parts_returns_empty_string(self): + from app.utils.content_utils import extract_text_content + + parts = [ + {"type": "tool_use", "id": "x"}, + {"type": "image", "source": {}}, + ] + assert extract_text_content(parts) == "" + + def test_list_single_text_part(self): + from app.utils.content_utils import extract_text_content + + parts = [{"type": "text", "text": "only me"}] + assert extract_text_content(parts) == "only me" + + def test_list_text_part_missing_text_key_contributes_empty_string(self): + """part.get("text", "") — a text-typed dict with no "text" key gives "".""" + from app.utils.content_utils import extract_text_content + + parts = [{"type": "text"}, {"type": "text", "text": "after"}] + result = extract_text_content(parts) + # both parts collected; joined → "\nafter" or "after" depending on strip + assert "after" in result + + # ------------------------------------------------------------------ + # Empty list → empty string + # ------------------------------------------------------------------ + + def test_empty_list_returns_empty_string(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content([]) == "" + + # ------------------------------------------------------------------ + # Unsupported types → empty string (the final bare `return ""`) + # ------------------------------------------------------------------ + + def test_none_returns_empty_string(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content(None) == "" + + def test_integer_returns_empty_string(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content(42) == "" + + def test_boolean_returns_empty_string(self): + from app.utils.content_utils import extract_text_content + + assert extract_text_content(True) == "" \ No newline at end of file diff --git a/surfsense_backend/tests/unit/utils/test_text_spans.py b/surfsense_backend/tests/unit/utils/test_text_spans.py deleted file mode 100644 index d70418ea5..000000000 --- a/surfsense_backend/tests/unit/utils/test_text_spans.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Unit tests for char-span -> line-range conversion.""" - -from __future__ import annotations - -import pytest - -from app.utils.text_spans import char_span_to_line_range - -pytestmark = pytest.mark.unit - -_TEXT = "line1\nline2\nline3" - - -def test_single_line_span() -> None: - start = _TEXT.index("line2") - assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2) - - -def test_first_line_span() -> None: - assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1) - - -def test_last_line_span() -> None: - start = _TEXT.index("line3") - assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3) - - -def test_multi_line_span() -> None: - # "line1\nline2" spans lines 1-2. - assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2) - - -def test_empty_span_resolves_to_its_line() -> None: - start = _TEXT.index("line2") - assert char_span_to_line_range(_TEXT, start, start) == (2, 2) - - -def test_offsets_clamped_to_text_bounds() -> None: - assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3) diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example index 11646c948..7d03cf498 100644 --- a/surfsense_web/.env.example +++ b/surfsense_web/.env.example @@ -14,7 +14,10 @@ SURFSENSE_BACKEND_INTERNAL_URL=http://backend:8000 # ───────────────────────────────────────────────────────────────────────────── # Runtime configuration (read at runtime by the server, no rebuild needed) # ───────────────────────────────────────────────────────────────────────────── - +# Configure these plain variables for runtime behavior. They are read by server +# code when the app starts/serves requests, so changing them requires restarting +# the web process but not rebuilding the frontend bundle. +# # Authentication method: LOCAL (email/password) or GOOGLE (OAuth). AUTH_TYPE=LOCAL # Document parsing backend: DOCLING, LLAMACLOUD, etc. @@ -22,16 +25,6 @@ ETL_SERVICE=DOCLING # Deployment mode: self-hosted or cloud. DEPLOYMENT_MODE=self-hosted -# ───────────────────────────────────────────────────────────────────────────── -# Build-time fallbacks for packaged clients (e.g. Electron) without a runtime -# config provider. Optional; Docker reads the plain runtime vars above first. -# ───────────────────────────────────────────────────────────────────────────── -# NEXT_PUBLIC_AUTH_TYPE=GOOGLE -# NEXT_PUBLIC_ETL_SERVICE=DOCLING -# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted -# Overrides the app version shown in the UI (defaults to package.json version). -# NEXT_PUBLIC_APP_VERSION= - # ───────────────────────────────────────────────────────────────────────────── # Database (Contact Form, optional) # ───────────────────────────────────────────────────────────────────────────── @@ -72,3 +65,20 @@ NEXT_PUBLIC_GOOGLE_ADSENSE_SLOT_FREE_HUB_BEFORE_FAQ= # ───────────────────────────────────────────────────────────────────────────── NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_ENABLED=false NEXT_PUBLIC_GLOBAL_ANNOUNCEMENT_MESSAGE= + +# ───────────────────────────────────────────────────────────────────────────── +# Internal build-time fallbacks +# ───────────────────────────────────────────────────────────────────────────── +# +# Most deployments should leave these unset. +# +# These are only for SurfSense-managed production/cloud builds or packaged +# clients that do not have the normal server runtime config available. +# +# NEXT_PUBLIC_* values are embedded into the browser bundle during `next build`. +# Changing them after the bundle is built has no effect. + +# NEXT_PUBLIC_AUTH_TYPE=GOOGLE +# NEXT_PUBLIC_ETL_SERVICE=DOCLING +# NEXT_PUBLIC_DEPLOYMENT_MODE=self-hosted +# NEXT_PUBLIC_APP_VERSION= \ No newline at end of file diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css index 6950fd284..4a29edfa6 100644 --- a/surfsense_web/app/globals.css +++ b/surfsense_web/app/globals.css @@ -58,6 +58,11 @@ --highlight: oklch(0.852 0.199 91.936); } +html[data-surfsense-auth-type="GOOGLE"] .runtime-auth-local, +html[data-surfsense-auth-type="LOCAL"] .runtime-auth-google { + display: none; +} + .dark { --background: oklch(0.145 0 0); --foreground: oklch(0.985 0 0); @@ -270,12 +275,6 @@ button { contain-intrinsic-size: 0 40px; } -/* Monaco whole-line highlight for a cited source span (Phase E). */ -.citation-line-highlight { - background-color: color-mix(in srgb, var(--primary) 16%, transparent); - box-shadow: inset 2px 0 0 0 var(--primary); -} - @source "../node_modules/@llamaindex/chat-ui/**/*.{ts,tsx}"; @source "../node_modules/streamdown/dist/*.js"; @source "../node_modules/@streamdown/code/dist/*.js"; diff --git a/surfsense_web/app/layout.tsx b/surfsense_web/app/layout.tsx index 1e9c9eebe..46182f40e 100644 --- a/surfsense_web/app/layout.tsx +++ b/surfsense_web/app/layout.tsx @@ -2,6 +2,7 @@ import type { Metadata, Viewport } from "next"; import "./globals.css"; import { RootProvider } from "fumadocs-ui/provider/next"; import { Roboto } from "next/font/google"; +import Script from "next/script"; import { AnnouncementToastProvider } from "@/components/announcements/AnnouncementToastProvider"; import { DesktopUpdateToast } from "@/components/desktop/desktop-update-toast"; import { GlobalLoadingProvider } from "@/components/providers/GlobalLoadingProvider"; @@ -16,8 +17,13 @@ import { import { ThemeProvider } from "@/components/theme/theme-provider"; import { Toaster } from "@/components/ui/sonner"; import { LocaleProvider } from "@/contexts/LocaleContext"; +import { BUILD_TIME_AUTH_TYPE } from "@/lib/env-config"; import { PlatformProvider } from "@/contexts/platform-context"; import { ReactQueryClientProvider } from "@/lib/query-client/query-client.provider"; +import { + getRuntimeAuthInitScript, + resolveRuntimeAuthUiMode, +} from "@/lib/runtime-auth-config"; import { cn } from "@/lib/utils"; const roboto = Roboto({ @@ -131,8 +137,15 @@ export default function RootLayout({ // Language can be switched dynamically through LanguageSwitcher component // Locale state is managed by LocaleContext and persisted in localStorage return ( - + + diff --git a/surfsense_web/atoms/editor/editor-panel.atom.ts b/surfsense_web/atoms/editor/editor-panel.atom.ts index ee609f519..c302c66ee 100644 --- a/surfsense_web/atoms/editor/editor-panel.atom.ts +++ b/surfsense_web/atoms/editor/editor-panel.atom.ts @@ -1,11 +1,6 @@ import { atom } from "jotai"; import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom"; -export interface EditorLineRange { - start: number; - end: number; -} - interface EditorPanelState { isOpen: boolean; kind: "document" | "local_file" | "memory"; @@ -14,10 +9,6 @@ interface EditorPanelState { searchSpaceId: number | null; memoryScope: "user" | "team" | null; title: string | null; - // Citation line anchor: when set, the editor opens the raw source view - // scrolled to and highlighting this 1-based inclusive line range. - highlightLines: EditorLineRange | null; - forceSourceView: boolean; } const initialState: EditorPanelState = { @@ -28,8 +19,6 @@ const initialState: EditorPanelState = { searchSpaceId: null, memoryScope: null, title: null, - highlightLines: null, - forceSourceView: false, }; export const editorPanelAtom = atom(initialState); @@ -44,14 +33,7 @@ export const openEditorPanelAtom = atom( get, set, payload: - | { - documentId: number; - searchSpaceId: number; - title?: string; - kind?: "document"; - highlightLines?: EditorLineRange | null; - forceSourceView?: boolean; - } + | { documentId: number; searchSpaceId: number; title?: string; kind?: "document" } | { kind: "local_file"; localFilePath: string; @@ -77,8 +59,6 @@ export const openEditorPanelAtom = atom( searchSpaceId: payload.searchSpaceId ?? null, memoryScope: null, title: payload.title ?? null, - highlightLines: null, - forceSourceView: false, }); set(rightPanelTabAtom, "editor"); set(rightPanelCollapsedAtom, false); @@ -93,8 +73,6 @@ export const openEditorPanelAtom = atom( searchSpaceId: payload.searchSpaceId ?? null, memoryScope: payload.memoryScope, title: payload.title ?? null, - highlightLines: null, - forceSourceView: false, }); set(rightPanelTabAtom, "editor"); set(rightPanelCollapsedAtom, false); @@ -108,8 +86,6 @@ export const openEditorPanelAtom = atom( searchSpaceId: payload.searchSpaceId, memoryScope: null, title: payload.title ?? null, - highlightLines: payload.highlightLines ?? null, - forceSourceView: payload.forceSourceView ?? false, }); set(rightPanelTabAtom, "editor"); set(rightPanelCollapsedAtom, false); diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx index 28f5212ae..59a10739c 100644 --- a/surfsense_web/components/assistant-ui/inline-citation.tsx +++ b/surfsense_web/components/assistant-ui/inline-citation.tsx @@ -2,11 +2,9 @@ import { useSetAtom } from "jotai"; import { FileText } from "lucide-react"; -import { useParams } from "next/navigation"; import type { FC } from "react"; import { useId, useState } from "react"; import { openCitationPanelAtom } from "@/atoms/citation/citation-panel.atom"; -import { openEditorPanelAtom } from "@/atoms/editor/editor-panel.atom"; import { useCitationMetadata } from "@/components/assistant-ui/citation-metadata-context"; import { CitationPanelContent } from "@/components/citation-panel/citation-panel"; import { Citation } from "@/components/tool-ui/citation"; @@ -110,50 +108,6 @@ const NumericChunkCitation: FC<{ chunkId: number }> = ({ chunkId }) => { ); }; -interface LineCitationProps { - documentId: number; - startLine: number; - endLine: number; -} - -/** - * Inline citation for a knowledge-base document line range - * (`[citation:d#L-]`). Clicking opens the document in - * the editor's read-only source view, scrolled to and highlighting the cited - * lines — the same anchor the citation panel uses for chunk citations. - */ -export const LineCitation: FC = ({ documentId, startLine, endLine }) => { - const openEditorPanel = useSetAtom(openEditorPanelAtom); - const params = useParams(); - const searchSpaceId = Number(params?.search_space_id); - - const label = startLine === endLine ? `L${startLine}` : `L${startLine}-${endLine}`; - - const handleClick = () => { - if (!Number.isFinite(searchSpaceId)) return; - openEditorPanel({ - documentId, - searchSpaceId, - highlightLines: { start: startLine, end: endLine }, - forceSourceView: true, - }); - }; - - return ( - - ); -}; - import { tryGetHostname } from "@/lib/url"; interface UrlCitationProps { diff --git a/surfsense_web/components/auth/sign-in-button.tsx b/surfsense_web/components/auth/sign-in-button.tsx index 581e37603..d0a563a54 100644 --- a/surfsense_web/components/auth/sign-in-button.tsx +++ b/surfsense_web/components/auth/sign-in-button.tsx @@ -3,7 +3,7 @@ import Link from "next/link"; import { useState } from "react"; import { Button } from "@/components/ui/button"; -import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config"; +import { buildBackendUrl } from "@/lib/env-config"; import { trackLoginAttempt } from "@/lib/posthog/events"; import { cn } from "@/lib/utils"; @@ -46,7 +46,6 @@ interface SignInButtonProps { } export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => { - const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE"; const [isRedirecting, setIsRedirecting] = useState(false); const handleGoogleLogin = () => { @@ -56,44 +55,45 @@ export const SignInButton = ({ variant = "desktop" }: SignInButtonProps) => { window.location.href = buildBackendUrl("/auth/google/authorize-redirect"); }; - const getClassName = () => { + const getGoogleClassName = () => { if (variant === "desktop") { - return isGoogleAuth - ? "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white" - : "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black"; + return "hidden rounded-full border border-white bg-white px-5 py-2 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] md:flex dark:border-white"; } if (variant === "compact") { - return isGoogleAuth - ? "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white" - : "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black"; + return "rounded-full border border-white bg-white px-4 py-1.5 text-sm font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white"; } // mobile - return isGoogleAuth - ? "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation" - : "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation"; + return "w-full rounded-lg border border-white bg-white px-8 py-2.5 font-medium text-[#1f1f1f] shadow-sm hover:bg-zinc-100 hover:text-[#1f1f1f] dark:border-white touch-manipulation"; }; - if (isGoogleAuth) { - return ( + const getLocalClassName = () => { + if (variant === "desktop") { + return "hidden rounded-full bg-black px-8 py-2 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] md:block dark:bg-white dark:text-black"; + } + if (variant === "compact") { + return "rounded-full bg-black px-6 py-1.5 text-sm font-bold text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black"; + } + return "w-full rounded-lg bg-black px-8 py-2 font-medium text-white shadow-[0px_-2px_0px_0px_rgba(255,255,255,0.4)_inset] dark:bg-white dark:text-black text-center touch-manipulation"; + }; + + return ( + <> - ); - } - - return ( - - Sign In - + + Sign In + + ); }; diff --git a/surfsense_web/components/citation-panel/citation-panel.tsx b/surfsense_web/components/citation-panel/citation-panel.tsx index 9b9a9aaa9..890ac11ac 100644 --- a/surfsense_web/components/citation-panel/citation-panel.tsx +++ b/surfsense_web/components/citation-panel/citation-panel.tsx @@ -46,13 +46,6 @@ export const CitationPanelContent: FC = ({ const cited = useMemo(() => data?.chunks.find((c) => c.id === chunkId) ?? null, [data, chunkId]); - const citedLineLabel = useMemo(() => { - const start = data?.cited_start_line; - const end = data?.cited_end_line; - if (start == null || end == null) return null; - return start === end ? `Line ${start}` : `Lines ${start}–${end}`; - }, [data?.cited_start_line, data?.cited_end_line]); - const totalChunks = data?.total_chunks ?? data?.chunks.length ?? 0; const startIndex = data?.chunk_start_index ?? 0; const hasMoreAbove = startIndex > 0; @@ -82,15 +75,10 @@ export const CitationPanelContent: FC = ({ const handleOpenFullDocument = () => { if (!data) return; - const hasLineAnchor = data.cited_start_line != null && data.cited_end_line != null; openEditorPanel({ documentId: data.id, searchSpaceId: data.search_space_id, title: data.title, - highlightLines: hasLineAnchor - ? { start: data.cited_start_line as number, end: data.cited_end_line as number } - : null, - forceSourceView: hasLineAnchor, }); }; @@ -122,7 +110,6 @@ export const CitationPanelContent: FC = ({

- {citedLineLabel && {citedLineLabel}} {totalChunks > 0 && {totalChunks} chunks} {!isLoading && !error && data && (
diff --git a/surfsense_web/components/citations/citation-renderer.tsx b/surfsense_web/components/citations/citation-renderer.tsx index b0ab13f84..f2de4b27d 100644 --- a/surfsense_web/components/citations/citation-renderer.tsx +++ b/surfsense_web/components/citations/citation-renderer.tsx @@ -1,7 +1,7 @@ "use client"; import type { ReactNode } from "react"; -import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; +import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { type CitationToken, type CitationUrlMap, @@ -21,16 +21,6 @@ export function renderCitationToken(token: CitationToken, ordinalKey: number): R if (token.kind === "url") { return ; } - if (token.kind === "line") { - return ( - - ); - } return ( void; - highlightLines?: { start: number; end: number } | null; - forceSourceView?: boolean; }) { const electronAPI = useElectronAPI(); const [editorDoc, setEditorDoc] = useState(null); @@ -209,7 +205,7 @@ export function EditorPanelContent({ const isLargeDocument = docSizeBytes > plateMaxBytes || docLineCount > plateMaxLines; const viewerMode: ViewerMode = isMemoryMode ? "plate" - : editorDoc?.viewer_mode === "monaco" || isLargeDocument || forceSourceView + : editorDoc?.viewer_mode === "monaco" || isLargeDocument ? "monaco" : "plate"; @@ -832,7 +828,6 @@ export function EditorPanelContent({ value={editorDoc.source_markdown} readOnly onChange={() => {}} - highlightLines={highlightLines} />
@@ -923,8 +918,6 @@ function DesktopEditorPanel() { searchSpaceId={panelState.searchSpaceId ?? undefined} title={panelState.title} onClose={closePanel} - highlightLines={panelState.highlightLines} - forceSourceView={panelState.forceSourceView} /> ); @@ -964,8 +957,6 @@ function MobileEditorDrawer() { memoryScope={panelState.memoryScope ?? undefined} searchSpaceId={panelState.searchSpaceId ?? undefined} title={panelState.title} - highlightLines={panelState.highlightLines} - forceSourceView={panelState.forceSourceView} /> diff --git a/surfsense_web/components/editor/plugins/citation-kit.tsx b/surfsense_web/components/editor/plugins/citation-kit.tsx index edba9a19e..97e8ec723 100644 --- a/surfsense_web/components/editor/plugins/citation-kit.tsx +++ b/surfsense_web/components/editor/plugins/citation-kit.tsx @@ -3,10 +3,9 @@ import { type Descendant, KEYS } from "platejs"; import { createPlatePlugin, type PlateElementProps } from "platejs/react"; import type { FC } from "react"; -import { InlineCitation, LineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; +import { InlineCitation, UrlCitation } from "@/components/assistant-ui/inline-citation"; import { CITATION_REGEX, - type CitationToken, type CitationUrlMap, parseTextWithCitations, } from "@/lib/citations/citation-parser"; @@ -18,12 +17,9 @@ import { */ export type CitationElementNode = { type: "citation"; - kind: "chunk" | "doc" | "url" | "line"; + kind: "chunk" | "doc" | "url"; chunkId?: number; url?: string; - documentId?: number; - startLine?: number; - endLine?: number; /** Original literal token that produced this citation node. */ rawText: string; children: [{ text: "" }]; @@ -37,22 +33,11 @@ const CitationElement: FC> = ({ element, }) => { const isUrl = element.kind === "url"; - const isLine = - element.kind === "line" && - element.documentId !== undefined && - element.startLine !== undefined && - element.endLine !== undefined; return ( {isUrl && element.url ? ( - ) : isLine ? ( - ) : element.chunkId !== undefined ? ( ) : null} @@ -112,7 +97,10 @@ function copyMarks(textNode: SlateText): Record { return marks; } -function makeCitationElement(rawText: string, segment: CitationToken): CitationElementNode { +function makeCitationElement( + rawText: string, + segment: { kind: "url"; url: string } | { kind: "chunk"; chunkId: number; isDocsChunk: boolean } +): CitationElementNode { if (segment.kind === "url") { return { type: CITATION_TYPE, @@ -122,17 +110,6 @@ function makeCitationElement(rawText: string, segment: CitationToken): CitationE children: [{ text: "" }], }; } - if (segment.kind === "line") { - return { - type: CITATION_TYPE, - kind: "line", - documentId: segment.documentId, - startLine: segment.startLine, - endLine: segment.endLine, - rawText, - children: [{ text: "" }], - }; - } return { type: CITATION_TYPE, kind: segment.isDocsChunk ? "doc" : "chunk", diff --git a/surfsense_web/components/editor/source-code-editor.tsx b/surfsense_web/components/editor/source-code-editor.tsx index 4af4f2125..9102dffe9 100644 --- a/surfsense_web/components/editor/source-code-editor.tsx +++ b/surfsense_web/components/editor/source-code-editor.tsx @@ -2,7 +2,7 @@ import dynamic from "next/dynamic"; import { useTheme } from "next-themes"; -import { useCallback, useEffect, useRef } from "react"; +import { useEffect, useRef } from "react"; import { Spinner } from "@/components/ui/spinner"; const MonacoEditor = dynamic(() => import("@monaco-editor/react"), { @@ -17,8 +17,6 @@ interface SourceCodeEditorProps { readOnly?: boolean; fontSize?: number; onSave?: () => Promise | void; - /** 1-based inclusive line range to reveal and highlight (e.g. a citation). */ - highlightLines?: { start: number; end: number } | null; } export function SourceCodeEditor({ @@ -29,45 +27,10 @@ export function SourceCodeEditor({ readOnly = false, fontSize = 12, onSave, - highlightLines = null, }: SourceCodeEditorProps) { const { resolvedTheme } = useTheme(); const onSaveRef = useRef(onSave); const monacoRef = useRef(null); - const editorRef = useRef(null); - const decorationsRef = useRef(null); - const highlightLinesRef = useRef(highlightLines); - highlightLinesRef.current = highlightLines; - - const applyHighlight = useCallback(() => { - const editor = editorRef.current; - const monaco = monacoRef.current; - if (!editor || !monaco) return; - if (decorationsRef.current) { - decorationsRef.current.clear(); - decorationsRef.current = null; - } - const range = highlightLinesRef.current; - if (!range) return; - const lineCount = editor.getModel()?.getLineCount() ?? range.end; - const start = Math.min(Math.max(1, Math.floor(range.start)), lineCount); - const end = Math.min(Math.max(start, Math.floor(range.end)), lineCount); - try { - decorationsRef.current = editor.createDecorationsCollection([ - { - range: new monaco.Range(start, 1, end, 1), - options: { isWholeLine: true, className: "citation-line-highlight" }, - }, - ]); - } catch { - // Decoration failure must not block the reveal below. - } - editor.revealLinesInCenter(start, end, monaco.editor.ScrollType.Immediate); - }, []); - - useEffect(() => { - applyHighlight(); - }, [applyHighlight, highlightLines?.start, highlightLines?.end]); const normalizedModelPath = (() => { const raw = (path || "local-file.txt").trim(); const withLeadingSlash = raw.startsWith("/") ? raw : `/${raw}`; @@ -141,16 +104,7 @@ export function SourceCodeEditor({ }} onMount={(editor, monaco) => { monacoRef.current = monaco; - editorRef.current = editor; applySidebarTheme(monaco); - // Reveal now, then once more after the first layout settles: - // the panel slide-in animation means the editor often has no - // usable viewport height on the initial frame. - applyHighlight(); - const layoutSub = editor.onDidLayoutChange(() => { - applyHighlight(); - layoutSub.dispose(); - }); if (!isManualSaveEnabled) return; editor.addCommand(monaco.KeyMod.CtrlCmd | monaco.KeyCode.KeyS, () => { void onSaveRef.current?.(); diff --git a/surfsense_web/components/homepage/hero-section.tsx b/surfsense_web/components/homepage/hero-section.tsx index 0f3bfe1aa..c9430f098 100644 --- a/surfsense_web/components/homepage/hero-section.tsx +++ b/surfsense_web/components/homepage/hero-section.tsx @@ -37,7 +37,7 @@ import { getAssetLabel, usePrimaryDownload, } from "@/lib/desktop-download-utils"; -import { BUILD_TIME_AUTH_TYPE, buildBackendUrl } from "@/lib/env-config"; +import { buildBackendUrl } from "@/lib/env-config"; import { trackLoginAttempt } from "@/lib/posthog/events"; import { cn } from "@/lib/utils"; @@ -314,7 +314,6 @@ export function HeroSection() { } function GetStartedButton() { - const isGoogleAuth = BUILD_TIME_AUTH_TYPE === "GOOGLE"; const [isRedirecting, setIsRedirecting] = useState(false); const handleGoogleLogin = () => { @@ -324,29 +323,26 @@ function GetStartedButton() { window.location.href = buildBackendUrl("/auth/google/authorize-redirect"); }; - if (isGoogleAuth) { - return ( + return ( + <> - ); - } - - return ( - + + ); } diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx index 6662d7830..5a7588979 100644 --- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx +++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx @@ -12,7 +12,6 @@ import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right import { Button } from "@/components/ui/button"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { closeHitlEditPanelAtom, hitlEditPanelAtom } from "@/features/chat-messages/hitl"; -import { useMediaQuery } from "@/hooks/use-media-query"; import { cn } from "@/lib/utils"; import { DocumentsSidebar } from "../sidebar"; @@ -197,9 +196,6 @@ export function RightPanel({ const citationState = useAtomValue(citationPanelAtom); const closeCitation = useSetAtom(closeCitationPanelAtom); const [collapsed, setCollapsed] = useAtom(rightPanelCollapsedAtom); - // Desktop-only surface; mobile uses the dedicated Mobile* drawers. Without - // this guard both render together and two editors fight over one model. - const isDesktop = useMediaQuery("(min-width: 1024px)"); const documentsOpen = documentsPanel?.open ?? false; const reportOpen = reportState.isOpen && !!reportState.reportId; @@ -271,7 +267,7 @@ export function RightPanel({ setCollapsed(true)} /> ) : null; - if (!isVisible || !isDesktop) return null; + if (!isVisible) return null; return (