Merge remote-tracking branch 'upstream/dev' into feat/api-key

2026-07-04 22:02:16 +02:00 · 2026-06-20 10:50:03 +05:30 · 2026-06-20 10:50:03 +05:30 · fd31ac34fd
commit fd31ac34fd
parent 1e8baa10ec cd2242147a
61 changed files with 1984 additions and 435 deletions
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/tools/test_search_knowledge_base.py
@ -0,0 +1,87 @@
+"""Unit tests for search_knowledge_base hit rendering.
+
+The tool must surface the passage that actually matched (the RRF-ranked
+chunk), not the top of the document, and annotate it with its line range
+when the chunk carries a char span.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.main_agent.tools.search_knowledge_base import (
+    _format_hits,
+)
+
+pytestmark = pytest.mark.unit
+
+_BODY = "Intro paragraph.\n\nMatched passage here.\n\nClosing paragraph."
+
+
+def _hit() -> dict:
+    intro = "Intro paragraph."
+    matched = "Matched passage here."
+    matched_start = _BODY.index(matched)
+    return {
+        "document": {"id": 7, "title": "note.md", "document_type": "NOTE"},
+        "score": 0.42,
+        "content": _BODY.replace("\n\n", "\n\n"),
+        "matched_chunk_ids": [102],
+        "chunks": [
+            {
+                "chunk_id": 101,
+                "content": intro,
+                "start_char": 0,
+                "end_char": len(intro),
+            },
+            {
+                "chunk_id": 102,
+                "content": matched,
+                "start_char": matched_start,
+                "end_char": matched_start + len(matched),
+            },
+        ],
+    }
+
+
+def test_renders_matched_passage_not_top_of_doc() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Matched passage here." in out
+    # The intro chunk was not matched, so it must not be shown as the snippet.
+    assert "Intro paragraph." not in out
+
+
+def test_emits_copyable_line_citation_token_when_spans_present() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    # "Matched passage here." sits on line 3 of the body; the hit must surface
+    # a ready-to-copy token so the agent can cite without a separate read.
+    assert "[citation:d7#L3-3]" in out
+
+
+def test_header_includes_document_id() -> None:
+    out = _format_hits([_hit()], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "id=7" in out
+
+
+def test_omits_citation_token_when_spans_absent() -> None:
+    hit = _hit()
+    for chunk in hit["chunks"]:
+        chunk["start_char"] = None
+        chunk["end_char"] = None
+    out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Matched passage here." in out
+    # No concrete, copyable token for this document without spans (the closing
+    # instruction's placeholder template doesn't count).
+    assert "[citation:d7#L" not in out
+
+
+def test_falls_back_to_content_when_no_matched_ids() -> None:
+    hit = _hit()
+    hit["matched_chunk_ids"] = []
+    out = _format_hits([hit], paths={7: "/documents/note.md"}, bodies={7: _BODY}, query="q")
+    assert "Intro paragraph." in out
+
+
+def test_no_results_message() -> None:
+    out = _format_hits([], paths={}, bodies={}, query="missing")
+    assert "No knowledge-base matches" in out
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_markdown_with_spans.py
@ -0,0 +1,72 @@
+"""Span-aware chunking contract: slices form a lossless, contiguous partition
+of the markdown, and every slice's char span addresses its own text."""
+
+import pytest
+
+from app.indexing_pipeline.document_chunker import chunk_markdown_with_spans
+
+pytestmark = pytest.mark.unit
+
+
+def _assert_lossless_partition(md: str, slices) -> None:
+    assert "".join(s.text for s in slices) == md
+
+    cursor = 0
+    for s in slices:
+        assert s.start_char == cursor, "slices must be contiguous"
+        assert s.end_char >= s.start_char
+        assert md[s.start_char : s.end_char] == s.text, "span must address slice text"
+        cursor = s.end_char
+    assert cursor == len(md)
+
+
+def test_prose_partition_and_spans():
+    md = (
+        "# Title\n\n"
+        + "First paragraph with several words here. " * 20
+        + "\n\nSecond section with more prose to force multiple chunks. " * 20
+    )
+
+    slices = chunk_markdown_with_spans(md)
+
+    assert len(slices) > 1
+    _assert_lossless_partition(md, slices)
+
+
+def test_table_kept_whole_with_exact_span():
+    table = "| a | b |\n| - | - |\n| 1 | 2 |\n"
+    md = f"Intro prose before the table.\n{table}\nClosing prose after."
+
+    slices = chunk_markdown_with_spans(md)
+
+    _assert_lossless_partition(md, slices)
+    table_slices = [s for s in slices if s.text.lstrip().startswith("|")]
+    assert any("| 1 | 2 |" in s.text for s in table_slices)
+    for s in table_slices:
+        assert "| a | b |" in s.text and "| 1 | 2 |" in s.text
+
+
+def test_table_at_eof_without_trailing_newline_stays_whole():
+    md = "Intro.\n| a | b |\n| - | - |\n| 1 | 2 |"
+
+    slices = chunk_markdown_with_spans(md)
+
+    _assert_lossless_partition(md, slices)
+    table_slices = [s for s in slices if "| 1 | 2 |" in s.text]
+    assert len(table_slices) == 1
+    assert "| a | b |" in table_slices[0].text
+
+
+def test_code_chunker_partition_and_spans():
+    code = "\n\n".join(
+        f"def func_{i}(x):\n    total = x + {i}\n    return total" for i in range(40)
+    )
+
+    slices = chunk_markdown_with_spans(code, use_code_chunker=True)
+
+    assert len(slices) >= 1
+    _assert_lossless_partition(code, slices)
+
+
+def test_empty_markdown_yields_no_slices():
+    assert chunk_markdown_with_spans("") == []
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py
@ -37,12 +37,9 @@ def _make_orm_doc(connector_doc, doc_id):
 async def test_index_calls_embed_and_chunk_via_to_thread(
    pipeline, make_connector_document, monkeypatch
 ):
-    """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop.
+    """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice

-    Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default
-    path, see issue #1334) is verified separately in
-    ``test_non_code_documents_use_hybrid_chunker``.
-    """
    to_thread_calls = []
    original_to_thread = asyncio.to_thread

@ -51,11 +48,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
        return await original_to_thread(func, *args, **kwargs)

    monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
-    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
-    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
+    mock_chunker.__name__ = "chunk_markdown_with_spans"
    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
-        mock_chunk_hybrid,
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
    )
    mock_embed = MagicMock(
        side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
@ -90,34 +87,25 @@ async def test_index_calls_embed_and_chunk_via_to_thread(

    await pipeline.index(document, connector_doc)

-    # Either chunker entry point satisfies the "chunking runs off the event
-    # loop" contract this test guards. Routing between the two is verified
-    # in test_non_code_documents_use_hybrid_chunker.
-    assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls)
+    assert "chunk_markdown_with_spans" in to_thread_calls
    assert "embed_texts" in to_thread_calls
    assert document.status == DocumentStatus.ready()


-async def test_non_code_documents_use_hybrid_chunker(
+async def test_non_code_documents_use_prose_chunker(
    pipeline, make_connector_document, monkeypatch
 ):
-    """Non-code documents route through ``chunk_text_hybrid`` (issue #1334).
+    """Non-code documents chunk with use_code_chunker=False (issue #1334).

-    The hybrid chunker preserves Markdown table integrity by avoiding splits
-    mid-row. Only documents flagged with ``should_use_code_chunker=True``
-    should take the ``chunk_text`` path.
+    The table-aware prose path keeps Markdown tables intact; only documents
+    flagged with ``should_use_code_chunker=True`` request the code chunker.
    """
-    mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
-    mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
-        mock_chunk_hybrid,
-    )
-    mock_chunk_code = MagicMock(return_value=["chunk1"])
-    mock_chunk_code.__name__ = "chunk_text"
-    monkeypatch.setattr(
-        "app.indexing_pipeline.cache.cached_indexing.chunk_text",
-        mock_chunk_code,
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
    )
    monkeypatch.setattr(
        "app.indexing_pipeline.cache.cached_indexing.embed_texts",
@ -149,8 +137,49 @@ async def test_non_code_documents_use_hybrid_chunker(

    await pipeline.index(document, connector_doc)

-    mock_chunk_hybrid.assert_called_once()
-    mock_chunk_code.assert_not_called()
+    mock_chunker.assert_called_once()
+    assert mock_chunker.call_args.args[1] is False
+
+
+async def test_code_documents_request_code_chunker(
+    pipeline, make_connector_document, monkeypatch
+):
+    """Code-flagged documents forward use_code_chunker=True to the chunker."""
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    mock_chunker = MagicMock(return_value=[ChunkSlice("chunk1", 0, 6)])
+    monkeypatch.setattr(
+        "app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
+        mock_chunker,
+    )
+    monkeypatch.setattr(
+        "app.indexing_pipeline.cache.cached_indexing.embed_texts",
+        MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
+    )
+    monkeypatch.setattr(pipeline, "_load_existing_chunks", AsyncMock(return_value=[]))
+
+    async def _noop_persist(_session, doc, *_args, **_kwargs):
+        doc.status = DocumentStatus.ready()
+
+    monkeypatch.setattr(
+        "app.indexing_pipeline.indexing_pipeline_service.persist_scratch_index",
+        _noop_persist,
+    )
+
+    connector_doc = make_connector_document(
+        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
+        unique_id="repo-1",
+        search_space_id=1,
+        should_use_code_chunker=True,
+    )
+    document = MagicMock(spec=Document)
+    document.id = 1
+    document.status = DocumentStatus.pending()
+
+    await pipeline.index(document, connector_doc)
+
+    mock_chunker.assert_called_once()
+    assert mock_chunker.call_args.args[1] is True


 def _mock_session_factory(orm_docs_by_id):
--- a/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
+++ b/surfsense_backend/tests/unit/middleware/test_b_filesystem_rm_rmdir_cloud.py
@ -71,7 +71,7 @@ class _KBBackendStub(KBPostgresBackend):
    def __init__(self, *, children=None, file_data=None) -> None:
        self.als_info = AsyncMock(return_value=children or [])
        self._load_file_data = AsyncMock(
-            return_value=(file_data, 17) if file_data is not None else None
+            return_value=(file_data, 17, None) if file_data is not None else None
        )


--- a/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
+++ b/surfsense_backend/tests/unit/middleware/test_kb_persistence_filesystem_parity.py
@ -69,13 +69,25 @@ class _FakeSession:

@pytest.fixture(autouse=True)
 def _stub_embeddings_and_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Avoid loading the embedding model in unit tests."""
+    """Avoid loading the embedding model in unit tests.
+
+    Mirrors the legacy stub: one chunk spanning the whole content, with a
+    zero summary/chunk vector, routed through the shared span builder.
+    """
+    from app.indexing_pipeline.document_chunker import ChunkSlice
+
+    async def _fake_build_chunk_embeddings(content: str, *, use_code_chunker: bool):
+        summary = np.zeros(8, dtype=np.float32)
+        pairs = (
+            [(ChunkSlice(content, 0, len(content)), np.zeros(8, dtype=np.float32))]
+            if content
+            else []
+        )
+        return summary, pairs
+
    monkeypatch.setattr(
-        kb_persistence,
-        "embed_texts",
-        lambda texts: [np.zeros(8, dtype=np.float32) for _ in texts],
+        kb_persistence, "build_chunk_embeddings", _fake_build_chunk_embeddings
    )
-    monkeypatch.setattr(kb_persistence, "chunk_text", lambda content: [content])


@pytest.mark.asyncio
--- a/surfsense_backend/tests/unit/middleware/test_numbered_document.py
+++ b/surfsense_backend/tests/unit/middleware/test_numbered_document.py
@ -0,0 +1,92 @@
+"""Unit tests for the numbered-document read preamble."""
+
+import pytest
+
+from app.agents.chat.multi_agent_chat.shared.middleware.filesystem.backends.numbered_document import (
+    build_read_preamble,
+    compute_matched_line_ranges,
+)
+
+pytestmark = pytest.mark.unit
+
+
+_BODY = "alpha\nbravo\ncharlie\ndelta"
+
+
+class TestComputeMatchedLineRanges:
+    def test_maps_matched_chunk_spans_to_line_ranges(self):
+        chunks = [(1, 0, 12), (2, 12, len(_BODY))]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {2})
+        assert ranges == [(3, 4)]
+
+    def test_includes_only_matched_chunks(self):
+        chunks = [(1, 0, 5), (2, 6, 11)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1})
+        assert ranges == [(1, 1)]
+
+    def test_skips_chunks_without_spans(self):
+        chunks = [(1, None, None)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1})
+        assert ranges == []
+
+    def test_sorted_and_deduplicated(self):
+        chunks = [(1, 12, len(_BODY)), (2, 0, 5), (3, 0, 5)]
+        ranges = compute_matched_line_ranges(_BODY, chunks, {1, 2, 3})
+        assert ranges == [(1, 1), (3, 4)]
+
+
+class TestBuildReadPreamble:
+    def test_contains_document_metadata(self):
+        preamble = build_read_preamble(
+            document_id=42,
+            document_type="FILE",
+            title="Test Doc",
+            url="https://example.com",
+            matched_line_ranges=[],
+        )
+        assert "<document_id>42</document_id>" in preamble
+        assert "<document_type>FILE</document_type>" in preamble
+        assert "Test Doc" in preamble
+        assert "https://example.com" in preamble
+
+    def test_citation_hint_uses_document_id(self):
+        preamble = build_read_preamble(
+            document_id=42,
+            document_type="FILE",
+            title="Test Doc",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert "[citation:d42#L" in preamble
+
+    def test_lists_matched_line_ranges(self):
+        preamble = build_read_preamble(
+            document_id=7,
+            document_type="NOTE",
+            title="Notes",
+            url="",
+            matched_line_ranges=[(12, 18), (40, 40)],
+        )
+        assert "<matched_lines>" in preamble
+        assert "12-18" in preamble
+        assert "40" in preamble
+
+    def test_omits_matched_lines_block_when_empty(self):
+        preamble = build_read_preamble(
+            document_id=7,
+            document_type="NOTE",
+            title="Notes",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert "<matched_lines>" not in preamble
+
+    def test_ends_with_trailing_newline_so_body_follows_cleanly(self):
+        preamble = build_read_preamble(
+            document_id=1,
+            document_type="FILE",
+            title="t",
+            url="",
+            matched_line_ranges=[],
+        )
+        assert preamble.endswith("\n")
--- a/surfsense_backend/tests/unit/tasks/chat/streaming/test_llm_bundle.py
+++ b/surfsense_backend/tests/unit/tasks/chat/streaming/test_llm_bundle.py
@ -0,0 +1,154 @@
+"""Contracts for chat LLM construction in streaming flows.
+
+``stream_new_chat`` / ``stream_resume_chat`` depend on LangChain receiving
+token chunks from ``ChatLiteLLM``. ``langchain-litellm`` defaults
+``streaming`` to ``False``, so the shared bundle loader must opt in
+explicitly for both DB-backed and global model paths.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+
+import app.tasks.chat.streaming.flows.shared.llm_bundle as llm_bundle
+
+pytestmark = pytest.mark.unit
+
+
+class _CapturedChatLiteLLM:
+    calls: list[dict[str, Any]] = []
+
+    def __init__(self, **kwargs: Any) -> None:
+        self.kwargs = kwargs
+        self.__class__.calls.append(kwargs)
+
+
+@pytest.fixture(autouse=True)
+def _patch_common_bundle_dependencies(monkeypatch: pytest.MonkeyPatch):
+    """Keep these tests focused on the LLM constructor contract."""
+
+    _CapturedChatLiteLLM.calls = []
+
+    async def _fake_search_space(_session: Any, _search_space_id: int) -> SimpleNamespace:
+        return SimpleNamespace(id=42, user_id="user-1")
+
+    monkeypatch.setattr(llm_bundle, "_load_search_space", _fake_search_space)
+    monkeypatch.setattr(llm_bundle, "SanitizedChatLiteLLM", _CapturedChatLiteLLM)
+    monkeypatch.setattr(llm_bundle, "register_model_usage_metadata", lambda **_kw: None)
+    monkeypatch.setattr(
+        llm_bundle,
+        "has_capability",
+        lambda _model, capability: capability in {"chat", "vision"},
+    )
+
+    return None
+
+
+async def test_load_llm_bundle_enables_streaming_for_db_models(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    connection = SimpleNamespace(
+        provider="openai",
+        api_key="sk-test",
+        base_url=None,
+        extra={"litellm_params": {"temperature": 0.1}},
+    )
+    model = SimpleNamespace(
+        id=7,
+        model_id="gpt-4o-mini",
+        display_name="GPT 4o Mini",
+        connection=connection,
+    )
+
+    async def _fake_db_model(_session: Any, *, model_id: int, search_space: Any) -> Any:
+        assert model_id == 7
+        assert search_space.id == 42
+        return model
+
+    monkeypatch.setattr(llm_bundle, "_load_db_model", _fake_db_model)
+    monkeypatch.setattr(
+        llm_bundle,
+        "to_litellm",
+        lambda _conn, _model_id: (
+            "openai/gpt-4o-mini",
+            {"api_key": "sk-test", "temperature": 0.1},
+        ),
+    )
+
+    llm, agent_config, error = await llm_bundle.load_llm_bundle(
+        object(),
+        config_id=7,
+        search_space_id=42,
+    )
+
+    assert error is None
+    assert llm is not None
+    assert agent_config is not None
+    assert _CapturedChatLiteLLM.calls == [
+        {
+            "model": "openai/gpt-4o-mini",
+            "api_key": "sk-test",
+            "temperature": 0.1,
+            "streaming": True,
+        }
+    ]
+
+
+async def test_load_llm_bundle_enables_streaming_for_global_models(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    global_model = {
+        "id": -11,
+        "connection_id": -101,
+        "model_id": "claude-sonnet-4-5",
+        "display_name": "Claude Sonnet",
+        "billing_tier": "premium",
+    }
+    global_connection = {
+        "id": -101,
+        "provider": "anthropic",
+        "api_key": "sk-ant-test",
+        "base_url": None,
+        "extra": {"litellm_params": {"temperature": 0.2}},
+    }
+    monkeypatch.setattr(
+        llm_bundle.config,
+        "GLOBAL_MODELS",
+        [global_model],
+        raising=False,
+    )
+    monkeypatch.setattr(
+        llm_bundle.config,
+        "GLOBAL_CONNECTIONS",
+        [global_connection],
+        raising=False,
+    )
+    monkeypatch.setattr(
+        llm_bundle,
+        "to_litellm",
+        lambda _conn, _model_id: (
+            "anthropic/claude-sonnet-4-5",
+            {"api_key": "sk-ant-test", "temperature": 0.2},
+        ),
+    )
+
+    llm, agent_config, error = await llm_bundle.load_llm_bundle(
+        object(),
+        config_id=-11,
+        search_space_id=42,
+    )
+
+    assert error is None
+    assert llm is not None
+    assert agent_config is not None
+    assert _CapturedChatLiteLLM.calls == [
+        {
+            "model": "anthropic/claude-sonnet-4-5",
+            "api_key": "sk-ant-test",
+            "temperature": 0.2,
+            "streaming": True,
+        }
+    ]
--- a/surfsense_backend/tests/unit/utils/test_text_spans.py
+++ b/surfsense_backend/tests/unit/utils/test_text_spans.py
@ -0,0 +1,39 @@
+"""Unit tests for char-span -> line-range conversion."""
+
+from __future__ import annotations
+
+import pytest
+
+from app.utils.text_spans import char_span_to_line_range
+
+pytestmark = pytest.mark.unit
+
+_TEXT = "line1\nline2\nline3"
+
+
+def test_single_line_span() -> None:
+    start = _TEXT.index("line2")
+    assert char_span_to_line_range(_TEXT, start, start + len("line2")) == (2, 2)
+
+
+def test_first_line_span() -> None:
+    assert char_span_to_line_range(_TEXT, 0, len("line1")) == (1, 1)
+
+
+def test_last_line_span() -> None:
+    start = _TEXT.index("line3")
+    assert char_span_to_line_range(_TEXT, start, len(_TEXT)) == (3, 3)
+
+
+def test_multi_line_span() -> None:
+    # "line1\nline2" spans lines 1-2.
+    assert char_span_to_line_range(_TEXT, 0, _TEXT.index("line2") + 5) == (1, 2)
+
+
+def test_empty_span_resolves_to_its_line() -> None:
+    start = _TEXT.index("line2")
+    assert char_span_to_line_range(_TEXT, start, start) == (2, 2)
+
+
+def test_offsets_clamped_to_text_bounds() -> None:
+    assert char_span_to_line_range(_TEXT, -5, 10_000) == (1, 3)