diff --git a/surfsense_backend/tests/e2e/fakes/embeddings.py b/surfsense_backend/tests/e2e/fakes/embeddings.py index ab9e24df9..9a01fb84b 100644 --- a/surfsense_backend/tests/e2e/fakes/embeddings.py +++ b/surfsense_backend/tests/e2e/fakes/embeddings.py @@ -57,9 +57,9 @@ def install(patches: list[Any]) -> None: # Consumers that did `from app.utils.document_converters import embed_text/texts` ("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text), ("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts), - # Pipeline service binding (the actual call site for indexing.index) + # Index-cache facade binding (the actual call site for indexing.index) ( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", fake_embed_texts, ), ] diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 19f8e3d0a..8457047ec 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -127,7 +127,7 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac def patched_embed_texts(monkeypatch) -> MagicMock: mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock, ) return mock @@ -137,7 +137,7 @@ def patched_embed_texts(monkeypatch) -> MagicMock: def patched_embed_texts_raises(monkeypatch) -> MagicMock: mock = MagicMock(side_effect=RuntimeError("Embedding unavailable")) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock, ) return mock @@ -147,11 +147,11 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock: def patched_chunk_text(monkeypatch) -> MagicMock: mock = MagicMock(return_value=["Test chunk content."]) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", mock, ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock, ) return mock diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 812140be3..bd889360f 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -283,11 +283,11 @@ async def credits(): def _mock_external_apis(monkeypatch): """Mock LLM, embedding, and chunking — these are external API boundaries.""" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", MagicMock(return_value=["Test chunk content."]), ) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index 311716052..814129c8d 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -177,7 +177,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" mocker.patch( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", side_effect=[["Original chunk."], ["Updated chunk."]], ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py new file mode 100644 index 000000000..081dddaa7 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py @@ -0,0 +1,28 @@ +"""Stub the cache package __init__s so unit tests import only pure leaf modules. + +The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly +import the facade, file storage, Celery, and ``app.db`` -- none of which a pure +unit test should need. Turning those packages into bare namespace packages lets +``from app.indexing_pipeline.cache. import ...`` resolve the leaf module +without running the heavy __init__. ``schemas`` is left real (it is pure). +""" + +import sys +import types +from pathlib import Path + +_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "indexing_pipeline" / "cache" + + +def _stub_namespace_package(dotted: str, fs_dir: Path) -> None: + if dotted in sys.modules: + return + module = types.ModuleType(dotted) + module.__path__ = [str(fs_dir)] + module.__package__ = dotted + sys.modules[dotted] = module + + +_stub_namespace_package("app.indexing_pipeline.cache", _CACHE_DIR) +_stub_namespace_package("app.indexing_pipeline.cache.storage", _CACHE_DIR / "storage") +_stub_namespace_package("app.indexing_pipeline.cache.eviction", _CACHE_DIR / "eviction") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py new file mode 100644 index 000000000..780a6c536 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py @@ -0,0 +1,28 @@ +from app.indexing_pipeline.cache.eligibility import is_index_cacheable + + +def test_disabled_cache_is_never_cacheable(): + assert not is_index_cacheable( + cache_enabled=False, embedding_model="m", embedding_dim=384 + ) + + +def test_missing_model_is_not_cacheable(): + assert not is_index_cacheable( + cache_enabled=True, embedding_model=None, embedding_dim=384 + ) + + +def test_missing_dimension_is_not_cacheable(): + assert not is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=None + ) + assert not is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=0 + ) + + +def test_enabled_with_model_and_dim_is_cacheable(): + assert is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=384 + ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py new file mode 100644 index 000000000..ce9c8672d --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py @@ -0,0 +1,31 @@ +from app.indexing_pipeline.cache.schemas import EmbeddingKey + + +def _key(**overrides) -> EmbeddingKey: + base = { + "markdown_sha256": "a" * 64, + "embedding_model": "openai://text-embedding-3-small", + "embedding_dim": 1536, + "chunker_kind": "hybrid", + "chunker_version": 1, + } + base.update(overrides) + return EmbeddingKey(**base) + + +def test_object_suffix_is_stable(): + assert _key().object_suffix == _key().object_suffix + + +def test_object_suffix_differs_by_model(): + assert _key().object_suffix != _key(embedding_model="local/minilm").object_suffix + + +def test_object_suffix_differs_by_chunker_kind_and_version(): + assert _key().object_suffix != _key(chunker_kind="code").object_suffix + assert _key().object_suffix != _key(chunker_version=2).object_suffix + + +def test_object_suffix_encodes_kind_and_version(): + suffix = _key(chunker_kind="code", chunker_version=3).object_suffix + assert suffix.endswith(".code.v3.emb") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py new file mode 100644 index 000000000..8db87bf1b --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet +from app.indexing_pipeline.cache.serialization import deserialize, serialize + + +def _make_set(dim: int, n_chunks: int) -> EmbeddingSet: + rng = np.random.default_rng(0) + return EmbeddingSet( + summary_embedding=rng.random(dim, dtype=np.float64), + chunks=[ + CachedChunk(text=f"chunk {i}\nwith newline", embedding=rng.random(dim)) + for i in range(n_chunks) + ], + ) + + +def test_round_trip_preserves_texts_and_vectors(): + original = _make_set(dim=8, n_chunks=3) + + restored = deserialize(serialize(original)) + + assert [c.text for c in restored.chunks] == [c.text for c in original.chunks] + assert restored.chunk_count == 3 + assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6) + for got, want in zip(restored.chunks, original.chunks, strict=True): + assert np.allclose(got.embedding, want.embedding, atol=1e-6) + + +def test_round_trip_with_no_chunks(): + original = _make_set(dim=4, n_chunks=0) + + restored = deserialize(serialize(original)) + + assert restored.chunk_count == 0 + assert restored.summary_embedding.shape[0] == 4 + + +def test_serialize_rejects_mismatched_dimensions(): + bad = EmbeddingSet( + summary_embedding=np.zeros(4, dtype=np.float32), + chunks=[CachedChunk(text="x", embedding=np.zeros(8, dtype=np.float32))], + ) + + with pytest.raises(ValueError): + serialize(bad) + + +def test_deserialize_rejects_foreign_blob(): + with pytest.raises(ValueError): + deserialize(b"not-a-surfsense-blob") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index 3a1b77d90..252310061 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -54,7 +54,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread( mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock_chunk_hybrid, ) mock_embed = MagicMock( @@ -62,7 +62,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread( ) mock_embed.__name__ = "embed_texts" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock_embed, ) # Bypass set_committed_value, which requires a real ORM instance (not MagicMock). @@ -102,17 +102,17 @@ async def test_non_code_documents_use_hybrid_chunker( mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock_chunk_hybrid, ) mock_chunk_code = MagicMock(return_value=["chunk1"]) mock_chunk_code.__name__ = "chunk_text" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", mock_chunk_code, ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), ) monkeypatch.setattr(