test(index-cache): add unit tests and repoint embed/chunk patch targets

This commit is contained in:
CREDO23 2026-06-12 16:48:18 +02:00
parent 4e4f7f34fa
commit 8cf578d965
9 changed files with 153 additions and 14 deletions

View file

@ -57,9 +57,9 @@ def install(patches: list[Any]) -> None:
# Consumers that did `from app.utils.document_converters import embed_text/texts`
("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text),
("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts),
# Pipeline service binding (the actual call site for indexing.index)
# Index-cache facade binding (the actual call site for indexing.index)
(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
fake_embed_texts,
),
]

View file

@ -127,7 +127,7 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac
def patched_embed_texts(monkeypatch) -> MagicMock:
mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
mock,
)
return mock
@ -137,7 +137,7 @@ def patched_embed_texts(monkeypatch) -> MagicMock:
def patched_embed_texts_raises(monkeypatch) -> MagicMock:
mock = MagicMock(side_effect=RuntimeError("Embedding unavailable"))
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
mock,
)
return mock
@ -147,11 +147,11 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
def patched_chunk_text(monkeypatch) -> MagicMock:
mock = MagicMock(return_value=["Test chunk content."])
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock,
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock,
)
return mock

View file

@ -283,11 +283,11 @@ async def credits():
def _mock_external_apis(monkeypatch):
"""Mock LLM, embedding, and chunking — these are external API boundaries."""
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
MagicMock(return_value=["Test chunk content."]),
)

View file

@ -177,7 +177,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending."""
mocker.patch(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
side_effect=[["Original chunk."], ["Updated chunk."]],
)

View file

@ -0,0 +1,28 @@
"""Stub the cache package __init__s so unit tests import only pure leaf modules.
The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly
import the facade, file storage, Celery, and ``app.db`` -- none of which a pure
unit test should need. Turning those packages into bare namespace packages lets
``from app.indexing_pipeline.cache.<leaf> import ...`` resolve the leaf module
without running the heavy __init__. ``schemas`` is left real (it is pure).
"""
import sys
import types
from pathlib import Path
_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "indexing_pipeline" / "cache"
def _stub_namespace_package(dotted: str, fs_dir: Path) -> None:
if dotted in sys.modules:
return
module = types.ModuleType(dotted)
module.__path__ = [str(fs_dir)]
module.__package__ = dotted
sys.modules[dotted] = module
_stub_namespace_package("app.indexing_pipeline.cache", _CACHE_DIR)
_stub_namespace_package("app.indexing_pipeline.cache.storage", _CACHE_DIR / "storage")
_stub_namespace_package("app.indexing_pipeline.cache.eviction", _CACHE_DIR / "eviction")

View file

@ -0,0 +1,28 @@
from app.indexing_pipeline.cache.eligibility import is_index_cacheable
def test_disabled_cache_is_never_cacheable():
assert not is_index_cacheable(
cache_enabled=False, embedding_model="m", embedding_dim=384
)
def test_missing_model_is_not_cacheable():
assert not is_index_cacheable(
cache_enabled=True, embedding_model=None, embedding_dim=384
)
def test_missing_dimension_is_not_cacheable():
assert not is_index_cacheable(
cache_enabled=True, embedding_model="m", embedding_dim=None
)
assert not is_index_cacheable(
cache_enabled=True, embedding_model="m", embedding_dim=0
)
def test_enabled_with_model_and_dim_is_cacheable():
assert is_index_cacheable(
cache_enabled=True, embedding_model="m", embedding_dim=384
)

View file

@ -0,0 +1,31 @@
from app.indexing_pipeline.cache.schemas import EmbeddingKey
def _key(**overrides) -> EmbeddingKey:
base = {
"markdown_sha256": "a" * 64,
"embedding_model": "openai://text-embedding-3-small",
"embedding_dim": 1536,
"chunker_kind": "hybrid",
"chunker_version": 1,
}
base.update(overrides)
return EmbeddingKey(**base)
def test_object_suffix_is_stable():
assert _key().object_suffix == _key().object_suffix
def test_object_suffix_differs_by_model():
assert _key().object_suffix != _key(embedding_model="local/minilm").object_suffix
def test_object_suffix_differs_by_chunker_kind_and_version():
assert _key().object_suffix != _key(chunker_kind="code").object_suffix
assert _key().object_suffix != _key(chunker_version=2).object_suffix
def test_object_suffix_encodes_kind_and_version():
suffix = _key(chunker_kind="code", chunker_version=3).object_suffix
assert suffix.endswith(".code.v3.emb")

View file

@ -0,0 +1,52 @@
import numpy as np
import pytest
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet
from app.indexing_pipeline.cache.serialization import deserialize, serialize
def _make_set(dim: int, n_chunks: int) -> EmbeddingSet:
rng = np.random.default_rng(0)
return EmbeddingSet(
summary_embedding=rng.random(dim, dtype=np.float64),
chunks=[
CachedChunk(text=f"chunk {i}\nwith newline", embedding=rng.random(dim))
for i in range(n_chunks)
],
)
def test_round_trip_preserves_texts_and_vectors():
original = _make_set(dim=8, n_chunks=3)
restored = deserialize(serialize(original))
assert [c.text for c in restored.chunks] == [c.text for c in original.chunks]
assert restored.chunk_count == 3
assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6)
for got, want in zip(restored.chunks, original.chunks, strict=True):
assert np.allclose(got.embedding, want.embedding, atol=1e-6)
def test_round_trip_with_no_chunks():
original = _make_set(dim=4, n_chunks=0)
restored = deserialize(serialize(original))
assert restored.chunk_count == 0
assert restored.summary_embedding.shape[0] == 4
def test_serialize_rejects_mismatched_dimensions():
bad = EmbeddingSet(
summary_embedding=np.zeros(4, dtype=np.float32),
chunks=[CachedChunk(text="x", embedding=np.zeros(8, dtype=np.float32))],
)
with pytest.raises(ValueError):
serialize(bad)
def test_deserialize_rejects_foreign_blob():
with pytest.raises(ValueError):
deserialize(b"not-a-surfsense-blob")

View file

@ -54,7 +54,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock_chunk_hybrid,
)
mock_embed = MagicMock(
@ -62,7 +62,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
)
mock_embed.__name__ = "embed_texts"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
mock_embed,
)
# Bypass set_committed_value, which requires a real ORM instance (not MagicMock).
@ -102,17 +102,17 @@ async def test_non_code_documents_use_hybrid_chunker(
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
mock_chunk_hybrid,
)
mock_chunk_code = MagicMock(return_value=["chunk1"])
mock_chunk_code.__name__ = "chunk_text"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
mock_chunk_code,
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
)
monkeypatch.setattr(