mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
test(index-cache): add unit tests and repoint embed/chunk patch targets
This commit is contained in:
parent
4e4f7f34fa
commit
8cf578d965
9 changed files with 153 additions and 14 deletions
|
|
@ -57,9 +57,9 @@ def install(patches: list[Any]) -> None:
|
|||
# Consumers that did `from app.utils.document_converters import embed_text/texts`
|
||||
("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text),
|
||||
("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts),
|
||||
# Pipeline service binding (the actual call site for indexing.index)
|
||||
# Index-cache facade binding (the actual call site for indexing.index)
|
||||
(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
fake_embed_texts,
|
||||
),
|
||||
]
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac
|
|||
def patched_embed_texts(monkeypatch) -> MagicMock:
|
||||
mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
mock,
|
||||
)
|
||||
return mock
|
||||
|
|
@ -137,7 +137,7 @@ def patched_embed_texts(monkeypatch) -> MagicMock:
|
|||
def patched_embed_texts_raises(monkeypatch) -> MagicMock:
|
||||
mock = MagicMock(side_effect=RuntimeError("Embedding unavailable"))
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
mock,
|
||||
)
|
||||
return mock
|
||||
|
|
@ -147,11 +147,11 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock:
|
|||
def patched_chunk_text(monkeypatch) -> MagicMock:
|
||||
mock = MagicMock(return_value=["Test chunk content."])
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
mock,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
mock,
|
||||
)
|
||||
return mock
|
||||
|
|
|
|||
|
|
@ -283,11 +283,11 @@ async def credits():
|
|||
def _mock_external_apis(monkeypatch):
|
||||
"""Mock LLM, embedding, and chunking — these are external API boundaries."""
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
MagicMock(return_value=["Test chunk content."]),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
|
|||
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
||||
"""Reindexing replaces old chunks with new content rather than appending."""
|
||||
mocker.patch(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
side_effect=[["Original chunk."], ["Updated chunk."]],
|
||||
)
|
||||
|
||||
|
|
|
|||
28
surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py
vendored
Normal file
28
surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
"""Stub the cache package __init__s so unit tests import only pure leaf modules.
|
||||
|
||||
The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly
|
||||
import the facade, file storage, Celery, and ``app.db`` -- none of which a pure
|
||||
unit test should need. Turning those packages into bare namespace packages lets
|
||||
``from app.indexing_pipeline.cache.<leaf> import ...`` resolve the leaf module
|
||||
without running the heavy __init__. ``schemas`` is left real (it is pure).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "indexing_pipeline" / "cache"
|
||||
|
||||
|
||||
def _stub_namespace_package(dotted: str, fs_dir: Path) -> None:
|
||||
if dotted in sys.modules:
|
||||
return
|
||||
module = types.ModuleType(dotted)
|
||||
module.__path__ = [str(fs_dir)]
|
||||
module.__package__ = dotted
|
||||
sys.modules[dotted] = module
|
||||
|
||||
|
||||
_stub_namespace_package("app.indexing_pipeline.cache", _CACHE_DIR)
|
||||
_stub_namespace_package("app.indexing_pipeline.cache.storage", _CACHE_DIR / "storage")
|
||||
_stub_namespace_package("app.indexing_pipeline.cache.eviction", _CACHE_DIR / "eviction")
|
||||
28
surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py
vendored
Normal file
28
surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
from app.indexing_pipeline.cache.eligibility import is_index_cacheable
|
||||
|
||||
|
||||
def test_disabled_cache_is_never_cacheable():
|
||||
assert not is_index_cacheable(
|
||||
cache_enabled=False, embedding_model="m", embedding_dim=384
|
||||
)
|
||||
|
||||
|
||||
def test_missing_model_is_not_cacheable():
|
||||
assert not is_index_cacheable(
|
||||
cache_enabled=True, embedding_model=None, embedding_dim=384
|
||||
)
|
||||
|
||||
|
||||
def test_missing_dimension_is_not_cacheable():
|
||||
assert not is_index_cacheable(
|
||||
cache_enabled=True, embedding_model="m", embedding_dim=None
|
||||
)
|
||||
assert not is_index_cacheable(
|
||||
cache_enabled=True, embedding_model="m", embedding_dim=0
|
||||
)
|
||||
|
||||
|
||||
def test_enabled_with_model_and_dim_is_cacheable():
|
||||
assert is_index_cacheable(
|
||||
cache_enabled=True, embedding_model="m", embedding_dim=384
|
||||
)
|
||||
31
surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py
vendored
Normal file
31
surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
from app.indexing_pipeline.cache.schemas import EmbeddingKey
|
||||
|
||||
|
||||
def _key(**overrides) -> EmbeddingKey:
|
||||
base = {
|
||||
"markdown_sha256": "a" * 64,
|
||||
"embedding_model": "openai://text-embedding-3-small",
|
||||
"embedding_dim": 1536,
|
||||
"chunker_kind": "hybrid",
|
||||
"chunker_version": 1,
|
||||
}
|
||||
base.update(overrides)
|
||||
return EmbeddingKey(**base)
|
||||
|
||||
|
||||
def test_object_suffix_is_stable():
|
||||
assert _key().object_suffix == _key().object_suffix
|
||||
|
||||
|
||||
def test_object_suffix_differs_by_model():
|
||||
assert _key().object_suffix != _key(embedding_model="local/minilm").object_suffix
|
||||
|
||||
|
||||
def test_object_suffix_differs_by_chunker_kind_and_version():
|
||||
assert _key().object_suffix != _key(chunker_kind="code").object_suffix
|
||||
assert _key().object_suffix != _key(chunker_version=2).object_suffix
|
||||
|
||||
|
||||
def test_object_suffix_encodes_kind_and_version():
|
||||
suffix = _key(chunker_kind="code", chunker_version=3).object_suffix
|
||||
assert suffix.endswith(".code.v3.emb")
|
||||
52
surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py
vendored
Normal file
52
surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py
vendored
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet
|
||||
from app.indexing_pipeline.cache.serialization import deserialize, serialize
|
||||
|
||||
|
||||
def _make_set(dim: int, n_chunks: int) -> EmbeddingSet:
|
||||
rng = np.random.default_rng(0)
|
||||
return EmbeddingSet(
|
||||
summary_embedding=rng.random(dim, dtype=np.float64),
|
||||
chunks=[
|
||||
CachedChunk(text=f"chunk {i}\nwith newline", embedding=rng.random(dim))
|
||||
for i in range(n_chunks)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_round_trip_preserves_texts_and_vectors():
|
||||
original = _make_set(dim=8, n_chunks=3)
|
||||
|
||||
restored = deserialize(serialize(original))
|
||||
|
||||
assert [c.text for c in restored.chunks] == [c.text for c in original.chunks]
|
||||
assert restored.chunk_count == 3
|
||||
assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6)
|
||||
for got, want in zip(restored.chunks, original.chunks, strict=True):
|
||||
assert np.allclose(got.embedding, want.embedding, atol=1e-6)
|
||||
|
||||
|
||||
def test_round_trip_with_no_chunks():
|
||||
original = _make_set(dim=4, n_chunks=0)
|
||||
|
||||
restored = deserialize(serialize(original))
|
||||
|
||||
assert restored.chunk_count == 0
|
||||
assert restored.summary_embedding.shape[0] == 4
|
||||
|
||||
|
||||
def test_serialize_rejects_mismatched_dimensions():
|
||||
bad = EmbeddingSet(
|
||||
summary_embedding=np.zeros(4, dtype=np.float32),
|
||||
chunks=[CachedChunk(text="x", embedding=np.zeros(8, dtype=np.float32))],
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
serialize(bad)
|
||||
|
||||
|
||||
def test_deserialize_rejects_foreign_blob():
|
||||
with pytest.raises(ValueError):
|
||||
deserialize(b"not-a-surfsense-blob")
|
||||
|
|
@ -54,7 +54,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
|||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
mock_chunk_hybrid,
|
||||
)
|
||||
mock_embed = MagicMock(
|
||||
|
|
@ -62,7 +62,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
|||
)
|
||||
mock_embed.__name__ = "embed_texts"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
mock_embed,
|
||||
)
|
||||
# Bypass set_committed_value, which requires a real ORM instance (not MagicMock).
|
||||
|
|
@ -102,17 +102,17 @@ async def test_non_code_documents_use_hybrid_chunker(
|
|||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid",
|
||||
mock_chunk_hybrid,
|
||||
)
|
||||
mock_chunk_code = MagicMock(return_value=["chunk1"])
|
||||
mock_chunk_code.__name__ = "chunk_text"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text",
|
||||
mock_chunk_code,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
"app.indexing_pipeline.cache.cached_indexing.embed_texts",
|
||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue