SurfSense/surfsense_backend/tests/e2e/fakes/embeddings.py

80 lines
2.9 KiB
Python
Raw Normal View History

"""Deterministic embedding fakes for E2E.
Mirrors the existing `patched_embed_texts` fixture in
`surfsense_backend/tests/integration/conftest.py`:
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
The dimension matches whatever `config.embedding_model_instance.dimension`
returns in the running process so the fakes are vector-compatible with
the documents.embedding pgvector column.
"""
from __future__ import annotations
import logging
from typing import Any
import numpy as np
from app.config import config
logger = logging.getLogger(__name__)
def _embedding_dim() -> int:
"""Resolve the dimension once, lazily, so tests work for any embedding model."""
return int(config.embedding_model_instance.dimension)
def fake_embed_text(text: str) -> np.ndarray:
"""Deterministic single-text embedding."""
return np.full(shape=(_embedding_dim(),), fill_value=0.1, dtype=np.float32)
def fake_embed_texts(texts: list[str]) -> list[np.ndarray]:
"""Deterministic batch embedding. One vector per input text."""
if not texts:
return []
dim = _embedding_dim()
return [
np.full(shape=(dim,), fill_value=0.1, dtype=np.float32) for _ in texts
]
def install(patches: list[Any]) -> None:
"""Install embedding patches at every binding site we know about.
Caller passes a `patches` list that the entrypoint will track in
order to start them (and, in principle, stop them on shutdown we
intentionally never stop because the process exits when the test
server stops).
"""
from unittest.mock import patch as _patch
targets = [
# Source binding (where the real implementation lives)
("app.utils.document_converters.embed_text", fake_embed_text),
("app.utils.document_converters.embed_texts", fake_embed_texts),
# Consumers that did `from app.utils.document_converters import embed_text/texts`
("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text),
("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts),
# Pipeline service binding (the actual call site for indexing.index)
("app.indexing_pipeline.indexing_pipeline_service.embed_texts", fake_embed_texts),
]
for target, replacement in targets:
try:
p = _patch(target, replacement)
p.start()
patches.append(p)
logger.info("[fake-embeddings] patched %s", target)
except (ModuleNotFoundError, AttributeError) as exc:
# If a future refactor moves a binding, fail loudly — silent
# passthrough to a real embedding model would be expensive
# and non-deterministic.
raise RuntimeError(
f"Could not patch embedding binding {target!r}: {exc!s}. "
f"Update surfsense_backend/tests/e2e/fakes/embeddings.py "
f"to point at the new binding site."
) from exc