From 24f824b597cb73eaddb5ccb189d62b518a5c6c62 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:48 +0200 Subject: [PATCH 01/52] feat(etl-cache): add ParseKey cache identity value object --- .../etl_pipeline/cache/schemas/parse_key.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py new file mode 100644 index 000000000..65e7b08a5 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py @@ -0,0 +1,28 @@ +"""Identity of a cacheable parse: equal keys yield identical markdown.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class ParseKey: + source_sha256: str + etl_service: str + mode: str + version: int + + @classmethod + def for_document( + cls, source_sha256: str, *, etl_service: str, mode: str, version: int + ) -> "ParseKey": + return cls( + source_sha256=source_sha256, + etl_service=etl_service, + mode=mode, + version=version, + ) + + @property + def object_suffix(self) -> str: + return f"{self.etl_service}.{self.mode}.v{self.version}.md" From 3c9ea0011d2dd0b9ccf75e2b46de279eef610d63 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:48 +0200 Subject: [PATCH 02/52] feat(etl-cache): add EvictionCandidate value object --- .../cache/schemas/eviction_candidate.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py b/surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py new file mode 100644 index 000000000..13a903e7d --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/schemas/eviction_candidate.py @@ -0,0 +1,15 @@ +"""Row projection handed to the eviction policy.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime + + +@dataclass(frozen=True, slots=True) +class EvictionCandidate: + id: int + storage_key: str + size_bytes: int + last_used_at: datetime + times_reused: int From b84debd99917ecce372a207d82774002748f37a2 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:48 +0200 Subject: [PATCH 03/52] feat(etl-cache): expose cache schema value objects --- .../app/etl_pipeline/cache/schemas/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/schemas/__init__.py diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/__init__.py b/surfsense_backend/app/etl_pipeline/cache/schemas/__init__.py new file mode 100644 index 000000000..c88ac0c72 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/schemas/__init__.py @@ -0,0 +1,11 @@ +"""Pure value objects for the parse cache.""" + +from __future__ import annotations + +from .eviction_candidate import EvictionCandidate +from .parse_key import ParseKey + +__all__ = [ + "EvictionCandidate", + "ParseKey", +] From 205a63b9bccc3b02c11199aa6ae625f0d48026f9 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:48 +0200 Subject: [PATCH 04/52] feat(etl-cache): add EtlCacheSettings resolved from config --- .../app/etl_pipeline/cache/settings.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/settings.py diff --git a/surfsense_backend/app/etl_pipeline/cache/settings.py b/surfsense_backend/app/etl_pipeline/cache/settings.py new file mode 100644 index 000000000..5911ea222 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/settings.py @@ -0,0 +1,33 @@ +"""Cache configuration resolved from the central ``Config``.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class EtlCacheSettings: + enabled: bool + parser_version: int + ttl_days: int + max_total_bytes: int + eviction_batch: int + # None for any storage_* field means: reuse the main file_storage backend. + storage_backend: str | None + storage_container: str | None + storage_local_root: str | None + + +def load_etl_cache_settings() -> EtlCacheSettings: + from app.config import config + + return EtlCacheSettings( + enabled=config.ETL_CACHE_ENABLED, + parser_version=config.ETL_CACHE_PARSER_VERSION, + ttl_days=config.ETL_CACHE_TTL_DAYS, + max_total_bytes=config.ETL_CACHE_MAX_TOTAL_MB * 1024 * 1024, + eviction_batch=config.ETL_CACHE_EVICTION_BATCH, + storage_backend=config.ETL_CACHE_STORAGE_BACKEND or None, + storage_container=config.ETL_CACHE_STORAGE_CONTAINER or None, + storage_local_root=config.ETL_CACHE_STORAGE_LOCAL_PATH or None, + ) From c624235780fc965f8f648e85a3e2b68cfa456df4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:48 +0200 Subject: [PATCH 05/52] feat(etl-cache): add CachedParse table model --- .../etl_pipeline/cache/persistence/models.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/persistence/models.py diff --git a/surfsense_backend/app/etl_pipeline/cache/persistence/models.py b/surfsense_backend/app/etl_pipeline/cache/persistence/models.py new file mode 100644 index 000000000..bd20bdd12 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/persistence/models.py @@ -0,0 +1,49 @@ +"""``etl_cache_parses``: one reusable parser result per (bytes + recipe).""" + +from __future__ import annotations + +from sqlalchemy import ( + BigInteger, + Column, + DateTime, + Index, + Integer, + String, + UniqueConstraint, +) + +from app.db import BaseModel, TimestampMixin + + +class CachedParse(BaseModel, TimestampMixin): + __tablename__ = "etl_cache_parses" + + # Key: raw bytes + the recipe that produced the markdown. + source_sha256 = Column(String(64), nullable=False) + etl_service = Column(String(32), nullable=False) + mode = Column(String(16), nullable=False) + parser_version = Column(Integer, nullable=False) + + # Where the markdown blob lives (kept out of the row to stay small). + storage_backend = Column(String(32), nullable=False) + storage_key = Column(String, nullable=False) + size_bytes = Column(BigInteger, nullable=False) + + # Payload needed to rebuild the EtlResult on a hit. + content_type = Column(String(32), nullable=False) + actual_pages = Column(Integer, nullable=False, default=0, server_default="0") + + # Drives eviction (popularity + recency). + times_reused = Column(BigInteger, nullable=False, default=0, server_default="0") + last_used_at = Column(DateTime(timezone=True), nullable=False) + + __table_args__ = ( + UniqueConstraint( + "source_sha256", + "etl_service", + "mode", + "parser_version", + name="uq_etl_cache_parses_key", + ), + Index("ix_etl_cache_parses_last_used_at", "last_used_at"), + ) From ea1012797911d677592172b646f73135d6bc7d98 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:57 +0200 Subject: [PATCH 06/52] feat(etl-cache): add CachedParseRepository data access --- .../cache/persistence/repository.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/persistence/repository.py diff --git a/surfsense_backend/app/etl_pipeline/cache/persistence/repository.py b/surfsense_backend/app/etl_pipeline/cache/persistence/repository.py new file mode 100644 index 000000000..05f40eae5 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/persistence/repository.py @@ -0,0 +1,121 @@ +"""CRUD and eviction selectors for ``etl_cache_parses`` (no business rules).""" + +from __future__ import annotations + +from datetime import UTC, datetime + +from sqlalchemy import delete, func, select, update +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncSession + +from app.etl_pipeline.cache.schemas import EvictionCandidate, ParseKey + +from .models import CachedParse + +_EVICTION_COLUMNS = ( + CachedParse.id, + CachedParse.storage_key, + CachedParse.size_bytes, + CachedParse.last_used_at, + CachedParse.times_reused, +) + + +def _as_eviction_candidate(row) -> EvictionCandidate: + return EvictionCandidate( + id=row.id, + storage_key=row.storage_key, + size_bytes=row.size_bytes, + last_used_at=row.last_used_at, + times_reused=row.times_reused, + ) + + +class CachedParseRepository: + def __init__(self, session: AsyncSession) -> None: + self._session = session + + async def get(self, key: ParseKey) -> CachedParse | None: + result = await self._session.execute( + select(CachedParse).where( + CachedParse.source_sha256 == key.source_sha256, + CachedParse.etl_service == key.etl_service, + CachedParse.mode == key.mode, + CachedParse.parser_version == key.version, + ) + ) + return result.scalars().first() + + async def insert( + self, + *, + key: ParseKey, + content_type: str, + actual_pages: int, + storage_backend: str, + storage_key: str, + size_bytes: int, + ) -> None: + # Concurrent writers parse identical bytes, so a lost race is harmless. + now = datetime.now(UTC) + await self._session.execute( + pg_insert(CachedParse) + .values( + source_sha256=key.source_sha256, + etl_service=key.etl_service, + mode=key.mode, + parser_version=key.version, + content_type=content_type, + actual_pages=actual_pages, + storage_backend=storage_backend, + storage_key=storage_key, + size_bytes=size_bytes, + times_reused=0, + last_used_at=now, + created_at=now, + ) + .on_conflict_do_nothing(constraint="uq_etl_cache_parses_key") + ) + await self._session.commit() + + async def mark_used(self, row_id: int) -> None: + await self._session.execute( + update(CachedParse) + .where(CachedParse.id == row_id) + .values( + times_reused=CachedParse.times_reused + 1, + last_used_at=datetime.now(UTC), + ) + ) + await self._session.commit() + + async def total_size_bytes(self) -> int: + result = await self._session.execute( + select(func.coalesce(func.sum(CachedParse.size_bytes), 0)) + ) + return int(result.scalar() or 0) + + async def select_expired( + self, *, cutoff: datetime, limit: int + ) -> list[EvictionCandidate]: + result = await self._session.execute( + select(*_EVICTION_COLUMNS) + .where(CachedParse.last_used_at < cutoff) + .order_by(CachedParse.last_used_at.asc()) + .limit(limit) + ) + return [_as_eviction_candidate(row) for row in result] + + async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]: + result = await self._session.execute( + select(*_EVICTION_COLUMNS) + .order_by(CachedParse.times_reused.asc(), CachedParse.last_used_at.asc()) + .limit(limit) + ) + return [_as_eviction_candidate(row) for row in result] + + async def delete_by_ids(self, ids: list[int]) -> None: + if not ids: + return + await self._session.execute(delete(CachedParse).where(CachedParse.id.in_(ids))) + await self._session.commit() From 8d3238bcd1fb770b7a624a60b90746b64dda2b0a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:57 +0200 Subject: [PATCH 07/52] feat(etl-cache): expose cache persistence layer --- .../app/etl_pipeline/cache/persistence/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/persistence/__init__.py diff --git a/surfsense_backend/app/etl_pipeline/cache/persistence/__init__.py b/surfsense_backend/app/etl_pipeline/cache/persistence/__init__.py new file mode 100644 index 000000000..666e4cfa8 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/persistence/__init__.py @@ -0,0 +1,11 @@ +"""Database access for cached parse rows.""" + +from __future__ import annotations + +from .models import CachedParse +from .repository import CachedParseRepository + +__all__ = [ + "CachedParse", + "CachedParseRepository", +] From d9b1b491e921468c90f7d9e2e26174954331250e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:57 +0200 Subject: [PATCH 08/52] feat(etl-cache): add cache blob object-key builder --- .../app/etl_pipeline/cache/storage/object_keys.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py diff --git a/surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py b/surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py new file mode 100644 index 000000000..7b89c3f92 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/storage/object_keys.py @@ -0,0 +1,12 @@ +"""Object keys for cached markdown, namespaced under a dedicated prefix.""" + +from __future__ import annotations + +from app.etl_pipeline.cache.schemas import ParseKey + +CACHE_PREFIX = "etl_cache" + + +def build_parse_object_key(key: ParseKey) -> str: + # Content-addressed: identical bytes + recipe always map to the same key. + return f"{CACHE_PREFIX}/{key.source_sha256}/{key.object_suffix}" From 217d040e9e984586d5415e47458c7670f8a8dad2 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:57 +0200 Subject: [PATCH 09/52] feat(etl-cache): resolve cache blob storage backend --- .../app/etl_pipeline/cache/storage/backend.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/storage/backend.py diff --git a/surfsense_backend/app/etl_pipeline/cache/storage/backend.py b/surfsense_backend/app/etl_pipeline/cache/storage/backend.py new file mode 100644 index 000000000..ac7501984 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/storage/backend.py @@ -0,0 +1,46 @@ +"""Resolve the storage backend for cache blobs: shared main store or a dedicated one.""" + +from __future__ import annotations + +from functools import lru_cache + +from app.file_storage.backends.base import StorageBackend + + +@lru_cache(maxsize=1) +def resolve_cache_backend() -> StorageBackend: + from app.etl_pipeline.cache.settings import load_etl_cache_settings + + settings = load_etl_cache_settings() + + if not settings.storage_backend: + from app.file_storage.factory import get_storage_backend + + return get_storage_backend() + + backend = settings.storage_backend.strip().lower() + + if backend == "azure": + from app.config import config + + if not settings.storage_container: + raise ValueError("ETL_CACHE_STORAGE_CONTAINER is required for azure cache.") + if not config.AZURE_STORAGE_CONNECTION_STRING: + raise ValueError( + "AZURE_STORAGE_CONNECTION_STRING is required for azure cache." + ) + from app.file_storage.backends.azure import AzureBlobBackend + + return AzureBlobBackend( + connection_string=config.AZURE_STORAGE_CONNECTION_STRING, + container=settings.storage_container, + ) + + if backend == "local": + if not settings.storage_local_root: + raise ValueError("ETL_CACHE_STORAGE_LOCAL_PATH is required for local cache.") + from app.file_storage.backends.local import LocalFileBackend + + return LocalFileBackend(settings.storage_local_root) + + raise ValueError(f"Unknown ETL_CACHE_STORAGE_BACKEND: {settings.storage_backend!r}") From a6f2457c7ca73b4d51faf757f650343b52e04366 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:22:57 +0200 Subject: [PATCH 10/52] feat(etl-cache): add MarkdownCacheStore for cache blobs --- .../cache/storage/markdown_store.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py diff --git a/surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py b/surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py new file mode 100644 index 000000000..189f3508b --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/storage/markdown_store.py @@ -0,0 +1,35 @@ +"""Read and write cached markdown blobs through the resolved backend.""" + +from __future__ import annotations + +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.storage.backend import resolve_cache_backend +from app.etl_pipeline.cache.storage.object_keys import build_parse_object_key + +_MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8" + + +class MarkdownCacheStore: + def __init__(self) -> None: + self._backend = resolve_cache_backend() + + @property + def backend_name(self) -> str: + return self._backend.backend_name + + async def save(self, key: ParseKey, markdown: str) -> str: + """Persist the markdown and return its storage key for the index row.""" + storage_key = build_parse_object_key(key) + await self._backend.put( + storage_key, + markdown.encode("utf-8"), + content_type=_MARKDOWN_CONTENT_TYPE, + ) + return storage_key + + async def load(self, storage_key: str) -> str: + chunks = [chunk async for chunk in self._backend.open_stream(storage_key)] + return b"".join(chunks).decode("utf-8") + + async def delete(self, storage_key: str) -> None: + await self._backend.delete(storage_key) From 87fdb37fa3f19b2d618968a8647f0c7d75c09bd6 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:40 +0200 Subject: [PATCH 11/52] feat(etl-cache): expose storage layer --- .../app/etl_pipeline/cache/storage/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/storage/__init__.py diff --git a/surfsense_backend/app/etl_pipeline/cache/storage/__init__.py b/surfsense_backend/app/etl_pipeline/cache/storage/__init__.py new file mode 100644 index 000000000..bed39c510 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/storage/__init__.py @@ -0,0 +1,9 @@ +"""Blob storage for cached parse markdown.""" + +from __future__ import annotations + +from .markdown_store import MarkdownCacheStore + +__all__ = [ + "MarkdownCacheStore", +] From 41dea96af4bc76861825c433967b9788a62613c8 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:40 +0200 Subject: [PATCH 12/52] feat(etl-cache): add EtlCacheService --- .../app/etl_pipeline/cache/service.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/service.py diff --git a/surfsense_backend/app/etl_pipeline/cache/service.py b/surfsense_backend/app/etl_pipeline/cache/service.py new file mode 100644 index 000000000..49398faf8 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/service.py @@ -0,0 +1,53 @@ +"""Recall and remember parser output, coordinating the index and blob store.""" + +from __future__ import annotations + +import logging + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.etl_pipeline.cache.persistence import CachedParseRepository +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.storage import MarkdownCacheStore +from app.etl_pipeline.etl_document import EtlResult + +logger = logging.getLogger(__name__) + + +class EtlCacheService: + def __init__(self, session: AsyncSession) -> None: + self._index = CachedParseRepository(session) + self._store = MarkdownCacheStore() + + async def recall(self, key: ParseKey) -> EtlResult | None: + """Return the cached result, or None on a miss.""" + row = await self._index.get(key) + if row is None: + return None + + try: + markdown = await self._store.load(row.storage_key) + except Exception: + # Index points at a blob that is gone; treat as a miss and re-parse. + logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True) + return None + + await self._index.mark_used(row.id) + return EtlResult( + markdown_content=markdown, + etl_service=row.etl_service, + actual_pages=row.actual_pages, + content_type=row.content_type, + ) + + async def remember(self, key: ParseKey, result: EtlResult) -> None: + """Store a freshly parsed result for future reuse.""" + storage_key = await self._store.save(key, result.markdown_content) + await self._index.insert( + key=key, + content_type=result.content_type, + actual_pages=result.actual_pages, + storage_backend=self._store.backend_name, + storage_key=storage_key, + size_bytes=len(result.markdown_content.encode("utf-8")), + ) From 758da06c4f6d41d484d475012544a4c3b6558023 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:40 +0200 Subject: [PATCH 13/52] feat(etl-cache): add extract_with_cache --- .../etl_pipeline/cache/cached_extraction.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/cached_extraction.py diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py new file mode 100644 index 000000000..5348c5f4b --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py @@ -0,0 +1,82 @@ +"""Entry point: serve ETL parses from cache, parsing only on a miss.""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging + +from app.config import config +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.service import EtlCacheService +from app.etl_pipeline.cache.settings import load_etl_cache_settings +from app.etl_pipeline.etl_document import EtlRequest, EtlResult +from app.etl_pipeline.etl_pipeline_service import EtlPipelineService +from app.etl_pipeline.file_classifier import FileCategory, classify_file + +logger = logging.getLogger(__name__) + +_HASH_CHUNK = 1024 * 1024 + + +async def extract_with_cache( + request: EtlRequest, *, vision_llm=None +) -> EtlResult: + """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output.""" + settings = load_etl_cache_settings() + + # Vision-LLM appends model-generated content not captured by the key, so its + # output must not be shared with plain parses (and vice versa): bypass cache. + cacheable = ( + settings.enabled + and vision_llm is None + and bool(config.ETL_SERVICE) + and classify_file(request.filename) == FileCategory.DOCUMENT + ) + if not cacheable: + return await EtlPipelineService(vision_llm=vision_llm).extract(request) + + key = ParseKey.for_document( + await asyncio.to_thread(_hash_file, request.file_path), + etl_service=config.ETL_SERVICE, + mode=request.processing_mode.value, + version=settings.parser_version, + ) + + cached_result = await _recall(key) + if cached_result is not None: + return cached_result + + result = await EtlPipelineService(vision_llm=vision_llm).extract(request) + await _remember(key, result) + return result + + +async def _recall(key: ParseKey) -> EtlResult | None: + # Caching is best-effort: any failure falls through to a normal parse. + try: + from app.tasks.celery_tasks import get_celery_session_maker + + async with get_celery_session_maker()() as session: + return await EtlCacheService(session).recall(key) + except Exception: + logger.warning("ETL cache recall failed; parsing fresh", exc_info=True) + return None + + +async def _remember(key: ParseKey, result: EtlResult) -> None: + try: + from app.tasks.celery_tasks import get_celery_session_maker + + async with get_celery_session_maker()() as session: + await EtlCacheService(session).remember(key, result) + except Exception: + logger.warning("ETL cache write failed; result not cached", exc_info=True) + + +def _hash_file(path: str) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(_HASH_CHUNK), b""): + digest.update(chunk) + return digest.hexdigest() From 7ad39fd995d61d03448b3feb43c4df8da8af2460 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:40 +0200 Subject: [PATCH 14/52] feat(etl-cache): add eviction policy --- .../app/etl_pipeline/cache/eviction/policy.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/eviction/policy.py diff --git a/surfsense_backend/app/etl_pipeline/cache/eviction/policy.py b/surfsense_backend/app/etl_pipeline/cache/eviction/policy.py new file mode 100644 index 000000000..5a80752d6 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/eviction/policy.py @@ -0,0 +1,28 @@ +"""Pure selection rules for which cached entries to drop.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from app.etl_pipeline.cache.schemas import EvictionCandidate + + +def select_over_budget( + coldest_first: Iterable[EvictionCandidate], + *, + current_total_bytes: int, + max_total_bytes: int, +) -> list[EvictionCandidate]: + """Pick coldest entries until the footprint drops under the budget.""" + bytes_to_free = current_total_bytes - max_total_bytes + if bytes_to_free <= 0: + return [] + + chosen: list[EvictionCandidate] = [] + bytes_freed = 0 + for candidate in coldest_first: + if bytes_freed >= bytes_to_free: + break + chosen.append(candidate) + bytes_freed += candidate.size_bytes + return chosen From 324ba141a60648dc012f9edfd53be0eafaa9e2b4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:40 +0200 Subject: [PATCH 15/52] feat(etl-cache): add eviction task and public API --- .../app/etl_pipeline/cache/__init__.py | 11 ++++ .../etl_pipeline/cache/eviction/__init__.py | 9 +++ .../app/etl_pipeline/cache/eviction/task.py | 62 +++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/cache/__init__.py create mode 100644 surfsense_backend/app/etl_pipeline/cache/eviction/__init__.py create mode 100644 surfsense_backend/app/etl_pipeline/cache/eviction/task.py diff --git a/surfsense_backend/app/etl_pipeline/cache/__init__.py b/surfsense_backend/app/etl_pipeline/cache/__init__.py new file mode 100644 index 000000000..3f4585778 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/__init__.py @@ -0,0 +1,11 @@ +"""Content-addressed reuse of expensive ETL parser output across workspaces.""" + +from __future__ import annotations + +from app.etl_pipeline.cache.cached_extraction import extract_with_cache +from app.etl_pipeline.cache.service import EtlCacheService + +__all__ = [ + "EtlCacheService", + "extract_with_cache", +] diff --git a/surfsense_backend/app/etl_pipeline/cache/eviction/__init__.py b/surfsense_backend/app/etl_pipeline/cache/eviction/__init__.py new file mode 100644 index 000000000..f47b9c4e0 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/eviction/__init__.py @@ -0,0 +1,9 @@ +"""Background pruning of the parse cache by age and size budget.""" + +from __future__ import annotations + +from .task import evict_etl_cache_task + +__all__ = [ + "evict_etl_cache_task", +] diff --git a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py new file mode 100644 index 000000000..98841b139 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py @@ -0,0 +1,62 @@ +"""Celery task that prunes the parse cache by TTL, then by size budget.""" + +from __future__ import annotations + +import contextlib +import logging +from datetime import UTC, datetime, timedelta + +from app.celery_app import celery_app +from app.etl_pipeline.cache.eviction.policy import select_over_budget +from app.etl_pipeline.cache.persistence import CachedParseRepository +from app.etl_pipeline.cache.schemas import EvictionCandidate +from app.etl_pipeline.cache.settings import load_etl_cache_settings +from app.etl_pipeline.cache.storage import MarkdownCacheStore +from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task + +logger = logging.getLogger(__name__) + + +@celery_app.task(name="evict_etl_cache") +def evict_etl_cache_task(): + return run_async_celery_task(_evict) + + +async def _evict() -> None: + """Expire stale entries, then shed the coldest overflow only if still over budget.""" + settings = load_etl_cache_settings() + if not settings.enabled: + return + + store = MarkdownCacheStore() + async with get_celery_session_maker()() as session: + index = CachedParseRepository(session) + + cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days) + expired = await index.select_expired(cutoff=cutoff, limit=settings.eviction_batch) + await _drop(index, store, expired) + + total = await index.total_size_bytes() + if total > settings.max_total_bytes: + coldest = await index.select_coldest(limit=settings.eviction_batch) + over_budget = select_over_budget( + coldest, + current_total_bytes=total, + max_total_bytes=settings.max_total_bytes, + ) + await _drop(index, store, over_budget) + + +async def _drop( + index: CachedParseRepository, + store: MarkdownCacheStore, + candidates: list[EvictionCandidate], +) -> None: + if not candidates: + return + for candidate in candidates: + # Drop the index row even if the blob delete fails (orphan blob is harmless). + with contextlib.suppress(Exception): + await store.delete(candidate.storage_key) + await index.delete_by_ids([candidate.id for candidate in candidates]) + logger.info("Evicted %d cached parses", len(candidates)) From 5c4eec26cc21bcaeb3002746fba49be89e04b35b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 16/52] feat(config): add ETL_CACHE_* settings --- surfsense_backend/app/config/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index bbaf3ac55..525fe160d 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -952,6 +952,18 @@ class Config: AZURE_DI_ENDPOINT = os.getenv("AZURE_DI_ENDPOINT") AZURE_DI_KEY = os.getenv("AZURE_DI_KEY") + # ETL parse cache: reuse parser output for identical bytes across workspaces. + ETL_CACHE_ENABLED = os.getenv("ETL_CACHE_ENABLED", "false").strip().lower() == "true" + # Bump to invalidate every cached entry after a parser/behaviour change. + ETL_CACHE_PARSER_VERSION = int(os.getenv("ETL_CACHE_PARSER_VERSION", "1")) + ETL_CACHE_TTL_DAYS = int(os.getenv("ETL_CACHE_TTL_DAYS", "90")) + ETL_CACHE_MAX_TOTAL_MB = int(os.getenv("ETL_CACHE_MAX_TOTAL_MB", "5120")) + ETL_CACHE_EVICTION_BATCH = int(os.getenv("ETL_CACHE_EVICTION_BATCH", "500")) + # Optional dedicated blob storage; unset reuses the main file_storage backend. + ETL_CACHE_STORAGE_BACKEND = os.getenv("ETL_CACHE_STORAGE_BACKEND") + ETL_CACHE_STORAGE_CONTAINER = os.getenv("ETL_CACHE_STORAGE_CONTAINER") + ETL_CACHE_STORAGE_LOCAL_PATH = os.getenv("ETL_CACHE_STORAGE_LOCAL_PATH") + # Proxy provider selection. Maps to a ProxyProvider implementation registered # in app/utils/proxy/registry.py. Add new vendors there and switch via this var. PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies") From 9f29a885b1aa97cc3937c5a94c5dd3413f9988e0 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 17/52] feat(db): register CachedParse model --- surfsense_backend/app/db.py | 1 + 1 file changed, 1 insertion(+) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 2d672131b..97843d395 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -2864,6 +2864,7 @@ from app.automations.persistence import ( # noqa: E402, F401 AutomationRun, AutomationTrigger, ) +from app.etl_pipeline.cache.persistence.models import CachedParse # noqa: E402, F401 from app.file_storage.persistence import DocumentFile # noqa: E402, F401 from app.notifications.persistence import Notification # noqa: E402, F401 from app.podcasts.persistence import ( # noqa: E402, F401 From 1c05980ffbd2f89bb62c43d59886230ce931b659 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 18/52] feat(celery): schedule etl cache eviction --- surfsense_backend/app/celery_app.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 5eebffd65..413522189 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -192,6 +192,7 @@ celery_app = Celery( "app.tasks.celery_tasks.stripe_reconciliation_task", "app.tasks.celery_tasks.auto_reload_task", "app.tasks.celery_tasks.gateway_tasks", + "app.etl_pipeline.cache.eviction.task", "app.automations.tasks.execute_run", "app.automations.triggers.builtin.schedule.selector", "app.automations.triggers.builtin.event.selector", @@ -306,6 +307,12 @@ celery_app.conf.beat_schedule = { "schedule": crontab(hour="3", minute="17"), "options": {"expires": 600}, }, + # Prune the ETL parse cache (TTL + size budget) once daily, off-peak. + "evict-etl-cache": { + "task": "evict_etl_cache", + "schedule": crontab(hour="4", minute="0"), + "options": {"expires": 600}, + }, # Fire due automation schedule triggers (Beat entry owned by the schedule # trigger; see app.automations.triggers.builtin.schedule.source). **SCHEDULE_BEAT_SCHEDULE, From 0dc2ccc003aed18aa17ed9fe5486bf330a503d42 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 19/52] feat(tasks): route extraction through etl cache --- .../app/tasks/document_processors/file_processors.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index a646b7aa6..0c3d30766 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -381,7 +381,6 @@ async def _extract_file_content( Tuple of (markdown_content, etl_service_name, billable_pages). """ from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService from app.etl_pipeline.file_classifier import ( FileCategory, classify_file as etl_classify, @@ -432,13 +431,16 @@ async def _extract_file_content( vision_llm = await get_vision_llm(session, search_space_id) - result = await EtlPipelineService(vision_llm=vision_llm).extract( + from app.etl_pipeline.cache import extract_with_cache + + result = await extract_with_cache( EtlRequest( file_path=file_path, filename=filename, estimated_pages=estimated_pages, processing_mode=mode, - ) + ), + vision_llm=vision_llm, ) with contextlib.suppress(Exception): From d898716cf4c9baa62fd6767844a301187addbbe8 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 20/52] feat(migration): add etl_cache_parses table --- .../versions/160_add_etl_cache_parses.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 surfsense_backend/alembic/versions/160_add_etl_cache_parses.py diff --git a/surfsense_backend/alembic/versions/160_add_etl_cache_parses.py b/surfsense_backend/alembic/versions/160_add_etl_cache_parses.py new file mode 100644 index 000000000..f021a962a --- /dev/null +++ b/surfsense_backend/alembic/versions/160_add_etl_cache_parses.py @@ -0,0 +1,53 @@ +"""add etl_cache_parses table for content-addressed parse reuse + +Revision ID: 160 +Revises: 159 +""" + +from collections.abc import Sequence + +from alembic import op + +revision: str = "160" +down_revision: str | None = "159" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.execute( + """ + CREATE TABLE IF NOT EXISTS etl_cache_parses ( + id SERIAL PRIMARY KEY, + source_sha256 VARCHAR(64) NOT NULL, + etl_service VARCHAR(32) NOT NULL, + mode VARCHAR(16) NOT NULL, + parser_version INTEGER NOT NULL, + storage_backend VARCHAR(32) NOT NULL, + storage_key TEXT NOT NULL, + size_bytes BIGINT NOT NULL, + content_type VARCHAR(32) NOT NULL, + actual_pages INTEGER NOT NULL DEFAULT 0, + times_reused BIGINT NOT NULL DEFAULT 0, + last_used_at TIMESTAMP WITH TIME ZONE NOT NULL, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + CONSTRAINT uq_etl_cache_parses_key + UNIQUE (source_sha256, etl_service, mode, parser_version) + ); + """ + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_last_used_at " + "ON etl_cache_parses(last_used_at);" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_etl_cache_parses_created_at " + "ON etl_cache_parses(created_at);" + ) + + +def downgrade() -> None: + op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_created_at;") + op.execute("DROP INDEX IF EXISTS ix_etl_cache_parses_last_used_at;") + op.execute("DROP TABLE IF EXISTS etl_cache_parses;") From 5af594c405b44a75d0142e7070dbdeb013d67e7d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:23:50 +0200 Subject: [PATCH 21/52] docs(env): document ETL_CACHE_* settings --- surfsense_backend/.env.example | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index b4f67328c..1924756ce 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -311,6 +311,23 @@ FILE_STORAGE_BACKEND=local # AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net # AZURE_STORAGE_CONTAINER=surfsense-documents +# ETL Parse Cache +# Reuse parser output for identical file bytes across workspaces (skips paid +# re-parsing on LlamaCloud / Azure DI / Unstructured). Off by default. +ETL_CACHE_ENABLED=false +# Bump to invalidate all cached entries after a parser/behaviour change. +# ETL_CACHE_PARSER_VERSION=1 +# Prune entries unused for this many days. +# ETL_CACHE_TTL_DAYS=90 +# Soft cap on total cached markdown; coldest entries are evicted past it. +# ETL_CACHE_MAX_TOTAL_MB=5120 +# Rows deleted per eviction pass. +# ETL_CACHE_EVICTION_BATCH=500 +# Optional dedicated blob storage; unset reuses the main file storage backend. +# ETL_CACHE_STORAGE_BACKEND=azure +# ETL_CACHE_STORAGE_CONTAINER=surfsense-etl-cache +# ETL_CACHE_STORAGE_LOCAL_PATH=/var/lib/surfsense/etl-cache + # Daytona Sandbox (isolated code execution) # DAYTONA_SANDBOX_ENABLED=FALSE # DAYTONA_API_KEY=your-daytona-api-key From ce1e90386fac8437a9a855e458d54b2e212e1424 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:51 +0200 Subject: [PATCH 22/52] refactor(etl-cache): extract pure cacheability gate --- .../etl_pipeline/cache/cached_extraction.py | 14 ++++------ .../app/etl_pipeline/cache/eligibility.py | 28 +++++++++++++++++++ .../etl_pipeline/cache/schemas/parse_key.py | 2 +- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 surfsense_backend/app/etl_pipeline/cache/eligibility.py diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py index 5348c5f4b..dba4b44da 100644 --- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py +++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py @@ -7,12 +7,12 @@ import hashlib import logging from app.config import config +from app.etl_pipeline.cache.eligibility import is_parse_cacheable from app.etl_pipeline.cache.schemas import ParseKey from app.etl_pipeline.cache.service import EtlCacheService from app.etl_pipeline.cache.settings import load_etl_cache_settings from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.etl_pipeline_service import EtlPipelineService -from app.etl_pipeline.file_classifier import FileCategory, classify_file logger = logging.getLogger(__name__) @@ -25,13 +25,11 @@ async def extract_with_cache( """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output.""" settings = load_etl_cache_settings() - # Vision-LLM appends model-generated content not captured by the key, so its - # output must not be shared with plain parses (and vice versa): bypass cache. - cacheable = ( - settings.enabled - and vision_llm is None - and bool(config.ETL_SERVICE) - and classify_file(request.filename) == FileCategory.DOCUMENT + cacheable = is_parse_cacheable( + filename=request.filename, + etl_service=config.ETL_SERVICE, + cache_enabled=settings.enabled, + has_vision_llm=vision_llm is not None, ) if not cacheable: return await EtlPipelineService(vision_llm=vision_llm).extract(request) diff --git a/surfsense_backend/app/etl_pipeline/cache/eligibility.py b/surfsense_backend/app/etl_pipeline/cache/eligibility.py new file mode 100644 index 000000000..18f096218 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/eligibility.py @@ -0,0 +1,28 @@ +"""Gating rule: may this upload be served from / written to the parse cache?""" + +from __future__ import annotations + +from app.etl_pipeline.file_classifier import FileCategory, classify_file + + +def is_parse_cacheable( + *, + filename: str, + etl_service: str | None, + cache_enabled: bool, + has_vision_llm: bool, +) -> bool: + """Only deterministic document parses are shareable across workspaces. + + Vision-LLM runs append model-generated content not captured by the cache key, + and a missing ETL service means there is no document parser to key against -- + both bypass the cache. Non-document categories (plaintext, audio, images, + direct-convert) are cheap or parser-agnostic and are handled outside it. + """ + if not cache_enabled: + return False + if has_vision_llm: + return False + if not etl_service: + return False + return classify_file(filename) == FileCategory.DOCUMENT diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py index 65e7b08a5..88133a418 100644 --- a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py +++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py @@ -15,7 +15,7 @@ class ParseKey: @classmethod def for_document( cls, source_sha256: str, *, etl_service: str, mode: str, version: int - ) -> "ParseKey": + ) -> ParseKey: return cls( source_sha256=source_sha256, etl_service=etl_service, From dddacbe762b4c52652db79e3b1d4d46bf5c88c71 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:52 +0200 Subject: [PATCH 23/52] test(etl-cache): cover content-addressing dedup and key shape --- .../tests/unit/etl_pipeline/cache/conftest.py | 28 ++++++++ .../unit/etl_pipeline/cache/test_parse_key.py | 70 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py create mode 100644 surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py diff --git a/surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py new file mode 100644 index 000000000..c6efddc09 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py @@ -0,0 +1,28 @@ +"""Stub the cache package __init__s so unit tests import only pure leaf modules. + +The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly +import the facade, file storage, Celery, and ``app.db`` -- none of which a pure +unit test should need. Turning those packages into bare namespace packages lets +``from app.etl_pipeline.cache.. import ...`` resolve the leaf module +without running the heavy __init__. ``schemas`` is left real (it is pure). +""" + +import sys +import types +from pathlib import Path + +_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "etl_pipeline" / "cache" + + +def _stub_namespace_package(dotted: str, fs_dir: Path) -> None: + if dotted in sys.modules: + return + module = types.ModuleType(dotted) + module.__path__ = [str(fs_dir)] + module.__package__ = dotted + sys.modules[dotted] = module + + +_stub_namespace_package("app.etl_pipeline.cache", _CACHE_DIR) +_stub_namespace_package("app.etl_pipeline.cache.storage", _CACHE_DIR / "storage") +_stub_namespace_package("app.etl_pipeline.cache.eviction", _CACHE_DIR / "eviction") diff --git a/surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py b/surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py new file mode 100644 index 000000000..d69e74ee0 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py @@ -0,0 +1,70 @@ +"""Content-addressing: equal (bytes + recipe) must map to one storage location. + +This is the dedup guarantee the whole cache rests on -- two users uploading the +same file under the same parser settings have to land on the same object key, and +any change to bytes or recipe has to land somewhere else. +""" + +from __future__ import annotations + +import pytest + +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.storage.object_keys import ( + CACHE_PREFIX, + build_parse_object_key, +) + +pytestmark = pytest.mark.unit + + +def _key(**overrides) -> ParseKey: + base = { + "source_sha256": "a" * 64, + "etl_service": "LLAMACLOUD", + "mode": "basic", + "version": 1, + } + base.update(overrides) + return ParseKey.for_document( + base["source_sha256"], + etl_service=base["etl_service"], + mode=base["mode"], + version=base["version"], + ) + + +def test_same_bytes_and_recipe_produce_the_same_object_key(): + assert build_parse_object_key(_key()) == build_parse_object_key(_key()) + + +def test_different_bytes_produce_different_object_keys(): + assert build_parse_object_key( + _key(source_sha256="a" * 64) + ) != build_parse_object_key(_key(source_sha256="b" * 64)) + + +@pytest.mark.parametrize( + "field, value", + [ + ("etl_service", "DOCLING"), + ("mode", "premium"), + ("version", 2), + ], +) +def test_any_recipe_change_produces_a_different_object_key(field, value): + # Same bytes but a different parser/mode/version must not collide: the recipe + # is part of the identity, so changing it has to re-parse, not reuse. + assert build_parse_object_key(_key()) != build_parse_object_key( + _key(**{field: value}) + ) + + +def test_object_key_is_prefixed_and_sharded_by_source_hash(): + # Shape matters operationally: a dedicated top-level prefix keeps cache blobs + # out of the normal store, and the sha directory groups every recipe variant + # of one file together. + key = _key() + assert build_parse_object_key(key) == ( + f"{CACHE_PREFIX}/{key.source_sha256}/LLAMACLOUD.basic.v1.md" + ) From a3e7047c3508cd65bd48eee68b6d9f8ca53a7a9c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:52 +0200 Subject: [PATCH 24/52] test(etl-cache): cover cacheability gate rules --- .../etl_pipeline/cache/test_eligibility.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 surfsense_backend/tests/unit/etl_pipeline/cache/test_eligibility.py diff --git a/surfsense_backend/tests/unit/etl_pipeline/cache/test_eligibility.py b/surfsense_backend/tests/unit/etl_pipeline/cache/test_eligibility.py new file mode 100644 index 000000000..99d8e67b6 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/cache/test_eligibility.py @@ -0,0 +1,88 @@ +"""What is allowed into the cache -- the gating rules, as pure logic. + +These rules decide whether a given upload may be served from / written to the +parse cache. They live in a pure predicate so every branch (disabled, vision, +no service, file category) is covered here without touching DB, storage, or the +parser. +""" + +from __future__ import annotations + +import pytest + +from app.etl_pipeline.cache.eligibility import is_parse_cacheable + +pytestmark = pytest.mark.unit + + +def test_document_with_service_and_cache_on_is_cacheable(): + assert is_parse_cacheable( + filename="report.pdf", + etl_service="LLAMACLOUD", + cache_enabled=True, + has_vision_llm=False, + ) + + +def test_disabled_cache_is_never_cacheable(): + assert not is_parse_cacheable( + filename="report.pdf", + etl_service="LLAMACLOUD", + cache_enabled=False, + has_vision_llm=False, + ) + + +def test_vision_llm_run_is_not_cacheable(): + # Vision appends model output not captured by the key; sharing it would leak + # one run's generated text into a plain parse of the same bytes. + assert not is_parse_cacheable( + filename="report.pdf", + etl_service="LLAMACLOUD", + cache_enabled=True, + has_vision_llm=True, + ) + + +@pytest.mark.parametrize("etl_service", [None, ""]) +def test_missing_etl_service_is_not_cacheable(etl_service): + assert not is_parse_cacheable( + filename="report.pdf", + etl_service=etl_service, + cache_enabled=True, + has_vision_llm=False, + ) + + +@pytest.mark.parametrize( + "filename", + ["paper.pdf", "memo.docx", "slides.pptx", "sheet.xlsx", "book.epub"], +) +def test_document_extensions_are_cacheable(filename): + assert is_parse_cacheable( + filename=filename, + etl_service="LLAMACLOUD", + cache_enabled=True, + has_vision_llm=False, + ) + + +@pytest.mark.parametrize( + "filename", + [ + "notes.txt", # plaintext + "readme.md", # plaintext + "main.py", # plaintext + "podcast.mp3", # audio + "photo.png", # image (vision path / fallback, not a shared doc parse) + "data.csv", # direct-convert + "archive.xyz", # unsupported + ], +) +def test_non_document_categories_are_not_cacheable(filename): + assert not is_parse_cacheable( + filename=filename, + etl_service="LLAMACLOUD", + cache_enabled=True, + has_vision_llm=False, + ) From 3dec3231d0f845225a2502d8a261afcdc7d73b80 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:52 +0200 Subject: [PATCH 25/52] test(etl-cache): cover over-budget eviction selection --- .../cache/test_eviction_policy.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 surfsense_backend/tests/unit/etl_pipeline/cache/test_eviction_policy.py diff --git a/surfsense_backend/tests/unit/etl_pipeline/cache/test_eviction_policy.py b/surfsense_backend/tests/unit/etl_pipeline/cache/test_eviction_policy.py new file mode 100644 index 000000000..5113d7c42 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/cache/test_eviction_policy.py @@ -0,0 +1,76 @@ +"""Size-based eviction: drop just enough of the coldest entries to fit budget. + +The caller supplies candidates already ordered coldest-first; this pure rule only +decides how far down that list to cut. It must never over-evict (stop as soon as +the footprint fits) and never promise more than the candidates can free. +""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest + +from app.etl_pipeline.cache.eviction.policy import select_over_budget +from app.etl_pipeline.cache.schemas import EvictionCandidate + +pytestmark = pytest.mark.unit + + +def _candidate(id_: int, size_bytes: int) -> EvictionCandidate: + return EvictionCandidate( + id=id_, + storage_key=f"etl_cache/{id_}.md", + size_bytes=size_bytes, + last_used_at=datetime(2026, 1, 1, tzinfo=UTC), + times_reused=0, + ) + + +def test_over_budget_drops_coldest_until_it_fits(): + # 300 used, budget 100 -> must free >=200. Coldest-first [120, 90, 70]; + # 120+90=210 >=200, so the third (70) is spared. + coldest_first = [_candidate(1, 120), _candidate(2, 90), _candidate(3, 70)] + + chosen = select_over_budget( + coldest_first, current_total_bytes=300, max_total_bytes=100 + ) + + assert [c.id for c in chosen] == [1, 2] + + +@pytest.mark.parametrize("current_total_bytes", [100, 80]) +def test_within_budget_evicts_nothing(current_total_bytes): + # At or under budget there is nothing to free, so no blob is touched. + coldest_first = [_candidate(1, 50), _candidate(2, 50)] + + chosen = select_over_budget( + coldest_first, + current_total_bytes=current_total_bytes, + max_total_bytes=100, + ) + + assert chosen == [] + + +def test_stops_as_soon_as_one_entry_covers_the_overage(): + # Only 10 over budget; the first (cold) entry already frees enough. + coldest_first = [_candidate(1, 40), _candidate(2, 40)] + + chosen = select_over_budget( + coldest_first, current_total_bytes=110, max_total_bytes=100 + ) + + assert [c.id for c in chosen] == [1] + + +def test_returns_all_candidates_when_they_cannot_free_enough(): + # Deficit is 500 but candidates only total 150: return everything available + # rather than looping forever or raising. + coldest_first = [_candidate(1, 100), _candidate(2, 50)] + + chosen = select_over_budget( + coldest_first, current_total_bytes=600, max_total_bytes=100 + ) + + assert [c.id for c in chosen] == [1, 2] From c49a0f1233bbce2fe556eeaac95654ad60ec09df Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:57 +0200 Subject: [PATCH 26/52] test(etl-cache): cover store, service, and repository on real infra --- .../etl_pipeline/cache/conftest.py | 32 +++++++ .../cache/test_cached_parse_repository.py | 96 +++++++++++++++++++ .../cache/test_etl_cache_service.py | 67 +++++++++++++ .../etl_pipeline/cache/test_markdown_store.py | 42 ++++++++ 4 files changed, 237 insertions(+) create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/conftest.py create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/test_markdown_store.py diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/conftest.py b/surfsense_backend/tests/integration/etl_pipeline/cache/conftest.py new file mode 100644 index 000000000..4369cc64d --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/conftest.py @@ -0,0 +1,32 @@ +"""Real-infra fixtures for the parse-cache integration tests. + +``cache_local_storage`` points the cache's blob store at a throwaway directory so +tests exercise the real ``LocalFileBackend`` (no cloud, no mocks). ``clean_cache_table`` +removes rows written through the facade's own committing session, which the +savepoint-rolled-back ``db_session`` cannot undo. +""" + +from __future__ import annotations + +import pytest +import pytest_asyncio +from sqlalchemy import text + + +@pytest.fixture +def cache_local_storage(tmp_path, monkeypatch): + from app.config import config + from app.etl_pipeline.cache.storage.backend import resolve_cache_backend + + monkeypatch.setattr(config, "ETL_CACHE_STORAGE_BACKEND", "local") + monkeypatch.setattr(config, "ETL_CACHE_STORAGE_LOCAL_PATH", str(tmp_path)) + resolve_cache_backend.cache_clear() + yield tmp_path + resolve_cache_backend.cache_clear() + + +@pytest_asyncio.fixture +async def clean_cache_table(async_engine): + yield + async with async_engine.begin() as conn: + await conn.execute(text("DELETE FROM etl_cache_parses")) diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py new file mode 100644 index 000000000..72e977f11 --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_parse_repository.py @@ -0,0 +1,96 @@ +"""CachedParseRepository against real Postgres: the SQL behind eviction & dedup. + +These verify the parts that only a real database can: coldest-first ordering by +reuse then recency, TTL cutoff selection, the size accumulator, and the +insert-once guarantee under a duplicate key. +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta + +import pytest + +from app.etl_pipeline.cache.persistence import CachedParseRepository +from app.etl_pipeline.cache.schemas import ParseKey + +pytestmark = pytest.mark.integration + + +def _key(sha: str) -> ParseKey: + return ParseKey.for_document( + sha, etl_service="LLAMACLOUD", mode="basic", version=1 + ) + + +async def _insert(repo, *, sha, size=100, storage_key=None): + key = _key(sha) + await repo.insert( + key=key, + content_type="application/pdf", + actual_pages=1, + storage_backend="local", + storage_key=storage_key or f"etl_cache/{sha}.md", + size_bytes=size, + ) + return key + + +async def test_total_size_bytes_sums_all_rows(db_session): + repo = CachedParseRepository(db_session) + await _insert(repo, sha="a" * 64, size=100) + await _insert(repo, sha="b" * 64, size=250) + + assert await repo.total_size_bytes() == 350 + + +async def test_select_coldest_orders_by_reuse_then_recency(db_session): + repo = CachedParseRepository(db_session) + ka = await _insert(repo, sha="a" * 64) + kb = await _insert(repo, sha="b" * 64) + kc = await _insert(repo, sha="c" * 64) + + # Warm B once and C twice; A stays untouched and should be coldest. + await repo.mark_used((await repo.get(kb)).id) + await repo.mark_used((await repo.get(kc)).id) + await repo.mark_used((await repo.get(kc)).id) + + coldest = await repo.select_coldest(limit=10) + + ids_by_reuse = [c.id for c in coldest] + assert ids_by_reuse[:3] == [ + (await repo.get(ka)).id, + (await repo.get(kb)).id, + (await repo.get(kc)).id, + ] + + +async def test_select_expired_returns_only_rows_older_than_cutoff(db_session): + repo = CachedParseRepository(db_session) + await _insert(repo, sha="a" * 64) + + future = datetime.now(UTC) + timedelta(days=1) + past = datetime.now(UTC) - timedelta(days=1) + + # Row was just used, so it's older than a future cutoff but not a past one. + assert len(await repo.select_expired(cutoff=future, limit=10)) == 1 + assert await repo.select_expired(cutoff=past, limit=10) == [] + + +async def test_duplicate_key_insert_keeps_the_first_row(db_session): + repo = CachedParseRepository(db_session) + key = await _insert(repo, sha="a" * 64, size=100, storage_key="etl_cache/first.md") + + # Same content-addressed key (a concurrent re-parse): must be a no-op. + await repo.insert( + key=key, + content_type="application/pdf", + actual_pages=1, + storage_backend="local", + storage_key="etl_cache/second.md", + size_bytes=999, + ) + + row = await repo.get(key) + assert row.storage_key == "etl_cache/first.md" + assert await repo.total_size_bytes() == 100 diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py new file mode 100644 index 000000000..df74c97d4 --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_etl_cache_service.py @@ -0,0 +1,67 @@ +"""EtlCacheService end-to-end against real Postgres + real local storage. + +Exercises the public cache surface -- ``recall`` / ``remember`` -- with no mocks: +a miss returns nothing, and a remembered parse comes back as an equivalent +``EtlResult`` rebuilt from the row and the blob. +""" + +from __future__ import annotations + +import pytest + +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.service import EtlCacheService +from app.etl_pipeline.etl_document import EtlResult + +pytestmark = pytest.mark.integration + + +def _key(sha: str = "c" * 64) -> ParseKey: + return ParseKey.for_document( + sha, etl_service="LLAMACLOUD", mode="basic", version=1 + ) + + +async def test_recall_is_a_miss_for_an_unknown_key(db_session, cache_local_storage): + service = EtlCacheService(db_session) + assert await service.recall(_key()) is None + + +async def test_remembered_parse_recalls_as_equivalent_result( + db_session, cache_local_storage +): + service = EtlCacheService(db_session) + stored = EtlResult( + markdown_content="# Cached doc\n\nBody paragraph.\n", + etl_service="LLAMACLOUD", + actual_pages=7, + content_type="application/pdf", + ) + + await service.remember(_key(), stored) + recalled = await service.recall(_key()) + + assert recalled is not None + assert recalled.markdown_content == stored.markdown_content + assert recalled.etl_service == "LLAMACLOUD" + assert recalled.actual_pages == 7 + assert recalled.content_type == "application/pdf" + + +async def test_repeated_recall_keeps_serving_the_same_content( + db_session, cache_local_storage +): + service = EtlCacheService(db_session) + stored = EtlResult( + markdown_content="# Stable\n", + etl_service="LLAMACLOUD", + actual_pages=1, + content_type="application/pdf", + ) + await service.remember(_key(), stored) + + first = await service.recall(_key()) + second = await service.recall(_key()) + + assert first is not None and second is not None + assert first.markdown_content == second.markdown_content == "# Stable\n" diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_markdown_store.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_markdown_store.py new file mode 100644 index 000000000..a9d685017 --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_markdown_store.py @@ -0,0 +1,42 @@ +"""MarkdownCacheStore against a real local filesystem backend (no mocks). + +Proves the blob side of the cache: markdown written under a content-addressed key +comes back byte-for-byte, and a delete actually removes it. +""" + +from __future__ import annotations + +import pytest + +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.storage import MarkdownCacheStore +from app.etl_pipeline.cache.storage.object_keys import build_parse_object_key + +pytestmark = pytest.mark.integration + + +def _key() -> ParseKey: + return ParseKey.for_document( + "d" * 64, etl_service="LLAMACLOUD", mode="basic", version=1 + ) + + +async def test_save_then_load_round_trips_markdown(cache_local_storage): + store = MarkdownCacheStore() + markdown = "# Title\n\nBody with unicode: café, naïve, 漢字.\n" + + storage_key = await store.save(_key(), markdown) + + assert storage_key == build_parse_object_key(_key()) + assert await store.load(storage_key) == markdown + + +async def test_delete_removes_the_blob(cache_local_storage): + store = MarkdownCacheStore() + storage_key = await store.save(_key(), "to be deleted") + + await store.delete(storage_key) + + # Eviction deleted the blob; a later read must fail rather than serve stale. + with pytest.raises(FileNotFoundError): + await store.load(storage_key) From 1460173dadac5aca8faa6ee8d5e0d5fc96a64a6e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:57 +0200 Subject: [PATCH 27/52] test(etl-cache): cover extract_with_cache end-to-end --- .../cache/test_cached_extraction.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py new file mode 100644 index 000000000..0b4a3dcf0 --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_cached_extraction.py @@ -0,0 +1,84 @@ +"""extract_with_cache end-to-end: real DB + real local storage. + +The only seam mocked is the parser itself (``EtlPipelineService.extract``) -- the +external boundary the facade wraps. Everything else (eligibility, hashing, recall, +remember, blob I/O) runs for real, so these tests prove the actual cost saving: +identical bytes are parsed once and reused. +""" + +from __future__ import annotations + +import pytest + +from app.config import config +from app.etl_pipeline.cache.cached_extraction import extract_with_cache +from app.etl_pipeline.etl_document import EtlRequest, EtlResult, ProcessingMode + +pytestmark = pytest.mark.integration + + +class _CountingParser: + """Stand-in for the external parser; records how often it actually ran.""" + + def __init__(self, **_kwargs) -> None: + pass + + calls = 0 + + async def extract(self, request: EtlRequest) -> EtlResult: + type(self).calls += 1 + return EtlResult( + markdown_content="# Parsed once\n", + etl_service="LLAMACLOUD", + actual_pages=3, + content_type="application/pdf", + ) + + +@pytest.fixture +def counting_parser(monkeypatch): + _CountingParser.calls = 0 + monkeypatch.setattr( + "app.etl_pipeline.cache.cached_extraction.EtlPipelineService", + _CountingParser, + ) + return _CountingParser + + +async def test_identical_uploads_are_parsed_once_then_served_from_cache( + tmp_path, monkeypatch, counting_parser, cache_local_storage, clean_cache_table +): + monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True) + monkeypatch.setattr(config, "ETL_SERVICE", "LLAMACLOUD") + + pdf = tmp_path / "doc.pdf" + pdf.write_bytes(b"%PDF-1.4 unique-bytes-for-this-test") + request = EtlRequest( + file_path=str(pdf), filename="doc.pdf", processing_mode=ProcessingMode.BASIC + ) + + first = await extract_with_cache(request) + second = await extract_with_cache(request) + + assert counting_parser.calls == 1 # second upload reused the cache + assert first.markdown_content == second.markdown_content == "# Parsed once\n" + assert second.actual_pages == 3 + assert second.content_type == "application/pdf" + + +async def test_disabled_cache_parses_every_time( + tmp_path, monkeypatch, counting_parser +): + monkeypatch.setattr(config, "ETL_CACHE_ENABLED", False) + monkeypatch.setattr(config, "ETL_SERVICE", "LLAMACLOUD") + + pdf = tmp_path / "doc.pdf" + pdf.write_bytes(b"%PDF-1.4 another-unique-payload") + request = EtlRequest( + file_path=str(pdf), filename="doc.pdf", processing_mode=ProcessingMode.BASIC + ) + + await extract_with_cache(request) + await extract_with_cache(request) + + assert counting_parser.calls == 2 # bypassed: no reuse From d5e0280097a643a34b1e18189c872e4fb1bdc04b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:54:36 +0200 Subject: [PATCH 28/52] test(etl-cache): cover two-phase eviction task on real infra --- .../etl_pipeline/cache/test_eviction_task.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py diff --git a/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py b/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py new file mode 100644 index 000000000..e25cfaef0 --- /dev/null +++ b/surfsense_backend/tests/integration/etl_pipeline/cache/test_eviction_task.py @@ -0,0 +1,96 @@ +"""The eviction task on real infra: TTL expiry first, then coldest-over-budget. + +Seeds entries through the real cache (DB rows + local blobs), runs the actual +``_evict`` coroutine, and checks what survives via ``recall`` -- no mocks. TTL and +budget are driven through config so each phase can be exercised in isolation. +""" + +from __future__ import annotations + +import pytest + +from app.config import config +from app.etl_pipeline.cache.eviction.task import _evict +from app.etl_pipeline.cache.schemas import ParseKey +from app.etl_pipeline.cache.service import EtlCacheService +from app.etl_pipeline.etl_document import EtlResult +from app.tasks.celery_tasks import get_celery_session_maker + +pytestmark = pytest.mark.integration + + +def _key(sha: str) -> ParseKey: + return ParseKey.for_document( + sha, etl_service="LLAMACLOUD", mode="basic", version=1 + ) + + +def _result(markdown: str) -> EtlResult: + return EtlResult( + markdown_content=markdown, + etl_service="LLAMACLOUD", + actual_pages=1, + content_type="application/pdf", + ) + + +async def _remember(key: ParseKey, result: EtlResult) -> None: + async with get_celery_session_maker()() as session: + await EtlCacheService(session).remember(key, result) + + +async def _recall(key: ParseKey) -> EtlResult | None: + async with get_celery_session_maker()() as session: + return await EtlCacheService(session).recall(key) + + +async def test_expired_entries_are_pruned( + monkeypatch, cache_local_storage, clean_cache_table +): + monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True) + monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", -1) # cutoff in the future -> stale + monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 10_000) # size phase no-op + + key = _key("a" * 64) + await _remember(key, _result("# stale doc\n")) + + await _evict() + + assert await _recall(key) is None + + +async def test_coldest_entries_are_shed_when_over_budget( + monkeypatch, cache_local_storage, clean_cache_table +): + monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True) + monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", 3650) # nothing TTL-expired + monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 1) # ~1 MiB budget + + cold = _key("a" * 64) + warm = _key("b" * 64) + # Two ~0.6 MiB entries together exceed the 1 MiB budget; one must go. + await _remember(cold, _result("x" * 600_000)) + await _remember(warm, _result("y" * 600_000)) + + # A reuse makes `warm` warmer than `cold`, so `cold` is the eviction target. + assert await _recall(warm) is not None + + await _evict() + + assert await _recall(cold) is None + assert await _recall(warm) is not None + + +async def test_nothing_is_evicted_within_ttl_and_budget( + monkeypatch, cache_local_storage, clean_cache_table +): + monkeypatch.setattr(config, "ETL_CACHE_ENABLED", True) + monkeypatch.setattr(config, "ETL_CACHE_TTL_DAYS", 3650) + monkeypatch.setattr(config, "ETL_CACHE_MAX_TOTAL_MB", 10_000) + + key = _key("a" * 64) + await _remember(key, _result("# keep me\n")) + + await _evict() + + assert await _recall(key) is not None From 9efe24879dca2e19c799b102ea13524a3822a962 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:57:03 +0200 Subject: [PATCH 29/52] feat(observability): add etl cache lookup and eviction metrics --- .../app/observability/metrics.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index 5ba3be059..4751278a4 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -289,6 +289,22 @@ def _etl_extract_outcome(): ) +@lru_cache(maxsize=1) +def _etl_cache_lookups(): + return _get_meter().create_counter( + "surfsense.etl.cache.lookups", + description="Count of ETL parse-cache lookups by outcome (hit/miss).", + ) + + +@lru_cache(maxsize=1) +def _etl_cache_evictions(): + return _get_meter().create_counter( + "surfsense.etl.cache.evictions", + description="Count of ETL parse-cache entries evicted, by phase.", + ) + + @lru_cache(maxsize=1) def _celery_heartbeat_refreshes(): return _get_meter().create_counter( @@ -670,6 +686,28 @@ def record_etl_extract_outcome( ) +def record_etl_cache_lookup( + *, etl_service: str | None, mode: str | None, outcome: str +) -> None: + """Record a parse-cache lookup. ``outcome`` is ``hit`` or ``miss``.""" + _add( + _etl_cache_lookups(), + 1, + { + "etl.service": etl_service or "unknown", + "mode": mode or "unknown", + "outcome": outcome, + }, + ) + + +def record_etl_cache_eviction(count: int, *, phase: str) -> None: + """Record evicted entries. ``phase`` is ``ttl`` or ``size``.""" + if count <= 0: + return + _add(_etl_cache_evictions(), count, {"phase": phase}) + + def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type}) @@ -866,6 +904,8 @@ __all__ = [ "record_compaction_run", "record_connector_sync_duration", "record_connector_sync_outcome", + "record_etl_cache_eviction", + "record_etl_cache_lookup", "record_etl_extract_duration", "record_etl_extract_outcome", "record_indexing_document_duration", From 0808fbcdee7d16755e3410604e6b61ebe54e2a67 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:57:03 +0200 Subject: [PATCH 30/52] feat(etl-cache): emit hit/miss and eviction metrics --- .../app/etl_pipeline/cache/cached_extraction.py | 8 ++++++++ .../app/etl_pipeline/cache/eviction/task.py | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py index dba4b44da..b6a9e5531 100644 --- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py +++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py @@ -13,6 +13,7 @@ from app.etl_pipeline.cache.service import EtlCacheService from app.etl_pipeline.cache.settings import load_etl_cache_settings from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.etl_pipeline_service import EtlPipelineService +from app.observability import metrics logger = logging.getLogger(__name__) @@ -43,8 +44,15 @@ async def extract_with_cache( cached_result = await _recall(key) if cached_result is not None: + metrics.record_etl_cache_lookup( + etl_service=key.etl_service, mode=key.mode, outcome="hit" + ) + logger.debug("ETL cache hit for %s", key.source_sha256) return cached_result + metrics.record_etl_cache_lookup( + etl_service=key.etl_service, mode=key.mode, outcome="miss" + ) result = await EtlPipelineService(vision_llm=vision_llm).extract(request) await _remember(key, result) return result diff --git a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py index 98841b139..dcda10f61 100644 --- a/surfsense_backend/app/etl_pipeline/cache/eviction/task.py +++ b/surfsense_backend/app/etl_pipeline/cache/eviction/task.py @@ -12,6 +12,7 @@ from app.etl_pipeline.cache.persistence import CachedParseRepository from app.etl_pipeline.cache.schemas import EvictionCandidate from app.etl_pipeline.cache.settings import load_etl_cache_settings from app.etl_pipeline.cache.storage import MarkdownCacheStore +from app.observability import metrics from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task logger = logging.getLogger(__name__) @@ -34,7 +35,7 @@ async def _evict() -> None: cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days) expired = await index.select_expired(cutoff=cutoff, limit=settings.eviction_batch) - await _drop(index, store, expired) + await _drop(index, store, expired, phase="ttl") total = await index.total_size_bytes() if total > settings.max_total_bytes: @@ -44,13 +45,15 @@ async def _evict() -> None: current_total_bytes=total, max_total_bytes=settings.max_total_bytes, ) - await _drop(index, store, over_budget) + await _drop(index, store, over_budget, phase="size") async def _drop( index: CachedParseRepository, store: MarkdownCacheStore, candidates: list[EvictionCandidate], + *, + phase: str, ) -> None: if not candidates: return @@ -59,4 +62,5 @@ async def _drop( with contextlib.suppress(Exception): await store.delete(candidate.storage_key) await index.delete_by_ids([candidate.id for candidate in candidates]) - logger.info("Evicted %d cached parses", len(candidates)) + metrics.record_etl_cache_eviction(len(candidates), phase=phase) + logger.info("Evicted %d cached parses (%s)", len(candidates), phase) From 99cf212c3160ab6e99af2e72718ed1d5e88e4c85 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 12:19:49 +0200 Subject: [PATCH 31/52] test: fix auth-mode mismatch and stale QuotaInsufficientError kwargs Pin AUTH_TYPE=LOCAL (and REGISTRATION_ENABLED=TRUE) in the test bootstrap so the email/password auth routers mount during integration tests regardless of a developer's .env=GOOGLE; without this the upload tests 404 on registration. Also update three tests to the current QuotaInsufficientError signature (balance_micros) after used_micros/limit_micros were removed. --- surfsense_backend/tests/conftest.py | 8 ++++++++ .../tests/integration/podcasts/test_draft_task.py | 3 +-- .../tests/unit/services/test_quota_checked_vision_llm.py | 3 +-- .../tests/unit/tasks/test_video_presentation_billing.py | 3 +-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/tests/conftest.py b/surfsense_backend/tests/conftest.py index e2b586aa2..e227ed287 100644 --- a/surfsense_backend/tests/conftest.py +++ b/surfsense_backend/tests/conftest.py @@ -13,6 +13,14 @@ TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB) # DATABASE_URL in the environment (e.g. from .env or shell profile). os.environ["DATABASE_URL"] = TEST_DATABASE_URL +# Integration tests authenticate over HTTP via email/password, so the +# password-auth routers must be mounted (they are skipped under AUTH_TYPE=GOOGLE). +# setdefault (not load_dotenv, which runs later with override=False) lets a +# developer's .env=GOOGLE be overridden here while still honouring an explicitly +# exported shell AUTH_TYPE. +os.environ.setdefault("AUTH_TYPE", "LOCAL") +os.environ.setdefault("REGISTRATION_ENABLED", "TRUE") + import pytest # noqa: E402 from app.db import DocumentType # noqa: E402 diff --git a/surfsense_backend/tests/integration/podcasts/test_draft_task.py b/surfsense_backend/tests/integration/podcasts/test_draft_task.py index 7dadfc2f5..e9c9e4a9c 100644 --- a/surfsense_backend/tests/integration/podcasts/test_draft_task.py +++ b/surfsense_backend/tests/integration/podcasts/test_draft_task.py @@ -76,8 +76,7 @@ async def test_quota_denial_fails_the_podcast_without_a_transcript( async def _deny(**_kwargs): raise QuotaInsufficientError( usage_type="podcast_generation", - used_micros=5_000_000, - limit_micros=5_000_000, + balance_micros=0, remaining_micros=0, ) yield # pragma: no cover - unreachable, satisfies the CM protocol diff --git a/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py b/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py index 9e35b6f9c..17df89135 100644 --- a/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py +++ b/surfsense_backend/tests/unit/services/test_quota_checked_vision_llm.py @@ -105,8 +105,7 @@ async def test_ainvoke_propagates_quota_insufficient_error(monkeypatch): async def _denying_billable_call(**_kwargs): raise QuotaInsufficientError( usage_type="vision_extraction", - used_micros=5_000_000, - limit_micros=5_000_000, + balance_micros=0, remaining_micros=0, ) yield # unreachable but required for asynccontextmanager type diff --git a/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py b/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py index 423b64ddb..97c1551a5 100644 --- a/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py +++ b/surfsense_backend/tests/unit/tasks/test_video_presentation_billing.py @@ -98,8 +98,7 @@ async def _denying_billable_call(**kwargs): _CALL_LOG.append(kwargs) raise QuotaInsufficientError( usage_type=kwargs.get("usage_type", "?"), - used_micros=5_000_000, - limit_micros=5_000_000, + balance_micros=0, remaining_micros=0, ) yield SimpleNamespace() # pragma: no cover From 0fb1d3d37b5e697bf0cf3b3286f11a8f57dfebb0 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 14:47:25 +0200 Subject: [PATCH 32/52] feat(etl-cache): route all file-based sources through the parse cache Every file ingestion path (Dropbox, Google Drive / Composio Drive, OneDrive, local folder, Obsidian, and the legacy upload handlers) now parses via the extract_with_cache facade instead of calling EtlPipelineService.extract directly, so identical bytes are deduplicated globally regardless of source. vision_llm is passed through, keeping the existing cacheability gate intact. --- .../connectors/dropbox/content_extractor.py | 7 ++++--- .../google_drive/content_extractor.py | 9 +++++---- .../connectors/onedrive/content_extractor.py | 9 +++++---- .../app/services/obsidian_plugin_indexer.py | 7 ++++--- .../local_folder_indexer.py | 7 ++++--- .../document_processors/file_processors.py | 19 +++++++++++-------- 6 files changed, 33 insertions(+), 25 deletions(-) diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index 372d2fc82..300010c26 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -90,11 +90,12 @@ async def download_and_extract_content( if error: return None, metadata, error + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=temp_file_path, filename=file_name) + result = await extract_with_cache( + EtlRequest(file_path=temp_file_path, filename=file_name), + vision_llm=vision_llm, ) markdown = result.markdown_content return markdown, metadata, None diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 59392831d..1ea047978 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -122,12 +122,13 @@ async def download_and_extract_content( async def _parse_file_to_markdown( file_path: str, filename: str, *, vision_llm=None ) -> str: - """Parse a local file to markdown using the unified ETL pipeline.""" + """Parse a local file to markdown via the cache-aware ETL pipeline.""" + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=file_path, filename=filename) + result = await extract_with_cache( + EtlRequest(file_path=file_path, filename=filename), + vision_llm=vision_llm, ) return result.markdown_content diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py index 3154f2eca..fb1d31fbc 100644 --- a/surfsense_backend/app/connectors/onedrive/content_extractor.py +++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py @@ -84,11 +84,12 @@ async def download_and_extract_content( async def _parse_file_to_markdown( file_path: str, filename: str, *, vision_llm=None ) -> str: - """Parse a local file to markdown using the unified ETL pipeline.""" + """Parse a local file to markdown via the cache-aware ETL pipeline.""" + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - result = await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=file_path, filename=filename) + result = await extract_with_cache( + EtlRequest(file_path=file_path, filename=filename), + vision_llm=vision_llm, ) return result.markdown_content diff --git a/surfsense_backend/app/services/obsidian_plugin_indexer.py b/surfsense_backend/app/services/obsidian_plugin_indexer.py index 13f43d1ee..cd05d7935 100644 --- a/surfsense_backend/app/services/obsidian_plugin_indexer.py +++ b/surfsense_backend/app/services/obsidian_plugin_indexer.py @@ -199,11 +199,12 @@ async def _extract_binary_attachment_markdown( async def _run_etl_extract(*, file_path: str, filename: str, vision_llm): """Lazy-load ETL dependencies to avoid module-import cycles.""" + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - return await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=file_path, filename=filename) + return await extract_with_cache( + EtlRequest(file_path=file_path, filename=filename), + vision_llm=vision_llm, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 1a2d4b967..2505fa7c4 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -162,12 +162,13 @@ async def _read_file_content( All file types (plaintext, audio, direct-convert, document, image) are handled by ``EtlPipelineService``. """ + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService mode = ProcessingMode.coerce(processing_mode) - result = await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=file_path, filename=filename, processing_mode=mode) + result = await extract_with_cache( + EtlRequest(file_path=file_path, filename=filename, processing_mode=mode), + vision_llm=vision_llm, ) return result.markdown_content diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 0c3d30766..174ac966d 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1,8 +1,9 @@ """ File document processors orchestrating content extraction and indexing. -Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and -keeps only orchestration concerns (notifications, logging, page limits, saving). +Delegates content extraction to the cache-aware ``extract_with_cache`` facade +(over ``EtlPipelineService``) and keeps only orchestration concerns +(notifications, logging, page limits, saving). """ from __future__ import annotations @@ -116,8 +117,8 @@ async def _log_page_divergence( async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None: """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline.""" + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService await _notify(ctx, "parsing", "Processing file") await ctx.task_logger.log_task_progress( @@ -136,8 +137,9 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id) - etl_result = await EtlPipelineService(vision_llm=vision_llm).extract( - EtlRequest(file_path=ctx.file_path, filename=ctx.filename) + etl_result = await extract_with_cache( + EtlRequest(file_path=ctx.file_path, filename=ctx.filename), + vision_llm=vision_llm, ) with contextlib.suppress(Exception): @@ -183,8 +185,8 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: """Route a document file to the configured ETL service via the unified pipeline.""" + from app.etl_pipeline.cache import extract_with_cache from app.etl_pipeline.etl_document import EtlRequest, ProcessingMode - from app.etl_pipeline.etl_pipeline_service import EtlPipelineService from app.services.etl_credit_service import ( EtlCreditService, InsufficientCreditsError, @@ -237,13 +239,14 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id) - etl_result = await EtlPipelineService(vision_llm=vision_llm).extract( + etl_result = await extract_with_cache( EtlRequest( file_path=ctx.file_path, filename=ctx.filename, estimated_pages=estimated_pages, processing_mode=mode, - ) + ), + vision_llm=vision_llm, ) with contextlib.suppress(Exception): From cf208365b471941b967d4f2514edc54bad20213c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:01 +0200 Subject: [PATCH 33/52] feat(index-cache): add embedding set value objects --- .../cache/schemas/__init__.py | 12 ++++++++ .../cache/schemas/embedding_key.py | 27 +++++++++++++++++ .../cache/schemas/embedding_set.py | 29 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py new file mode 100644 index 000000000..8714e2d86 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py @@ -0,0 +1,12 @@ +"""Pure value objects for the index cache.""" + +from __future__ import annotations + +from .embedding_key import EmbeddingKey +from .embedding_set import CachedChunk, EmbeddingSet + +__all__ = [ + "CachedChunk", + "EmbeddingKey", + "EmbeddingSet", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py new file mode 100644 index 000000000..55d891e73 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py @@ -0,0 +1,27 @@ +"""Identity of a cacheable embedding set: equal keys yield identical vectors. + +Embeddings depend on the markdown text, the embedding model, and the chunker -- +never on how the markdown was produced. So the key is the markdown's own hash +plus the model and chunker recipe, not the upstream parse identity. +""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class EmbeddingKey: + markdown_sha256: str + embedding_model: str + embedding_dim: int + chunker_kind: str + chunker_version: int + + @property + def object_suffix(self) -> str: + # Fingerprint the model so distinct models never share a blob, while the + # markdown hash (the object's folder) stays human-readable. + fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest() + return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb" diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py new file mode 100644 index 000000000..68c3a5211 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py @@ -0,0 +1,29 @@ +"""The cached payload: a document's chunk texts paired with their vectors.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np + + +@dataclass(frozen=True, slots=True) +class CachedChunk: + text: str + embedding: np.ndarray + + +@dataclass(frozen=True, slots=True) +class EmbeddingSet: + """Everything the indexer needs to rebuild a document's chunks without embedding. + + ``summary_embedding`` is the document-level vector; ``chunks`` are the ordered + chunk texts and their vectors. + """ + + summary_embedding: np.ndarray + chunks: list[CachedChunk] + + @property + def chunk_count(self) -> int: + return len(self.chunks) From 59fa4c38c3613f47ae4336ed6a9f1a0d698b749a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:01 +0200 Subject: [PATCH 34/52] feat(index-cache): add pickle-free blob serialization --- .../indexing_pipeline/cache/serialization.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/serialization.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/serialization.py b/surfsense_backend/app/indexing_pipeline/cache/serialization.py new file mode 100644 index 000000000..f9d53b471 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/serialization.py @@ -0,0 +1,71 @@ +"""Serialize an EmbeddingSet to a compact, self-describing blob (no pickle). + +Layout: ``MAGIC | uint32 header_len | json header | float32 matrix``. The header +carries the dim, chunk count, and ordered chunk texts; the matrix holds the +summary vector followed by one row per chunk, all float32 for compactness. +""" + +from __future__ import annotations + +import json +import struct + +import numpy as np + +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet + +# Marker at the start of every blob: "SurfSense EMBeddings, version 1"-> SSEMB1. Lets us +# reject foreign blobs and bump the trailing digit if the layout ever changes. +_MAGIC = b"SSEMB1" +# 4-byte big-endian unsigned int written before the variable-length JSON header, +# so the reader knows where the header ends and the float matrix begins. +_HEADER_LEN = struct.Struct(">I") + + +def serialize(embedding_set: EmbeddingSet) -> bytes: + summary = np.asarray(embedding_set.summary_embedding, dtype=np.float32).reshape(-1) + dim = int(summary.shape[0]) + + rows = [summary] + texts: list[str] = [] + for chunk in embedding_set.chunks: + vector = np.asarray(chunk.embedding, dtype=np.float32).reshape(-1) + if vector.shape[0] != dim: + raise ValueError("All vectors in an embedding set must share one dimension.") + rows.append(vector) + texts.append(chunk.text) + + matrix = np.stack(rows, axis=0) + header = json.dumps( + {"dim": dim, "count": len(texts), "texts": texts}, ensure_ascii=False + ).encode("utf-8") + return b"".join( + [_MAGIC, _HEADER_LEN.pack(len(header)), header, matrix.tobytes(order="C")] + ) + + +def deserialize(blob: bytes) -> EmbeddingSet: + view = memoryview(blob) + if bytes(view[: len(_MAGIC)]) != _MAGIC: + raise ValueError("Unrecognized embedding cache blob.") + + offset = len(_MAGIC) + (header_len,) = _HEADER_LEN.unpack(view[offset : offset + _HEADER_LEN.size]) + offset += _HEADER_LEN.size + + header = json.loads(bytes(view[offset : offset + header_len]).decode("utf-8")) + offset += header_len + + dim = int(header["dim"]) + count = int(header["count"]) + texts: list[str] = header["texts"] + + matrix = np.frombuffer(view[offset:], dtype=np.float32) + if matrix.shape[0] != (count + 1) * dim: + raise ValueError("Embedding cache blob is truncated or corrupt.") + matrix = matrix.reshape(count + 1, dim) + + return EmbeddingSet( + summary_embedding=matrix[0], + chunks=[CachedChunk(text=texts[i], embedding=matrix[i + 1]) for i in range(count)], + ) From f5411145446aa253cfe21802ce1666f484a0a006 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:01 +0200 Subject: [PATCH 35/52] feat(index-cache): add cached embedding set table and repository --- .../161_add_index_cache_embedding_sets.py | 53 ++++++++ surfsense_backend/app/db.py | 3 + .../cache/persistence/__init__.py | 11 ++ .../cache/persistence/models.py | 47 +++++++ .../cache/persistence/repository.py | 126 ++++++++++++++++++ 5 files changed, 240 insertions(+) create mode 100644 surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/persistence/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/persistence/models.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py diff --git a/surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py b/surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py new file mode 100644 index 000000000..8441dcf6e --- /dev/null +++ b/surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py @@ -0,0 +1,53 @@ +"""add index_cache_embedding_sets table for content-addressed embedding reuse + +Revision ID: 161 +Revises: 160 +""" + +from collections.abc import Sequence + +from alembic import op + +revision: str = "161" +down_revision: str | None = "160" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.execute( + """ + CREATE TABLE IF NOT EXISTS index_cache_embedding_sets ( + id SERIAL PRIMARY KEY, + markdown_sha256 VARCHAR(64) NOT NULL, + embedding_model VARCHAR(255) NOT NULL, + embedding_dim INTEGER NOT NULL, + chunker_kind VARCHAR(8) NOT NULL, + chunker_version INTEGER NOT NULL, + storage_backend VARCHAR(32) NOT NULL, + storage_key TEXT NOT NULL, + size_bytes BIGINT NOT NULL, + chunk_count INTEGER NOT NULL DEFAULT 0, + times_reused BIGINT NOT NULL DEFAULT 0, + last_used_at TIMESTAMP WITH TIME ZONE NOT NULL, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + CONSTRAINT uq_index_cache_embedding_sets_key + UNIQUE (markdown_sha256, embedding_model, chunker_kind, chunker_version) + ); + """ + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS ix_index_cache_embedding_sets_last_used_at " + "ON index_cache_embedding_sets(last_used_at);" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_index_cache_embedding_sets_created_at " + "ON index_cache_embedding_sets(created_at);" + ) + + +def downgrade() -> None: + op.execute("DROP INDEX IF EXISTS ix_index_cache_embedding_sets_created_at;") + op.execute("DROP INDEX IF EXISTS ix_index_cache_embedding_sets_last_used_at;") + op.execute("DROP TABLE IF EXISTS index_cache_embedding_sets;") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 97843d395..9ec13f4e2 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -2866,6 +2866,9 @@ from app.automations.persistence import ( # noqa: E402, F401 ) from app.etl_pipeline.cache.persistence.models import CachedParse # noqa: E402, F401 from app.file_storage.persistence import DocumentFile # noqa: E402, F401 +from app.indexing_pipeline.cache.persistence.models import ( # noqa: E402, F401 + CachedEmbeddingSet, +) from app.notifications.persistence import Notification # noqa: E402, F401 from app.podcasts.persistence import ( # noqa: E402, F401 Podcast, diff --git a/surfsense_backend/app/indexing_pipeline/cache/persistence/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/persistence/__init__.py new file mode 100644 index 000000000..62cde0d05 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/__init__.py @@ -0,0 +1,11 @@ +"""Database access for cached embedding sets.""" + +from __future__ import annotations + +from .models import CachedEmbeddingSet +from .repository import CachedEmbeddingSetRepository + +__all__ = [ + "CachedEmbeddingSet", + "CachedEmbeddingSetRepository", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py b/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py new file mode 100644 index 000000000..e33e470f0 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py @@ -0,0 +1,47 @@ +"""``index_cache_embedding_sets``: one reusable chunk+embedding set per markdown.""" + +from __future__ import annotations + +from sqlalchemy import ( + BigInteger, + Column, + DateTime, + Index, + Integer, + String, + UniqueConstraint, +) + +from app.db import BaseModel, TimestampMixin + + +class CachedEmbeddingSet(BaseModel, TimestampMixin): + __tablename__ = "index_cache_embedding_sets" + + # Key: markdown text + the recipe that turned it into vectors. + markdown_sha256 = Column(String(64), nullable=False) + embedding_model = Column(String(255), nullable=False) + embedding_dim = Column(Integer, nullable=False) + chunker_kind = Column(String(8), nullable=False) + chunker_version = Column(Integer, nullable=False) + + # Where the embedding blob lives (kept out of the row to stay small). + storage_backend = Column(String(32), nullable=False) + storage_key = Column(String, nullable=False) + size_bytes = Column(BigInteger, nullable=False) + chunk_count = Column(Integer, nullable=False, default=0, server_default="0") + + # Drives eviction (popularity + recency). + times_reused = Column(BigInteger, nullable=False, default=0, server_default="0") + last_used_at = Column(DateTime(timezone=True), nullable=False) + + __table_args__ = ( + UniqueConstraint( + "markdown_sha256", + "embedding_model", + "chunker_kind", + "chunker_version", + name="uq_index_cache_embedding_sets_key", + ), + Index("ix_index_cache_embedding_sets_last_used_at", "last_used_at"), + ) diff --git a/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py b/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py new file mode 100644 index 000000000..0bb0f8f23 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py @@ -0,0 +1,126 @@ +"""CRUD and eviction selectors for ``index_cache_embedding_sets`` (no business rules).""" + +from __future__ import annotations + +from datetime import UTC, datetime + +from sqlalchemy import delete, func, select, update +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.ext.asyncio import AsyncSession + +from app.etl_pipeline.cache.schemas import EvictionCandidate +from app.indexing_pipeline.cache.schemas import EmbeddingKey + +from .models import CachedEmbeddingSet + +_EVICTION_COLUMNS = ( + CachedEmbeddingSet.id, + CachedEmbeddingSet.storage_key, + CachedEmbeddingSet.size_bytes, + CachedEmbeddingSet.last_used_at, + CachedEmbeddingSet.times_reused, +) + + +def _as_eviction_candidate(row) -> EvictionCandidate: + return EvictionCandidate( + id=row.id, + storage_key=row.storage_key, + size_bytes=row.size_bytes, + last_used_at=row.last_used_at, + times_reused=row.times_reused, + ) + + +class CachedEmbeddingSetRepository: + def __init__(self, session: AsyncSession) -> None: + self._session = session + + async def get(self, key: EmbeddingKey) -> CachedEmbeddingSet | None: + result = await self._session.execute( + select(CachedEmbeddingSet).where( + CachedEmbeddingSet.markdown_sha256 == key.markdown_sha256, + CachedEmbeddingSet.embedding_model == key.embedding_model, + CachedEmbeddingSet.chunker_kind == key.chunker_kind, + CachedEmbeddingSet.chunker_version == key.chunker_version, + ) + ) + return result.scalars().first() + + async def insert( + self, + *, + key: EmbeddingKey, + storage_backend: str, + storage_key: str, + size_bytes: int, + chunk_count: int, + ) -> None: + # Concurrent writers embed identical markdown, so a lost race is harmless. + now = datetime.now(UTC) + await self._session.execute( + pg_insert(CachedEmbeddingSet) + .values( + markdown_sha256=key.markdown_sha256, + embedding_model=key.embedding_model, + embedding_dim=key.embedding_dim, + chunker_kind=key.chunker_kind, + chunker_version=key.chunker_version, + storage_backend=storage_backend, + storage_key=storage_key, + size_bytes=size_bytes, + chunk_count=chunk_count, + times_reused=0, + last_used_at=now, + created_at=now, + ) + .on_conflict_do_nothing(constraint="uq_index_cache_embedding_sets_key") + ) + await self._session.commit() + + async def mark_used(self, row_id: int) -> None: + await self._session.execute( + update(CachedEmbeddingSet) + .where(CachedEmbeddingSet.id == row_id) + .values( + times_reused=CachedEmbeddingSet.times_reused + 1, + last_used_at=datetime.now(UTC), + ) + ) + await self._session.commit() + + async def total_size_bytes(self) -> int: + result = await self._session.execute( + select(func.coalesce(func.sum(CachedEmbeddingSet.size_bytes), 0)) + ) + return int(result.scalar() or 0) + + async def select_expired( + self, *, cutoff: datetime, limit: int + ) -> list[EvictionCandidate]: + result = await self._session.execute( + select(*_EVICTION_COLUMNS) + .where(CachedEmbeddingSet.last_used_at < cutoff) + .order_by(CachedEmbeddingSet.last_used_at.asc()) + .limit(limit) + ) + return [_as_eviction_candidate(row) for row in result] + + async def select_coldest(self, *, limit: int) -> list[EvictionCandidate]: + result = await self._session.execute( + select(*_EVICTION_COLUMNS) + .order_by( + CachedEmbeddingSet.times_reused.asc(), + CachedEmbeddingSet.last_used_at.asc(), + ) + .limit(limit) + ) + return [_as_eviction_candidate(row) for row in result] + + async def delete_by_ids(self, ids: list[int]) -> None: + if not ids: + return + await self._session.execute( + delete(CachedEmbeddingSet).where(CachedEmbeddingSet.id.in_(ids)) + ) + await self._session.commit() From ad6da7c6afbbef3aca89100fca0caefde47e3c0d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:01 +0200 Subject: [PATCH 36/52] feat(index-cache): add embedding blob store sharing the cache backend --- .../cache/storage/__init__.py | 9 +++++ .../cache/storage/embedding_store.py | 39 +++++++++++++++++++ .../cache/storage/object_keys.py | 12 ++++++ 3 files changed, 60 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/storage/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/storage/__init__.py new file mode 100644 index 000000000..72b04c34d --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/__init__.py @@ -0,0 +1,9 @@ +"""Blob storage for cached embedding sets.""" + +from __future__ import annotations + +from .embedding_store import EmbeddingCacheStore + +__all__ = [ + "EmbeddingCacheStore", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py new file mode 100644 index 000000000..48835a12b --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py @@ -0,0 +1,39 @@ +"""Read and write cached embedding blobs through the shared cache backend. + +The blob backend is shared with the ETL parse cache (same bucket / root), so +markdown and its embeddings live side by side; only the object prefix differs. +""" + +from __future__ import annotations + +from app.etl_pipeline.cache.storage.backend import resolve_cache_backend +from app.indexing_pipeline.cache.serialization import deserialize, serialize +from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key + +_EMBEDDING_CONTENT_TYPE = "application/octet-stream" + + +class EmbeddingCacheStore: + def __init__(self) -> None: + self._backend = resolve_cache_backend() + + @property + def backend_name(self) -> str: + return self._backend.backend_name + + async def save(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> tuple[str, int]: + """Persist the embedding set and return its storage key and byte size.""" + blob = serialize(embedding_set) + storage_key = build_embedding_object_key(key) + await self._backend.put( + storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE + ) + return storage_key, len(blob) + + async def load(self, storage_key: str) -> EmbeddingSet: + chunks = [chunk async for chunk in self._backend.open_stream(storage_key)] + return deserialize(b"".join(chunks)) + + async def delete(self, storage_key: str) -> None: + await self._backend.delete(storage_key) diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py b/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py new file mode 100644 index 000000000..90e0b8957 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py @@ -0,0 +1,12 @@ +"""Object keys for cached embedding sets, namespaced under a dedicated prefix.""" + +from __future__ import annotations + +from app.indexing_pipeline.cache.schemas import EmbeddingKey + +CACHE_PREFIX = "index_cache" + + +def build_embedding_object_key(key: EmbeddingKey) -> str: + # Content-addressed: identical markdown + recipe always map to the same key. + return f"{CACHE_PREFIX}/{key.markdown_sha256}/{key.object_suffix}" From daccd304ee8bce407495964ecb0ea070f1c22716 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:10 +0200 Subject: [PATCH 37/52] feat(index-cache): add settings, eligibility, and config flags --- surfsense_backend/.env.example | 14 +++++++++ surfsense_backend/app/config/__init__.py | 11 +++++++ .../indexing_pipeline/cache/eligibility.py | 21 +++++++++++++ .../app/indexing_pipeline/cache/settings.py | 30 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/eligibility.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/settings.py diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 1924756ce..03b8d9255 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -328,6 +328,20 @@ ETL_CACHE_ENABLED=false # ETL_CACHE_STORAGE_CONTAINER=surfsense-etl-cache # ETL_CACHE_STORAGE_LOCAL_PATH=/var/lib/surfsense/etl-cache +# Index Cache +# Reuse chunk+embedding output for identical markdown across workspaces (skips +# re-chunking and re-embedding). Blobs share the ETL_CACHE_STORAGE_* backend. +# Off by default. +INDEX_CACHE_ENABLED=false +# Bump to invalidate all cached embedding sets after a chunker change. +# INDEX_CACHE_CHUNKER_VERSION=1 +# Prune entries unused for this many days. +# INDEX_CACHE_TTL_DAYS=90 +# Soft cap on total cached embeddings; coldest entries are evicted past it. +# INDEX_CACHE_MAX_TOTAL_MB=5120 +# Rows deleted per eviction pass. +# INDEX_CACHE_EVICTION_BATCH=500 + # Daytona Sandbox (isolated code execution) # DAYTONA_SANDBOX_ENABLED=FALSE # DAYTONA_API_KEY=your-daytona-api-key diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 525fe160d..0b6c05c39 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -964,6 +964,17 @@ class Config: ETL_CACHE_STORAGE_CONTAINER = os.getenv("ETL_CACHE_STORAGE_CONTAINER") ETL_CACHE_STORAGE_LOCAL_PATH = os.getenv("ETL_CACHE_STORAGE_LOCAL_PATH") + # Index cache: reuse chunk+embedding output for identical markdown across + # workspaces. Blobs share the ETL_CACHE_STORAGE_* backend. + INDEX_CACHE_ENABLED = ( + os.getenv("INDEX_CACHE_ENABLED", "false").strip().lower() == "true" + ) + # Bump to invalidate every cached embedding set after a chunker change. + INDEX_CACHE_CHUNKER_VERSION = int(os.getenv("INDEX_CACHE_CHUNKER_VERSION", "1")) + INDEX_CACHE_TTL_DAYS = int(os.getenv("INDEX_CACHE_TTL_DAYS", "90")) + INDEX_CACHE_MAX_TOTAL_MB = int(os.getenv("INDEX_CACHE_MAX_TOTAL_MB", "5120")) + INDEX_CACHE_EVICTION_BATCH = int(os.getenv("INDEX_CACHE_EVICTION_BATCH", "500")) + # Proxy provider selection. Maps to a ProxyProvider implementation registered # in app/utils/proxy/registry.py. Add new vendors there and switch via this var. PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies") diff --git a/surfsense_backend/app/indexing_pipeline/cache/eligibility.py b/surfsense_backend/app/indexing_pipeline/cache/eligibility.py new file mode 100644 index 000000000..7dbf79202 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/eligibility.py @@ -0,0 +1,21 @@ +"""Gating rule: may this document be served from / written to the index cache?""" + +from __future__ import annotations + + +def is_index_cacheable( + *, + cache_enabled: bool, + embedding_model: str | None, + embedding_dim: int | None, +) -> bool: + """Cache only when a concrete embedding model and dimension are configured. + + Without a model there is nothing to key against, and without a dimension the + blob's integrity guard cannot run -- both bypass the cache. + """ + if not cache_enabled: + return False + if not embedding_model: + return False + return bool(embedding_dim) diff --git a/surfsense_backend/app/indexing_pipeline/cache/settings.py b/surfsense_backend/app/indexing_pipeline/cache/settings.py new file mode 100644 index 000000000..2991c0980 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/settings.py @@ -0,0 +1,30 @@ +"""Index-cache configuration resolved from the central ``Config``. + +The blob backend is intentionally not configured here: it is shared with the ETL +parse cache (see ``ETL_CACHE_STORAGE_*``). +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class IndexCacheSettings: + enabled: bool + chunker_version: int + ttl_days: int + max_total_bytes: int + eviction_batch: int + + +def load_index_cache_settings() -> IndexCacheSettings: + from app.config import config + + return IndexCacheSettings( + enabled=config.INDEX_CACHE_ENABLED, + chunker_version=config.INDEX_CACHE_CHUNKER_VERSION, + ttl_days=config.INDEX_CACHE_TTL_DAYS, + max_total_bytes=config.INDEX_CACHE_MAX_TOTAL_MB * 1024 * 1024, + eviction_batch=config.INDEX_CACHE_EVICTION_BATCH, + ) From 4d6378e031ff0de284867ebe3cbf36d8f6028099 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:10 +0200 Subject: [PATCH 38/52] feat(observability): add index cache hit/miss and eviction metrics --- .../app/observability/metrics.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index 4751278a4..61b380722 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -305,6 +305,22 @@ def _etl_cache_evictions(): ) +@lru_cache(maxsize=1) +def _index_cache_lookups(): + return _get_meter().create_counter( + "surfsense.index.cache.lookups", + description="Count of index (chunk+embedding) cache lookups by outcome (hit/miss).", + ) + + +@lru_cache(maxsize=1) +def _index_cache_evictions(): + return _get_meter().create_counter( + "surfsense.index.cache.evictions", + description="Count of index cache entries evicted, by phase.", + ) + + @lru_cache(maxsize=1) def _celery_heartbeat_refreshes(): return _get_meter().create_counter( @@ -708,6 +724,28 @@ def record_etl_cache_eviction(count: int, *, phase: str) -> None: _add(_etl_cache_evictions(), count, {"phase": phase}) +def record_index_cache_lookup( + *, embedding_model: str | None, chunker_kind: str | None, outcome: str +) -> None: + """Record an index-cache lookup. ``outcome`` is ``hit`` or ``miss``.""" + _add( + _index_cache_lookups(), + 1, + { + "embedding.model": embedding_model or "unknown", + "chunker.kind": chunker_kind or "unknown", + "outcome": outcome, + }, + ) + + +def record_index_cache_eviction(count: int, *, phase: str) -> None: + """Record evicted entries. ``phase`` is ``ttl`` or ``size``.""" + if count <= 0: + return + _add(_index_cache_evictions(), count, {"phase": phase}) + + def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type}) @@ -908,6 +946,8 @@ __all__ = [ "record_etl_cache_lookup", "record_etl_extract_duration", "record_etl_extract_outcome", + "record_index_cache_eviction", + "record_index_cache_lookup", "record_indexing_document_duration", "record_indexing_document_outcome", "record_interrupt", From e8938c119bdda6f44ac78d48011e004a5ce02294 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:10 +0200 Subject: [PATCH 39/52] feat(index-cache): add recall/remember service --- .../app/indexing_pipeline/cache/service.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/service.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/service.py b/surfsense_backend/app/indexing_pipeline/cache/service.py new file mode 100644 index 000000000..942ba7d51 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/service.py @@ -0,0 +1,51 @@ +"""Recall and remember embedding sets, coordinating the index and blob store.""" + +from __future__ import annotations + +import logging + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository +from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.storage import EmbeddingCacheStore + +logger = logging.getLogger(__name__) + + +class IndexCacheService: + def __init__(self, session: AsyncSession) -> None: + self._index = CachedEmbeddingSetRepository(session) + self._store = EmbeddingCacheStore() + + async def recall(self, key: EmbeddingKey) -> EmbeddingSet | None: + """Return the cached embedding set, or None on a miss.""" + row = await self._index.get(key) + if row is None: + return None + + try: + embedding_set = await self._store.load(row.storage_key) + except Exception: + # Index points at a blob that is gone; treat as a miss and re-embed. + logger.warning("Cache blob missing: %s", row.storage_key, exc_info=True) + return None + + if int(embedding_set.summary_embedding.shape[0]) != key.embedding_dim: + # A model swapped its dimension under a reused name; never serve it. + logger.warning("Cached embedding dimension mismatch: %s", row.storage_key) + return None + + await self._index.mark_used(row.id) + return embedding_set + + async def remember(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> None: + """Store a freshly embedded set for future reuse.""" + storage_key, size_bytes = await self._store.save(key, embedding_set) + await self._index.insert( + key=key, + storage_backend=self._store.backend_name, + storage_key=storage_key, + size_bytes=size_bytes, + chunk_count=embedding_set.chunk_count, + ) From 019aa7bf76dd1dd68e3f54321dbf21fcd9afe43a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:18 +0200 Subject: [PATCH 40/52] feat(index-cache): serve chunk embeddings from cache during indexing --- .../app/indexing_pipeline/cache/__init__.py | 11 ++ .../cache/cached_indexing.py | 121 ++++++++++++++++++ .../indexing_pipeline_service.py | 27 +--- 3 files changed, 138 insertions(+), 21 deletions(-) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/__init__.py new file mode 100644 index 000000000..190b45f84 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/__init__.py @@ -0,0 +1,11 @@ +"""Content-addressed reuse of chunk+embedding output across workspaces.""" + +from __future__ import annotations + +from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings +from app.indexing_pipeline.cache.service import IndexCacheService + +__all__ = [ + "IndexCacheService", + "build_chunk_embeddings", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py new file mode 100644 index 000000000..db07998a4 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py @@ -0,0 +1,121 @@ +"""Entry point: serve chunk embeddings from cache, embedding only on a miss. + +Embeddings are a pure function of the markdown, the embedding model, and the +chunker -- so identical markdown is chunked and embedded once and reused across +workspaces, even when it came from different sources. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging + +import numpy as np + +from app.config import config +from app.indexing_pipeline.cache.eligibility import is_index_cacheable +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.service import IndexCacheService +from app.indexing_pipeline.cache.settings import load_index_cache_settings +from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid +from app.indexing_pipeline.document_embedder import embed_texts +from app.observability import metrics + +logger = logging.getLogger(__name__) + +ChunkPair = tuple[str, np.ndarray] + + +async def build_chunk_embeddings( + markdown: str, *, use_code_chunker: bool +) -> tuple[np.ndarray, list[ChunkPair]]: + """Return the document-level vector and ordered ``(chunk_text, vector)`` pairs. + + Drop-in for the inline chunk+embed step; reuses prior output when the same + markdown has already been embedded with the current model and chunker. + """ + settings = load_index_cache_settings() + chunker_kind = "code" if use_code_chunker else "hybrid" + embedding_dim = getattr(config.embedding_model_instance, "dimension", None) + + cacheable = is_index_cacheable( + cache_enabled=settings.enabled, + embedding_model=config.EMBEDDING_MODEL, + embedding_dim=embedding_dim, + ) + if not cacheable: + return await _compute(markdown, use_code_chunker=use_code_chunker) + + key = EmbeddingKey( + markdown_sha256=_hash_text(markdown), + embedding_model=config.EMBEDDING_MODEL, + embedding_dim=int(embedding_dim), + chunker_kind=chunker_kind, + chunker_version=settings.chunker_version, + ) + + cached = await _recall(key) + if cached is not None: + metrics.record_index_cache_lookup( + embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit" + ) + logger.debug("Index cache hit for %s", key.markdown_sha256) + return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] + + metrics.record_index_cache_lookup( + embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss" + ) + summary_embedding, chunk_pairs = await _compute( + markdown, use_code_chunker=use_code_chunker + ) + await _remember(key, summary_embedding, chunk_pairs) + return summary_embedding, chunk_pairs + + +async def _compute( + markdown: str, *, use_code_chunker: bool +) -> tuple[np.ndarray, list[ChunkPair]]: + if use_code_chunker: + chunk_texts = await asyncio.to_thread( + chunk_text, markdown, use_code_chunker=True + ) + else: + # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). + chunk_texts = await asyncio.to_thread(chunk_text_hybrid, markdown) + + embeddings = await asyncio.to_thread(embed_texts, [markdown, *chunk_texts]) + summary_embedding, *chunk_embeddings = embeddings + return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False)) + + +async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: + # Caching is best-effort: any failure falls through to a normal embed. + try: + from app.tasks.celery_tasks import get_celery_session_maker + + async with get_celery_session_maker()() as session: + return await IndexCacheService(session).recall(key) + except Exception: + logger.warning("Index cache recall failed; embedding fresh", exc_info=True) + return None + + +async def _remember( + key: EmbeddingKey, summary_embedding: np.ndarray, chunk_pairs: list[ChunkPair] +) -> None: + try: + from app.tasks.celery_tasks import get_celery_session_maker + + embedding_set = EmbeddingSet( + summary_embedding=summary_embedding, + chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs], + ) + async with get_celery_session_maker()() as session: + await IndexCacheService(session).remember(key, embedding_set) + except Exception: + logger.warning("Index cache write failed; result not cached", exc_info=True) + + +def _hash_text(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 67a6778e0..271b3ee03 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -19,9 +19,8 @@ from app.db import ( DocumentStatus, DocumentType, ) +from app.indexing_pipeline.cache import build_chunk_embeddings from app.indexing_pipeline.connector_document import ConnectorDocument -from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid -from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_hashing import ( compute_content_hash, compute_identifier_hash, @@ -385,27 +384,13 @@ class IndexingPipelineService: ) t_step = time.perf_counter() - if connector_doc.should_use_code_chunker: - chunk_texts = await asyncio.to_thread( - chunk_text, - connector_doc.source_markdown, - use_code_chunker=True, - ) - else: - # Use the table-aware hybrid chunker so Markdown tables are not - # split mid-row (see issue #1334). - chunk_texts = await asyncio.to_thread( - chunk_text_hybrid, - connector_doc.source_markdown, - ) - - texts_to_embed = [content, *chunk_texts] - embeddings = await asyncio.to_thread(embed_texts, texts_to_embed) - summary_embedding, *chunk_embeddings = embeddings + summary_embedding, chunk_pairs = await build_chunk_embeddings( + content, + use_code_chunker=connector_doc.should_use_code_chunker, + ) chunks = [ - Chunk(content=text, embedding=emb) - for text, emb in zip(chunk_texts, chunk_embeddings, strict=False) + Chunk(content=text, embedding=emb) for text, emb in chunk_pairs ] perf.info( "[indexing] chunk+embed doc=%d chunks=%d in %.3fs", From 4e4f7f34faaa40102a8201a8e19c335c8b5dcd27 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:18 +0200 Subject: [PATCH 41/52] feat(index-cache): add TTL/size eviction task and daily schedule --- surfsense_backend/app/celery_app.py | 7 ++ .../cache/eviction/__init__.py | 9 +++ .../indexing_pipeline/cache/eviction/task.py | 68 +++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/eviction/task.py diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 413522189..38fb12a32 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -193,6 +193,7 @@ celery_app = Celery( "app.tasks.celery_tasks.auto_reload_task", "app.tasks.celery_tasks.gateway_tasks", "app.etl_pipeline.cache.eviction.task", + "app.indexing_pipeline.cache.eviction.task", "app.automations.tasks.execute_run", "app.automations.triggers.builtin.schedule.selector", "app.automations.triggers.builtin.event.selector", @@ -313,6 +314,12 @@ celery_app.conf.beat_schedule = { "schedule": crontab(hour="4", minute="0"), "options": {"expires": 600}, }, + # Prune the index cache (chunk+embedding sets) once daily, off-peak. + "evict-index-cache": { + "task": "evict_index_cache", + "schedule": crontab(hour="4", minute="30"), + "options": {"expires": 600}, + }, # Fire due automation schedule triggers (Beat entry owned by the schedule # trigger; see app.automations.triggers.builtin.schedule.source). **SCHEDULE_BEAT_SCHEDULE, diff --git a/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py new file mode 100644 index 000000000..de4df784e --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py @@ -0,0 +1,9 @@ +"""Background pruning of the index cache by age and size budget.""" + +from __future__ import annotations + +from .task import evict_index_cache_task + +__all__ = [ + "evict_index_cache_task", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py b/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py new file mode 100644 index 000000000..ab6885bca --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py @@ -0,0 +1,68 @@ +"""Celery task that prunes the index cache by TTL, then by size budget.""" + +from __future__ import annotations + +import contextlib +import logging +from datetime import UTC, datetime, timedelta + +from app.celery_app import celery_app +from app.etl_pipeline.cache.eviction.policy import select_over_budget +from app.etl_pipeline.cache.schemas import EvictionCandidate +from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository +from app.indexing_pipeline.cache.settings import load_index_cache_settings +from app.indexing_pipeline.cache.storage import EmbeddingCacheStore +from app.observability import metrics +from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task + +logger = logging.getLogger(__name__) + + +@celery_app.task(name="evict_index_cache") +def evict_index_cache_task(): + return run_async_celery_task(_evict) + + +async def _evict() -> None: + """Expire stale entries, then shed the coldest overflow only if still over budget.""" + settings = load_index_cache_settings() + if not settings.enabled: + return + + store = EmbeddingCacheStore() + async with get_celery_session_maker()() as session: + index = CachedEmbeddingSetRepository(session) + + cutoff = datetime.now(UTC) - timedelta(days=settings.ttl_days) + expired = await index.select_expired( + cutoff=cutoff, limit=settings.eviction_batch + ) + await _drop(index, store, expired, phase="ttl") + + total = await index.total_size_bytes() + if total > settings.max_total_bytes: + coldest = await index.select_coldest(limit=settings.eviction_batch) + over_budget = select_over_budget( + coldest, + current_total_bytes=total, + max_total_bytes=settings.max_total_bytes, + ) + await _drop(index, store, over_budget, phase="size") + + +async def _drop( + index: CachedEmbeddingSetRepository, + store: EmbeddingCacheStore, + candidates: list[EvictionCandidate], + *, + phase: str, +) -> None: + if not candidates: + return + for candidate in candidates: + # Drop the index row even if the blob delete fails (orphan blob is harmless). + with contextlib.suppress(Exception): + await store.delete(candidate.storage_key) + await index.delete_by_ids([candidate.id for candidate in candidates]) + metrics.record_index_cache_eviction(len(candidates), phase=phase) + logger.info("Evicted %d cached embedding sets (%s)", len(candidates), phase) From 8cf578d965657bbf5e7f9e2f1513ed19d8b8622c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:18 +0200 Subject: [PATCH 42/52] test(index-cache): add unit tests and repoint embed/chunk patch targets --- .../tests/e2e/fakes/embeddings.py | 4 +- .../tests/integration/conftest.py | 8 +-- .../integration/document_upload/conftest.py | 4 +- .../adapters/test_file_upload_adapter.py | 2 +- .../unit/indexing_pipeline/cache/conftest.py | 28 ++++++++++ .../cache/test_eligibility.py | 28 ++++++++++ .../cache/test_embedding_key.py | 31 +++++++++++ .../cache/test_serialization.py | 52 +++++++++++++++++++ .../test_index_batch_parallel.py | 10 ++-- 9 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py diff --git a/surfsense_backend/tests/e2e/fakes/embeddings.py b/surfsense_backend/tests/e2e/fakes/embeddings.py index ab9e24df9..9a01fb84b 100644 --- a/surfsense_backend/tests/e2e/fakes/embeddings.py +++ b/surfsense_backend/tests/e2e/fakes/embeddings.py @@ -57,9 +57,9 @@ def install(patches: list[Any]) -> None: # Consumers that did `from app.utils.document_converters import embed_text/texts` ("app.indexing_pipeline.document_embedder.embed_text", fake_embed_text), ("app.indexing_pipeline.document_embedder.embed_texts", fake_embed_texts), - # Pipeline service binding (the actual call site for indexing.index) + # Index-cache facade binding (the actual call site for indexing.index) ( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", fake_embed_texts, ), ] diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 19f8e3d0a..8457047ec 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -127,7 +127,7 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac def patched_embed_texts(monkeypatch) -> MagicMock: mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock, ) return mock @@ -137,7 +137,7 @@ def patched_embed_texts(monkeypatch) -> MagicMock: def patched_embed_texts_raises(monkeypatch) -> MagicMock: mock = MagicMock(side_effect=RuntimeError("Embedding unavailable")) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock, ) return mock @@ -147,11 +147,11 @@ def patched_embed_texts_raises(monkeypatch) -> MagicMock: def patched_chunk_text(monkeypatch) -> MagicMock: mock = MagicMock(return_value=["Test chunk content."]) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", mock, ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock, ) return mock diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 812140be3..bd889360f 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -283,11 +283,11 @@ async def credits(): def _mock_external_apis(monkeypatch): """Mock LLM, embedding, and chunking — these are external API boundaries.""" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", MagicMock(return_value=["Test chunk content."]), ) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index 311716052..814129c8d 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -177,7 +177,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" mocker.patch( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", side_effect=[["Original chunk."], ["Updated chunk."]], ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py new file mode 100644 index 000000000..081dddaa7 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/conftest.py @@ -0,0 +1,28 @@ +"""Stub the cache package __init__s so unit tests import only pure leaf modules. + +The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly +import the facade, file storage, Celery, and ``app.db`` -- none of which a pure +unit test should need. Turning those packages into bare namespace packages lets +``from app.indexing_pipeline.cache. import ...`` resolve the leaf module +without running the heavy __init__. ``schemas`` is left real (it is pure). +""" + +import sys +import types +from pathlib import Path + +_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "indexing_pipeline" / "cache" + + +def _stub_namespace_package(dotted: str, fs_dir: Path) -> None: + if dotted in sys.modules: + return + module = types.ModuleType(dotted) + module.__path__ = [str(fs_dir)] + module.__package__ = dotted + sys.modules[dotted] = module + + +_stub_namespace_package("app.indexing_pipeline.cache", _CACHE_DIR) +_stub_namespace_package("app.indexing_pipeline.cache.storage", _CACHE_DIR / "storage") +_stub_namespace_package("app.indexing_pipeline.cache.eviction", _CACHE_DIR / "eviction") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py new file mode 100644 index 000000000..780a6c536 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py @@ -0,0 +1,28 @@ +from app.indexing_pipeline.cache.eligibility import is_index_cacheable + + +def test_disabled_cache_is_never_cacheable(): + assert not is_index_cacheable( + cache_enabled=False, embedding_model="m", embedding_dim=384 + ) + + +def test_missing_model_is_not_cacheable(): + assert not is_index_cacheable( + cache_enabled=True, embedding_model=None, embedding_dim=384 + ) + + +def test_missing_dimension_is_not_cacheable(): + assert not is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=None + ) + assert not is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=0 + ) + + +def test_enabled_with_model_and_dim_is_cacheable(): + assert is_index_cacheable( + cache_enabled=True, embedding_model="m", embedding_dim=384 + ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py new file mode 100644 index 000000000..ce9c8672d --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_embedding_key.py @@ -0,0 +1,31 @@ +from app.indexing_pipeline.cache.schemas import EmbeddingKey + + +def _key(**overrides) -> EmbeddingKey: + base = { + "markdown_sha256": "a" * 64, + "embedding_model": "openai://text-embedding-3-small", + "embedding_dim": 1536, + "chunker_kind": "hybrid", + "chunker_version": 1, + } + base.update(overrides) + return EmbeddingKey(**base) + + +def test_object_suffix_is_stable(): + assert _key().object_suffix == _key().object_suffix + + +def test_object_suffix_differs_by_model(): + assert _key().object_suffix != _key(embedding_model="local/minilm").object_suffix + + +def test_object_suffix_differs_by_chunker_kind_and_version(): + assert _key().object_suffix != _key(chunker_kind="code").object_suffix + assert _key().object_suffix != _key(chunker_version=2).object_suffix + + +def test_object_suffix_encodes_kind_and_version(): + suffix = _key(chunker_kind="code", chunker_version=3).object_suffix + assert suffix.endswith(".code.v3.emb") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py new file mode 100644 index 000000000..8db87bf1b --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_serialization.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingSet +from app.indexing_pipeline.cache.serialization import deserialize, serialize + + +def _make_set(dim: int, n_chunks: int) -> EmbeddingSet: + rng = np.random.default_rng(0) + return EmbeddingSet( + summary_embedding=rng.random(dim, dtype=np.float64), + chunks=[ + CachedChunk(text=f"chunk {i}\nwith newline", embedding=rng.random(dim)) + for i in range(n_chunks) + ], + ) + + +def test_round_trip_preserves_texts_and_vectors(): + original = _make_set(dim=8, n_chunks=3) + + restored = deserialize(serialize(original)) + + assert [c.text for c in restored.chunks] == [c.text for c in original.chunks] + assert restored.chunk_count == 3 + assert np.allclose(restored.summary_embedding, original.summary_embedding, atol=1e-6) + for got, want in zip(restored.chunks, original.chunks, strict=True): + assert np.allclose(got.embedding, want.embedding, atol=1e-6) + + +def test_round_trip_with_no_chunks(): + original = _make_set(dim=4, n_chunks=0) + + restored = deserialize(serialize(original)) + + assert restored.chunk_count == 0 + assert restored.summary_embedding.shape[0] == 4 + + +def test_serialize_rejects_mismatched_dimensions(): + bad = EmbeddingSet( + summary_embedding=np.zeros(4, dtype=np.float32), + chunks=[CachedChunk(text="x", embedding=np.zeros(8, dtype=np.float32))], + ) + + with pytest.raises(ValueError): + serialize(bad) + + +def test_deserialize_rejects_foreign_blob(): + with pytest.raises(ValueError): + deserialize(b"not-a-surfsense-blob") diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index 3a1b77d90..252310061 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -54,7 +54,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread( mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock_chunk_hybrid, ) mock_embed = MagicMock( @@ -62,7 +62,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread( ) mock_embed.__name__ = "embed_texts" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", mock_embed, ) # Bypass set_committed_value, which requires a real ORM instance (not MagicMock). @@ -102,17 +102,17 @@ async def test_non_code_documents_use_hybrid_chunker( mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", mock_chunk_hybrid, ) mock_chunk_code = MagicMock(return_value=["chunk1"]) mock_chunk_code.__name__ = "chunk_text" monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.cache.cached_indexing.chunk_text", mock_chunk_code, ) monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + "app.indexing_pipeline.cache.cached_indexing.embed_texts", MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), ) monkeypatch.setattr( From 91d947ff79f6952d3d0c40217e118d10e09bfc8d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 17:00:01 +0200 Subject: [PATCH 43/52] refactor(embedding-cache): rename index cache to embedding cache The cached payload is the indexing pipeline's embeddings (markdown is chunked then embedded), so "embedding cache" names the expensive output directly and removes the "index" ambiguity (DB index vs vector index vs indexing phase). Renames the service, settings, eligibility, eviction task, metrics, config flags (INDEX_CACHE_* -> EMBEDDING_CACHE_*), object prefix, and the table (index_cache_embedding_sets -> embedding_cache_sets) with its constraint and indexes. Migration 161 renamed accordingly. --- surfsense_backend/.env.example | 12 ++++----- ...ets.py => 161_add_embedding_cache_sets.py} | 20 +++++++------- surfsense_backend/app/celery_app.py | 6 ++--- surfsense_backend/app/config/__init__.py | 18 ++++++++----- .../app/indexing_pipeline/cache/__init__.py | 4 +-- .../cache/cached_indexing.py | 24 ++++++++--------- .../indexing_pipeline/cache/eligibility.py | 4 +-- .../cache/eviction/__init__.py | 6 ++--- .../indexing_pipeline/cache/eviction/task.py | 12 ++++----- .../cache/persistence/models.py | 8 +++--- .../cache/persistence/repository.py | 4 +-- .../cache/schemas/__init__.py | 2 +- .../app/indexing_pipeline/cache/service.py | 2 +- .../app/indexing_pipeline/cache/settings.py | 18 ++++++------- .../cache/storage/embedding_store.py | 2 +- .../cache/storage/object_keys.py | 2 +- .../app/observability/metrics.py | 26 +++++++++---------- .../cache/test_eligibility.py | 12 ++++----- 18 files changed, 93 insertions(+), 89 deletions(-) rename surfsense_backend/alembic/versions/{161_add_index_cache_embedding_sets.py => 161_add_embedding_cache_sets.py} (61%) diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 03b8d9255..ac289c5a6 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -328,19 +328,19 @@ ETL_CACHE_ENABLED=false # ETL_CACHE_STORAGE_CONTAINER=surfsense-etl-cache # ETL_CACHE_STORAGE_LOCAL_PATH=/var/lib/surfsense/etl-cache -# Index Cache +# Embedding Cache # Reuse chunk+embedding output for identical markdown across workspaces (skips # re-chunking and re-embedding). Blobs share the ETL_CACHE_STORAGE_* backend. # Off by default. -INDEX_CACHE_ENABLED=false +EMBEDDING_CACHE_ENABLED=false # Bump to invalidate all cached embedding sets after a chunker change. -# INDEX_CACHE_CHUNKER_VERSION=1 +# EMBEDDING_CACHE_CHUNKER_VERSION=1 # Prune entries unused for this many days. -# INDEX_CACHE_TTL_DAYS=90 +# EMBEDDING_CACHE_TTL_DAYS=90 # Soft cap on total cached embeddings; coldest entries are evicted past it. -# INDEX_CACHE_MAX_TOTAL_MB=5120 +# EMBEDDING_CACHE_MAX_TOTAL_MB=5120 # Rows deleted per eviction pass. -# INDEX_CACHE_EVICTION_BATCH=500 +# EMBEDDING_CACHE_EVICTION_BATCH=500 # Daytona Sandbox (isolated code execution) # DAYTONA_SANDBOX_ENABLED=FALSE diff --git a/surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py b/surfsense_backend/alembic/versions/161_add_embedding_cache_sets.py similarity index 61% rename from surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py rename to surfsense_backend/alembic/versions/161_add_embedding_cache_sets.py index 8441dcf6e..70fb8b57a 100644 --- a/surfsense_backend/alembic/versions/161_add_index_cache_embedding_sets.py +++ b/surfsense_backend/alembic/versions/161_add_embedding_cache_sets.py @@ -1,4 +1,4 @@ -"""add index_cache_embedding_sets table for content-addressed embedding reuse +"""add embedding_cache_sets table for content-addressed embedding reuse Revision ID: 161 Revises: 160 @@ -17,7 +17,7 @@ depends_on: str | Sequence[str] | None = None def upgrade() -> None: op.execute( """ - CREATE TABLE IF NOT EXISTS index_cache_embedding_sets ( + CREATE TABLE IF NOT EXISTS embedding_cache_sets ( id SERIAL PRIMARY KEY, markdown_sha256 VARCHAR(64) NOT NULL, embedding_model VARCHAR(255) NOT NULL, @@ -31,23 +31,23 @@ def upgrade() -> None: times_reused BIGINT NOT NULL DEFAULT 0, last_used_at TIMESTAMP WITH TIME ZONE NOT NULL, created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), - CONSTRAINT uq_index_cache_embedding_sets_key + CONSTRAINT uq_embedding_cache_sets_key UNIQUE (markdown_sha256, embedding_model, chunker_kind, chunker_version) ); """ ) op.execute( - "CREATE INDEX IF NOT EXISTS ix_index_cache_embedding_sets_last_used_at " - "ON index_cache_embedding_sets(last_used_at);" + "CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_last_used_at " + "ON embedding_cache_sets(last_used_at);" ) op.execute( - "CREATE INDEX IF NOT EXISTS ix_index_cache_embedding_sets_created_at " - "ON index_cache_embedding_sets(created_at);" + "CREATE INDEX IF NOT EXISTS ix_embedding_cache_sets_created_at " + "ON embedding_cache_sets(created_at);" ) def downgrade() -> None: - op.execute("DROP INDEX IF EXISTS ix_index_cache_embedding_sets_created_at;") - op.execute("DROP INDEX IF EXISTS ix_index_cache_embedding_sets_last_used_at;") - op.execute("DROP TABLE IF EXISTS index_cache_embedding_sets;") + op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_created_at;") + op.execute("DROP INDEX IF EXISTS ix_embedding_cache_sets_last_used_at;") + op.execute("DROP TABLE IF EXISTS embedding_cache_sets;") diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 38fb12a32..9704c0312 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -314,9 +314,9 @@ celery_app.conf.beat_schedule = { "schedule": crontab(hour="4", minute="0"), "options": {"expires": 600}, }, - # Prune the index cache (chunk+embedding sets) once daily, off-peak. - "evict-index-cache": { - "task": "evict_index_cache", + # Prune the embedding cache (chunk+embedding sets) once daily, off-peak. + "evict-embedding-cache": { + "task": "evict_embedding_cache", "schedule": crontab(hour="4", minute="30"), "options": {"expires": 600}, }, diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 0b6c05c39..549252cec 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -964,16 +964,20 @@ class Config: ETL_CACHE_STORAGE_CONTAINER = os.getenv("ETL_CACHE_STORAGE_CONTAINER") ETL_CACHE_STORAGE_LOCAL_PATH = os.getenv("ETL_CACHE_STORAGE_LOCAL_PATH") - # Index cache: reuse chunk+embedding output for identical markdown across + # Embedding cache: reuse chunk+embedding output for identical markdown across # workspaces. Blobs share the ETL_CACHE_STORAGE_* backend. - INDEX_CACHE_ENABLED = ( - os.getenv("INDEX_CACHE_ENABLED", "false").strip().lower() == "true" + EMBEDDING_CACHE_ENABLED = ( + os.getenv("EMBEDDING_CACHE_ENABLED", "false").strip().lower() == "true" ) # Bump to invalidate every cached embedding set after a chunker change. - INDEX_CACHE_CHUNKER_VERSION = int(os.getenv("INDEX_CACHE_CHUNKER_VERSION", "1")) - INDEX_CACHE_TTL_DAYS = int(os.getenv("INDEX_CACHE_TTL_DAYS", "90")) - INDEX_CACHE_MAX_TOTAL_MB = int(os.getenv("INDEX_CACHE_MAX_TOTAL_MB", "5120")) - INDEX_CACHE_EVICTION_BATCH = int(os.getenv("INDEX_CACHE_EVICTION_BATCH", "500")) + EMBEDDING_CACHE_CHUNKER_VERSION = int( + os.getenv("EMBEDDING_CACHE_CHUNKER_VERSION", "1") + ) + EMBEDDING_CACHE_TTL_DAYS = int(os.getenv("EMBEDDING_CACHE_TTL_DAYS", "90")) + EMBEDDING_CACHE_MAX_TOTAL_MB = int(os.getenv("EMBEDDING_CACHE_MAX_TOTAL_MB", "5120")) + EMBEDDING_CACHE_EVICTION_BATCH = int( + os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500") + ) # Proxy provider selection. Maps to a ProxyProvider implementation registered # in app/utils/proxy/registry.py. Add new vendors there and switch via this var. diff --git a/surfsense_backend/app/indexing_pipeline/cache/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/__init__.py index 190b45f84..d3b9e5f0d 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/__init__.py +++ b/surfsense_backend/app/indexing_pipeline/cache/__init__.py @@ -3,9 +3,9 @@ from __future__ import annotations from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings -from app.indexing_pipeline.cache.service import IndexCacheService +from app.indexing_pipeline.cache.service import EmbeddingCacheService __all__ = [ - "IndexCacheService", + "EmbeddingCacheService", "build_chunk_embeddings", ] diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py index db07998a4..c93f2f133 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py +++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py @@ -14,10 +14,10 @@ import logging import numpy as np from app.config import config -from app.indexing_pipeline.cache.eligibility import is_index_cacheable +from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet -from app.indexing_pipeline.cache.service import IndexCacheService -from app.indexing_pipeline.cache.settings import load_index_cache_settings +from app.indexing_pipeline.cache.service import EmbeddingCacheService +from app.indexing_pipeline.cache.settings import load_embedding_cache_settings from app.indexing_pipeline.document_chunker import chunk_text, chunk_text_hybrid from app.indexing_pipeline.document_embedder import embed_texts from app.observability import metrics @@ -35,11 +35,11 @@ async def build_chunk_embeddings( Drop-in for the inline chunk+embed step; reuses prior output when the same markdown has already been embedded with the current model and chunker. """ - settings = load_index_cache_settings() + settings = load_embedding_cache_settings() chunker_kind = "code" if use_code_chunker else "hybrid" embedding_dim = getattr(config.embedding_model_instance, "dimension", None) - cacheable = is_index_cacheable( + cacheable = is_embedding_cacheable( cache_enabled=settings.enabled, embedding_model=config.EMBEDDING_MODEL, embedding_dim=embedding_dim, @@ -57,13 +57,13 @@ async def build_chunk_embeddings( cached = await _recall(key) if cached is not None: - metrics.record_index_cache_lookup( + metrics.record_embedding_cache_lookup( embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit" ) - logger.debug("Index cache hit for %s", key.markdown_sha256) + logger.debug("Embedding cache hit for %s", key.markdown_sha256) return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] - metrics.record_index_cache_lookup( + metrics.record_embedding_cache_lookup( embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="miss" ) summary_embedding, chunk_pairs = await _compute( @@ -95,9 +95,9 @@ async def _recall(key: EmbeddingKey) -> EmbeddingSet | None: from app.tasks.celery_tasks import get_celery_session_maker async with get_celery_session_maker()() as session: - return await IndexCacheService(session).recall(key) + return await EmbeddingCacheService(session).recall(key) except Exception: - logger.warning("Index cache recall failed; embedding fresh", exc_info=True) + logger.warning("Embedding cache recall failed; embedding fresh", exc_info=True) return None @@ -112,9 +112,9 @@ async def _remember( chunks=[CachedChunk(text=text, embedding=vec) for text, vec in chunk_pairs], ) async with get_celery_session_maker()() as session: - await IndexCacheService(session).remember(key, embedding_set) + await EmbeddingCacheService(session).remember(key, embedding_set) except Exception: - logger.warning("Index cache write failed; result not cached", exc_info=True) + logger.warning("Embedding cache write failed; result not cached", exc_info=True) def _hash_text(text: str) -> str: diff --git a/surfsense_backend/app/indexing_pipeline/cache/eligibility.py b/surfsense_backend/app/indexing_pipeline/cache/eligibility.py index 7dbf79202..446bea2f8 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/eligibility.py +++ b/surfsense_backend/app/indexing_pipeline/cache/eligibility.py @@ -1,9 +1,9 @@ -"""Gating rule: may this document be served from / written to the index cache?""" +"""Gating rule: may this document be served from / written to the embedding cache?""" from __future__ import annotations -def is_index_cacheable( +def is_embedding_cacheable( *, cache_enabled: bool, embedding_model: str | None, diff --git a/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py index de4df784e..a0f74b360 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py +++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/__init__.py @@ -1,9 +1,9 @@ -"""Background pruning of the index cache by age and size budget.""" +"""Background pruning of the embedding cache by age and size budget.""" from __future__ import annotations -from .task import evict_index_cache_task +from .task import evict_embedding_cache_task __all__ = [ - "evict_index_cache_task", + "evict_embedding_cache_task", ] diff --git a/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py b/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py index ab6885bca..70eff6ea5 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py +++ b/surfsense_backend/app/indexing_pipeline/cache/eviction/task.py @@ -1,4 +1,4 @@ -"""Celery task that prunes the index cache by TTL, then by size budget.""" +"""Celery task that prunes the embedding cache by TTL, then by size budget.""" from __future__ import annotations @@ -10,7 +10,7 @@ from app.celery_app import celery_app from app.etl_pipeline.cache.eviction.policy import select_over_budget from app.etl_pipeline.cache.schemas import EvictionCandidate from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository -from app.indexing_pipeline.cache.settings import load_index_cache_settings +from app.indexing_pipeline.cache.settings import load_embedding_cache_settings from app.indexing_pipeline.cache.storage import EmbeddingCacheStore from app.observability import metrics from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task @@ -18,14 +18,14 @@ from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_ta logger = logging.getLogger(__name__) -@celery_app.task(name="evict_index_cache") -def evict_index_cache_task(): +@celery_app.task(name="evict_embedding_cache") +def evict_embedding_cache_task(): return run_async_celery_task(_evict) async def _evict() -> None: """Expire stale entries, then shed the coldest overflow only if still over budget.""" - settings = load_index_cache_settings() + settings = load_embedding_cache_settings() if not settings.enabled: return @@ -64,5 +64,5 @@ async def _drop( with contextlib.suppress(Exception): await store.delete(candidate.storage_key) await index.delete_by_ids([candidate.id for candidate in candidates]) - metrics.record_index_cache_eviction(len(candidates), phase=phase) + metrics.record_embedding_cache_eviction(len(candidates), phase=phase) logger.info("Evicted %d cached embedding sets (%s)", len(candidates), phase) diff --git a/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py b/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py index e33e470f0..af34d92d2 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py +++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py @@ -1,4 +1,4 @@ -"""``index_cache_embedding_sets``: one reusable chunk+embedding set per markdown.""" +"""``embedding_cache_sets``: one reusable chunk+embedding set per markdown.""" from __future__ import annotations @@ -16,7 +16,7 @@ from app.db import BaseModel, TimestampMixin class CachedEmbeddingSet(BaseModel, TimestampMixin): - __tablename__ = "index_cache_embedding_sets" + __tablename__ = "embedding_cache_sets" # Key: markdown text + the recipe that turned it into vectors. markdown_sha256 = Column(String(64), nullable=False) @@ -41,7 +41,7 @@ class CachedEmbeddingSet(BaseModel, TimestampMixin): "embedding_model", "chunker_kind", "chunker_version", - name="uq_index_cache_embedding_sets_key", + name="uq_embedding_cache_sets_key", ), - Index("ix_index_cache_embedding_sets_last_used_at", "last_used_at"), + Index("ix_embedding_cache_sets_last_used_at", "last_used_at"), ) diff --git a/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py b/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py index 0bb0f8f23..f7f1f4345 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py +++ b/surfsense_backend/app/indexing_pipeline/cache/persistence/repository.py @@ -1,4 +1,4 @@ -"""CRUD and eviction selectors for ``index_cache_embedding_sets`` (no business rules).""" +"""CRUD and eviction selectors for ``embedding_cache_sets`` (no business rules).""" from __future__ import annotations @@ -74,7 +74,7 @@ class CachedEmbeddingSetRepository: last_used_at=now, created_at=now, ) - .on_conflict_do_nothing(constraint="uq_index_cache_embedding_sets_key") + .on_conflict_do_nothing(constraint="uq_embedding_cache_sets_key") ) await self._session.commit() diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py index 8714e2d86..c200ca1a6 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py @@ -1,4 +1,4 @@ -"""Pure value objects for the index cache.""" +"""Pure value objects for the embedding cache.""" from __future__ import annotations diff --git a/surfsense_backend/app/indexing_pipeline/cache/service.py b/surfsense_backend/app/indexing_pipeline/cache/service.py index 942ba7d51..b1d634782 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/service.py +++ b/surfsense_backend/app/indexing_pipeline/cache/service.py @@ -13,7 +13,7 @@ from app.indexing_pipeline.cache.storage import EmbeddingCacheStore logger = logging.getLogger(__name__) -class IndexCacheService: +class EmbeddingCacheService: def __init__(self, session: AsyncSession) -> None: self._index = CachedEmbeddingSetRepository(session) self._store = EmbeddingCacheStore() diff --git a/surfsense_backend/app/indexing_pipeline/cache/settings.py b/surfsense_backend/app/indexing_pipeline/cache/settings.py index 2991c0980..9c6737445 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/settings.py +++ b/surfsense_backend/app/indexing_pipeline/cache/settings.py @@ -1,4 +1,4 @@ -"""Index-cache configuration resolved from the central ``Config``. +"""Embedding-cache configuration resolved from the central ``Config``. The blob backend is intentionally not configured here: it is shared with the ETL parse cache (see ``ETL_CACHE_STORAGE_*``). @@ -10,7 +10,7 @@ from dataclasses import dataclass @dataclass(frozen=True) -class IndexCacheSettings: +class EmbeddingCacheSettings: enabled: bool chunker_version: int ttl_days: int @@ -18,13 +18,13 @@ class IndexCacheSettings: eviction_batch: int -def load_index_cache_settings() -> IndexCacheSettings: +def load_embedding_cache_settings() -> EmbeddingCacheSettings: from app.config import config - return IndexCacheSettings( - enabled=config.INDEX_CACHE_ENABLED, - chunker_version=config.INDEX_CACHE_CHUNKER_VERSION, - ttl_days=config.INDEX_CACHE_TTL_DAYS, - max_total_bytes=config.INDEX_CACHE_MAX_TOTAL_MB * 1024 * 1024, - eviction_batch=config.INDEX_CACHE_EVICTION_BATCH, + return EmbeddingCacheSettings( + enabled=config.EMBEDDING_CACHE_ENABLED, + chunker_version=config.EMBEDDING_CACHE_CHUNKER_VERSION, + ttl_days=config.EMBEDDING_CACHE_TTL_DAYS, + max_total_bytes=config.EMBEDDING_CACHE_MAX_TOTAL_MB * 1024 * 1024, + eviction_batch=config.EMBEDDING_CACHE_EVICTION_BATCH, ) diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py index 48835a12b..58c4a6cc1 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/embedding_store.py @@ -7,8 +7,8 @@ markdown and its embeddings live side by side; only the object prefix differs. from __future__ import annotations from app.etl_pipeline.cache.storage.backend import resolve_cache_backend -from app.indexing_pipeline.cache.serialization import deserialize, serialize from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.serialization import deserialize, serialize from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key _EMBEDDING_CONTENT_TYPE = "application/octet-stream" diff --git a/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py b/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py index 90e0b8957..6286ccf90 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py +++ b/surfsense_backend/app/indexing_pipeline/cache/storage/object_keys.py @@ -4,7 +4,7 @@ from __future__ import annotations from app.indexing_pipeline.cache.schemas import EmbeddingKey -CACHE_PREFIX = "index_cache" +CACHE_PREFIX = "embedding_cache" def build_embedding_object_key(key: EmbeddingKey) -> str: diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index 61b380722..94bb55740 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -306,18 +306,18 @@ def _etl_cache_evictions(): @lru_cache(maxsize=1) -def _index_cache_lookups(): +def _embedding_cache_lookups(): return _get_meter().create_counter( - "surfsense.index.cache.lookups", - description="Count of index (chunk+embedding) cache lookups by outcome (hit/miss).", + "surfsense.embedding.cache.lookups", + description="Count of embedding (chunk+embedding) cache lookups by outcome (hit/miss).", ) @lru_cache(maxsize=1) -def _index_cache_evictions(): +def _embedding_cache_evictions(): return _get_meter().create_counter( - "surfsense.index.cache.evictions", - description="Count of index cache entries evicted, by phase.", + "surfsense.embedding.cache.evictions", + description="Count of embedding cache entries evicted, by phase.", ) @@ -724,12 +724,12 @@ def record_etl_cache_eviction(count: int, *, phase: str) -> None: _add(_etl_cache_evictions(), count, {"phase": phase}) -def record_index_cache_lookup( +def record_embedding_cache_lookup( *, embedding_model: str | None, chunker_kind: str | None, outcome: str ) -> None: - """Record an index-cache lookup. ``outcome`` is ``hit`` or ``miss``.""" + """Record an embedding-cache lookup. ``outcome`` is ``hit`` or ``miss``.""" _add( - _index_cache_lookups(), + _embedding_cache_lookups(), 1, { "embedding.model": embedding_model or "unknown", @@ -739,11 +739,11 @@ def record_index_cache_lookup( ) -def record_index_cache_eviction(count: int, *, phase: str) -> None: +def record_embedding_cache_eviction(count: int, *, phase: str) -> None: """Record evicted entries. ``phase`` is ``ttl`` or ``size``.""" if count <= 0: return - _add(_index_cache_evictions(), count, {"phase": phase}) + _add(_embedding_cache_evictions(), count, {"phase": phase}) def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: @@ -942,12 +942,12 @@ __all__ = [ "record_compaction_run", "record_connector_sync_duration", "record_connector_sync_outcome", + "record_embedding_cache_eviction", + "record_embedding_cache_lookup", "record_etl_cache_eviction", "record_etl_cache_lookup", "record_etl_extract_duration", "record_etl_extract_outcome", - "record_index_cache_eviction", - "record_index_cache_lookup", "record_indexing_document_duration", "record_indexing_document_outcome", "record_interrupt", diff --git a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py index 780a6c536..2e488231c 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/cache/test_eligibility.py @@ -1,28 +1,28 @@ -from app.indexing_pipeline.cache.eligibility import is_index_cacheable +from app.indexing_pipeline.cache.eligibility import is_embedding_cacheable def test_disabled_cache_is_never_cacheable(): - assert not is_index_cacheable( + assert not is_embedding_cacheable( cache_enabled=False, embedding_model="m", embedding_dim=384 ) def test_missing_model_is_not_cacheable(): - assert not is_index_cacheable( + assert not is_embedding_cacheable( cache_enabled=True, embedding_model=None, embedding_dim=384 ) def test_missing_dimension_is_not_cacheable(): - assert not is_index_cacheable( + assert not is_embedding_cacheable( cache_enabled=True, embedding_model="m", embedding_dim=None ) - assert not is_index_cacheable( + assert not is_embedding_cacheable( cache_enabled=True, embedding_model="m", embedding_dim=0 ) def test_enabled_with_model_and_dim_is_cacheable(): - assert is_index_cacheable( + assert is_embedding_cacheable( cache_enabled=True, embedding_model="m", embedding_dim=384 ) From 412493ae08ab6d6333ece255334fec7c67c4d53d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 17:33:21 +0200 Subject: [PATCH 44/52] test(embedding-cache): add integration tests for service, repository, and store Covers the public cache surface against real Postgres and a real local file backend (no mocks): recall miss, remember->recall vector/text/order round-trip, the dimension-mismatch refusal, the repository SQL behind eviction and dedup (size sum, coldest ordering, TTL cutoff, duplicate-key no-op, reuse counter), and the blob store save/load round-trip and delete. --- .../indexing_pipeline/cache/conftest.py | 33 ++++++ .../cache/test_cached_embedding_repository.py | 110 ++++++++++++++++++ .../cache/test_embedding_cache_service.py | 70 +++++++++++ .../cache/test_embedding_store.py | 63 ++++++++++ 4 files changed, 276 insertions(+) create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/cache/conftest.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/cache/test_cached_embedding_repository.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_store.py diff --git a/surfsense_backend/tests/integration/indexing_pipeline/cache/conftest.py b/surfsense_backend/tests/integration/indexing_pipeline/cache/conftest.py new file mode 100644 index 000000000..6acb457ee --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/cache/conftest.py @@ -0,0 +1,33 @@ +"""Real-infra fixtures for the embedding-cache integration tests. + +``cache_local_storage`` points the shared cache backend at a throwaway directory +so tests exercise the real ``LocalFileBackend`` (no cloud, no mocks); the +embedding cache reuses the ETL cache backend, hence the ``ETL_CACHE_STORAGE_*`` +knobs. ``clean_embedding_cache_table`` removes rows written through the store's +own committing session, which the savepoint-rolled-back ``db_session`` cannot undo. +""" + +from __future__ import annotations + +import pytest +import pytest_asyncio +from sqlalchemy import text + + +@pytest.fixture +def cache_local_storage(tmp_path, monkeypatch): + from app.config import config + from app.etl_pipeline.cache.storage.backend import resolve_cache_backend + + monkeypatch.setattr(config, "ETL_CACHE_STORAGE_BACKEND", "local") + monkeypatch.setattr(config, "ETL_CACHE_STORAGE_LOCAL_PATH", str(tmp_path)) + resolve_cache_backend.cache_clear() + yield tmp_path + resolve_cache_backend.cache_clear() + + +@pytest_asyncio.fixture +async def clean_embedding_cache_table(async_engine): + yield + async with async_engine.begin() as conn: + await conn.execute(text("DELETE FROM embedding_cache_sets")) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/cache/test_cached_embedding_repository.py b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_cached_embedding_repository.py new file mode 100644 index 000000000..446932793 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_cached_embedding_repository.py @@ -0,0 +1,110 @@ +"""CachedEmbeddingSetRepository against real Postgres: the SQL behind eviction & dedup. + +These verify the parts only a real database can: the size accumulator, +coldest-first ordering by reuse then recency, TTL cutoff selection, the +insert-once guarantee under a duplicate key, and the reuse counter. +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta + +import pytest + +from app.indexing_pipeline.cache.persistence import CachedEmbeddingSetRepository +from app.indexing_pipeline.cache.schemas import EmbeddingKey + +pytestmark = pytest.mark.integration + + +def _key(sha: str) -> EmbeddingKey: + return EmbeddingKey( + markdown_sha256=sha, + embedding_model="test-model", + embedding_dim=4, + chunker_kind="hybrid", + chunker_version=1, + ) + + +async def _insert(repo, *, sha, size=100, storage_key=None, chunk_count=1): + key = _key(sha) + await repo.insert( + key=key, + storage_backend="local", + storage_key=storage_key or f"embedding_cache/{sha}.emb", + size_bytes=size, + chunk_count=chunk_count, + ) + return key + + +async def test_total_size_bytes_sums_all_rows(db_session): + repo = CachedEmbeddingSetRepository(db_session) + await _insert(repo, sha="a" * 64, size=100) + await _insert(repo, sha="b" * 64, size=250) + + assert await repo.total_size_bytes() == 350 + + +async def test_select_coldest_orders_by_reuse_then_recency(db_session): + repo = CachedEmbeddingSetRepository(db_session) + ka = await _insert(repo, sha="a" * 64) + kb = await _insert(repo, sha="b" * 64) + kc = await _insert(repo, sha="c" * 64) + + # Warm B once and C twice; A stays untouched and should be coldest. + await repo.mark_used((await repo.get(kb)).id) + await repo.mark_used((await repo.get(kc)).id) + await repo.mark_used((await repo.get(kc)).id) + + coldest = await repo.select_coldest(limit=10) + + assert [c.id for c in coldest][:3] == [ + (await repo.get(ka)).id, + (await repo.get(kb)).id, + (await repo.get(kc)).id, + ] + + +async def test_select_expired_returns_only_rows_older_than_cutoff(db_session): + repo = CachedEmbeddingSetRepository(db_session) + await _insert(repo, sha="a" * 64) + + future = datetime.now(UTC) + timedelta(days=1) + past = datetime.now(UTC) - timedelta(days=1) + + # Row was just used, so it predates a future cutoff but not a past one. + assert len(await repo.select_expired(cutoff=future, limit=10)) == 1 + assert await repo.select_expired(cutoff=past, limit=10) == [] + + +async def test_duplicate_key_insert_keeps_the_first_row(db_session): + repo = CachedEmbeddingSetRepository(db_session) + key = await _insert( + repo, sha="a" * 64, size=100, storage_key="embedding_cache/first.emb" + ) + + # Same content-addressed key (a concurrent re-embed): must be a no-op. + await repo.insert( + key=key, + storage_backend="local", + storage_key="embedding_cache/second.emb", + size_bytes=999, + chunk_count=42, + ) + + row = await repo.get(key) + assert row.storage_key == "embedding_cache/first.emb" + assert await repo.total_size_bytes() == 100 + + +async def test_mark_used_increments_reuse_count(db_session): + repo = CachedEmbeddingSetRepository(db_session) + key = await _insert(repo, sha="a" * 64) + assert (await repo.get(key)).times_reused == 0 + + await repo.mark_used((await repo.get(key)).id) + await repo.mark_used((await repo.get(key)).id) + + assert (await repo.get(key)).times_reused == 2 diff --git a/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py new file mode 100644 index 000000000..2f4cd4a89 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_cache_service.py @@ -0,0 +1,70 @@ +"""EmbeddingCacheService end-to-end against real Postgres + real local storage. + +Exercises the public cache surface -- ``recall`` / ``remember`` -- with no mocks: +a miss returns nothing, a remembered set comes back as equivalent vectors, and a +dimension mismatch is refused rather than served. +""" + +from __future__ import annotations + +import numpy as np +import pytest + +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.service import EmbeddingCacheService + +pytestmark = pytest.mark.integration + + +def _key(sha: str = "c" * 64, *, dim: int = 4) -> EmbeddingKey: + return EmbeddingKey( + markdown_sha256=sha, + embedding_model="test-model", + embedding_dim=dim, + chunker_kind="hybrid", + chunker_version=1, + ) + + +async def test_recall_is_a_miss_for_an_unknown_key(db_session, cache_local_storage): + service = EmbeddingCacheService(db_session) + assert await service.recall(_key()) is None + + +async def test_remembered_set_recalls_as_equivalent_vectors( + db_session, cache_local_storage, clean_embedding_cache_table +): + service = EmbeddingCacheService(db_session) + stored = EmbeddingSet( + summary_embedding=np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32), + chunks=[ + CachedChunk("first chunk", np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32)), + CachedChunk("second chunk", np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)), + ], + ) + + await service.remember(_key(), stored) + recalled = await service.recall(_key()) + + assert recalled is not None + assert np.array_equal(recalled.summary_embedding, stored.summary_embedding) + assert [c.text for c in recalled.chunks] == ["first chunk", "second chunk"] + assert np.array_equal(recalled.chunks[0].embedding, stored.chunks[0].embedding) + assert np.array_equal(recalled.chunks[1].embedding, stored.chunks[1].embedding) + + +async def test_recall_refuses_a_set_whose_dimension_changed( + db_session, cache_local_storage, clean_embedding_cache_table +): + # A model kept its name but changed its output width: never serve the stale blob. + service = EmbeddingCacheService(db_session) + stored = EmbeddingSet( + summary_embedding=np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32), + chunks=[CachedChunk("c", np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32))], + ) + await service.remember(_key(dim=4), stored) + + # Same identity (model + chunker + markdown), but the caller now expects dim 8. + recalled = await service.recall(_key(dim=8)) + + assert recalled is None diff --git a/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_store.py b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_store.py new file mode 100644 index 000000000..83becd7b5 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/cache/test_embedding_store.py @@ -0,0 +1,63 @@ +"""EmbeddingCacheStore against a real local filesystem backend (no mocks). + +Proves the blob side of the cache: an embedding set written under a +content-addressed key comes back with identical vectors, and a delete actually +removes it. +""" + +from __future__ import annotations + +import numpy as np +import pytest + +from app.indexing_pipeline.cache.schemas import CachedChunk, EmbeddingKey, EmbeddingSet +from app.indexing_pipeline.cache.storage import EmbeddingCacheStore +from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key + +pytestmark = pytest.mark.integration + + +def _key() -> EmbeddingKey: + return EmbeddingKey( + markdown_sha256="d" * 64, + embedding_model="test-model", + embedding_dim=4, + chunker_kind="hybrid", + chunker_version=1, + ) + + +def _set() -> EmbeddingSet: + return EmbeddingSet( + summary_embedding=np.array([0.5, 0.25, 0.125, 0.0625], dtype=np.float32), + chunks=[ + CachedChunk("café, naïve, 漢字", np.array([1, 2, 3, 4], dtype=np.float32)), + CachedChunk("second", np.array([5, 6, 7, 8], dtype=np.float32)), + ], + ) + + +async def test_save_then_load_round_trips_the_embedding_set(cache_local_storage): + store = EmbeddingCacheStore() + embedding_set = _set() + + storage_key, size_bytes = await store.save(_key(), embedding_set) + loaded = await store.load(storage_key) + + assert storage_key == build_embedding_object_key(_key()) + assert size_bytes > 0 + assert np.array_equal(loaded.summary_embedding, embedding_set.summary_embedding) + assert [c.text for c in loaded.chunks] == ["café, naïve, 漢字", "second"] + assert np.array_equal(loaded.chunks[0].embedding, embedding_set.chunks[0].embedding) + assert np.array_equal(loaded.chunks[1].embedding, embedding_set.chunks[1].embedding) + + +async def test_delete_removes_the_blob(cache_local_storage): + store = EmbeddingCacheStore() + storage_key, _ = await store.save(_key(), _set()) + + await store.delete(storage_key) + + # Eviction deleted the blob; a later read must fail rather than serve stale. + with pytest.raises(FileNotFoundError): + await store.load(storage_key) From c6e71c851cdba5323f6b84b823650fd0e079e07a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:45 +0200 Subject: [PATCH 45/52] feat(chunks): add explicit position column with backfill migration Chunk ids stop reflecting document order once incremental re-indexing keeps unchanged rows across edits. Backfill preserves the historical id ordering so behavior is identical on day one. --- .../versions/162_add_chunk_position.py | 51 +++++++++++++++++++ surfsense_backend/app/db.py | 8 ++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 surfsense_backend/alembic/versions/162_add_chunk_position.py diff --git a/surfsense_backend/alembic/versions/162_add_chunk_position.py b/surfsense_backend/alembic/versions/162_add_chunk_position.py new file mode 100644 index 000000000..cb240e3ef --- /dev/null +++ b/surfsense_backend/alembic/versions/162_add_chunk_position.py @@ -0,0 +1,51 @@ +"""add chunks.position for explicit document order + +Incremental re-indexing keeps unchanged chunk rows, so auto-increment ids no +longer reflect document order. Backfill preserves the historical id ordering. + +Revision ID: 162 +Revises: 161 +""" + +from collections.abc import Sequence + +from alembic import op + +revision: str = "162" +down_revision: str | None = "161" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.execute( + "ALTER TABLE chunks ADD COLUMN IF NOT EXISTS position INTEGER NOT NULL DEFAULT 0;" + ) + + # Backfill: document order so far has been the insertion order (id). + op.execute( + """ + UPDATE chunks + SET position = numbered.rn + FROM ( + SELECT id, + ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY id) - 1 AS rn + FROM chunks + ) AS numbered + WHERE chunks.id = numbered.id; + """ + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS ix_chunks_position ON chunks(position);" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_chunks_document_id_position " + "ON chunks(document_id, position);" + ) + + +def downgrade() -> None: + op.execute("DROP INDEX IF EXISTS ix_chunks_document_id_position;") + op.execute("DROP INDEX IF EXISTS ix_chunks_position;") + op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS position;") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 9ec13f4e2..8d110bbf1 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1484,7 +1484,10 @@ class Document(BaseModel, TimestampMixin): created_by = relationship("User", back_populates="documents") connector = relationship("SearchSourceConnector", back_populates="documents") chunks = relationship( - "Chunk", back_populates="document", cascade="all, delete-orphan" + "Chunk", + back_populates="document", + cascade="all, delete-orphan", + order_by="Chunk.position", ) # Original upload + future derived artifacts (redacted, filled-form). # Model lives in app.file_storage.persistence to keep that feature cohesive. @@ -1520,6 +1523,9 @@ class Chunk(BaseModel, TimestampMixin): content = Column(Text, nullable=False) embedding = Column(Vector(config.embedding_model_instance.dimension)) + # Explicit document order; ids don't follow it since incremental + # re-indexing keeps unchanged rows across edits. + position = Column(Integer, nullable=False, server_default="0", index=True) document_id = Column( Integer, From f82dedf712862b96e5748cbc3ae539ae5a071e0c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:46 +0200 Subject: [PATCH 46/52] feat(indexing): add pure chunk reconciler for content-addressed diffs Greedy multiset match on chunk text decides which rows keep their embeddings, which texts need embedding, and which rows are deleted. No DB, no embeddings; fully unit-tested (reuse, head insert, middle edit, deletion, duplicates, reorder, full rewrite). --- .../app/indexing_pipeline/chunk_reconciler.py | 56 +++++++++++ .../test_chunk_reconciler.py | 94 +++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/chunk_reconciler.py create mode 100644 surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py diff --git a/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py new file mode 100644 index 000000000..9354aeb9f --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/chunk_reconciler.py @@ -0,0 +1,56 @@ +"""Diff a document's existing chunk rows against its freshly chunked texts. + +Embeddings are a pure function of chunk text, so a row whose content reappears +in the new chunking keeps its embedding (and its HNSW/GIN index entries); only +genuinely new texts are embedded and only vanished rows are deleted. Matching +is a greedy multiset match on content in document order, so duplicate +boilerplate chunks pair up one-to-one and reordered chunks become cheap +position updates instead of delete+reinsert. +""" + +from __future__ import annotations + +from collections import defaultdict, deque +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class ExistingChunk: + id: int + content: str + position: int + + +@dataclass(frozen=True, slots=True) +class ChunkPlan: + """The minimal set of writes that turns the stored chunks into the new ones. + + ``reused`` holds only kept rows whose position actually changed; rows that + match in place need no write at all. Kept-row count (for metrics) is + ``len(existing) - len(to_delete)``. + """ + + reused: list[tuple[int, int]] # (existing_chunk_id, new_position) + to_embed: list[tuple[int, str]] # (new_position, text) + to_delete: list[int] # existing chunk ids + + +def reconcile(existing: list[ExistingChunk], new_texts: list[str]) -> ChunkPlan: + available: dict[str, deque[ExistingChunk]] = defaultdict(deque) + for chunk in sorted(existing, key=lambda c: c.position): + available[chunk.content].append(chunk) + + reused: list[tuple[int, int]] = [] + to_embed: list[tuple[int, str]] = [] + + for new_position, text in enumerate(new_texts): + matches = available.get(text) + if matches: + chunk = matches.popleft() + if chunk.position != new_position: + reused.append((chunk.id, new_position)) + else: + to_embed.append((new_position, text)) + + to_delete = [chunk.id for queue in available.values() for chunk in queue] + return ChunkPlan(reused=reused, to_embed=to_embed, to_delete=to_delete) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py new file mode 100644 index 000000000..7effce840 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_chunk_reconciler.py @@ -0,0 +1,94 @@ +"""reconcile(): diff existing chunk rows against new chunk texts. + +The reconciler decides which rows (and embeddings) survive an edit, which texts +must be embedded, and which rows go away -- purely from content, no DB. +""" + +from __future__ import annotations + +from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile + + +def _existing(*contents: str) -> list[ExistingChunk]: + return [ + ExistingChunk(id=i + 1, content=text, position=i) + for i, text in enumerate(contents) + ] + + +def test_identical_content_keeps_every_row_untouched(): + plan = reconcile(_existing("alpha", "beta", "gamma"), ["alpha", "beta", "gamma"]) + + assert plan.to_embed == [] + assert plan.to_delete == [] + assert plan.reused == [] + + +def test_head_insert_embeds_only_the_new_chunk_and_shifts_the_rest(): + plan = reconcile(_existing("alpha", "beta"), ["intro", "alpha", "beta"]) + + assert plan.to_embed == [(0, "intro")] + assert plan.to_delete == [] + # alpha: position 0 -> 1, beta: 1 -> 2; embeddings untouched. + assert plan.reused == [(1, 1), (2, 2)] + + +def test_middle_edit_swaps_exactly_one_chunk(): + plan = reconcile( + _existing("alpha", "beta", "gamma"), ["alpha", "beta EDITED", "gamma"] + ) + + assert plan.to_embed == [(1, "beta EDITED")] + assert plan.to_delete == [2] + # Neighbours did not move, so no position writes at all. + assert plan.reused == [] + + +def test_removed_chunk_is_deleted_and_followers_shift_up(): + plan = reconcile(_existing("alpha", "beta", "gamma"), ["alpha", "gamma"]) + + assert plan.to_embed == [] + assert plan.to_delete == [2] + assert plan.reused == [(3, 1)] + + +def test_duplicate_texts_pair_up_one_to_one(): + # Two identical boilerplate chunks, only one survives the edit: exactly one + # row is kept and exactly one is deleted -- never both kept or both dropped. + plan = reconcile(_existing("boiler", "boiler", "body"), ["boiler", "body"]) + + assert plan.to_embed == [] + assert plan.to_delete == [2] + assert plan.reused == [(3, 1)] + + +def test_duplicate_growth_embeds_only_the_extra_copy(): + plan = reconcile(_existing("boiler", "body"), ["boiler", "boiler", "body"]) + + assert plan.to_embed == [(1, "boiler")] + assert plan.to_delete == [] + assert plan.reused == [(2, 2)] + + +def test_reorder_becomes_position_updates_with_no_embedding(): + plan = reconcile(_existing("alpha", "beta"), ["beta", "alpha"]) + + assert plan.to_embed == [] + assert plan.to_delete == [] + assert sorted(plan.reused) == [(1, 1), (2, 0)] + + +def test_full_rewrite_replaces_everything(): + plan = reconcile(_existing("alpha", "beta"), ["new one", "new two"]) + + assert plan.to_embed == [(0, "new one"), (1, "new two")] + assert sorted(plan.to_delete) == [1, 2] + assert plan.reused == [] + + +def test_no_existing_chunks_embeds_all(): + plan = reconcile([], ["alpha", "beta"]) + + assert plan.to_embed == [(0, "alpha"), (1, "beta")] + assert plan.to_delete == [] + assert plan.reused == [] From 8d413ea5c2b644e86de2f626b8ba461cfe4baa42 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:57 +0200 Subject: [PATCH 47/52] refactor(indexing): expose chunk_markdown and embed_batch helpers Split _compute so the incremental edit path can reuse the exact same chunker selection and embedding entry points (and their test patch targets) without going through the doc-level cache. --- .../cache/cached_indexing.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py index c93f2f133..95321a229 100644 --- a/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py +++ b/surfsense_backend/app/indexing_pipeline/cache/cached_indexing.py @@ -58,7 +58,9 @@ async def build_chunk_embeddings( cached = await _recall(key) if cached is not None: metrics.record_embedding_cache_lookup( - embedding_model=key.embedding_model, chunker_kind=chunker_kind, outcome="hit" + embedding_model=key.embedding_model, + chunker_kind=chunker_kind, + outcome="hit", ) logger.debug("Embedding cache hit for %s", key.markdown_sha256) return cached.summary_embedding, [(c.text, c.embedding) for c in cached.chunks] @@ -73,18 +75,24 @@ async def build_chunk_embeddings( return summary_embedding, chunk_pairs +async def chunk_markdown(markdown: str, *, use_code_chunker: bool) -> list[str]: + """Chunk markdown into ordered texts with the pipeline's chunker selection.""" + if use_code_chunker: + return await asyncio.to_thread(chunk_text, markdown, use_code_chunker=True) + # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). + return await asyncio.to_thread(chunk_text_hybrid, markdown) + + +async def embed_batch(texts: list[str]) -> list[np.ndarray]: + """Embed texts in one batch off the event loop.""" + return await asyncio.to_thread(embed_texts, texts) + + async def _compute( markdown: str, *, use_code_chunker: bool ) -> tuple[np.ndarray, list[ChunkPair]]: - if use_code_chunker: - chunk_texts = await asyncio.to_thread( - chunk_text, markdown, use_code_chunker=True - ) - else: - # Table-aware hybrid chunker keeps Markdown tables intact (issue #1334). - chunk_texts = await asyncio.to_thread(chunk_text_hybrid, markdown) - - embeddings = await asyncio.to_thread(embed_texts, [markdown, *chunk_texts]) + chunk_texts = await chunk_markdown(markdown, use_code_chunker=use_code_chunker) + embeddings = await embed_batch([markdown, *chunk_texts]) summary_embedding, *chunk_embeddings = embeddings return summary_embedding, list(zip(chunk_texts, chunk_embeddings, strict=False)) From fd495e1b2f69034e038798a291300b0d2fbce7b2 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:52:57 +0200 Subject: [PATCH 48/52] feat(observability): add chunk reconcile metric and kill-switch flag surfsense.indexing.reconcile.chunks counts reused/embedded/deleted chunks per re-index. CHUNK_RECONCILE_ENABLED (default on) falls back to delete-all + full re-embed if the diff path ever misbehaves. --- surfsense_backend/.env.example | 5 ++++ surfsense_backend/app/config/__init__.py | 7 ++++++ .../app/observability/metrics.py | 23 +++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index ac289c5a6..1e09b266a 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -342,6 +342,11 @@ EMBEDDING_CACHE_ENABLED=false # Rows deleted per eviction pass. # EMBEDDING_CACHE_EVICTION_BATCH=500 +# Incremental re-indexing: on document edits, keep chunks whose text is +# unchanged (reusing their embeddings) and embed only new/changed ones. +# Set to false to fall back to delete-all + full re-embed (kill switch). +# CHUNK_RECONCILE_ENABLED=true + # Daytona Sandbox (isolated code execution) # DAYTONA_SANDBOX_ENABLED=FALSE # DAYTONA_API_KEY=your-daytona-api-key diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 549252cec..c242419f6 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -979,6 +979,13 @@ class Config: os.getenv("EMBEDDING_CACHE_EVICTION_BATCH", "500") ) + # Incremental re-indexing: on document edits, keep chunk rows whose text is + # unchanged (reusing their embeddings) and embed only new/changed chunks. + # Kill switch -- disabling falls back to delete-all + full re-embed. + CHUNK_RECONCILE_ENABLED = ( + os.getenv("CHUNK_RECONCILE_ENABLED", "true").strip().lower() == "true" + ) + # Proxy provider selection. Maps to a ProxyProvider implementation registered # in app/utils/proxy/registry.py. Add new vendors there and switch via this var. PROXY_PROVIDER = os.getenv("PROXY_PROVIDER", "anonymous_proxies") diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index 94bb55740..ade43ab01 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -321,6 +321,17 @@ def _embedding_cache_evictions(): ) +@lru_cache(maxsize=1) +def _chunk_reconcile_chunks(): + return _get_meter().create_counter( + "surfsense.indexing.reconcile.chunks", + description=( + "Chunks handled by incremental re-indexing, by outcome " + "(reused/embedded/deleted)." + ), + ) + + @lru_cache(maxsize=1) def _celery_heartbeat_refreshes(): return _get_meter().create_counter( @@ -746,6 +757,17 @@ def record_embedding_cache_eviction(count: int, *, phase: str) -> None: _add(_embedding_cache_evictions(), count, {"phase": phase}) +def record_chunk_reconcile(*, reused: int, embedded: int, deleted: int) -> None: + """Record an incremental re-index: how many chunks were kept vs recomputed.""" + for outcome, count in ( + ("reused", reused), + ("embedded", embedded), + ("deleted", deleted), + ): + if count > 0: + _add(_chunk_reconcile_chunks(), count, {"outcome": outcome}) + + def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type}) @@ -939,6 +961,7 @@ __all__ = [ "record_celery_queue_latency", "record_chat_request_duration", "record_chat_request_outcome", + "record_chunk_reconcile", "record_compaction_run", "record_connector_sync_duration", "record_connector_sync_outcome", From 7d55aaf2c183b37b715be3c9f4d15cff5156e04a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:53:08 +0200 Subject: [PATCH 49/52] feat(indexing): reconcile chunks incrementally on re-index index() now loads existing rows and applies a content diff instead of delete-all/reinsert-all: unchanged chunks keep their rows and embeddings (zero HNSW/GIN churn), moved chunks get a position-only UPDATE, and only new texts are embedded, batched with the summary embedding. First index keeps the cache-aware build_chunk_embeddings path. --- .../indexing_pipeline_service.py | 119 +++++++++++++++--- 1 file changed, 101 insertions(+), 18 deletions(-) diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 271b3ee03..224eb0f5d 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -8,7 +8,7 @@ from collections.abc import Awaitable, Callable from dataclasses import dataclass, field from datetime import UTC, datetime -from sqlalchemy import delete, select +from sqlalchemy import delete, select, update from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession @@ -20,6 +20,8 @@ from app.db import ( DocumentType, ) from app.indexing_pipeline.cache import build_chunk_embeddings +from app.indexing_pipeline.cache.cached_indexing import chunk_markdown, embed_batch +from app.indexing_pipeline.chunk_reconciler import ExistingChunk, reconcile from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import ( compute_content_hash, @@ -379,39 +381,34 @@ class IndexingPipelineService: content = connector_doc.source_markdown - await self.session.execute( - delete(Chunk).where(Chunk.document_id == document.id) - ) - t_step = time.perf_counter() - summary_embedding, chunk_pairs = await build_chunk_embeddings( - content, - use_code_chunker=connector_doc.should_use_code_chunker, - ) - - chunks = [ - Chunk(content=text, embedding=emb) for text, emb in chunk_pairs - ] + existing = await self._load_existing_chunks(document.id) + if existing and self._reconcile_enabled(): + chunk_count = await self._reindex_incrementally( + document, content, connector_doc, existing + ) + else: + chunk_count = await self._reindex_from_scratch( + document, content, connector_doc + ) perf.info( "[indexing] chunk+embed doc=%d chunks=%d in %.3fs", document.id, - len(chunks), + chunk_count, time.perf_counter() - t_step, ) document.content = content - document.embedding = summary_embedding - attach_chunks_to_document(document, chunks) document.updated_at = datetime.now(UTC) document.status = DocumentStatus.ready() await self.session.commit() perf.info( "[indexing] index TOTAL doc=%d chunks=%d in %.3fs", document.id, - len(chunks), + chunk_count, time.perf_counter() - t_index, ) - log_index_success(ctx, chunk_count=len(chunks)) + log_index_success(ctx, chunk_count=chunk_count) outcome_status = "success" await self._enqueue_ai_sort_if_enabled(document) @@ -468,6 +465,92 @@ class IndexingPipelineService: persist_span_cm.__exit__(*sys.exc_info()) return document + @staticmethod + def _reconcile_enabled() -> bool: + from app.config import config + + return config.CHUNK_RECONCILE_ENABLED + + async def _load_existing_chunks(self, document_id: int) -> list[ExistingChunk]: + result = await self.session.execute( + select(Chunk.id, Chunk.content, Chunk.position).where( + Chunk.document_id == document_id + ) + ) + return [ + ExistingChunk(id=row.id, content=row.content, position=row.position) + for row in result + ] + + async def _reindex_from_scratch( + self, document: Document, content: str, connector_doc: ConnectorDocument + ) -> int: + """First index (or kill-switched re-index): cache-aware full chunk+embed.""" + await self.session.execute( + delete(Chunk).where(Chunk.document_id == document.id) + ) + + summary_embedding, chunk_pairs = await build_chunk_embeddings( + content, + use_code_chunker=connector_doc.should_use_code_chunker, + ) + + chunks = [ + Chunk(content=text, embedding=emb, position=i) + for i, (text, emb) in enumerate(chunk_pairs) + ] + document.embedding = summary_embedding + attach_chunks_to_document(document, chunks) + return len(chunks) + + async def _reindex_incrementally( + self, + document: Document, + content: str, + connector_doc: ConnectorDocument, + existing: list[ExistingChunk], + ) -> int: + """Edit path: keep rows whose text survived, embed only new texts. + + Unchanged rows keep their embedding and their HNSW/GIN index entries; + moved rows get a position-only UPDATE, which touches neither index. + """ + new_texts = await chunk_markdown( + content, use_code_chunker=connector_doc.should_use_code_chunker + ) + plan = reconcile(existing, new_texts) + + # One batch: the document-level summary vector plus the missing chunks. + embeddings = await embed_batch([content, *[t for _, t in plan.to_embed]]) + summary_embedding, *new_embeddings = embeddings + + if plan.reused: + await self.session.execute( + update(Chunk), + [{"id": cid, "position": pos} for cid, pos in plan.reused], + ) + if plan.to_delete: + await self.session.execute( + delete(Chunk).where(Chunk.id.in_(plan.to_delete)) + ) + self.session.add_all( + Chunk( + content=text, + embedding=emb, + position=pos, + document_id=document.id, + ) + for (pos, text), emb in zip(plan.to_embed, new_embeddings, strict=True) + ) + document.embedding = summary_embedding + + ot_metrics.record_chunk_reconcile( + reused=len(existing) - len(plan.to_delete), + embedded=len(plan.to_embed), + deleted=len(plan.to_delete), + ) + return len(new_texts) + async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: """Fire-and-forget: enqueue incremental AI sort if the search space has it enabled.""" try: From 5a71769dba8b678baba29bfef0a7fc0c35d7cdd4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:53:08 +0200 Subject: [PATCH 50/52] fix(chunks): set position on remaining chunk insert paths document_converters, the github size-fallback chunker, revert_service restores, and the kb-persistence middleware now write explicit positions (the middleware read path also orders by position). --- .../middleware/kb_persistence/middleware.py | 26 +++++++++++++++---- .../app/services/revert_service.py | 22 ++++++++++++---- .../connector_indexers/github_indexer.py | 1 + .../app/utils/document_converters.py | 6 +++-- 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py index ef86eaddd..a6c83a7d4 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py @@ -241,8 +241,15 @@ async def _create_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) session.add_all( [ - Chunk(document_id=doc.id, content=text, embedding=embedding) - for text, embedding in zip(chunks, chunk_embeddings, strict=True) + Chunk( + document_id=doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) ] ) return doc @@ -289,8 +296,15 @@ async def _update_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunks) session.add_all( [ - Chunk(document_id=document.id, content=text, embedding=embedding) - for text, embedding in zip(chunks, chunk_embeddings, strict=True) + Chunk( + document_id=document.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunks, chunk_embeddings, strict=True) + ) ] ) return document @@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot( session: AsyncSession, *, doc_id: int ) -> list[dict[str, str]]: rows = await session.execute( - select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id) + select(Chunk.content) + .where(Chunk.document_id == doc_id) + .order_by(Chunk.position, Chunk.id) ) return [{"content": row.content} for row in rows.all() if row.content is not None] diff --git a/surfsense_backend/app/services/revert_service.py b/surfsense_backend/app/services/revert_service.py index 6db5e2604..0cb6cd092 100644 --- a/surfsense_backend/app/services/revert_service.py +++ b/surfsense_backend/app/services/revert_service.py @@ -238,9 +238,14 @@ async def _restore_in_place_document( chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) session.add_all( [ - Chunk(document_id=doc.id, content=text, embedding=embedding) - for text, embedding in zip( - chunk_texts, chunk_embeddings, strict=True + Chunk( + document_id=doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=True) ) ] ) @@ -336,8 +341,15 @@ async def _reinsert_document_from_revision( chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) session.add_all( [ - Chunk(document_id=new_doc.id, content=text, embedding=embedding) - for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True) + Chunk( + document_id=new_doc.id, + content=text, + embedding=embedding, + position=i, + ) + for i, (text, embedding) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=True) + ) ] ) diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index ce9b80e5e..557c2ce71 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -525,6 +525,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: Chunk( content=chunk_text, embedding=embed_text(chunk_text), + position=len(chunks), ) ) diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index 694ae22ac..fef51d692 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -188,8 +188,10 @@ async def create_document_chunks(content: str) -> list[Chunk]: chunk_texts = [c.text for c in config.chunker_instance.chunk(content)] chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts) return [ - Chunk(content=text, embedding=emb) - for text, emb in zip(chunk_texts, chunk_embeddings, strict=False) + Chunk(content=text, embedding=emb, position=i) + for i, (text, emb) in enumerate( + zip(chunk_texts, chunk_embeddings, strict=False) + ) ] From 052e9ef4d19b16e58bef8f7c22328f64b6f94fbc Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:53:21 +0200 Subject: [PATCH 51/52] refactor(chunks): order chunk reads by (document_id, position) Presentation and citation ordering moves off Chunk.id/created_at to the explicit position column (id kept as tiebreaker). Vector and ts_rank ranking order_by clauses are untouched. --- .../shared/middleware/filesystem/backends/kb_postgres.py | 4 ++-- .../shared/middleware/knowledge_search.py | 9 ++++++--- .../builtins/deliverables/tools/knowledge_base.py | 2 +- surfsense_backend/app/retriever/chunks_hybrid_search.py | 7 +++++-- .../app/retriever/documents_hybrid_search.py | 7 +++++-- surfsense_backend/app/routes/documents_routes.py | 8 ++++---- surfsense_backend/app/routes/editor_routes.py | 6 +++--- surfsense_backend/app/services/ai_file_sort_service.py | 2 +- surfsense_backend/app/services/export_service.py | 2 +- 9 files changed, 28 insertions(+), 19 deletions(-) diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py index 7b8aaf2b0..e13196537 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/filesystem/backends/kb_postgres.py @@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol): chunk_rows = await session.execute( select(Chunk.id, Chunk.content) .where(Chunk.document_id == document.id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) ) chunks = [ {"chunk_id": row.id, "content": row.content} for row in chunk_rows.all() @@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol): .join(Document, Document.id == Chunk.document_id) .where(Document.search_space_id == self.search_space_id) .where(Chunk.content.ilike(f"%{pattern}%")) - .order_by(Chunk.document_id, Chunk.id) + .order_by(Chunk.document_id, Chunk.position, Chunk.id) ) chunk_rows = await session.execute(sub) per_doc: dict[int, int] = {} diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py index 681e80b0e..9ef601791 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/middleware/knowledge_search.py @@ -394,7 +394,10 @@ async def browse_recent_documents( Chunk.document_id, Chunk.content, func.row_number() - .over(partition_by=Chunk.document_id, order_by=Chunk.id) + .over( + partition_by=Chunk.document_id, + order_by=(Chunk.position, Chunk.id), + ) .label("rn"), ) .where(Chunk.document_id.in_(doc_ids)) @@ -404,7 +407,7 @@ async def browse_recent_documents( chunk_query = ( select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content) .where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC) - .order_by(numbered.c.document_id, numbered.c.chunk_id) + .order_by(numbered.c.document_id, numbered.c.rn) ) chunk_result = await session.execute(chunk_query) fetched_chunks = chunk_result.all() @@ -531,7 +534,7 @@ async def fetch_mentioned_documents( chunk_result = await session.execute( select(Chunk.id, Chunk.content, Chunk.document_id) .where(Chunk.document_id.in_(list(docs.keys()))) - .order_by(Chunk.document_id, Chunk.id) + .order_by(Chunk.document_id, Chunk.position, Chunk.id) ) chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs} for row in chunk_result.all(): diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py index e99e0291a..d89124990 100644 --- a/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/subagents/builtins/deliverables/tools/knowledge_base.py @@ -122,7 +122,7 @@ async def _browse_recent_documents( chunk_query = ( select(Chunk) .where(Chunk.document_id.in_(doc_ids)) - .order_by(Chunk.document_id, Chunk.id) + .order_by(Chunk.document_id, Chunk.position, Chunk.id) ) chunk_result = await session.execute(chunk_query) raw_chunks = chunk_result.scalars().all() diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py index 47f7fe6b1..5e5edec2e 100644 --- a/surfsense_backend/app/retriever/chunks_hybrid_search.py +++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py @@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever: select( Chunk.id.label("chunk_id"), func.row_number() - .over(partition_by=Chunk.document_id, order_by=Chunk.id) + .over( + partition_by=Chunk.document_id, + order_by=(Chunk.position, Chunk.id), + ) .label("rn"), ) .where(Chunk.document_id.in_(doc_ids)) @@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever: select(Chunk.id, Chunk.content, Chunk.document_id) .join(numbered, Chunk.id == numbered.c.chunk_id) .where(chunk_filter) - .order_by(Chunk.document_id, Chunk.id) + .order_by(Chunk.document_id, Chunk.position, Chunk.id) ) t_fetch = time.perf_counter() diff --git a/surfsense_backend/app/retriever/documents_hybrid_search.py b/surfsense_backend/app/retriever/documents_hybrid_search.py index 9ce86d404..d856e93cf 100644 --- a/surfsense_backend/app/retriever/documents_hybrid_search.py +++ b/surfsense_backend/app/retriever/documents_hybrid_search.py @@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever: select( Chunk.id.label("chunk_id"), func.row_number() - .over(partition_by=Chunk.document_id, order_by=Chunk.id) + .over( + partition_by=Chunk.document_id, + order_by=(Chunk.position, Chunk.id), + ) .label("rn"), ) .where(Chunk.document_id.in_(doc_ids)) @@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever: select(Chunk.id, Chunk.content, Chunk.document_id) .join(numbered, Chunk.id == numbered.c.chunk_id) .where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC) - .order_by(Chunk.document_id, Chunk.id) + .order_by(Chunk.document_id, Chunk.position, Chunk.id) ) t_fetch = time.perf_counter() diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 865068fba..53f03a0ca 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1014,8 +1014,8 @@ async def get_document_by_chunk_id( .filter( Chunk.document_id == document.id, or_( - Chunk.created_at < chunk.created_at, - and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id), + Chunk.position < chunk.position, + and_(Chunk.position == chunk.position, Chunk.id < chunk.id), ), ) ) @@ -1027,7 +1027,7 @@ async def get_document_by_chunk_id( windowed_result = await session.execute( select(Chunk) .filter(Chunk.document_id == document.id) - .order_by(Chunk.created_at, Chunk.id) + .order_by(Chunk.position, Chunk.id) .offset(start) .limit(end - start) ) @@ -1137,7 +1137,7 @@ async def get_document_chunks_paginated( chunks_result = await session.execute( select(Chunk) .filter(Chunk.document_id == document_id) - .order_by(Chunk.created_at, Chunk.id) + .order_by(Chunk.position, Chunk.id) .offset(offset) .limit(page_size) ) diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index 166164c50..34828964a 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -119,7 +119,7 @@ async def get_editor_content( chunk_contents_result = await session.execute( select(Chunk.content) .filter(Chunk.document_id == document_id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) ) chunk_contents = chunk_contents_result.scalars().all() @@ -205,7 +205,7 @@ async def download_document_markdown( chunk_contents_result = await session.execute( select(Chunk.content) .filter(Chunk.document_id == document_id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) ) chunk_contents = chunk_contents_result.scalars().all() if chunk_contents: @@ -354,7 +354,7 @@ async def export_document( chunk_contents_result = await session.execute( select(Chunk.content) .filter(Chunk.document_id == document_id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) ) chunk_contents = chunk_contents_result.scalars().all() if chunk_contents: diff --git a/surfsense_backend/app/services/ai_file_sort_service.py b/surfsense_backend/app/services/ai_file_sort_service.py index 2f04131a6..1bf4d325e 100644 --- a/surfsense_backend/app/services/ai_file_sort_service.py +++ b/surfsense_backend/app/services/ai_file_sort_service.py @@ -156,7 +156,7 @@ async def _resolve_document_text( stmt = ( select(Chunk.content) .where(Chunk.document_id == document.id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) .limit(_MAX_CHUNKS_FOR_CONTEXT) ) result = await session.execute(stmt) diff --git a/surfsense_backend/app/services/export_service.py b/surfsense_backend/app/services/export_service.py index 97f952223..9e6869fe1 100644 --- a/surfsense_backend/app/services/export_service.py +++ b/surfsense_backend/app/services/export_service.py @@ -62,7 +62,7 @@ async def _get_document_markdown( chunk_result = await session.execute( select(Chunk.content) .filter(Chunk.document_id == document.id) - .order_by(Chunk.id) + .order_by(Chunk.position, Chunk.id) ) chunks = chunk_result.scalars().all() if chunks: From 311570b4f0e097914e816630b87c1e0597418b93 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 18:53:21 +0200 Subject: [PATCH 52/52] test(indexing): cover the edit path and make integration caches hermetic Real-DB tests assert unchanged chunk rows survive edits, only new text is embedded, removed rows are deleted with positions compacted, and the kill switch restores full-replace. An autouse fixture disables the ETL/embedding caches so a developer's .env can't leak cache hits into unrelated tests. --- .../tests/integration/conftest.py | 13 ++ .../indexing_pipeline/test_index_editions.py | 193 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 8457047ec..6b8aa3cdb 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -123,6 +123,19 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac return space +@pytest.fixture(autouse=True) +def _derivation_caches_disabled(monkeypatch): + """Keep integration tests hermetic regardless of the developer's .env. + + With the embedding cache enabled, a successful index of some markdown makes + every later index of the same markdown a cache hit -- silently bypassing + patched ``embed_texts`` fakes/failure injections in unrelated tests. Cache + tests opt back in explicitly via ``monkeypatch.setattr``. + """ + monkeypatch.setattr(app_config, "ETL_CACHE_ENABLED", False) + monkeypatch.setattr(app_config, "EMBEDDING_CACHE_ENABLED", False) + + @pytest.fixture def patched_embed_texts(monkeypatch) -> MagicMock: mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]) diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py new file mode 100644 index 000000000..68d5ec0af --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_index_editions.py @@ -0,0 +1,193 @@ +"""Edit path: re-indexing a document diffs chunks instead of replacing them. + +Unchanged paragraphs must keep their chunk rows (ids survive -> embeddings and +HNSW entries untouched), only new text is embedded, removed text is deleted, +and (position) keeps presentation order correct throughout. +""" + +import pytest +from sqlalchemy import select + +from app.db import Chunk, DocumentStatus +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + +pytestmark = pytest.mark.integration + +_V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph." + + +@pytest.fixture +def paragraph_chunker(monkeypatch): + """One chunk per markdown paragraph, so edits map to chunk-level diffs.""" + + def _split(markdown, **_kwargs): + return [p for p in markdown.split("\n\n") if p.strip()] + + monkeypatch.setattr( + "app.indexing_pipeline.cache.cached_indexing.chunk_text", _split + ) + monkeypatch.setattr( + "app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split + ) + + +async def _index(service, connector_doc): + prepared = await service.prepare_for_indexing([connector_doc]) + document = prepared[0] + await service.index(document, connector_doc) + return document + + +async def _load_chunks(db_session, document_id): + result = await db_session.execute( + select(Chunk) + .where(Chunk.document_id == document_id) + .order_by(Chunk.position, Chunk.id) + ) + return result.scalars().all() + + +@pytest.mark.usefixtures("paragraph_chunker") +async def test_edit_keeps_unchanged_rows_and_embeds_only_the_new_text( + db_session, + db_search_space, + make_connector_document, + patched_embed_texts, +): + service = IndexingPipelineService(session=db_session) + doc_v1 = make_connector_document( + search_space_id=db_search_space.id, source_markdown=_V1 + ) + document = await _index(service, doc_v1) + + ids_v1 = {c.content: c.id for c in await _load_chunks(db_session, document.id)} + patched_embed_texts.reset_mock() + + edited = "Intro paragraph.\n\nBody paragraph EDITED.\n\nOutro paragraph." + doc_v2 = make_connector_document( + search_space_id=db_search_space.id, source_markdown=edited + ) + await _index(service, doc_v2) + + chunks = await _load_chunks(db_session, document.id) + by_content = {c.content: c for c in chunks} + + # Untouched paragraphs keep their rows (same ids => embeddings reused, + # no HNSW/GIN churn); the edited paragraph got a fresh row. + assert by_content["Intro paragraph."].id == ids_v1["Intro paragraph."] + assert by_content["Outro paragraph."].id == ids_v1["Outro paragraph."] + assert "Body paragraph." not in by_content + assert by_content["Body paragraph EDITED."].id not in ids_v1.values() + + # Exactly one embed call: the document summary plus only the edited text. + (embedded_texts,) = patched_embed_texts.call_args.args + assert embedded_texts == [edited, "Body paragraph EDITED."] + + assert [c.position for c in chunks] == [0, 1, 2] + assert [c.content for c in chunks] == [ + "Intro paragraph.", + "Body paragraph EDITED.", + "Outro paragraph.", + ] + + +@pytest.mark.usefixtures("paragraph_chunker", "patched_embed_texts") +async def test_head_insert_shifts_positions_without_new_rows_for_old_text( + db_session, + db_search_space, + make_connector_document, +): + service = IndexingPipelineService(session=db_session) + document = await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, source_markdown=_V1 + ), + ) + ids_v1 = {c.content: c.id for c in await _load_chunks(db_session, document.id)} + + await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, + source_markdown="Brand new opener.\n\n" + _V1, + ), + ) + + chunks = await _load_chunks(db_session, document.id) + assert [c.content for c in chunks] == [ + "Brand new opener.", + "Intro paragraph.", + "Body paragraph.", + "Outro paragraph.", + ] + assert [c.position for c in chunks] == [0, 1, 2, 3] + # The three original rows survived the shift. + surviving = {c.content: c.id for c in chunks if c.content in ids_v1} + assert surviving == ids_v1 + + +@pytest.mark.usefixtures("paragraph_chunker", "patched_embed_texts") +async def test_removed_paragraph_is_deleted_and_order_compacts( + db_session, + db_search_space, + make_connector_document, +): + service = IndexingPipelineService(session=db_session) + document = await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, source_markdown=_V1 + ), + ) + ids_v1 = {c.content: c.id for c in await _load_chunks(db_session, document.id)} + + await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, + source_markdown="Intro paragraph.\n\nOutro paragraph.", + ), + ) + + chunks = await _load_chunks(db_session, document.id) + assert [(c.content, c.position) for c in chunks] == [ + ("Intro paragraph.", 0), + ("Outro paragraph.", 1), + ] + assert chunks[0].id == ids_v1["Intro paragraph."] + assert chunks[1].id == ids_v1["Outro paragraph."] + + +@pytest.mark.usefixtures("paragraph_chunker", "patched_embed_texts") +async def test_kill_switch_falls_back_to_full_replace( + db_session, + db_search_space, + make_connector_document, + monkeypatch, +): + from app.config import config + + service = IndexingPipelineService(session=db_session) + document = await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, source_markdown=_V1 + ), + ) + ids_v1 = {c.id for c in await _load_chunks(db_session, document.id)} + + monkeypatch.setattr(config, "CHUNK_RECONCILE_ENABLED", False) + await _index( + service, + make_connector_document( + search_space_id=db_search_space.id, + source_markdown=_V1 + "\n\nAppended paragraph.", + ), + ) + + chunks = await _load_chunks(db_session, document.id) + # Legacy behavior: every row is recreated, even unchanged paragraphs. + assert {c.id for c in chunks}.isdisjoint(ids_v1) + assert [c.position for c in chunks] == [0, 1, 2, 3] + assert DocumentStatus.is_state(document.status, DocumentStatus.READY)