From cf208365b471941b967d4f2514edc54bad20213c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 16:48:01 +0200 Subject: [PATCH] feat(index-cache): add embedding set value objects --- .../cache/schemas/__init__.py | 12 ++++++++ .../cache/schemas/embedding_key.py | 27 +++++++++++++++++ .../cache/schemas/embedding_set.py | 29 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py new file mode 100644 index 000000000..8714e2d86 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py @@ -0,0 +1,12 @@ +"""Pure value objects for the index cache.""" + +from __future__ import annotations + +from .embedding_key import EmbeddingKey +from .embedding_set import CachedChunk, EmbeddingSet + +__all__ = [ + "CachedChunk", + "EmbeddingKey", + "EmbeddingSet", +] diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py new file mode 100644 index 000000000..55d891e73 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py @@ -0,0 +1,27 @@ +"""Identity of a cacheable embedding set: equal keys yield identical vectors. + +Embeddings depend on the markdown text, the embedding model, and the chunker -- +never on how the markdown was produced. So the key is the markdown's own hash +plus the model and chunker recipe, not the upstream parse identity. +""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class EmbeddingKey: + markdown_sha256: str + embedding_model: str + embedding_dim: int + chunker_kind: str + chunker_version: int + + @property + def object_suffix(self) -> str: + # Fingerprint the model so distinct models never share a blob, while the + # markdown hash (the object's folder) stays human-readable. + fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest() + return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb" diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py new file mode 100644 index 000000000..68c3a5211 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py @@ -0,0 +1,29 @@ +"""The cached payload: a document's chunk texts paired with their vectors.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np + + +@dataclass(frozen=True, slots=True) +class CachedChunk: + text: str + embedding: np.ndarray + + +@dataclass(frozen=True, slots=True) +class EmbeddingSet: + """Everything the indexer needs to rebuild a document's chunks without embedding. + + ``summary_embedding`` is the document-level vector; ``chunks`` are the ordered + chunk texts and their vectors. + """ + + summary_embedding: np.ndarray + chunks: list[CachedChunk] + + @property + def chunk_count(self) -> int: + return len(self.chunks)