From cf208365b471941b967d4f2514edc54bad20213c Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 12 Jun 2026 16:48:01 +0200
Subject: [PATCH] feat(index-cache): add embedding set value objects

---
 .../cache/schemas/__init__.py                 | 12 ++++++++
 .../cache/schemas/embedding_key.py            | 27 +++++++++++++++++
 .../cache/schemas/embedding_set.py            | 29 +++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py
 create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
 create mode 100644 surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py

diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py
new file mode 100644
index 000000000..8714e2d86
--- /dev/null
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py
@@ -0,0 +1,12 @@
+"""Pure value objects for the index cache."""
+
+from __future__ import annotations
+
+from .embedding_key import EmbeddingKey
+from .embedding_set import CachedChunk, EmbeddingSet
+
+__all__ = [
+    "CachedChunk",
+    "EmbeddingKey",
+    "EmbeddingSet",
+]
diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
new file mode 100644
index 000000000..55d891e73
--- /dev/null
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
@@ -0,0 +1,27 @@
+"""Identity of a cacheable embedding set: equal keys yield identical vectors.
+
+Embeddings depend on the markdown text, the embedding model, and the chunker --
+never on how the markdown was produced. So the key is the markdown's own hash
+plus the model and chunker recipe, not the upstream parse identity.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class EmbeddingKey:
+    markdown_sha256: str
+    embedding_model: str
+    embedding_dim: int
+    chunker_kind: str
+    chunker_version: int
+
+    @property
+    def object_suffix(self) -> str:
+        # Fingerprint the model so distinct models never share a blob, while the
+        # markdown hash (the object's folder) stays human-readable.
+        fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
+        return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"
diff --git a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
new file mode 100644
index 000000000..68c3a5211
--- /dev/null
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
@@ -0,0 +1,29 @@
+"""The cached payload: a document's chunk texts paired with their vectors."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True, slots=True)
+class CachedChunk:
+    text: str
+    embedding: np.ndarray
+
+
+@dataclass(frozen=True, slots=True)
+class EmbeddingSet:
+    """Everything the indexer needs to rebuild a document's chunks without embedding.
+
+    ``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
+    chunk texts and their vectors.
+    """
+
+    summary_embedding: np.ndarray
+    chunks: list[CachedChunk]
+
+    @property
+    def chunk_count(self) -> int:
+        return len(self.chunks)