feat(index-cache): add embedding set value objects

2026-06-12 20:45:20 +02:00 · 2026-06-12 16:48:01 +02:00 · 2026-06-12 16:48:01 +02:00 · cf208365b4
commit cf208365b4
parent 0fb1d3d37b
3 changed files with 68 additions and 0 deletions
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/init.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/init.py
@ -0,0 +1,12 @@
+"""Pure value objects for the index cache."""
+
+from __future__ import annotations
+
+from .embedding_key import EmbeddingKey
+from .embedding_set import CachedChunk, EmbeddingSet
+
+__all__ = [
+    "CachedChunk",
+    "EmbeddingKey",
+    "EmbeddingSet",
+]
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
@ -0,0 +1,27 @@
+"""Identity of a cacheable embedding set: equal keys yield identical vectors.
+
+Embeddings depend on the markdown text, the embedding model, and the chunker --
+never on how the markdown was produced. So the key is the markdown's own hash
+plus the model and chunker recipe, not the upstream parse identity.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class EmbeddingKey:
+    markdown_sha256: str
+    embedding_model: str
+    embedding_dim: int
+    chunker_kind: str
+    chunker_version: int
+
+    @property
+    def object_suffix(self) -> str:
+        # Fingerprint the model so distinct models never share a blob, while the
+        # markdown hash (the object's folder) stays human-readable.
+        fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
+        return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"
--- a/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
+++ b/surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
@ -0,0 +1,29 @@
+"""The cached payload: a document's chunk texts paired with their vectors."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True, slots=True)
+class CachedChunk:
+    text: str
+    embedding: np.ndarray
+
+
+@dataclass(frozen=True, slots=True)
+class EmbeddingSet:
+    """Everything the indexer needs to rebuild a document's chunks without embedding.
+
+    ``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
+    chunk texts and their vectors.
+    """
+
+    summary_embedding: np.ndarray
+    chunks: list[CachedChunk]
+
+    @property
+    def chunk_count(self) -> int:
+        return len(self.chunks)