feat(index-cache): add embedding set value objects

This commit is contained in:
CREDO23 2026-06-12 16:48:01 +02:00
parent 0fb1d3d37b
commit cf208365b4
3 changed files with 68 additions and 0 deletions

View file

@ -0,0 +1,12 @@
"""Pure value objects for the index cache."""
from __future__ import annotations
from .embedding_key import EmbeddingKey
from .embedding_set import CachedChunk, EmbeddingSet
__all__ = [
"CachedChunk",
"EmbeddingKey",
"EmbeddingSet",
]

View file

@ -0,0 +1,27 @@
"""Identity of a cacheable embedding set: equal keys yield identical vectors.
Embeddings depend on the markdown text, the embedding model, and the chunker --
never on how the markdown was produced. So the key is the markdown's own hash
plus the model and chunker recipe, not the upstream parse identity.
"""
from __future__ import annotations
import hashlib
from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
class EmbeddingKey:
markdown_sha256: str
embedding_model: str
embedding_dim: int
chunker_kind: str
chunker_version: int
@property
def object_suffix(self) -> str:
# Fingerprint the model so distinct models never share a blob, while the
# markdown hash (the object's folder) stays human-readable.
fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"

View file

@ -0,0 +1,29 @@
"""The cached payload: a document's chunk texts paired with their vectors."""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
@dataclass(frozen=True, slots=True)
class CachedChunk:
text: str
embedding: np.ndarray
@dataclass(frozen=True, slots=True)
class EmbeddingSet:
"""Everything the indexer needs to rebuild a document's chunks without embedding.
``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
chunk texts and their vectors.
"""
summary_embedding: np.ndarray
chunks: list[CachedChunk]
@property
def chunk_count(self) -> int:
return len(self.chunks)