feat(index-cache): add embedding blob store sharing the cache backend

This commit is contained in:
CREDO23 2026-06-12 16:48:01 +02:00
parent f541114544
commit ad6da7c6af
3 changed files with 60 additions and 0 deletions

View file

@ -0,0 +1,9 @@
"""Blob storage for cached embedding sets."""
from __future__ import annotations
from .embedding_store import EmbeddingCacheStore
__all__ = [
"EmbeddingCacheStore",
]

View file

@ -0,0 +1,39 @@
"""Read and write cached embedding blobs through the shared cache backend.
The blob backend is shared with the ETL parse cache (same bucket / root), so
markdown and its embeddings live side by side; only the object prefix differs.
"""
from __future__ import annotations
from app.etl_pipeline.cache.storage.backend import resolve_cache_backend
from app.indexing_pipeline.cache.serialization import deserialize, serialize
from app.indexing_pipeline.cache.schemas import EmbeddingKey, EmbeddingSet
from app.indexing_pipeline.cache.storage.object_keys import build_embedding_object_key
_EMBEDDING_CONTENT_TYPE = "application/octet-stream"
class EmbeddingCacheStore:
def __init__(self) -> None:
self._backend = resolve_cache_backend()
@property
def backend_name(self) -> str:
return self._backend.backend_name
async def save(self, key: EmbeddingKey, embedding_set: EmbeddingSet) -> tuple[str, int]:
"""Persist the embedding set and return its storage key and byte size."""
blob = serialize(embedding_set)
storage_key = build_embedding_object_key(key)
await self._backend.put(
storage_key, blob, content_type=_EMBEDDING_CONTENT_TYPE
)
return storage_key, len(blob)
async def load(self, storage_key: str) -> EmbeddingSet:
chunks = [chunk async for chunk in self._backend.open_stream(storage_key)]
return deserialize(b"".join(chunks))
async def delete(self, storage_key: str) -> None:
await self._backend.delete(storage_key)

View file

@ -0,0 +1,12 @@
"""Object keys for cached embedding sets, namespaced under a dedicated prefix."""
from __future__ import annotations
from app.indexing_pipeline.cache.schemas import EmbeddingKey
CACHE_PREFIX = "index_cache"
def build_embedding_object_key(key: EmbeddingKey) -> str:
# Content-addressed: identical markdown + recipe always map to the same key.
return f"{CACHE_PREFIX}/{key.markdown_sha256}/{key.object_suffix}"