SurfSense/surfsense_backend/app/indexing_pipeline/cache/persistence/models.py
CREDO23 91d947ff79 refactor(embedding-cache): rename index cache to embedding cache
The cached payload is the indexing pipeline's embeddings (markdown is
chunked then embedded), so "embedding cache" names the expensive output
directly and removes the "index" ambiguity (DB index vs vector index vs
indexing phase). Renames the service, settings, eligibility, eviction
task, metrics, config flags (INDEX_CACHE_* -> EMBEDDING_CACHE_*), object
prefix, and the table (index_cache_embedding_sets -> embedding_cache_sets)
with its constraint and indexes. Migration 161 renamed accordingly.
2026-06-12 17:00:01 +02:00

47 lines
1.5 KiB
Python

"""``embedding_cache_sets``: one reusable chunk+embedding set per markdown."""
from __future__ import annotations
from sqlalchemy import (
BigInteger,
Column,
DateTime,
Index,
Integer,
String,
UniqueConstraint,
)
from app.db import BaseModel, TimestampMixin
class CachedEmbeddingSet(BaseModel, TimestampMixin):
__tablename__ = "embedding_cache_sets"
# Key: markdown text + the recipe that turned it into vectors.
markdown_sha256 = Column(String(64), nullable=False)
embedding_model = Column(String(255), nullable=False)
embedding_dim = Column(Integer, nullable=False)
chunker_kind = Column(String(8), nullable=False)
chunker_version = Column(Integer, nullable=False)
# Where the embedding blob lives (kept out of the row to stay small).
storage_backend = Column(String(32), nullable=False)
storage_key = Column(String, nullable=False)
size_bytes = Column(BigInteger, nullable=False)
chunk_count = Column(Integer, nullable=False, default=0, server_default="0")
# Drives eviction (popularity + recency).
times_reused = Column(BigInteger, nullable=False, default=0, server_default="0")
last_used_at = Column(DateTime(timezone=True), nullable=False)
__table_args__ = (
UniqueConstraint(
"markdown_sha256",
"embedding_model",
"chunker_kind",
"chunker_version",
name="uq_embedding_cache_sets_key",
),
Index("ix_embedding_cache_sets_last_used_at", "last_used_at"),
)