mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
feat(index-cache): add embedding set value objects
This commit is contained in:
parent
0fb1d3d37b
commit
cf208365b4
3 changed files with 68 additions and 0 deletions
12
surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py
vendored
Normal file
12
surfsense_backend/app/indexing_pipeline/cache/schemas/__init__.py
vendored
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
"""Pure value objects for the index cache."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .embedding_key import EmbeddingKey
|
||||
from .embedding_set import CachedChunk, EmbeddingSet
|
||||
|
||||
__all__ = [
|
||||
"CachedChunk",
|
||||
"EmbeddingKey",
|
||||
"EmbeddingSet",
|
||||
]
|
||||
27
surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
vendored
Normal file
27
surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_key.py
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
"""Identity of a cacheable embedding set: equal keys yield identical vectors.
|
||||
|
||||
Embeddings depend on the markdown text, the embedding model, and the chunker --
|
||||
never on how the markdown was produced. So the key is the markdown's own hash
|
||||
plus the model and chunker recipe, not the upstream parse identity.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class EmbeddingKey:
|
||||
markdown_sha256: str
|
||||
embedding_model: str
|
||||
embedding_dim: int
|
||||
chunker_kind: str
|
||||
chunker_version: int
|
||||
|
||||
@property
|
||||
def object_suffix(self) -> str:
|
||||
# Fingerprint the model so distinct models never share a blob, while the
|
||||
# markdown hash (the object's folder) stays human-readable.
|
||||
fingerprint = hashlib.sha256(self.embedding_model.encode("utf-8")).hexdigest()
|
||||
return f"{fingerprint[:16]}.{self.chunker_kind}.v{self.chunker_version}.emb"
|
||||
29
surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
vendored
Normal file
29
surfsense_backend/app/indexing_pipeline/cache/schemas/embedding_set.py
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
"""The cached payload: a document's chunk texts paired with their vectors."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class CachedChunk:
|
||||
text: str
|
||||
embedding: np.ndarray
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class EmbeddingSet:
|
||||
"""Everything the indexer needs to rebuild a document's chunks without embedding.
|
||||
|
||||
``summary_embedding`` is the document-level vector; ``chunks`` are the ordered
|
||||
chunk texts and their vectors.
|
||||
"""
|
||||
|
||||
summary_embedding: np.ndarray
|
||||
chunks: list[CachedChunk]
|
||||
|
||||
@property
|
||||
def chunk_count(self) -> int:
|
||||
return len(self.chunks)
|
||||
Loading…
Add table
Add a link
Reference in a new issue