mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-30 20:39:43 +02:00
fix(filesystem): guard summary index dimensions
Raise on summary projection dimension mismatch instead of resetting an existing index.
This commit is contained in:
parent
2297453103
commit
cb9db0bab9
2 changed files with 61 additions and 9 deletions
|
|
@ -110,15 +110,20 @@ class SummaryProjectionIndexer:
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
if self.index.dimension() != self.embedding_dimensions:
|
existing_dimension = self.index.dimension()
|
||||||
self.index.reset(
|
except Exception as exc:
|
||||||
dimension=self.embedding_dimensions,
|
raise RuntimeError(
|
||||||
metadata=self._index_metadata(),
|
"could not validate existing summary projection index config; "
|
||||||
)
|
f"refusing to reset {self.index.db_path}. Move the existing index "
|
||||||
except Exception:
|
"aside or rebuild it intentionally before changing embedding config."
|
||||||
self.index.reset(
|
) from exc
|
||||||
dimension=self.embedding_dimensions,
|
if existing_dimension != self.embedding_dimensions:
|
||||||
metadata=self._index_metadata(),
|
raise RuntimeError(
|
||||||
|
"summary projection index dimension mismatch: "
|
||||||
|
f"{self.index.db_path} was built with dimension {existing_dimension}, "
|
||||||
|
f"but configured embedding_dimensions is {self.embedding_dimensions}. "
|
||||||
|
"Use the matching embedding config, or rebuild the projection index "
|
||||||
|
"at a new path after preserving the existing data."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _index_metadata(self) -> dict[str, Any]:
|
def _index_metadata(self) -> dict[str, Any]:
|
||||||
|
|
|
||||||
|
|
@ -91,6 +91,53 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||||
assert hits[0].metadata["department"] == "ops"
|
assert hits[0].metadata["department"] == "ops"
|
||||||
|
|
||||||
|
|
||||||
|
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||||
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||||
|
|
||||||
|
class FakeEmbedder:
|
||||||
|
def embed(self, texts):
|
||||||
|
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
|
||||||
|
|
||||||
|
index_dir = tmp_path / "projection"
|
||||||
|
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||||
|
index.reset(
|
||||||
|
dimension=3,
|
||||||
|
metadata={
|
||||||
|
"channel": "summary",
|
||||||
|
"embedding_provider": "test",
|
||||||
|
"embedding_model": "fake",
|
||||||
|
"embedding_dimensions": 3,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
index.upsert_many(
|
||||||
|
[
|
||||||
|
SemanticIndexRecord(
|
||||||
|
file_ref="file_a",
|
||||||
|
external_id="doc_a",
|
||||||
|
source_type="documents",
|
||||||
|
source_path="docs/a.pdf",
|
||||||
|
title="A",
|
||||||
|
text="summary",
|
||||||
|
vector=[1.0, 0.0, 0.0],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
|
||||||
|
SummaryProjectionIndexer(
|
||||||
|
index_dir,
|
||||||
|
embedder=FakeEmbedder(),
|
||||||
|
embedding_provider="test",
|
||||||
|
embedding_model="fake",
|
||||||
|
embedding_dimensions=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
preserved = SQLiteVecSemanticIndex(index.db_path)
|
||||||
|
assert preserved.info()["dimension"] == 3
|
||||||
|
assert preserved.info()["document_count"] == 1
|
||||||
|
assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a"
|
||||||
|
|
||||||
|
|
||||||
def test_hash_embedding_provider_is_not_available():
|
def test_hash_embedding_provider_is_not_available():
|
||||||
from pageindex.filesystem.hybrid_projection import make_embedder
|
from pageindex.filesystem.hybrid_projection import make_embedder
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue