mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): guard summary index dimensions
Raise on summary projection dimension mismatch instead of resetting an existing index.
This commit is contained in:
parent
2297453103
commit
cb9db0bab9
2 changed files with 61 additions and 9 deletions
|
|
@ -110,15 +110,20 @@ class SummaryProjectionIndexer:
|
|||
)
|
||||
return
|
||||
try:
|
||||
if self.index.dimension() != self.embedding_dimensions:
|
||||
self.index.reset(
|
||||
dimension=self.embedding_dimensions,
|
||||
metadata=self._index_metadata(),
|
||||
)
|
||||
except Exception:
|
||||
self.index.reset(
|
||||
dimension=self.embedding_dimensions,
|
||||
metadata=self._index_metadata(),
|
||||
existing_dimension = self.index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {self.index.db_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != self.embedding_dimensions:
|
||||
raise RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{self.index.db_path} was built with dimension {existing_dimension}, "
|
||||
f"but configured embedding_dimensions is {self.embedding_dimensions}. "
|
||||
"Use the matching embedding config, or rebuild the projection index "
|
||||
"at a new path after preserving the existing data."
|
||||
)
|
||||
|
||||
def _index_metadata(self) -> dict[str, Any]:
|
||||
|
|
|
|||
|
|
@ -91,6 +91,53 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|||
assert hits[0].metadata["department"] == "ops"
|
||||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "test",
|
||||
"embedding_model": "fake",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="docs/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
|
||||
SummaryProjectionIndexer(
|
||||
index_dir,
|
||||
embedder=FakeEmbedder(),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=4,
|
||||
)
|
||||
|
||||
preserved = SQLiteVecSemanticIndex(index.db_path)
|
||||
assert preserved.info()["dimension"] == 3
|
||||
assert preserved.info()["document_count"] == 1
|
||||
assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a"
|
||||
|
||||
|
||||
def test_hash_embedding_provider_is_not_available():
|
||||
from pageindex.filesystem.hybrid_projection import make_embedder
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue