fix(filesystem): guard summary index dimensions

Raise on summary projection dimension mismatch instead of resetting an existing index.
This commit is contained in:
Bukely_ 2026-05-26 20:28:11 +08:00 committed by BukeLy
parent 2297453103
commit cb9db0bab9
2 changed files with 61 additions and 9 deletions

View file

@ -110,15 +110,20 @@ class SummaryProjectionIndexer:
)
return
try:
if self.index.dimension() != self.embedding_dimensions:
self.index.reset(
dimension=self.embedding_dimensions,
metadata=self._index_metadata(),
)
except Exception:
self.index.reset(
dimension=self.embedding_dimensions,
metadata=self._index_metadata(),
existing_dimension = self.index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {self.index.db_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != self.embedding_dimensions:
raise RuntimeError(
"summary projection index dimension mismatch: "
f"{self.index.db_path} was built with dimension {existing_dimension}, "
f"but configured embedding_dimensions is {self.embedding_dimensions}. "
"Use the matching embedding config, or rebuild the projection index "
"at a new path after preserving the existing data."
)
def _index_metadata(self) -> dict[str, Any]:

View file

@ -91,6 +91,53 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
assert hits[0].metadata["department"] == "ops"
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
class FakeEmbedder:
def embed(self, texts):
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
index.reset(
dimension=3,
metadata={
"channel": "summary",
"embedding_provider": "test",
"embedding_model": "fake",
"embedding_dimensions": 3,
},
)
index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="docs/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
)
]
)
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
SummaryProjectionIndexer(
index_dir,
embedder=FakeEmbedder(),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=4,
)
preserved = SQLiteVecSemanticIndex(index.db_path)
assert preserved.info()["dimension"] == 3
assert preserved.info()["document_count"] == 1
assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a"
def test_hash_embedding_provider_is_not_available():
from pageindex.filesystem.hybrid_projection import make_embedder