fix(filesystem): reject mismatched existing projection indexes

This commit is contained in:
BukeLy 2026-05-31 21:10:23 +08:00
parent decfe29fe4
commit e293814bc0
2 changed files with 95 additions and 39 deletions

View file

@ -249,33 +249,29 @@ class PageIndexFileSystem:
"""Attach semantic retrieval to already-built projection indexes. """Attach semantic retrieval to already-built projection indexes.
Register-time generation owns building the index files. Opening an Register-time generation owns building the index files. Opening an
existing workspace should still expose semantic browse, without forcing existing workspace should still expose semantic retrieval when the
a re-register step. configured embedding dimensions match the existing index.
""" """
if self.semantic_retrieval_backend is not None: if self.semantic_retrieval_backend is not None:
return bool(self.semantic_retrieval_channels()) return bool(self.semantic_retrieval_channels())
index_config = self._existing_projection_index_config() index_config = self._existing_projection_index_config()
if index_config is None: if index_config is None:
return False return False
metadata = dict(index_config.get("metadata") or {}) existing_dimension = int(index_config.get("dimension") or 0)
embedding_provider = str( if existing_dimension != self.summary_projection_embedding_dimensions:
metadata.get("embedding_provider") raise RuntimeError(
or self.summary_projection_embedding_provider "summary projection index dimension mismatch: "
) f"{index_config.get('db_path') or self.summary_projection_index_dir} "
embedding_model = str( f"was built with dimension {existing_dimension}, but configured "
metadata.get("embedding_model") "summary_projection_embedding_dimensions is "
or self.summary_projection_embedding_model f"{self.summary_projection_embedding_dimensions}. Rebuild the "
) "projection index or use a matching embedding configuration."
embedding_dimensions = int( )
metadata.get("embedding_dimensions")
or index_config.get("dimension")
or self.summary_projection_embedding_dimensions
)
self.configure_hybrid_projection_retrieval( self.configure_hybrid_projection_retrieval(
self.summary_projection_index_dir, self.summary_projection_index_dir,
embedding_provider=embedding_provider, embedding_provider=self.summary_projection_embedding_provider,
embedding_model=embedding_model, embedding_model=self.summary_projection_embedding_model,
embedding_dimensions=embedding_dimensions, embedding_dimensions=self.summary_projection_embedding_dimensions,
embedding_timeout=self.summary_projection_embedding_timeout, embedding_timeout=self.summary_projection_embedding_timeout,
) )
return bool(self.semantic_retrieval_channels()) return bool(self.semantic_retrieval_channels())

View file

@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
assert "alpha evidence" in matched["data"]["data"][0]["text"] assert "alpha evidence" in matched["data"]["data"][0]["text"]
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch): def test_existing_summary_projection_index_uses_current_config_when_dimensions_match(
tmp_path, monkeypatch
):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
workspace = tmp_path / "workspace"
index_dir = workspace / "artifacts" / "projection_indexes"
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
summary_index.reset(
dimension=3,
metadata={
"channel": "summary",
"embedding_provider": "stale-provider",
"embedding_model": "stale-embedding",
"embedding_dimensions": 3,
},
)
summary_index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
)
]
)
filesystem = PageIndexFileSystem(
workspace,
summary_projection_embedding_provider="current-provider",
summary_projection_embedding_model="current-embedding",
summary_projection_embedding_dimensions=3,
summary_projection_embedding_timeout=12,
)
calls = []
def fake_configure(index_dir_arg, **kwargs):
calls.append((index_dir_arg, kwargs))
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
fake_configure,
)
assert filesystem.configure_existing_projection_retrieval() is True
assert calls == [
(
filesystem.summary_projection_index_dir,
{
"embedding_provider": "current-provider",
"embedding_model": "current-embedding",
"embedding_dimensions": 3,
"embedding_timeout": 12,
},
)
]
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
tmp_path, monkeypatch
):
from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
] ]
) )
filesystem = PageIndexFileSystem(workspace) filesystem = PageIndexFileSystem(workspace)
calls = []
def fake_configure(index_dir_arg, **kwargs): def fail_configure(*args, **kwargs):
calls.append((index_dir_arg, kwargs)) raise AssertionError("retrieval backend should not be configured on dimension mismatch")
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr( monkeypatch.setattr(
filesystem, filesystem,
"configure_hybrid_projection_retrieval", "configure_hybrid_projection_retrieval",
fake_configure, fail_configure,
) )
assert filesystem.configure_existing_projection_retrieval() is True with pytest.raises(
assert calls == [ RuntimeError,
( match=(
filesystem.summary_projection_index_dir, "summary projection index dimension mismatch: .*"
{ "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
"embedding_provider": "openai", ),
"embedding_model": "test-embedding", ):
"embedding_dimensions": 3, filesystem.configure_existing_projection_retrieval()
"embedding_timeout": 60,
},
)
]
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):