diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index eec0ade..c9a5d70 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -249,33 +249,29 @@ class PageIndexFileSystem: """Attach semantic retrieval to already-built projection indexes. Register-time generation owns building the index files. Opening an - existing workspace should still expose semantic browse, without forcing - a re-register step. + existing workspace should still expose semantic retrieval when the + configured embedding dimensions match the existing index. """ if self.semantic_retrieval_backend is not None: return bool(self.semantic_retrieval_channels()) index_config = self._existing_projection_index_config() if index_config is None: return False - metadata = dict(index_config.get("metadata") or {}) - embedding_provider = str( - metadata.get("embedding_provider") - or self.summary_projection_embedding_provider - ) - embedding_model = str( - metadata.get("embedding_model") - or self.summary_projection_embedding_model - ) - embedding_dimensions = int( - metadata.get("embedding_dimensions") - or index_config.get("dimension") - or self.summary_projection_embedding_dimensions - ) + existing_dimension = int(index_config.get("dimension") or 0) + if existing_dimension != self.summary_projection_embedding_dimensions: + raise RuntimeError( + "summary projection index dimension mismatch: " + f"{index_config.get('db_path') or self.summary_projection_index_dir} " + f"was built with dimension {existing_dimension}, but configured " + "summary_projection_embedding_dimensions is " + f"{self.summary_projection_embedding_dimensions}. Rebuild the " + "projection index or use a matching embedding configuration." + ) self.configure_hybrid_projection_retrieval( self.summary_projection_index_dir, - embedding_provider=embedding_provider, - embedding_model=embedding_model, - embedding_dimensions=embedding_dimensions, + embedding_provider=self.summary_projection_embedding_provider, + embedding_model=self.summary_projection_embedding_model, + embedding_dimensions=self.summary_projection_embedding_dimensions, embedding_timeout=self.summary_projection_embedding_timeout, ) return bool(self.semantic_retrieval_channels()) diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 9edf647..46b1161 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path): assert "alpha evidence" in matched["data"]["data"][0]["text"] -def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch): +def test_existing_summary_projection_index_uses_current_config_when_dimensions_match( + tmp_path, monkeypatch +): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "stale-provider", + "embedding_model": "stale-embedding", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + filesystem = PageIndexFileSystem( + workspace, + summary_projection_embedding_provider="current-provider", + summary_projection_embedding_model="current-embedding", + summary_projection_embedding_dimensions=3, + summary_projection_embedding_timeout=12, + ) + calls = [] + + def fake_configure(index_dir_arg, **kwargs): + calls.append((index_dir_arg, kwargs)) + filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") + return filesystem.semantic_retrieval_backend + + monkeypatch.setattr( + filesystem, + "configure_hybrid_projection_retrieval", + fake_configure, + ) + + assert filesystem.configure_existing_projection_retrieval() is True + assert calls == [ + ( + filesystem.summary_projection_index_dir, + { + "embedding_provider": "current-provider", + "embedding_model": "current-embedding", + "embedding_dimensions": 3, + "embedding_timeout": 12, + }, + ) + ] + assert filesystem.semantic_retrieval_channels() == ("summary",) + + +def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( + tmp_path, monkeypatch +): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex @@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path ] ) filesystem = PageIndexFileSystem(workspace) - calls = [] - def fake_configure(index_dir_arg, **kwargs): - calls.append((index_dir_arg, kwargs)) - filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") - return filesystem.semantic_retrieval_backend + def fail_configure(*args, **kwargs): + raise AssertionError("retrieval backend should not be configured on dimension mismatch") monkeypatch.setattr( filesystem, "configure_hybrid_projection_retrieval", - fake_configure, + fail_configure, ) - assert filesystem.configure_existing_projection_retrieval() is True - assert calls == [ - ( - filesystem.summary_projection_index_dir, - { - "embedding_provider": "openai", - "embedding_model": "test-embedding", - "embedding_dimensions": 3, - "embedding_timeout": 60, - }, - ) - ] - assert filesystem.semantic_retrieval_channels() == ("summary",) + with pytest.raises( + RuntimeError, + match=( + "summary projection index dimension mismatch: .*" + "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild" + ), + ): + filesystem.configure_existing_projection_retrieval() def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):