mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): reject mismatched existing projection indexes
This commit is contained in:
parent
decfe29fe4
commit
e293814bc0
2 changed files with 95 additions and 39 deletions
|
|
@ -249,33 +249,29 @@ class PageIndexFileSystem:
|
|||
"""Attach semantic retrieval to already-built projection indexes.
|
||||
|
||||
Register-time generation owns building the index files. Opening an
|
||||
existing workspace should still expose semantic browse, without forcing
|
||||
a re-register step.
|
||||
existing workspace should still expose semantic retrieval when the
|
||||
configured embedding dimensions match the existing index.
|
||||
"""
|
||||
if self.semantic_retrieval_backend is not None:
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
index_config = self._existing_projection_index_config()
|
||||
if index_config is None:
|
||||
return False
|
||||
metadata = dict(index_config.get("metadata") or {})
|
||||
embedding_provider = str(
|
||||
metadata.get("embedding_provider")
|
||||
or self.summary_projection_embedding_provider
|
||||
)
|
||||
embedding_model = str(
|
||||
metadata.get("embedding_model")
|
||||
or self.summary_projection_embedding_model
|
||||
)
|
||||
embedding_dimensions = int(
|
||||
metadata.get("embedding_dimensions")
|
||||
or index_config.get("dimension")
|
||||
or self.summary_projection_embedding_dimensions
|
||||
)
|
||||
existing_dimension = int(index_config.get("dimension") or 0)
|
||||
if existing_dimension != self.summary_projection_embedding_dimensions:
|
||||
raise RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{index_config.get('db_path') or self.summary_projection_index_dir} "
|
||||
f"was built with dimension {existing_dimension}, but configured "
|
||||
"summary_projection_embedding_dimensions is "
|
||||
f"{self.summary_projection_embedding_dimensions}. Rebuild the "
|
||||
"projection index or use a matching embedding configuration."
|
||||
)
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.summary_projection_index_dir,
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
embedding_provider=self.summary_projection_embedding_provider,
|
||||
embedding_model=self.summary_projection_embedding_model,
|
||||
embedding_dimensions=self.summary_projection_embedding_dimensions,
|
||||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
|
|
|
|||
|
|
@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
|||
assert "alpha evidence" in matched["data"]["data"][0]["text"]
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
||||
def test_existing_summary_projection_index_uses_current_config_when_dimensions_match(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "stale-provider",
|
||||
"embedding_model": "stale-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace,
|
||||
summary_projection_embedding_provider="current-provider",
|
||||
summary_projection_embedding_model="current-embedding",
|
||||
summary_projection_embedding_dimensions=3,
|
||||
summary_projection_embedding_timeout=12,
|
||||
)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
)
|
||||
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "current-provider",
|
||||
"embedding_model": "current-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
"embedding_timeout": 12,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
|
|
@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
|
|||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(workspace)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
def fail_configure(*args, **kwargs):
|
||||
raise AssertionError("retrieval backend should not be configured on dimension mismatch")
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
fail_configure,
|
||||
)
|
||||
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "test-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
"embedding_timeout": 60,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match=(
|
||||
"summary projection index dimension mismatch: .*"
|
||||
"dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
|
||||
),
|
||||
):
|
||||
filesystem.configure_existing_projection_retrieval()
|
||||
|
||||
|
||||
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue