diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 0019942..413b4a4 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -42,6 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") from pageindex import PageIndexClient from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor from pageindex.filesystem.agent import run_pifs_agent +from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS EXAMPLES_DIR = Path(__file__).parent @@ -149,7 +150,11 @@ def parse_args() -> argparse.Namespace: default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"), help="Embedding model used for register-time summary projection.", ) - parser.add_argument("--embedding-dimensions", type=int, default=256) + parser.add_argument( + "--embedding-dimensions", + type=int, + default=DEFAULT_EMBEDDING_DIMENSIONS, + ) return parser.parse_args() diff --git a/pageindex/filesystem/cli.py b/pageindex/filesystem/cli.py index e808d32..7f91cda 100644 --- a/pageindex/filesystem/cli.py +++ b/pageindex/filesystem/cli.py @@ -162,8 +162,7 @@ def _parse_agent_command( def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem: filesystem = PageIndexFileSystem(Path(workspace).expanduser()) - with contextlib.suppress(Exception): - filesystem.configure_existing_projection_retrieval() + filesystem.configure_existing_projection_retrieval() return filesystem diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 81b3848..557b4e1 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -14,6 +14,7 @@ from .metadata_generation import ( MetadataGenerationResult, MetadataGenerator, ) +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .semantic_folder_policy import ( SEMANTIC_FOLDER_BASE_FIELDS, SEMANTIC_FOLDER_ROOT, @@ -76,6 +77,11 @@ PROJECTION_INDEX_STATUSES = { } SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation") +SEMANTIC_PROJECTION_INDEX_NAMES = { + "summary": "summary_only_vector", + "entity": "entity_vectors", + "relation": "relation_vectors", +} PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"} PAGEINDEX_DOCUMENT_CONTENT_TYPES = { "application/pdf", @@ -103,7 +109,7 @@ class PageIndexFileSystem: summary_projection_index_dir: Union[str, Path, None] = None, summary_projection_embedding_provider: str = "openai", summary_projection_embedding_model: str = "text-embedding-3-small", - summary_projection_embedding_dimensions: int = 256, + summary_projection_embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, summary_projection_embedding_timeout: float = 60, ): self.workspace = Path(workspace).expanduser() @@ -248,48 +254,43 @@ class PageIndexFileSystem: """Attach semantic retrieval to already-built projection indexes. Register-time generation owns building the index files. Opening an - existing workspace should still expose semantic browse, without forcing - a re-register step. + existing workspace should still expose semantic retrieval when the + configured embedding dimensions match the existing index. """ if self.semantic_retrieval_backend is not None: return bool(self.semantic_retrieval_channels()) index_config = self._existing_projection_index_config() if index_config is None: return False - metadata = dict(index_config.get("metadata") or {}) - embedding_provider = str( - metadata.get("embedding_provider") - or self.summary_projection_embedding_provider - ) - embedding_model = str( - metadata.get("embedding_model") - or self.summary_projection_embedding_model - ) - embedding_dimensions = int( - metadata.get("embedding_dimensions") - or index_config.get("dimension") - or self.summary_projection_embedding_dimensions - ) + existing_dimension = int(index_config.get("dimension") or 0) + if existing_dimension != self.summary_projection_embedding_dimensions: + raise RuntimeError( + "summary projection index dimension mismatch: " + f"{index_config.get('db_path') or self.summary_projection_index_dir} " + f"was built with dimension {existing_dimension}, but configured " + "summary_projection_embedding_dimensions is " + f"{self.summary_projection_embedding_dimensions}. Rebuild the " + "projection index or use a matching embedding configuration." + ) self.configure_hybrid_projection_retrieval( self.summary_projection_index_dir, - embedding_provider=embedding_provider, - embedding_model=embedding_model, - embedding_dimensions=embedding_dimensions, + embedding_provider=self.summary_projection_embedding_provider, + embedding_model=self.summary_projection_embedding_model, + embedding_dimensions=self.summary_projection_embedding_dimensions, embedding_timeout=self.summary_projection_embedding_timeout, ) return bool(self.semantic_retrieval_channels()) def _existing_projection_index_config(self) -> dict[str, Any] | None: - from .hybrid_projection import INDEX_BY_CHANNEL - from .semantic_index import SQLiteVecSemanticIndex - for channel in SEMANTIC_RETRIEVAL_CHANNELS: - index_name = INDEX_BY_CHANNEL.get(channel) + index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel) if not index_name: continue index_path = self.summary_projection_index_dir / f"{index_name}.sqlite" if not index_path.exists(): continue + from .semantic_index import SQLiteVecSemanticIndex + try: info = SQLiteVecSemanticIndex(index_path).info() except Exception: @@ -656,7 +657,7 @@ class PageIndexFileSystem: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, per_channel_limit: int = 100, fetch_multiplier: int = 100, diff --git a/pageindex/filesystem/embedding_defaults.py b/pageindex/filesystem/embedding_defaults.py new file mode 100644 index 0000000..b329032 --- /dev/null +++ b/pageindex/filesystem/embedding_defaults.py @@ -0,0 +1 @@ +DEFAULT_EMBEDDING_DIMENSIONS = 1024 diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py index b49d49a..cdb97e6 100644 --- a/pageindex/filesystem/hybrid_projection.py +++ b/pageindex/filesystem/hybrid_projection.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult @@ -65,7 +66,7 @@ class HybridProjectionSearchBackend: embedder: Any, embedding_provider: str, embedding_model: str, - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_cache_path: str | Path | None = None, per_channel_limit: int = 100, fetch_multiplier: int = 100, @@ -95,7 +96,7 @@ class HybridProjectionSearchBackend: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, **kwargs: Any, ) -> "HybridProjectionSearchBackend": diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py index e5d7b82..63802d5 100644 --- a/pageindex/filesystem/projection_indexing.py +++ b/pageindex/filesystem/projection_indexing.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path from typing import Any +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .hybrid_projection import ( EmbeddingCache, INDEX_BY_CHANNEL, @@ -22,7 +23,7 @@ class SummaryProjectionIndexer: embedder: Any, embedding_provider: str, embedding_model: str, - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_cache_path: str | Path | None = None, ) -> None: self.index_dir = Path(index_dir).expanduser() @@ -49,10 +50,11 @@ class SummaryProjectionIndexer: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, **kwargs: Any, ) -> "SummaryProjectionIndexer": + cls._validate_existing_index_dimension(index_dir, embedding_dimensions) return cls( index_dir, embedder=make_embedder( @@ -118,12 +120,10 @@ class SummaryProjectionIndexer: "aside or rebuild it intentionally before changing embedding config." ) from exc if existing_dimension != self.embedding_dimensions: - raise RuntimeError( - "summary projection index dimension mismatch: " - f"{self.index.db_path} was built with dimension {existing_dimension}, " - f"but configured embedding_dimensions is {self.embedding_dimensions}. " - "Use the matching embedding config, or rebuild the projection index " - "at a new path after preserving the existing data." + raise self._dimension_mismatch_error( + self.index.db_path, + existing_dimension, + self.embedding_dimensions, ) def _index_metadata(self) -> dict[str, Any]: @@ -133,3 +133,44 @@ class SummaryProjectionIndexer: "embedding_model": self.embedding_model, "embedding_dimensions": self.embedding_dimensions, } + + @classmethod + def _validate_existing_index_dimension( + cls, + index_dir: str | Path, + embedding_dimensions: int, + ) -> None: + index_path = ( + Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite" + ) + if not index_path.exists(): + return + index = SQLiteVecSemanticIndex(index_path) + try: + existing_dimension = index.dimension() + except Exception as exc: + raise RuntimeError( + "could not validate existing summary projection index config; " + f"refusing to reset {index_path}. Move the existing index " + "aside or rebuild it intentionally before changing embedding config." + ) from exc + if existing_dimension != embedding_dimensions: + raise cls._dimension_mismatch_error( + index_path, + existing_dimension, + embedding_dimensions, + ) + + @staticmethod + def _dimension_mismatch_error( + index_path: Path, + existing_dimension: int, + embedding_dimensions: int, + ) -> RuntimeError: + return RuntimeError( + "summary projection index dimension mismatch: " + f"{index_path} was built with dimension {existing_dimension}, " + f"but configured embedding_dimensions is {embedding_dimensions}. " + "Use the matching embedding config, or rebuild the projection index " + "at a new path after preserving the existing data." + ) diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 9edf647..46b1161 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path): assert "alpha evidence" in matched["data"]["data"][0]["text"] -def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch): +def test_existing_summary_projection_index_uses_current_config_when_dimensions_match( + tmp_path, monkeypatch +): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "stale-provider", + "embedding_model": "stale-embedding", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + filesystem = PageIndexFileSystem( + workspace, + summary_projection_embedding_provider="current-provider", + summary_projection_embedding_model="current-embedding", + summary_projection_embedding_dimensions=3, + summary_projection_embedding_timeout=12, + ) + calls = [] + + def fake_configure(index_dir_arg, **kwargs): + calls.append((index_dir_arg, kwargs)) + filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") + return filesystem.semantic_retrieval_backend + + monkeypatch.setattr( + filesystem, + "configure_hybrid_projection_retrieval", + fake_configure, + ) + + assert filesystem.configure_existing_projection_retrieval() is True + assert calls == [ + ( + filesystem.summary_projection_index_dir, + { + "embedding_provider": "current-provider", + "embedding_model": "current-embedding", + "embedding_dimensions": 3, + "embedding_timeout": 12, + }, + ) + ] + assert filesystem.semantic_retrieval_channels() == ("summary",) + + +def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( + tmp_path, monkeypatch +): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex @@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path ] ) filesystem = PageIndexFileSystem(workspace) - calls = [] - def fake_configure(index_dir_arg, **kwargs): - calls.append((index_dir_arg, kwargs)) - filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") - return filesystem.semantic_retrieval_backend + def fail_configure(*args, **kwargs): + raise AssertionError("retrieval backend should not be configured on dimension mismatch") monkeypatch.setattr( filesystem, "configure_hybrid_projection_retrieval", - fake_configure, + fail_configure, ) - assert filesystem.configure_existing_projection_retrieval() is True - assert calls == [ - ( - filesystem.summary_projection_index_dir, - { - "embedding_provider": "openai", - "embedding_model": "test-embedding", - "embedding_dimensions": 3, - "embedding_timeout": 60, - }, - ) - ] - assert filesystem.semantic_retrieval_channels() == ("summary",) + with pytest.raises( + RuntimeError, + match=( + "summary projection index dimension mismatch: .*" + "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild" + ), + ): + filesystem.configure_existing_projection_retrieval() def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py index 491cbb9..3437a3c 100644 --- a/tests/test_pifs_cli.py +++ b/tests/test_pifs_cli.py @@ -1,6 +1,10 @@ +import builtins import os +import sys from pathlib import Path +import pytest + class FakeFileSystem: def __init__(self, workspace): @@ -25,6 +29,71 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp assert filesystem.projection_retrieval_configured is True +def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec( + monkeypatch, tmp_path +): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + real_import = builtins.__import__ + + monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False) + monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False) + monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False) + + def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0): + if name.split(".", 1)[0] == "sqlite_vec": + raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec") + return real_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", block_sqlite_vec) + + filesystem = cli._filesystem_from_workspace(str(workspace)) + + assert filesystem.workspace == workspace + assert filesystem.semantic_retrieval_channels() == () + + +def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path): + from pageindex.filesystem import cli + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "test", + "embedding_model": "fake", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + + with pytest.raises( + RuntimeError, + match=( + "summary projection index dimension mismatch: .*" + "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild" + ), + ): + cli._filesystem_from_workspace(str(workspace)) + + def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path): from pageindex.filesystem import cli diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py index d4263e1..6cdc0e1 100644 --- a/tests/test_semantic_index.py +++ b/tests/test_semantic_index.py @@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import ( ) +class FixedDimensionEmbedder: + def __init__(self, dimensions: int): + self.dimensions = dimensions + + def embed(self, texts): + return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts] + + def test_sqlite_vec_semantic_index_round_trip(tmp_path): index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") index.reset(dimension=3, metadata={"field_mode": "summary"}) @@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm def test_summary_projection_indexes_unified_metadata_summary(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer - class FakeEmbedder: - def embed(self, texts): - return [[1.0, 0.0, 0.0] for _ in texts] - indexer = SummaryProjectionIndexer( tmp_path / "projection", - embedder=FakeEmbedder(), + embedder=FixedDimensionEmbedder(3), embedding_provider="test", embedding_model="fake", embedding_dimensions=3, @@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path): assert hits[0].metadata["department"] == "ops" -def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): +def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer - class FakeEmbedder: + indexer = SummaryProjectionIndexer( + tmp_path / "projection", + embedder=FixedDimensionEmbedder(1024), + embedding_provider="test", + embedding_model="fake", + ) + + info = indexer.index.info() + + assert info["dimension"] == 1024 + assert info["metadata"]["embedding_dimensions"] == 1024 + + result = indexer.upsert_summary( + { + "file_ref": "file_a", + "external_id": "doc_a", + "source_type": "documents", + "source_path": "docs/a.pdf", + "title": "A", + "metadata": {"summary": "Default dimension summary."}, + } + ) + + assert result["status"] == "ready" + assert result["embedding_dimensions"] == 1024 + + +def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + indexer = SummaryProjectionIndexer( + tmp_path / "projection", + embedder=FixedDimensionEmbedder(256), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=256, + ) + + assert indexer.index.info()["dimension"] == 256 + assert indexer.upsert_summary( + { + "file_ref": "file_a", + "external_id": "doc_a", + "source_type": "documents", + "source_path": "docs/a.pdf", + "title": "A", + "metadata": {"summary": "Explicit 256 dimension summary."}, + } + )["status"] == "ready" + + +def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + index_dir = tmp_path / "projection" + index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + index.reset( + dimension=256, + metadata={ + "channel": "summary", + "embedding_provider": "test", + "embedding_model": "fake", + "embedding_dimensions": 256, + }, + ) + + with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"): + SummaryProjectionIndexer( + index_dir, + embedder=FixedDimensionEmbedder(1024), + embedding_provider="test", + embedding_model="fake", + ) + + assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256 + + +def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder( + tmp_path, monkeypatch +): + from pageindex.filesystem import projection_indexing + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + index_dir = tmp_path / "projection" + index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + index.reset( + dimension=256, + metadata={ + "channel": "summary", + "embedding_provider": "openai", + "embedding_model": "text-embedding-3-small", + "embedding_dimensions": 256, + }, + ) + + def fail_make_embedder(*args, **kwargs): + raise AssertionError("embedder should not be constructed before dimension validation") + + monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder) + + with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"): + SummaryProjectionIndexer.from_provider(index_dir) + + +def test_embedding_cache_key_separates_model_dimensions(tmp_path): + from pageindex.filesystem.hybrid_projection import ( + EmbeddingCache, + embedding_cache_model_key, + ) + + class CountingEmbedder: + def __init__(self, dimensions: int): + self.dimensions = dimensions + self.calls = 0 + def embed(self, texts): - return [[1.0, 0.0, 0.0, 0.0] for _ in texts] + self.calls += 1 + return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts] + + cache = EmbeddingCache(tmp_path / "cache.sqlite") + embedder_256 = CountingEmbedder(256) + embedder_1024 = CountingEmbedder(1024) + key_256 = embedding_cache_model_key("fake", 256) + key_1024 = embedding_cache_model_key("fake", 1024) + + assert key_256 != key_1024 + + vector_256 = cache.embed_texts( + ["same text"], + provider="test", + model=key_256, + embedder=embedder_256, + batch_size=1, + )[0] + vector_1024 = cache.embed_texts( + ["same text"], + provider="test", + model=key_1024, + embedder=embedder_1024, + batch_size=1, + )[0] + + assert len(vector_256) == 256 + assert len(vector_1024) == 1024 + assert embedder_256.calls == 1 + assert embedder_1024.calls == 1 + + +def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") @@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"): SummaryProjectionIndexer( index_dir, - embedder=FakeEmbedder(), + embedder=FixedDimensionEmbedder(4), embedding_provider="test", embedding_model="fake", embedding_dimensions=4,