diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 0019942..4d71082 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -42,6 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") from pageindex import PageIndexClient from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor from pageindex.filesystem.agent import run_pifs_agent +from pageindex.filesystem.hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS EXAMPLES_DIR = Path(__file__).parent @@ -149,7 +150,11 @@ def parse_args() -> argparse.Namespace: default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"), help="Embedding model used for register-time summary projection.", ) - parser.add_argument("--embedding-dimensions", type=int, default=256) + parser.add_argument( + "--embedding-dimensions", + type=int, + default=DEFAULT_EMBEDDING_DIMENSIONS, + ) return parser.parse_args() diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 81b3848..7bf1903 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -22,6 +22,7 @@ from .semantic_folder_policy import ( is_semantic_folder_forbidden_field, semantic_folder_allowed_extension_fields, ) +from .hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS from .store import ( SQLiteFileSystemStore, fingerprint, @@ -103,7 +104,7 @@ class PageIndexFileSystem: summary_projection_index_dir: Union[str, Path, None] = None, summary_projection_embedding_provider: str = "openai", summary_projection_embedding_model: str = "text-embedding-3-small", - summary_projection_embedding_dimensions: int = 256, + summary_projection_embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, summary_projection_embedding_timeout: float = 60, ): self.workspace = Path(workspace).expanduser() @@ -656,7 +657,7 @@ class PageIndexFileSystem: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, per_channel_limit: int = 100, fetch_multiplier: int = 100, diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py index b49d49a..2e228a8 100644 --- a/pageindex/filesystem/hybrid_projection.py +++ b/pageindex/filesystem/hybrid_projection.py @@ -28,6 +28,7 @@ HYBRID_ENTITY_RELATION_WEIGHTS = { "relation": 0.30, "constraint": 0.20, } +DEFAULT_EMBEDDING_DIMENSIONS = 1024 @dataclass(frozen=True) @@ -65,7 +66,7 @@ class HybridProjectionSearchBackend: embedder: Any, embedding_provider: str, embedding_model: str, - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_cache_path: str | Path | None = None, per_channel_limit: int = 100, fetch_multiplier: int = 100, @@ -95,7 +96,7 @@ class HybridProjectionSearchBackend: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, **kwargs: Any, ) -> "HybridProjectionSearchBackend": diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py index e5d7b82..843d158 100644 --- a/pageindex/filesystem/projection_indexing.py +++ b/pageindex/filesystem/projection_indexing.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any from .hybrid_projection import ( + DEFAULT_EMBEDDING_DIMENSIONS, EmbeddingCache, INDEX_BY_CHANNEL, embedding_cache_model_key, @@ -22,7 +23,7 @@ class SummaryProjectionIndexer: embedder: Any, embedding_provider: str, embedding_model: str, - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_cache_path: str | Path | None = None, ) -> None: self.index_dir = Path(index_dir).expanduser() @@ -49,10 +50,11 @@ class SummaryProjectionIndexer: *, embedding_provider: str = "openai", embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = 256, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, **kwargs: Any, ) -> "SummaryProjectionIndexer": + cls._validate_existing_index_dimension(index_dir, embedding_dimensions) return cls( index_dir, embedder=make_embedder( @@ -118,12 +120,10 @@ class SummaryProjectionIndexer: "aside or rebuild it intentionally before changing embedding config." ) from exc if existing_dimension != self.embedding_dimensions: - raise RuntimeError( - "summary projection index dimension mismatch: " - f"{self.index.db_path} was built with dimension {existing_dimension}, " - f"but configured embedding_dimensions is {self.embedding_dimensions}. " - "Use the matching embedding config, or rebuild the projection index " - "at a new path after preserving the existing data." + raise self._dimension_mismatch_error( + self.index.db_path, + existing_dimension, + self.embedding_dimensions, ) def _index_metadata(self) -> dict[str, Any]: @@ -133,3 +133,44 @@ class SummaryProjectionIndexer: "embedding_model": self.embedding_model, "embedding_dimensions": self.embedding_dimensions, } + + @classmethod + def _validate_existing_index_dimension( + cls, + index_dir: str | Path, + embedding_dimensions: int, + ) -> None: + index_path = ( + Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite" + ) + if not index_path.exists(): + return + index = SQLiteVecSemanticIndex(index_path) + try: + existing_dimension = index.dimension() + except Exception as exc: + raise RuntimeError( + "could not validate existing summary projection index config; " + f"refusing to reset {index_path}. Move the existing index " + "aside or rebuild it intentionally before changing embedding config." + ) from exc + if existing_dimension != embedding_dimensions: + raise cls._dimension_mismatch_error( + index_path, + existing_dimension, + embedding_dimensions, + ) + + @staticmethod + def _dimension_mismatch_error( + index_path: Path, + existing_dimension: int, + embedding_dimensions: int, + ) -> RuntimeError: + return RuntimeError( + "summary projection index dimension mismatch: " + f"{index_path} was built with dimension {existing_dimension}, " + f"but configured embedding_dimensions is {embedding_dimensions}. " + "Use the matching embedding config, or rebuild the projection index " + "at a new path after preserving the existing data." + ) diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 9edf647..570d485 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -844,6 +844,67 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path assert filesystem.semantic_retrieval_channels() == ("summary",) +def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default( + tmp_path, monkeypatch +): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=256, + metadata={ + "channel": "summary", + "embedding_provider": "openai", + "embedding_model": "text-embedding-3-small", + "embedding_dimensions": 256, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, *([0.0] * 255)], + ) + ] + ) + filesystem = PageIndexFileSystem(workspace) + calls = [] + + def fake_configure(index_dir_arg, **kwargs): + calls.append((index_dir_arg, kwargs)) + filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") + return filesystem.semantic_retrieval_backend + + monkeypatch.setattr( + filesystem, + "configure_hybrid_projection_retrieval", + fake_configure, + ) + + assert filesystem.summary_projection_embedding_dimensions == 1024 + assert filesystem.configure_existing_projection_retrieval() is True + assert calls == [ + ( + filesystem.summary_projection_index_dir, + { + "embedding_provider": "openai", + "embedding_model": "text-embedding-3-small", + "embedding_dimensions": 256, + "embedding_timeout": 60, + }, + ) + ] + assert summary_index.info()["dimension"] == 256 + + def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py index d4263e1..6cdc0e1 100644 --- a/tests/test_semantic_index.py +++ b/tests/test_semantic_index.py @@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import ( ) +class FixedDimensionEmbedder: + def __init__(self, dimensions: int): + self.dimensions = dimensions + + def embed(self, texts): + return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts] + + def test_sqlite_vec_semantic_index_round_trip(tmp_path): index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") index.reset(dimension=3, metadata={"field_mode": "summary"}) @@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm def test_summary_projection_indexes_unified_metadata_summary(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer - class FakeEmbedder: - def embed(self, texts): - return [[1.0, 0.0, 0.0] for _ in texts] - indexer = SummaryProjectionIndexer( tmp_path / "projection", - embedder=FakeEmbedder(), + embedder=FixedDimensionEmbedder(3), embedding_provider="test", embedding_model="fake", embedding_dimensions=3, @@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path): assert hits[0].metadata["department"] == "ops" -def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): +def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer - class FakeEmbedder: + indexer = SummaryProjectionIndexer( + tmp_path / "projection", + embedder=FixedDimensionEmbedder(1024), + embedding_provider="test", + embedding_model="fake", + ) + + info = indexer.index.info() + + assert info["dimension"] == 1024 + assert info["metadata"]["embedding_dimensions"] == 1024 + + result = indexer.upsert_summary( + { + "file_ref": "file_a", + "external_id": "doc_a", + "source_type": "documents", + "source_path": "docs/a.pdf", + "title": "A", + "metadata": {"summary": "Default dimension summary."}, + } + ) + + assert result["status"] == "ready" + assert result["embedding_dimensions"] == 1024 + + +def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + indexer = SummaryProjectionIndexer( + tmp_path / "projection", + embedder=FixedDimensionEmbedder(256), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=256, + ) + + assert indexer.index.info()["dimension"] == 256 + assert indexer.upsert_summary( + { + "file_ref": "file_a", + "external_id": "doc_a", + "source_type": "documents", + "source_path": "docs/a.pdf", + "title": "A", + "metadata": {"summary": "Explicit 256 dimension summary."}, + } + )["status"] == "ready" + + +def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + index_dir = tmp_path / "projection" + index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + index.reset( + dimension=256, + metadata={ + "channel": "summary", + "embedding_provider": "test", + "embedding_model": "fake", + "embedding_dimensions": 256, + }, + ) + + with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"): + SummaryProjectionIndexer( + index_dir, + embedder=FixedDimensionEmbedder(1024), + embedding_provider="test", + embedding_model="fake", + ) + + assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256 + + +def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder( + tmp_path, monkeypatch +): + from pageindex.filesystem import projection_indexing + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + index_dir = tmp_path / "projection" + index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + index.reset( + dimension=256, + metadata={ + "channel": "summary", + "embedding_provider": "openai", + "embedding_model": "text-embedding-3-small", + "embedding_dimensions": 256, + }, + ) + + def fail_make_embedder(*args, **kwargs): + raise AssertionError("embedder should not be constructed before dimension validation") + + monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder) + + with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"): + SummaryProjectionIndexer.from_provider(index_dir) + + +def test_embedding_cache_key_separates_model_dimensions(tmp_path): + from pageindex.filesystem.hybrid_projection import ( + EmbeddingCache, + embedding_cache_model_key, + ) + + class CountingEmbedder: + def __init__(self, dimensions: int): + self.dimensions = dimensions + self.calls = 0 + def embed(self, texts): - return [[1.0, 0.0, 0.0, 0.0] for _ in texts] + self.calls += 1 + return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts] + + cache = EmbeddingCache(tmp_path / "cache.sqlite") + embedder_256 = CountingEmbedder(256) + embedder_1024 = CountingEmbedder(1024) + key_256 = embedding_cache_model_key("fake", 256) + key_1024 = embedding_cache_model_key("fake", 1024) + + assert key_256 != key_1024 + + vector_256 = cache.embed_texts( + ["same text"], + provider="test", + model=key_256, + embedder=embedder_256, + batch_size=1, + )[0] + vector_1024 = cache.embed_texts( + ["same text"], + provider="test", + model=key_1024, + embedder=embedder_1024, + batch_size=1, + )[0] + + assert len(vector_256) == 256 + assert len(vector_1024) == 1024 + assert embedder_256.calls == 1 + assert embedder_1024.calls == 1 + + +def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") @@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"): SummaryProjectionIndexer( index_dir, - embedder=FakeEmbedder(), + embedder=FixedDimensionEmbedder(4), embedding_provider="test", embedding_model="fake", embedding_dimensions=4,