Merge Goal 6: default embeddings to 1024 dimensions

Merge embedding dimension defaults and mismatch guards into feat/pageindex-filesystem.
2026-06-12 19:55:17 +02:00 · 2026-05-31 21:42:26 +08:00 · 2026-05-31 21:42:26 +08:00 · 01af0c6a22
commit 01af0c6a22
parent a7a1165c95 8f87cee6ce
9 changed files with 395 additions and 67 deletions
--- a/examples/pifs_demo.py
+++ b/examples/pifs_demo.py
@ -42,6 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
 from pageindex import PageIndexClient
 from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
 from pageindex.filesystem.agent import run_pifs_agent
+from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS


 EXAMPLES_DIR = Path(__file__).parent
@ -149,7 +150,11 @@ def parse_args() -> argparse.Namespace:
        default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"),
        help="Embedding model used for register-time summary projection.",
    )
-    parser.add_argument("--embedding-dimensions", type=int, default=256)
+    parser.add_argument(
+        "--embedding-dimensions",
+        type=int,
+        default=DEFAULT_EMBEDDING_DIMENSIONS,
+    )
    return parser.parse_args()


--- a/pageindex/filesystem/cli.py
+++ b/pageindex/filesystem/cli.py
@ -162,8 +162,7 @@ def _parse_agent_command(

 def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
    filesystem = PageIndexFileSystem(Path(workspace).expanduser())
-    with contextlib.suppress(Exception):
-        filesystem.configure_existing_projection_retrieval()
+    filesystem.configure_existing_projection_retrieval()
    return filesystem


--- a/pageindex/filesystem/core.py
+++ b/pageindex/filesystem/core.py
@ -14,6 +14,7 @@ from .metadata_generation import (
    MetadataGenerationResult,
    MetadataGenerator,
 )
+from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
 from .semantic_folder_policy import (
    SEMANTIC_FOLDER_BASE_FIELDS,
    SEMANTIC_FOLDER_ROOT,
@ -76,6 +77,11 @@ PROJECTION_INDEX_STATUSES = {
 }

 SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
+SEMANTIC_PROJECTION_INDEX_NAMES = {
+    "summary": "summary_only_vector",
+    "entity": "entity_vectors",
+    "relation": "relation_vectors",
+}
 PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
 PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
    "application/pdf",
@ -103,7 +109,7 @@ class PageIndexFileSystem:
        summary_projection_index_dir: Union[str, Path, None] = None,
        summary_projection_embedding_provider: str = "openai",
        summary_projection_embedding_model: str = "text-embedding-3-small",
-        summary_projection_embedding_dimensions: int = 256,
+        summary_projection_embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        summary_projection_embedding_timeout: float = 60,
    ):
        self.workspace = Path(workspace).expanduser()
@ -248,48 +254,43 @@ class PageIndexFileSystem:
        """Attach semantic retrieval to already-built projection indexes.

        Register-time generation owns building the index files. Opening an
-        existing workspace should still expose semantic browse, without forcing
-        a re-register step.
+        existing workspace should still expose semantic retrieval when the
+        configured embedding dimensions match the existing index.
        """
        if self.semantic_retrieval_backend is not None:
            return bool(self.semantic_retrieval_channels())
        index_config = self._existing_projection_index_config()
        if index_config is None:
            return False
-        metadata = dict(index_config.get("metadata") or {})
-        embedding_provider = str(
-            metadata.get("embedding_provider")
-            or self.summary_projection_embedding_provider
-        )
-        embedding_model = str(
-            metadata.get("embedding_model")
-            or self.summary_projection_embedding_model
-        )
-        embedding_dimensions = int(
-            metadata.get("embedding_dimensions")
-            or index_config.get("dimension")
-            or self.summary_projection_embedding_dimensions
-        )
+        existing_dimension = int(index_config.get("dimension") or 0)
+        if existing_dimension != self.summary_projection_embedding_dimensions:
+            raise RuntimeError(
+                "summary projection index dimension mismatch: "
+                f"{index_config.get('db_path') or self.summary_projection_index_dir} "
+                f"was built with dimension {existing_dimension}, but configured "
+                "summary_projection_embedding_dimensions is "
+                f"{self.summary_projection_embedding_dimensions}. Rebuild the "
+                "projection index or use a matching embedding configuration."
+            )
        self.configure_hybrid_projection_retrieval(
            self.summary_projection_index_dir,
-            embedding_provider=embedding_provider,
-            embedding_model=embedding_model,
-            embedding_dimensions=embedding_dimensions,
+            embedding_provider=self.summary_projection_embedding_provider,
+            embedding_model=self.summary_projection_embedding_model,
+            embedding_dimensions=self.summary_projection_embedding_dimensions,
            embedding_timeout=self.summary_projection_embedding_timeout,
        )
        return bool(self.semantic_retrieval_channels())

    def _existing_projection_index_config(self) -> dict[str, Any] | None:
-        from .hybrid_projection import INDEX_BY_CHANNEL
-        from .semantic_index import SQLiteVecSemanticIndex
-
        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
-            index_name = INDEX_BY_CHANNEL.get(channel)
+            index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel)
            if not index_name:
                continue
            index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
            if not index_path.exists():
                continue
+            from .semantic_index import SQLiteVecSemanticIndex
+
            try:
                info = SQLiteVecSemanticIndex(index_path).info()
            except Exception:
@ -656,7 +657,7 @@ class PageIndexFileSystem:
        *,
        embedding_provider: str = "openai",
        embedding_model: str = "text-embedding-3-small",
-        embedding_dimensions: int = 256,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_timeout: float = 60,
        per_channel_limit: int = 100,
        fetch_multiplier: int = 100,
--- a/pageindex/filesystem/embedding_defaults.py
+++ b/pageindex/filesystem/embedding_defaults.py
@ -0,0 +1 @@
+DEFAULT_EMBEDDING_DIMENSIONS = 1024
--- a/pageindex/filesystem/hybrid_projection.py
+++ b/pageindex/filesystem/hybrid_projection.py
@ -10,6 +10,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any

+from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
 from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult


@ -65,7 +66,7 @@ class HybridProjectionSearchBackend:
        embedder: Any,
        embedding_provider: str,
        embedding_model: str,
-        embedding_dimensions: int = 256,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_cache_path: str | Path | None = None,
        per_channel_limit: int = 100,
        fetch_multiplier: int = 100,
@ -95,7 +96,7 @@ class HybridProjectionSearchBackend:
        *,
        embedding_provider: str = "openai",
        embedding_model: str = "text-embedding-3-small",
-        embedding_dimensions: int = 256,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_timeout: float = 60,
        **kwargs: Any,
    ) -> "HybridProjectionSearchBackend":
--- a/pageindex/filesystem/projection_indexing.py
+++ b/pageindex/filesystem/projection_indexing.py
@ -3,6 +3,7 @@ from __future__ import annotations
 from pathlib import Path
 from typing import Any

+from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
 from .hybrid_projection import (
    EmbeddingCache,
    INDEX_BY_CHANNEL,
@ -22,7 +23,7 @@ class SummaryProjectionIndexer:
        embedder: Any,
        embedding_provider: str,
        embedding_model: str,
-        embedding_dimensions: int = 256,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_cache_path: str | Path | None = None,
    ) -> None:
        self.index_dir = Path(index_dir).expanduser()
@ -49,10 +50,11 @@ class SummaryProjectionIndexer:
        *,
        embedding_provider: str = "openai",
        embedding_model: str = "text-embedding-3-small",
-        embedding_dimensions: int = 256,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_timeout: float = 60,
        **kwargs: Any,
    ) -> "SummaryProjectionIndexer":
+        cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
        return cls(
            index_dir,
            embedder=make_embedder(
@ -118,12 +120,10 @@ class SummaryProjectionIndexer:
                "aside or rebuild it intentionally before changing embedding config."
            ) from exc
        if existing_dimension != self.embedding_dimensions:
-            raise RuntimeError(
-                "summary projection index dimension mismatch: "
-                f"{self.index.db_path} was built with dimension {existing_dimension}, "
-                f"but configured embedding_dimensions is {self.embedding_dimensions}. "
-                "Use the matching embedding config, or rebuild the projection index "
-                "at a new path after preserving the existing data."
+            raise self._dimension_mismatch_error(
+                self.index.db_path,
+                existing_dimension,
+                self.embedding_dimensions,
            )

    def _index_metadata(self) -> dict[str, Any]:
@ -133,3 +133,44 @@ class SummaryProjectionIndexer:
            "embedding_model": self.embedding_model,
            "embedding_dimensions": self.embedding_dimensions,
        }
+
+    @classmethod
+    def _validate_existing_index_dimension(
+        cls,
+        index_dir: str | Path,
+        embedding_dimensions: int,
+    ) -> None:
+        index_path = (
+            Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        if not index_path.exists():
+            return
+        index = SQLiteVecSemanticIndex(index_path)
+        try:
+            existing_dimension = index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {index_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != embedding_dimensions:
+            raise cls._dimension_mismatch_error(
+                index_path,
+                existing_dimension,
+                embedding_dimensions,
+            )
+
+    @staticmethod
+    def _dimension_mismatch_error(
+        index_path: Path,
+        existing_dimension: int,
+        embedding_dimensions: int,
+    ) -> RuntimeError:
+        return RuntimeError(
+            "summary projection index dimension mismatch: "
+            f"{index_path} was built with dimension {existing_dimension}, "
+            f"but configured embedding_dimensions is {embedding_dimensions}. "
+            "Use the matching embedding config, or rebuild the projection index "
+            "at a new path after preserving the existing data."
+        )
--- a/tests/test_pageindex_filesystem_scope.py
+++ b/tests/test_pageindex_filesystem_scope.py
@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
    assert "alpha evidence" in matched["data"]["data"][0]["text"]


-def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
+def test_existing_summary_projection_index_uses_current_config_when_dimensions_match(
+    tmp_path, monkeypatch
+):
+    from pageindex.filesystem import PageIndexFileSystem
+    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
+
+    workspace = tmp_path / "workspace"
+    index_dir = workspace / "artifacts" / "projection_indexes"
+    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    summary_index.reset(
+        dimension=3,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "stale-provider",
+            "embedding_model": "stale-embedding",
+            "embedding_dimensions": 3,
+        },
+    )
+    summary_index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="documents",
+                source_path="documents/a.pdf",
+                title="A",
+                text="summary",
+                vector=[1.0, 0.0, 0.0],
+            )
+        ]
+    )
+    filesystem = PageIndexFileSystem(
+        workspace,
+        summary_projection_embedding_provider="current-provider",
+        summary_projection_embedding_model="current-embedding",
+        summary_projection_embedding_dimensions=3,
+        summary_projection_embedding_timeout=12,
+    )
+    calls = []
+
+    def fake_configure(index_dir_arg, **kwargs):
+        calls.append((index_dir_arg, kwargs))
+        filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
+        return filesystem.semantic_retrieval_backend
+
+    monkeypatch.setattr(
+        filesystem,
+        "configure_hybrid_projection_retrieval",
+        fake_configure,
+    )
+
+    assert filesystem.configure_existing_projection_retrieval() is True
+    assert calls == [
+        (
+            filesystem.summary_projection_index_dir,
+            {
+                "embedding_provider": "current-provider",
+                "embedding_model": "current-embedding",
+                "embedding_dimensions": 3,
+                "embedding_timeout": 12,
+            },
+        )
+    ]
+    assert filesystem.semantic_retrieval_channels() == ("summary",)
+
+
+def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
+    tmp_path, monkeypatch
+):
    from pageindex.filesystem import PageIndexFileSystem
    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex

@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
        ]
    )
    filesystem = PageIndexFileSystem(workspace)
-    calls = []

-    def fake_configure(index_dir_arg, **kwargs):
-        calls.append((index_dir_arg, kwargs))
-        filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
-        return filesystem.semantic_retrieval_backend
+    def fail_configure(*args, **kwargs):
+        raise AssertionError("retrieval backend should not be configured on dimension mismatch")

    monkeypatch.setattr(
        filesystem,
        "configure_hybrid_projection_retrieval",
-        fake_configure,
+        fail_configure,
    )

-    assert filesystem.configure_existing_projection_retrieval() is True
-    assert calls == [
-        (
-            filesystem.summary_projection_index_dir,
-            {
-                "embedding_provider": "openai",
-                "embedding_model": "test-embedding",
-                "embedding_dimensions": 3,
-                "embedding_timeout": 60,
-            },
-        )
-    ]
-    assert filesystem.semantic_retrieval_channels() == ("summary",)
+    with pytest.raises(
+        RuntimeError,
+        match=(
+            "summary projection index dimension mismatch: .*"
+            "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
+        ),
+    ):
+        filesystem.configure_existing_projection_retrieval()


 def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
--- a/tests/test_pifs_cli.py
+++ b/tests/test_pifs_cli.py
@ -1,6 +1,10 @@
+import builtins
 import os
+import sys
 from pathlib import Path

+import pytest
+

 class FakeFileSystem:
    def __init__(self, workspace):
@ -25,6 +29,71 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp
    assert filesystem.projection_retrieval_configured is True


+def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
+    monkeypatch, tmp_path
+):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    real_import = builtins.__import__
+
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
+    monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
+
+    def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0):
+        if name.split(".", 1)[0] == "sqlite_vec":
+            raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec")
+        return real_import(name, globals, locals, fromlist, level)
+
+    monkeypatch.setattr(builtins, "__import__", block_sqlite_vec)
+
+    filesystem = cli._filesystem_from_workspace(str(workspace))
+
+    assert filesystem.workspace == workspace
+    assert filesystem.semantic_retrieval_channels() == ()
+
+
+def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
+    from pageindex.filesystem import cli
+    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
+
+    workspace = tmp_path / "workspace"
+    index_dir = workspace / "artifacts" / "projection_indexes"
+    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    summary_index.reset(
+        dimension=3,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "test",
+            "embedding_model": "fake",
+            "embedding_dimensions": 3,
+        },
+    )
+    summary_index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="documents",
+                source_path="documents/a.pdf",
+                title="A",
+                text="summary",
+                vector=[1.0, 0.0, 0.0],
+            )
+        ]
+    )
+
+    with pytest.raises(
+        RuntimeError,
+        match=(
+            "summary projection index dimension mismatch: .*"
+            "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
+        ),
+    ):
+        cli._filesystem_from_workspace(str(workspace))
+
+
 def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):
    from pageindex.filesystem import cli

--- a/tests/test_semantic_index.py
+++ b/tests/test_semantic_index.py
@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import (
 )


+class FixedDimensionEmbedder:
+    def __init__(self, dimensions: int):
+        self.dimensions = dimensions
+
+    def embed(self, texts):
+        return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts]
+
+
 def test_sqlite_vec_semantic_index_round_trip(tmp_path):
    index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
    index.reset(dimension=3, metadata={"field_mode": "summary"})
@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
 def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer

-    class FakeEmbedder:
-        def embed(self, texts):
-            return [[1.0, 0.0, 0.0] for _ in texts]
-
    indexer = SummaryProjectionIndexer(
        tmp_path / "projection",
-        embedder=FakeEmbedder(),
+        embedder=FixedDimensionEmbedder(3),
        embedding_provider="test",
        embedding_model="fake",
        embedding_dimensions=3,
@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
    assert hits[0].metadata["department"] == "ops"


-def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
+def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer

-    class FakeEmbedder:
+    indexer = SummaryProjectionIndexer(
+        tmp_path / "projection",
+        embedder=FixedDimensionEmbedder(1024),
+        embedding_provider="test",
+        embedding_model="fake",
+    )
+
+    info = indexer.index.info()
+
+    assert info["dimension"] == 1024
+    assert info["metadata"]["embedding_dimensions"] == 1024
+
+    result = indexer.upsert_summary(
+        {
+            "file_ref": "file_a",
+            "external_id": "doc_a",
+            "source_type": "documents",
+            "source_path": "docs/a.pdf",
+            "title": "A",
+            "metadata": {"summary": "Default dimension summary."},
+        }
+    )
+
+    assert result["status"] == "ready"
+    assert result["embedding_dimensions"] == 1024
+
+
+def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    indexer = SummaryProjectionIndexer(
+        tmp_path / "projection",
+        embedder=FixedDimensionEmbedder(256),
+        embedding_provider="test",
+        embedding_model="fake",
+        embedding_dimensions=256,
+    )
+
+    assert indexer.index.info()["dimension"] == 256
+    assert indexer.upsert_summary(
+        {
+            "file_ref": "file_a",
+            "external_id": "doc_a",
+            "source_type": "documents",
+            "source_path": "docs/a.pdf",
+            "title": "A",
+            "metadata": {"summary": "Explicit 256 dimension summary."},
+        }
+    )["status"] == "ready"
+
+
+def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    index_dir = tmp_path / "projection"
+    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    index.reset(
+        dimension=256,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "test",
+            "embedding_model": "fake",
+            "embedding_dimensions": 256,
+        },
+    )
+
+    with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
+        SummaryProjectionIndexer(
+            index_dir,
+            embedder=FixedDimensionEmbedder(1024),
+            embedding_provider="test",
+            embedding_model="fake",
+        )
+
+    assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256
+
+
+def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
+    tmp_path, monkeypatch
+):
+    from pageindex.filesystem import projection_indexing
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    index_dir = tmp_path / "projection"
+    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    index.reset(
+        dimension=256,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "openai",
+            "embedding_model": "text-embedding-3-small",
+            "embedding_dimensions": 256,
+        },
+    )
+
+    def fail_make_embedder(*args, **kwargs):
+        raise AssertionError("embedder should not be constructed before dimension validation")
+
+    monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
+
+    with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
+        SummaryProjectionIndexer.from_provider(index_dir)
+
+
+def test_embedding_cache_key_separates_model_dimensions(tmp_path):
+    from pageindex.filesystem.hybrid_projection import (
+        EmbeddingCache,
+        embedding_cache_model_key,
+    )
+
+    class CountingEmbedder:
+        def __init__(self, dimensions: int):
+            self.dimensions = dimensions
+            self.calls = 0
+
        def embed(self, texts):
-            return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
+            self.calls += 1
+            return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts]
+
+    cache = EmbeddingCache(tmp_path / "cache.sqlite")
+    embedder_256 = CountingEmbedder(256)
+    embedder_1024 = CountingEmbedder(1024)
+    key_256 = embedding_cache_model_key("fake", 256)
+    key_1024 = embedding_cache_model_key("fake", 1024)
+
+    assert key_256 != key_1024
+
+    vector_256 = cache.embed_texts(
+        ["same text"],
+        provider="test",
+        model=key_256,
+        embedder=embedder_256,
+        batch_size=1,
+    )[0]
+    vector_1024 = cache.embed_texts(
+        ["same text"],
+        provider="test",
+        model=key_1024,
+        embedder=embedder_1024,
+        batch_size=1,
+    )[0]
+
+    assert len(vector_256) == 256
+    assert len(vector_1024) == 1024
+    assert embedder_256.calls == 1
+    assert embedder_1024.calls == 1
+
+
+def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer

    index_dir = tmp_path / "projection"
    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
    with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
        SummaryProjectionIndexer(
            index_dir,
-            embedder=FakeEmbedder(),
+            embedder=FixedDimensionEmbedder(4),
            embedding_provider="test",
            embedding_model="fake",
            embedding_dimensions=4,