mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
Merge Goal 6: default embeddings to 1024 dimensions
Merge embedding dimension defaults and mismatch guards into feat/pageindex-filesystem.
This commit is contained in:
commit
01af0c6a22
9 changed files with 395 additions and 67 deletions
|
|
@ -42,6 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
|
|||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
|
||||
from pageindex.filesystem.agent import run_pifs_agent
|
||||
from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
|
||||
|
||||
EXAMPLES_DIR = Path(__file__).parent
|
||||
|
|
@ -149,7 +150,11 @@ def parse_args() -> argparse.Namespace:
|
|||
default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"),
|
||||
help="Embedding model used for register-time summary projection.",
|
||||
)
|
||||
parser.add_argument("--embedding-dimensions", type=int, default=256)
|
||||
parser.add_argument(
|
||||
"--embedding-dimensions",
|
||||
type=int,
|
||||
default=DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -162,8 +162,7 @@ def _parse_agent_command(
|
|||
|
||||
def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
|
||||
filesystem = PageIndexFileSystem(Path(workspace).expanduser())
|
||||
with contextlib.suppress(Exception):
|
||||
filesystem.configure_existing_projection_retrieval()
|
||||
filesystem.configure_existing_projection_retrieval()
|
||||
return filesystem
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from .metadata_generation import (
|
|||
MetadataGenerationResult,
|
||||
MetadataGenerator,
|
||||
)
|
||||
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .semantic_folder_policy import (
|
||||
SEMANTIC_FOLDER_BASE_FIELDS,
|
||||
SEMANTIC_FOLDER_ROOT,
|
||||
|
|
@ -76,6 +77,11 @@ PROJECTION_INDEX_STATUSES = {
|
|||
}
|
||||
|
||||
SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
|
||||
SEMANTIC_PROJECTION_INDEX_NAMES = {
|
||||
"summary": "summary_only_vector",
|
||||
"entity": "entity_vectors",
|
||||
"relation": "relation_vectors",
|
||||
}
|
||||
PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
|
||||
PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
|
||||
"application/pdf",
|
||||
|
|
@ -103,7 +109,7 @@ class PageIndexFileSystem:
|
|||
summary_projection_index_dir: Union[str, Path, None] = None,
|
||||
summary_projection_embedding_provider: str = "openai",
|
||||
summary_projection_embedding_model: str = "text-embedding-3-small",
|
||||
summary_projection_embedding_dimensions: int = 256,
|
||||
summary_projection_embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
summary_projection_embedding_timeout: float = 60,
|
||||
):
|
||||
self.workspace = Path(workspace).expanduser()
|
||||
|
|
@ -248,48 +254,43 @@ class PageIndexFileSystem:
|
|||
"""Attach semantic retrieval to already-built projection indexes.
|
||||
|
||||
Register-time generation owns building the index files. Opening an
|
||||
existing workspace should still expose semantic browse, without forcing
|
||||
a re-register step.
|
||||
existing workspace should still expose semantic retrieval when the
|
||||
configured embedding dimensions match the existing index.
|
||||
"""
|
||||
if self.semantic_retrieval_backend is not None:
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
index_config = self._existing_projection_index_config()
|
||||
if index_config is None:
|
||||
return False
|
||||
metadata = dict(index_config.get("metadata") or {})
|
||||
embedding_provider = str(
|
||||
metadata.get("embedding_provider")
|
||||
or self.summary_projection_embedding_provider
|
||||
)
|
||||
embedding_model = str(
|
||||
metadata.get("embedding_model")
|
||||
or self.summary_projection_embedding_model
|
||||
)
|
||||
embedding_dimensions = int(
|
||||
metadata.get("embedding_dimensions")
|
||||
or index_config.get("dimension")
|
||||
or self.summary_projection_embedding_dimensions
|
||||
)
|
||||
existing_dimension = int(index_config.get("dimension") or 0)
|
||||
if existing_dimension != self.summary_projection_embedding_dimensions:
|
||||
raise RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{index_config.get('db_path') or self.summary_projection_index_dir} "
|
||||
f"was built with dimension {existing_dimension}, but configured "
|
||||
"summary_projection_embedding_dimensions is "
|
||||
f"{self.summary_projection_embedding_dimensions}. Rebuild the "
|
||||
"projection index or use a matching embedding configuration."
|
||||
)
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.summary_projection_index_dir,
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
embedding_provider=self.summary_projection_embedding_provider,
|
||||
embedding_model=self.summary_projection_embedding_model,
|
||||
embedding_dimensions=self.summary_projection_embedding_dimensions,
|
||||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
|
||||
def _existing_projection_index_config(self) -> dict[str, Any] | None:
|
||||
from .hybrid_projection import INDEX_BY_CHANNEL
|
||||
from .semantic_index import SQLiteVecSemanticIndex
|
||||
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
index_name = INDEX_BY_CHANNEL.get(channel)
|
||||
index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel)
|
||||
if not index_name:
|
||||
continue
|
||||
index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
|
||||
if not index_path.exists():
|
||||
continue
|
||||
from .semantic_index import SQLiteVecSemanticIndex
|
||||
|
||||
try:
|
||||
info = SQLiteVecSemanticIndex(index_path).info()
|
||||
except Exception:
|
||||
|
|
@ -656,7 +657,7 @@ class PageIndexFileSystem:
|
|||
*,
|
||||
embedding_provider: str = "openai",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = 256,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
per_channel_limit: int = 100,
|
||||
fetch_multiplier: int = 100,
|
||||
|
|
|
|||
1
pageindex/filesystem/embedding_defaults.py
Normal file
1
pageindex/filesystem/embedding_defaults.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
DEFAULT_EMBEDDING_DIMENSIONS = 1024
|
||||
|
|
@ -10,6 +10,7 @@ from dataclasses import dataclass
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
|
||||
|
||||
|
||||
|
|
@ -65,7 +66,7 @@ class HybridProjectionSearchBackend:
|
|||
embedder: Any,
|
||||
embedding_provider: str,
|
||||
embedding_model: str,
|
||||
embedding_dimensions: int = 256,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_cache_path: str | Path | None = None,
|
||||
per_channel_limit: int = 100,
|
||||
fetch_multiplier: int = 100,
|
||||
|
|
@ -95,7 +96,7 @@ class HybridProjectionSearchBackend:
|
|||
*,
|
||||
embedding_provider: str = "openai",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = 256,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> "HybridProjectionSearchBackend":
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .hybrid_projection import (
|
||||
EmbeddingCache,
|
||||
INDEX_BY_CHANNEL,
|
||||
|
|
@ -22,7 +23,7 @@ class SummaryProjectionIndexer:
|
|||
embedder: Any,
|
||||
embedding_provider: str,
|
||||
embedding_model: str,
|
||||
embedding_dimensions: int = 256,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_cache_path: str | Path | None = None,
|
||||
) -> None:
|
||||
self.index_dir = Path(index_dir).expanduser()
|
||||
|
|
@ -49,10 +50,11 @@ class SummaryProjectionIndexer:
|
|||
*,
|
||||
embedding_provider: str = "openai",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = 256,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> "SummaryProjectionIndexer":
|
||||
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
|
||||
return cls(
|
||||
index_dir,
|
||||
embedder=make_embedder(
|
||||
|
|
@ -118,12 +120,10 @@ class SummaryProjectionIndexer:
|
|||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != self.embedding_dimensions:
|
||||
raise RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{self.index.db_path} was built with dimension {existing_dimension}, "
|
||||
f"but configured embedding_dimensions is {self.embedding_dimensions}. "
|
||||
"Use the matching embedding config, or rebuild the projection index "
|
||||
"at a new path after preserving the existing data."
|
||||
raise self._dimension_mismatch_error(
|
||||
self.index.db_path,
|
||||
existing_dimension,
|
||||
self.embedding_dimensions,
|
||||
)
|
||||
|
||||
def _index_metadata(self) -> dict[str, Any]:
|
||||
|
|
@ -133,3 +133,44 @@ class SummaryProjectionIndexer:
|
|||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _validate_existing_index_dimension(
|
||||
cls,
|
||||
index_dir: str | Path,
|
||||
embedding_dimensions: int,
|
||||
) -> None:
|
||||
index_path = (
|
||||
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
|
||||
)
|
||||
if not index_path.exists():
|
||||
return
|
||||
index = SQLiteVecSemanticIndex(index_path)
|
||||
try:
|
||||
existing_dimension = index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {index_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != embedding_dimensions:
|
||||
raise cls._dimension_mismatch_error(
|
||||
index_path,
|
||||
existing_dimension,
|
||||
embedding_dimensions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _dimension_mismatch_error(
|
||||
index_path: Path,
|
||||
existing_dimension: int,
|
||||
embedding_dimensions: int,
|
||||
) -> RuntimeError:
|
||||
return RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{index_path} was built with dimension {existing_dimension}, "
|
||||
f"but configured embedding_dimensions is {embedding_dimensions}. "
|
||||
"Use the matching embedding config, or rebuild the projection index "
|
||||
"at a new path after preserving the existing data."
|
||||
)
|
||||
|
|
|
|||
|
|
@ -786,7 +786,75 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
|||
assert "alpha evidence" in matched["data"]["data"][0]["text"]
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
||||
def test_existing_summary_projection_index_uses_current_config_when_dimensions_match(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "stale-provider",
|
||||
"embedding_model": "stale-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace,
|
||||
summary_projection_embedding_provider="current-provider",
|
||||
summary_projection_embedding_model="current-embedding",
|
||||
summary_projection_embedding_dimensions=3,
|
||||
summary_projection_embedding_timeout=12,
|
||||
)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
)
|
||||
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "current-provider",
|
||||
"embedding_model": "current-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
"embedding_timeout": 12,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
|
|
@ -816,32 +884,24 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
|
|||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(workspace)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
def fail_configure(*args, **kwargs):
|
||||
raise AssertionError("retrieval backend should not be configured on dimension mismatch")
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
fail_configure,
|
||||
)
|
||||
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "test-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
"embedding_timeout": 60,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match=(
|
||||
"summary projection index dimension mismatch: .*"
|
||||
"dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
|
||||
),
|
||||
):
|
||||
filesystem.configure_existing_projection_retrieval()
|
||||
|
||||
|
||||
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
import builtins
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class FakeFileSystem:
|
||||
def __init__(self, workspace):
|
||||
|
|
@ -25,6 +29,71 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp
|
|||
assert filesystem.projection_retrieval_configured is True
|
||||
|
||||
|
||||
def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
|
||||
monkeypatch, tmp_path
|
||||
):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
real_import = builtins.__import__
|
||||
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
|
||||
|
||||
def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0):
|
||||
if name.split(".", 1)[0] == "sqlite_vec":
|
||||
raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec")
|
||||
return real_import(name, globals, locals, fromlist, level)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", block_sqlite_vec)
|
||||
|
||||
filesystem = cli._filesystem_from_workspace(str(workspace))
|
||||
|
||||
assert filesystem.workspace == workspace
|
||||
assert filesystem.semantic_retrieval_channels() == ()
|
||||
|
||||
|
||||
def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "test",
|
||||
"embedding_model": "fake",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match=(
|
||||
"summary projection index dimension mismatch: .*"
|
||||
"dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
|
||||
),
|
||||
):
|
||||
cli._filesystem_from_workspace(str(workspace))
|
||||
|
||||
|
||||
def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import (
|
|||
)
|
||||
|
||||
|
||||
class FixedDimensionEmbedder:
|
||||
def __init__(self, dimensions: int):
|
||||
self.dimensions = dimensions
|
||||
|
||||
def embed(self, texts):
|
||||
return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts]
|
||||
|
||||
|
||||
def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
||||
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||
index.reset(dimension=3, metadata={"field_mode": "summary"})
|
||||
|
|
@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
|
|||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0] for _ in texts]
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FakeEmbedder(),
|
||||
embedder=FixedDimensionEmbedder(3),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=3,
|
||||
|
|
@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|||
assert hits[0].metadata["department"] == "ops"
|
||||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FixedDimensionEmbedder(1024),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
)
|
||||
|
||||
info = indexer.index.info()
|
||||
|
||||
assert info["dimension"] == 1024
|
||||
assert info["metadata"]["embedding_dimensions"] == 1024
|
||||
|
||||
result = indexer.upsert_summary(
|
||||
{
|
||||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Default dimension summary."},
|
||||
}
|
||||
)
|
||||
|
||||
assert result["status"] == "ready"
|
||||
assert result["embedding_dimensions"] == 1024
|
||||
|
||||
|
||||
def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FixedDimensionEmbedder(256),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=256,
|
||||
)
|
||||
|
||||
assert indexer.index.info()["dimension"] == 256
|
||||
assert indexer.upsert_summary(
|
||||
{
|
||||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Explicit 256 dimension summary."},
|
||||
}
|
||||
)["status"] == "ready"
|
||||
|
||||
|
||||
def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
index.reset(
|
||||
dimension=256,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "test",
|
||||
"embedding_model": "fake",
|
||||
"embedding_dimensions": 256,
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
|
||||
SummaryProjectionIndexer(
|
||||
index_dir,
|
||||
embedder=FixedDimensionEmbedder(1024),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
)
|
||||
|
||||
assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256
|
||||
|
||||
|
||||
def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import projection_indexing
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
index.reset(
|
||||
dimension=256,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "text-embedding-3-small",
|
||||
"embedding_dimensions": 256,
|
||||
},
|
||||
)
|
||||
|
||||
def fail_make_embedder(*args, **kwargs):
|
||||
raise AssertionError("embedder should not be constructed before dimension validation")
|
||||
|
||||
monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
|
||||
|
||||
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
|
||||
SummaryProjectionIndexer.from_provider(index_dir)
|
||||
|
||||
|
||||
def test_embedding_cache_key_separates_model_dimensions(tmp_path):
|
||||
from pageindex.filesystem.hybrid_projection import (
|
||||
EmbeddingCache,
|
||||
embedding_cache_model_key,
|
||||
)
|
||||
|
||||
class CountingEmbedder:
|
||||
def __init__(self, dimensions: int):
|
||||
self.dimensions = dimensions
|
||||
self.calls = 0
|
||||
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
|
||||
self.calls += 1
|
||||
return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts]
|
||||
|
||||
cache = EmbeddingCache(tmp_path / "cache.sqlite")
|
||||
embedder_256 = CountingEmbedder(256)
|
||||
embedder_1024 = CountingEmbedder(1024)
|
||||
key_256 = embedding_cache_model_key("fake", 256)
|
||||
key_1024 = embedding_cache_model_key("fake", 1024)
|
||||
|
||||
assert key_256 != key_1024
|
||||
|
||||
vector_256 = cache.embed_texts(
|
||||
["same text"],
|
||||
provider="test",
|
||||
model=key_256,
|
||||
embedder=embedder_256,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
vector_1024 = cache.embed_texts(
|
||||
["same text"],
|
||||
provider="test",
|
||||
model=key_1024,
|
||||
embedder=embedder_1024,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
|
||||
assert len(vector_256) == 256
|
||||
assert len(vector_1024) == 1024
|
||||
assert embedder_256.calls == 1
|
||||
assert embedder_1024.calls == 1
|
||||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
|
|
@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
|
|||
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
|
||||
SummaryProjectionIndexer(
|
||||
index_dir,
|
||||
embedder=FakeEmbedder(),
|
||||
embedder=FixedDimensionEmbedder(4),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=4,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue