feat(filesystem): default embeddings to 1024 dimensions

This commit is contained in:
BukeLy 2026-05-31 17:15:38 +08:00
parent b5cc404776
commit 58409d1ec5
6 changed files with 282 additions and 22 deletions

View file

@ -42,6 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
from pageindex import PageIndexClient
from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
from pageindex.filesystem.agent import run_pifs_agent
from pageindex.filesystem.hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
EXAMPLES_DIR = Path(__file__).parent
@ -149,7 +150,11 @@ def parse_args() -> argparse.Namespace:
default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"),
help="Embedding model used for register-time summary projection.",
)
parser.add_argument("--embedding-dimensions", type=int, default=256)
parser.add_argument(
"--embedding-dimensions",
type=int,
default=DEFAULT_EMBEDDING_DIMENSIONS,
)
return parser.parse_args()

View file

@ -22,6 +22,7 @@ from .semantic_folder_policy import (
is_semantic_folder_forbidden_field,
semantic_folder_allowed_extension_fields,
)
from .hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
from .store import (
SQLiteFileSystemStore,
fingerprint,
@ -103,7 +104,7 @@ class PageIndexFileSystem:
summary_projection_index_dir: Union[str, Path, None] = None,
summary_projection_embedding_provider: str = "openai",
summary_projection_embedding_model: str = "text-embedding-3-small",
summary_projection_embedding_dimensions: int = 256,
summary_projection_embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
summary_projection_embedding_timeout: float = 60,
):
self.workspace = Path(workspace).expanduser()
@ -656,7 +657,7 @@ class PageIndexFileSystem:
*,
embedding_provider: str = "openai",
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = 256,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
per_channel_limit: int = 100,
fetch_multiplier: int = 100,

View file

@ -28,6 +28,7 @@ HYBRID_ENTITY_RELATION_WEIGHTS = {
"relation": 0.30,
"constraint": 0.20,
}
DEFAULT_EMBEDDING_DIMENSIONS = 1024
@dataclass(frozen=True)
@ -65,7 +66,7 @@ class HybridProjectionSearchBackend:
embedder: Any,
embedding_provider: str,
embedding_model: str,
embedding_dimensions: int = 256,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None,
per_channel_limit: int = 100,
fetch_multiplier: int = 100,
@ -95,7 +96,7 @@ class HybridProjectionSearchBackend:
*,
embedding_provider: str = "openai",
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = 256,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
**kwargs: Any,
) -> "HybridProjectionSearchBackend":

View file

@ -4,6 +4,7 @@ from pathlib import Path
from typing import Any
from .hybrid_projection import (
DEFAULT_EMBEDDING_DIMENSIONS,
EmbeddingCache,
INDEX_BY_CHANNEL,
embedding_cache_model_key,
@ -22,7 +23,7 @@ class SummaryProjectionIndexer:
embedder: Any,
embedding_provider: str,
embedding_model: str,
embedding_dimensions: int = 256,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None,
) -> None:
self.index_dir = Path(index_dir).expanduser()
@ -49,10 +50,11 @@ class SummaryProjectionIndexer:
*,
embedding_provider: str = "openai",
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = 256,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
**kwargs: Any,
) -> "SummaryProjectionIndexer":
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
return cls(
index_dir,
embedder=make_embedder(
@ -118,12 +120,10 @@ class SummaryProjectionIndexer:
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != self.embedding_dimensions:
raise RuntimeError(
"summary projection index dimension mismatch: "
f"{self.index.db_path} was built with dimension {existing_dimension}, "
f"but configured embedding_dimensions is {self.embedding_dimensions}. "
"Use the matching embedding config, or rebuild the projection index "
"at a new path after preserving the existing data."
raise self._dimension_mismatch_error(
self.index.db_path,
existing_dimension,
self.embedding_dimensions,
)
def _index_metadata(self) -> dict[str, Any]:
@ -133,3 +133,44 @@ class SummaryProjectionIndexer:
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
}
@classmethod
def _validate_existing_index_dimension(
cls,
index_dir: str | Path,
embedding_dimensions: int,
) -> None:
index_path = (
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
)
if not index_path.exists():
return
index = SQLiteVecSemanticIndex(index_path)
try:
existing_dimension = index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {index_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != embedding_dimensions:
raise cls._dimension_mismatch_error(
index_path,
existing_dimension,
embedding_dimensions,
)
@staticmethod
def _dimension_mismatch_error(
index_path: Path,
existing_dimension: int,
embedding_dimensions: int,
) -> RuntimeError:
return RuntimeError(
"summary projection index dimension mismatch: "
f"{index_path} was built with dimension {existing_dimension}, "
f"but configured embedding_dimensions is {embedding_dimensions}. "
"Use the matching embedding config, or rebuild the projection index "
"at a new path after preserving the existing data."
)

View file

@ -844,6 +844,67 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default(
tmp_path, monkeypatch
):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
workspace = tmp_path / "workspace"
index_dir = workspace / "artifacts" / "projection_indexes"
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
summary_index.reset(
dimension=256,
metadata={
"channel": "summary",
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 256,
},
)
summary_index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, *([0.0] * 255)],
)
]
)
filesystem = PageIndexFileSystem(workspace)
calls = []
def fake_configure(index_dir_arg, **kwargs):
calls.append((index_dir_arg, kwargs))
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
fake_configure,
)
assert filesystem.summary_projection_embedding_dimensions == 1024
assert filesystem.configure_existing_projection_retrieval() is True
assert calls == [
(
filesystem.summary_projection_index_dir,
{
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 256,
"embedding_timeout": 60,
},
)
]
assert summary_index.info()["dimension"] == 256
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend

View file

@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import (
)
class FixedDimensionEmbedder:
def __init__(self, dimensions: int):
self.dimensions = dimensions
def embed(self, texts):
return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts]
def test_sqlite_vec_semantic_index_round_trip(tmp_path):
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
index.reset(dimension=3, metadata={"field_mode": "summary"})
@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
class FakeEmbedder:
def embed(self, texts):
return [[1.0, 0.0, 0.0] for _ in texts]
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
embedder=FakeEmbedder(),
embedder=FixedDimensionEmbedder(3),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=3,
@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
assert hits[0].metadata["department"] == "ops"
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
class FakeEmbedder:
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
embedder=FixedDimensionEmbedder(1024),
embedding_provider="test",
embedding_model="fake",
)
info = indexer.index.info()
assert info["dimension"] == 1024
assert info["metadata"]["embedding_dimensions"] == 1024
result = indexer.upsert_summary(
{
"file_ref": "file_a",
"external_id": "doc_a",
"source_type": "documents",
"source_path": "docs/a.pdf",
"title": "A",
"metadata": {"summary": "Default dimension summary."},
}
)
assert result["status"] == "ready"
assert result["embedding_dimensions"] == 1024
def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
embedder=FixedDimensionEmbedder(256),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=256,
)
assert indexer.index.info()["dimension"] == 256
assert indexer.upsert_summary(
{
"file_ref": "file_a",
"external_id": "doc_a",
"source_type": "documents",
"source_path": "docs/a.pdf",
"title": "A",
"metadata": {"summary": "Explicit 256 dimension summary."},
}
)["status"] == "ready"
def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
index.reset(
dimension=256,
metadata={
"channel": "summary",
"embedding_provider": "test",
"embedding_model": "fake",
"embedding_dimensions": 256,
},
)
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
SummaryProjectionIndexer(
index_dir,
embedder=FixedDimensionEmbedder(1024),
embedding_provider="test",
embedding_model="fake",
)
assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256
def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
tmp_path, monkeypatch
):
from pageindex.filesystem import projection_indexing
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
index.reset(
dimension=256,
metadata={
"channel": "summary",
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 256,
},
)
def fail_make_embedder(*args, **kwargs):
raise AssertionError("embedder should not be constructed before dimension validation")
monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
SummaryProjectionIndexer.from_provider(index_dir)
def test_embedding_cache_key_separates_model_dimensions(tmp_path):
from pageindex.filesystem.hybrid_projection import (
EmbeddingCache,
embedding_cache_model_key,
)
class CountingEmbedder:
def __init__(self, dimensions: int):
self.dimensions = dimensions
self.calls = 0
def embed(self, texts):
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
self.calls += 1
return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts]
cache = EmbeddingCache(tmp_path / "cache.sqlite")
embedder_256 = CountingEmbedder(256)
embedder_1024 = CountingEmbedder(1024)
key_256 = embedding_cache_model_key("fake", 256)
key_1024 = embedding_cache_model_key("fake", 1024)
assert key_256 != key_1024
vector_256 = cache.embed_texts(
["same text"],
provider="test",
model=key_256,
embedder=embedder_256,
batch_size=1,
)[0]
vector_1024 = cache.embed_texts(
["same text"],
provider="test",
model=key_1024,
embedder=embedder_1024,
batch_size=1,
)[0]
assert len(vector_256) == 256
assert len(vector_1024) == 1024
assert embedder_256.calls == 1
assert embedder_1024.calls == 1
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
SummaryProjectionIndexer(
index_dir,
embedder=FakeEmbedder(),
embedder=FixedDimensionEmbedder(4),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=4,