mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-15 20:05:14 +02:00
feat(filesystem): default embeddings to 1024 dimensions
This commit is contained in:
parent
b5cc404776
commit
58409d1ec5
6 changed files with 282 additions and 22 deletions
|
|
@ -844,6 +844,67 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
|
|||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
|
||||
|
||||
def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=256,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "text-embedding-3-small",
|
||||
"embedding_dimensions": 256,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, *([0.0] * 255)],
|
||||
)
|
||||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(workspace)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
)
|
||||
|
||||
assert filesystem.summary_projection_embedding_dimensions == 1024
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "text-embedding-3-small",
|
||||
"embedding_dimensions": 256,
|
||||
"embedding_timeout": 60,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert summary_index.info()["dimension"] == 256
|
||||
|
||||
|
||||
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
||||
|
|
|
|||
|
|
@ -13,6 +13,14 @@ from pageindex.filesystem.semantic_index import (
|
|||
)
|
||||
|
||||
|
||||
class FixedDimensionEmbedder:
|
||||
def __init__(self, dimensions: int):
|
||||
self.dimensions = dimensions
|
||||
|
||||
def embed(self, texts):
|
||||
return [[1.0, *([0.0] * (self.dimensions - 1))] for _ in texts]
|
||||
|
||||
|
||||
def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
||||
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||
index.reset(dimension=3, metadata={"field_mode": "summary"})
|
||||
|
|
@ -96,13 +104,9 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
|
|||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0] for _ in texts]
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FakeEmbedder(),
|
||||
embedder=FixedDimensionEmbedder(3),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=3,
|
||||
|
|
@ -129,12 +133,159 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|||
assert hits[0].metadata["department"] == "ops"
|
||||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FixedDimensionEmbedder(1024),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
)
|
||||
|
||||
info = indexer.index.info()
|
||||
|
||||
assert info["dimension"] == 1024
|
||||
assert info["metadata"]["embedding_dimensions"] == 1024
|
||||
|
||||
result = indexer.upsert_summary(
|
||||
{
|
||||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Default dimension summary."},
|
||||
}
|
||||
)
|
||||
|
||||
assert result["status"] == "ready"
|
||||
assert result["embedding_dimensions"] == 1024
|
||||
|
||||
|
||||
def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FixedDimensionEmbedder(256),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=256,
|
||||
)
|
||||
|
||||
assert indexer.index.info()["dimension"] == 256
|
||||
assert indexer.upsert_summary(
|
||||
{
|
||||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Explicit 256 dimension summary."},
|
||||
}
|
||||
)["status"] == "ready"
|
||||
|
||||
|
||||
def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
index.reset(
|
||||
dimension=256,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "test",
|
||||
"embedding_model": "fake",
|
||||
"embedding_dimensions": 256,
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
|
||||
SummaryProjectionIndexer(
|
||||
index_dir,
|
||||
embedder=FixedDimensionEmbedder(1024),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
)
|
||||
|
||||
assert SQLiteVecSemanticIndex(index.db_path).info()["dimension"] == 256
|
||||
|
||||
|
||||
def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import projection_indexing
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
index.reset(
|
||||
dimension=256,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "text-embedding-3-small",
|
||||
"embedding_dimensions": 256,
|
||||
},
|
||||
)
|
||||
|
||||
def fail_make_embedder(*args, **kwargs):
|
||||
raise AssertionError("embedder should not be constructed before dimension validation")
|
||||
|
||||
monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
|
||||
|
||||
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
|
||||
SummaryProjectionIndexer.from_provider(index_dir)
|
||||
|
||||
|
||||
def test_embedding_cache_key_separates_model_dimensions(tmp_path):
|
||||
from pageindex.filesystem.hybrid_projection import (
|
||||
EmbeddingCache,
|
||||
embedding_cache_model_key,
|
||||
)
|
||||
|
||||
class CountingEmbedder:
|
||||
def __init__(self, dimensions: int):
|
||||
self.dimensions = dimensions
|
||||
self.calls = 0
|
||||
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
|
||||
self.calls += 1
|
||||
return [[float(self.dimensions), *([0.0] * (self.dimensions - 1))] for _ in texts]
|
||||
|
||||
cache = EmbeddingCache(tmp_path / "cache.sqlite")
|
||||
embedder_256 = CountingEmbedder(256)
|
||||
embedder_1024 = CountingEmbedder(1024)
|
||||
key_256 = embedding_cache_model_key("fake", 256)
|
||||
key_1024 = embedding_cache_model_key("fake", 1024)
|
||||
|
||||
assert key_256 != key_1024
|
||||
|
||||
vector_256 = cache.embed_texts(
|
||||
["same text"],
|
||||
provider="test",
|
||||
model=key_256,
|
||||
embedder=embedder_256,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
vector_1024 = cache.embed_texts(
|
||||
["same text"],
|
||||
provider="test",
|
||||
model=key_1024,
|
||||
embedder=embedder_1024,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
|
||||
assert len(vector_256) == 256
|
||||
assert len(vector_1024) == 1024
|
||||
assert embedder_256.calls == 1
|
||||
assert embedder_1024.calls == 1
|
||||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
|
|
@ -164,7 +315,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
|
|||
with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
|
||||
SummaryProjectionIndexer(
|
||||
index_dir,
|
||||
embedder=FakeEmbedder(),
|
||||
embedder=FixedDimensionEmbedder(4),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=4,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue