fix(filesystem): avoid eager vector imports for embedding defaults

This commit is contained in:
BukeLy 2026-05-31 21:07:58 +08:00
parent 58409d1ec5
commit decfe29fe4
6 changed files with 5 additions and 65 deletions

View file

@ -42,7 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
from pageindex import PageIndexClient
from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
from pageindex.filesystem.agent import run_pifs_agent
from pageindex.filesystem.hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
EXAMPLES_DIR = Path(__file__).parent

View file

@ -14,6 +14,7 @@ from .metadata_generation import (
MetadataGenerationResult,
MetadataGenerator,
)
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
from .semantic_folder_policy import (
SEMANTIC_FOLDER_BASE_FIELDS,
SEMANTIC_FOLDER_ROOT,
@ -22,7 +23,6 @@ from .semantic_folder_policy import (
is_semantic_folder_forbidden_field,
semantic_folder_allowed_extension_fields,
)
from .hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
from .store import (
SQLiteFileSystemStore,
fingerprint,

View file

@ -0,0 +1 @@
DEFAULT_EMBEDDING_DIMENSIONS = 1024

View file

@ -10,6 +10,7 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Any
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
@ -28,7 +29,6 @@ HYBRID_ENTITY_RELATION_WEIGHTS = {
"relation": 0.30,
"constraint": 0.20,
}
DEFAULT_EMBEDDING_DIMENSIONS = 1024
@dataclass(frozen=True)

View file

@ -3,8 +3,8 @@ from __future__ import annotations
from pathlib import Path
from typing import Any
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
from .hybrid_projection import (
DEFAULT_EMBEDDING_DIMENSIONS,
EmbeddingCache,
INDEX_BY_CHANNEL,
embedding_cache_model_key,

View file

@ -844,67 +844,6 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default(
tmp_path, monkeypatch
):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
workspace = tmp_path / "workspace"
index_dir = workspace / "artifacts" / "projection_indexes"
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
summary_index.reset(
dimension=256,
metadata={
"channel": "summary",
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 256,
},
)
summary_index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, *([0.0] * 255)],
)
]
)
filesystem = PageIndexFileSystem(workspace)
calls = []
def fake_configure(index_dir_arg, **kwargs):
calls.append((index_dir_arg, kwargs))
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
fake_configure,
)
assert filesystem.summary_projection_embedding_dimensions == 1024
assert filesystem.configure_existing_projection_retrieval() is True
assert calls == [
(
filesystem.summary_projection_index_dir,
{
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 256,
"embedding_timeout": 60,
},
)
]
assert summary_index.info()["dimension"] == 256
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend