From decfe29fe450418f04d12662ea5fadb9fe43feff Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 31 May 2026 21:07:58 +0800 Subject: [PATCH] fix(filesystem): avoid eager vector imports for embedding defaults --- examples/pifs_demo.py | 2 +- pageindex/filesystem/core.py | 2 +- pageindex/filesystem/embedding_defaults.py | 1 + pageindex/filesystem/hybrid_projection.py | 2 +- pageindex/filesystem/projection_indexing.py | 2 +- tests/test_pageindex_filesystem_scope.py | 61 --------------------- 6 files changed, 5 insertions(+), 65 deletions(-) create mode 100644 pageindex/filesystem/embedding_defaults.py diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 4d71082..413b4a4 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -42,7 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") from pageindex import PageIndexClient from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor from pageindex.filesystem.agent import run_pifs_agent -from pageindex.filesystem.hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS +from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS EXAMPLES_DIR = Path(__file__).parent diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 7bf1903..eec0ade 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -14,6 +14,7 @@ from .metadata_generation import ( MetadataGenerationResult, MetadataGenerator, ) +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .semantic_folder_policy import ( SEMANTIC_FOLDER_BASE_FIELDS, SEMANTIC_FOLDER_ROOT, @@ -22,7 +23,6 @@ from .semantic_folder_policy import ( is_semantic_folder_forbidden_field, semantic_folder_allowed_extension_fields, ) -from .hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS from .store import ( SQLiteFileSystemStore, fingerprint, diff --git a/pageindex/filesystem/embedding_defaults.py b/pageindex/filesystem/embedding_defaults.py new file mode 100644 index 0000000..b329032 --- /dev/null +++ b/pageindex/filesystem/embedding_defaults.py @@ -0,0 +1 @@ +DEFAULT_EMBEDDING_DIMENSIONS = 1024 diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py index 2e228a8..cdb97e6 100644 --- a/pageindex/filesystem/hybrid_projection.py +++ b/pageindex/filesystem/hybrid_projection.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult @@ -28,7 +29,6 @@ HYBRID_ENTITY_RELATION_WEIGHTS = { "relation": 0.30, "constraint": 0.20, } -DEFAULT_EMBEDDING_DIMENSIONS = 1024 @dataclass(frozen=True) diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py index 843d158..63802d5 100644 --- a/pageindex/filesystem/projection_indexing.py +++ b/pageindex/filesystem/projection_indexing.py @@ -3,8 +3,8 @@ from __future__ import annotations from pathlib import Path from typing import Any +from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .hybrid_projection import ( - DEFAULT_EMBEDDING_DIMENSIONS, EmbeddingCache, INDEX_BY_CHANNEL, embedding_cache_model_key, diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 570d485..9edf647 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -844,67 +844,6 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path assert filesystem.semantic_retrieval_channels() == ("summary",) -def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default( - tmp_path, monkeypatch -): - from pageindex.filesystem import PageIndexFileSystem - from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex - - workspace = tmp_path / "workspace" - index_dir = workspace / "artifacts" / "projection_indexes" - summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") - summary_index.reset( - dimension=256, - metadata={ - "channel": "summary", - "embedding_provider": "openai", - "embedding_model": "text-embedding-3-small", - "embedding_dimensions": 256, - }, - ) - summary_index.upsert_many( - [ - SemanticIndexRecord( - file_ref="file_a", - external_id="doc_a", - source_type="documents", - source_path="documents/a.pdf", - title="A", - text="summary", - vector=[1.0, *([0.0] * 255)], - ) - ] - ) - filesystem = PageIndexFileSystem(workspace) - calls = [] - - def fake_configure(index_dir_arg, **kwargs): - calls.append((index_dir_arg, kwargs)) - filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") - return filesystem.semantic_retrieval_backend - - monkeypatch.setattr( - filesystem, - "configure_hybrid_projection_retrieval", - fake_configure, - ) - - assert filesystem.summary_projection_embedding_dimensions == 1024 - assert filesystem.configure_existing_projection_retrieval() is True - assert calls == [ - ( - filesystem.summary_projection_index_dir, - { - "embedding_provider": "openai", - "embedding_model": "text-embedding-3-small", - "embedding_dimensions": 256, - "embedding_timeout": 60, - }, - ) - ] - assert summary_index.info()["dimension"] == 256 - - def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend