From e9453725b6523e57f1163c1d127b10241273082f Mon Sep 17 00:00:00 2001 From: Bukely_ Date: Tue, 26 May 2026 20:30:08 +0800 Subject: [PATCH] fix(filesystem): lazy-load pifs import surface Avoid eager optional dependency imports when importing PageIndexFileSystem or filesystem semantic exports. --- pageindex/__init__.py | 12 +++++++++-- pageindex/filesystem/__init__.py | 32 +++++++++++++++++++++-------- pageindex/filesystem/core.py | 12 ++++++++--- tests/test_import_surface.py | 35 ++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 tests/test_import_surface.py diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 97c3781..c3fb0b0 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -2,13 +2,21 @@ import os os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") +_OPTIONAL_CORE_IMPORTS = {"litellm", "openai", "PyPDF2", "pymupdf"} + try: from .page_index import * from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content from .client import PageIndexClient except ModuleNotFoundError as exc: - if exc.name != "litellm": + if exc.name not in _OPTIONAL_CORE_IMPORTS: raise -from .filesystem import PageIndexFileSystem + +def __getattr__(name: str): + if name == "PageIndexFileSystem": + from .filesystem import PageIndexFileSystem + + return PageIndexFileSystem + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/pageindex/filesystem/__init__.py b/pageindex/filesystem/__init__.py index a6cde16..2c8fd1b 100644 --- a/pageindex/filesystem/__init__.py +++ b/pageindex/filesystem/__init__.py @@ -1,6 +1,7 @@ +from importlib import import_module + from .commands import PIFSCommandExecutor from .core import PageIndexFileSystem -from .hybrid_projection import HybridProjectionSearchBackend from .metadata_generation import ( MetadataGenerationBackend, MetadataGenerationError, @@ -8,15 +9,17 @@ from .metadata_generation import ( MetadataGenerationResult, MetadataGenerator, ) -from .projection_indexing import SummaryProjectionIndexer -from .semantic_index import ( - RebuildableSemanticIndex, - SemanticIndexRecord, - SemanticSearchResult, - SQLiteVecSemanticIndex, -) from .types import OpenResult, SearchResult +_LAZY_EXPORTS = { + "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"), + "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"), + "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"), + "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"), + "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"), + "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"), +} + __all__ = [ "OpenResult", "HybridProjectionSearchBackend", @@ -34,3 +37,16 @@ __all__ = [ "SummaryProjectionIndexer", "SQLiteVecSemanticIndex", ] + + +def __getattr__(name: str): + if name in _LAZY_EXPORTS: + module_name, attribute_name = _LAZY_EXPORTS[name] + value = getattr(import_module(module_name, __name__), attribute_name) + globals()[name] = value + return value + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return sorted(set(globals()) | set(__all__)) diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index dcfb72f..b6cdef6 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -3,10 +3,9 @@ from __future__ import annotations import json import os from pathlib import Path -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from urllib.parse import unquote, urlparse -from ..client import PageIndexClient from .metadata import MetadataQueryEngine from .metadata_generation import ( MetadataGenerationBackend, @@ -15,7 +14,6 @@ from .metadata_generation import ( MetadataGenerationResult, MetadataGenerator, ) -from .projection_indexing import SummaryProjectionIndexer from .semantic_folder_policy import ( SEMANTIC_FOLDER_BASE_FIELDS, SEMANTIC_FOLDER_ROOT, @@ -39,6 +37,10 @@ from .structural_read import ( ) from .types import OpenResult, SearchResult +if TYPE_CHECKING: + from ..client import PageIndexClient + from .projection_indexing import SummaryProjectionIndexer + DEFAULT_METADATA_GENERATION_FIELDS = { "summary": True, "doc_type": True, @@ -215,6 +217,8 @@ class PageIndexFileSystem: max_text_chars=self.metadata_max_text_chars, ) if self.summary_projection_index and self.summary_projection_indexer is None: + from .projection_indexing import SummaryProjectionIndexer + self.summary_projection_indexer = SummaryProjectionIndexer.from_provider( self.summary_projection_index_dir, embedding_provider=self.summary_projection_embedding_provider, @@ -836,6 +840,8 @@ class PageIndexFileSystem: return self.workspace / "artifacts" / "pageindex_client" def _pageindex_client(self) -> PageIndexClient: + from ..client import PageIndexClient + return PageIndexClient(workspace=str(self.pageindex_client_workspace)) def _pageindex_client_doc_for_entry(self, entry: Any) -> tuple[PageIndexClient, str | None]: diff --git a/tests/test_import_surface.py b/tests/test_import_surface.py new file mode 100644 index 0000000..b4309cf --- /dev/null +++ b/tests/test_import_surface.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import builtins +import importlib +import sys + + +def test_filesystem_import_works_without_eager_optional_dependencies(monkeypatch): + blocked_roots = {"litellm", "openai", "PyPDF2", "pymupdf", "sqlite_vec"} + real_import = builtins.__import__ + + def clear_pageindex_modules() -> None: + for name in list(sys.modules): + if name == "pageindex" or name.startswith("pageindex."): + sys.modules.pop(name, None) + + def import_without_optional_deps(name, globals=None, locals=None, fromlist=(), level=0): + root = name.split(".", 1)[0] + if root in blocked_roots: + raise ModuleNotFoundError(f"No module named '{root}'", name=root) + return real_import(name, globals, locals, fromlist, level) + + clear_pageindex_modules() + try: + with monkeypatch.context() as patch: + patch.setattr(builtins, "__import__", import_without_optional_deps) + + filesystem_module = importlib.import_module("pageindex.filesystem") + from pageindex import PageIndexFileSystem as TopLevelPageIndexFileSystem + from pageindex.filesystem import PageIndexFileSystem + + assert filesystem_module.PageIndexFileSystem is PageIndexFileSystem + assert TopLevelPageIndexFileSystem is PageIndexFileSystem + finally: + clear_pageindex_modules()