fix(filesystem): lazy-load pifs import surface

Avoid eager optional dependency imports when importing PageIndexFileSystem or filesystem semantic exports.
This commit is contained in:
Bukely_ 2026-05-26 20:30:08 +08:00 committed by BukeLy
parent 70eece52e5
commit e9453725b6
4 changed files with 78 additions and 13 deletions

View file

@ -2,13 +2,21 @@ import os
os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
_OPTIONAL_CORE_IMPORTS = {"litellm", "openai", "PyPDF2", "pymupdf"}
try:
from .page_index import *
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
except ModuleNotFoundError as exc:
if exc.name != "litellm":
if exc.name not in _OPTIONAL_CORE_IMPORTS:
raise
from .filesystem import PageIndexFileSystem
def __getattr__(name: str):
if name == "PageIndexFileSystem":
from .filesystem import PageIndexFileSystem
return PageIndexFileSystem
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View file

@ -1,6 +1,7 @@
from importlib import import_module
from .commands import PIFSCommandExecutor
from .core import PageIndexFileSystem
from .hybrid_projection import HybridProjectionSearchBackend
from .metadata_generation import (
MetadataGenerationBackend,
MetadataGenerationError,
@ -8,15 +9,17 @@ from .metadata_generation import (
MetadataGenerationResult,
MetadataGenerator,
)
from .projection_indexing import SummaryProjectionIndexer
from .semantic_index import (
RebuildableSemanticIndex,
SemanticIndexRecord,
SemanticSearchResult,
SQLiteVecSemanticIndex,
)
from .types import OpenResult, SearchResult
_LAZY_EXPORTS = {
"HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
"RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
"SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
"SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
"SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
"SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
}
__all__ = [
"OpenResult",
"HybridProjectionSearchBackend",
@ -34,3 +37,16 @@ __all__ = [
"SummaryProjectionIndexer",
"SQLiteVecSemanticIndex",
]
def __getattr__(name: str):
if name in _LAZY_EXPORTS:
module_name, attribute_name = _LAZY_EXPORTS[name]
value = getattr(import_module(module_name, __name__), attribute_name)
globals()[name] = value
return value
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def __dir__() -> list[str]:
return sorted(set(globals()) | set(__all__))

View file

@ -3,10 +3,9 @@ from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any, Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union
from urllib.parse import unquote, urlparse
from ..client import PageIndexClient
from .metadata import MetadataQueryEngine
from .metadata_generation import (
MetadataGenerationBackend,
@ -15,7 +14,6 @@ from .metadata_generation import (
MetadataGenerationResult,
MetadataGenerator,
)
from .projection_indexing import SummaryProjectionIndexer
from .semantic_folder_policy import (
SEMANTIC_FOLDER_BASE_FIELDS,
SEMANTIC_FOLDER_ROOT,
@ -39,6 +37,10 @@ from .structural_read import (
)
from .types import OpenResult, SearchResult
if TYPE_CHECKING:
from ..client import PageIndexClient
from .projection_indexing import SummaryProjectionIndexer
DEFAULT_METADATA_GENERATION_FIELDS = {
"summary": True,
"doc_type": True,
@ -215,6 +217,8 @@ class PageIndexFileSystem:
max_text_chars=self.metadata_max_text_chars,
)
if self.summary_projection_index and self.summary_projection_indexer is None:
from .projection_indexing import SummaryProjectionIndexer
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
self.summary_projection_index_dir,
embedding_provider=self.summary_projection_embedding_provider,
@ -836,6 +840,8 @@ class PageIndexFileSystem:
return self.workspace / "artifacts" / "pageindex_client"
def _pageindex_client(self) -> PageIndexClient:
from ..client import PageIndexClient
return PageIndexClient(workspace=str(self.pageindex_client_workspace))
def _pageindex_client_doc_for_entry(self, entry: Any) -> tuple[PageIndexClient, str | None]:

View file

@ -0,0 +1,35 @@
from __future__ import annotations
import builtins
import importlib
import sys
def test_filesystem_import_works_without_eager_optional_dependencies(monkeypatch):
blocked_roots = {"litellm", "openai", "PyPDF2", "pymupdf", "sqlite_vec"}
real_import = builtins.__import__
def clear_pageindex_modules() -> None:
for name in list(sys.modules):
if name == "pageindex" or name.startswith("pageindex."):
sys.modules.pop(name, None)
def import_without_optional_deps(name, globals=None, locals=None, fromlist=(), level=0):
root = name.split(".", 1)[0]
if root in blocked_roots:
raise ModuleNotFoundError(f"No module named '{root}'", name=root)
return real_import(name, globals, locals, fromlist, level)
clear_pageindex_modules()
try:
with monkeypatch.context() as patch:
patch.setattr(builtins, "__import__", import_without_optional_deps)
filesystem_module = importlib.import_module("pageindex.filesystem")
from pageindex import PageIndexFileSystem as TopLevelPageIndexFileSystem
from pageindex.filesystem import PageIndexFileSystem
assert filesystem_module.PageIndexFileSystem is PageIndexFileSystem
assert TopLevelPageIndexFileSystem is PageIndexFileSystem
finally:
clear_pageindex_modules()