From 8f87cee6ce369af5c06212208d08d9226769a638 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 31 May 2026 21:25:42 +0800 Subject: [PATCH] fix(pifs): avoid vector imports for empty workspaces --- pageindex/filesystem/core.py | 12 ++++-- tests/test_pifs_cli.py | 73 ++++++++++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index c9a5d70..557b4e1 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -77,6 +77,11 @@ PROJECTION_INDEX_STATUSES = { } SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation") +SEMANTIC_PROJECTION_INDEX_NAMES = { + "summary": "summary_only_vector", + "entity": "entity_vectors", + "relation": "relation_vectors", +} PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"} PAGEINDEX_DOCUMENT_CONTENT_TYPES = { "application/pdf", @@ -277,16 +282,15 @@ class PageIndexFileSystem: return bool(self.semantic_retrieval_channels()) def _existing_projection_index_config(self) -> dict[str, Any] | None: - from .hybrid_projection import INDEX_BY_CHANNEL - from .semantic_index import SQLiteVecSemanticIndex - for channel in SEMANTIC_RETRIEVAL_CHANNELS: - index_name = INDEX_BY_CHANNEL.get(channel) + index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel) if not index_name: continue index_path = self.summary_projection_index_dir / f"{index_name}.sqlite" if not index_path.exists(): continue + from .semantic_index import SQLiteVecSemanticIndex + try: info = SQLiteVecSemanticIndex(index_path).info() except Exception: diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py index 3b27156..3437a3c 100644 --- a/tests/test_pifs_cli.py +++ b/tests/test_pifs_cli.py @@ -1,6 +1,10 @@ +import builtins import os +import sys from pathlib import Path +import pytest + class FakeFileSystem: def __init__(self, workspace): @@ -25,22 +29,69 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp assert filesystem.projection_retrieval_configured is True -def test_cli_workspace_surfaces_projection_dimension_mismatch(monkeypatch, tmp_path): - import pytest - +def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec( + monkeypatch, tmp_path +): from pageindex.filesystem import cli - class MismatchedFileSystem: - def __init__(self, workspace): - self.workspace = Path(workspace) + workspace = tmp_path / "workspace" + real_import = builtins.__import__ - def configure_existing_projection_retrieval(self): - raise RuntimeError("summary projection index dimension mismatch: rebuild") + monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False) + monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False) + monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False) - monkeypatch.setattr(cli, "PageIndexFileSystem", MismatchedFileSystem) + def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0): + if name.split(".", 1)[0] == "sqlite_vec": + raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec") + return real_import(name, globals, locals, fromlist, level) - with pytest.raises(RuntimeError, match="dimension mismatch"): - cli._filesystem_from_workspace(str(tmp_path / "workspace")) + monkeypatch.setattr(builtins, "__import__", block_sqlite_vec) + + filesystem = cli._filesystem_from_workspace(str(workspace)) + + assert filesystem.workspace == workspace + assert filesystem.semantic_retrieval_channels() == () + + +def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path): + from pageindex.filesystem import cli + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "test", + "embedding_model": "fake", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + + with pytest.raises( + RuntimeError, + match=( + "summary projection index dimension mismatch: .*" + "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild" + ), + ): + cli._filesystem_from_workspace(str(workspace)) def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):