From 8f87cee6ce369af5c06212208d08d9226769a638 Mon Sep 17 00:00:00 2001
From: BukeLy <bukely0119@foxmail.com>
Date: Sun, 31 May 2026 21:25:42 +0800
Subject: [PATCH] fix(pifs): avoid vector imports for empty workspaces

---
 pageindex/filesystem/core.py | 12 ++++--
 tests/test_pifs_cli.py       | 73 ++++++++++++++++++++++++++++++------
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py
index c9a5d70..557b4e1 100644
--- a/pageindex/filesystem/core.py
+++ b/pageindex/filesystem/core.py
@@ -77,6 +77,11 @@ PROJECTION_INDEX_STATUSES = {
 }
 
 SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
+SEMANTIC_PROJECTION_INDEX_NAMES = {
+    "summary": "summary_only_vector",
+    "entity": "entity_vectors",
+    "relation": "relation_vectors",
+}
 PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
 PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
     "application/pdf",
@@ -277,16 +282,15 @@ class PageIndexFileSystem:
         return bool(self.semantic_retrieval_channels())
 
     def _existing_projection_index_config(self) -> dict[str, Any] | None:
-        from .hybrid_projection import INDEX_BY_CHANNEL
-        from .semantic_index import SQLiteVecSemanticIndex
-
         for channel in SEMANTIC_RETRIEVAL_CHANNELS:
-            index_name = INDEX_BY_CHANNEL.get(channel)
+            index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel)
             if not index_name:
                 continue
             index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
             if not index_path.exists():
                 continue
+            from .semantic_index import SQLiteVecSemanticIndex
+
             try:
                 info = SQLiteVecSemanticIndex(index_path).info()
             except Exception:
diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py
index 3b27156..3437a3c 100644
--- a/tests/test_pifs_cli.py
+++ b/tests/test_pifs_cli.py
@@ -1,6 +1,10 @@
+import builtins
 import os
+import sys
 from pathlib import Path
 
+import pytest
+
 
 class FakeFileSystem:
     def __init__(self, workspace):
@@ -25,22 +29,69 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp
     assert filesystem.projection_retrieval_configured is True
 
 
-def test_cli_workspace_surfaces_projection_dimension_mismatch(monkeypatch, tmp_path):
-    import pytest
-
+def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
+    monkeypatch, tmp_path
+):
     from pageindex.filesystem import cli
 
-    class MismatchedFileSystem:
-        def __init__(self, workspace):
-            self.workspace = Path(workspace)
+    workspace = tmp_path / "workspace"
+    real_import = builtins.__import__
 
-        def configure_existing_projection_retrieval(self):
-            raise RuntimeError("summary projection index dimension mismatch: rebuild")
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
+    monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
 
-    monkeypatch.setattr(cli, "PageIndexFileSystem", MismatchedFileSystem)
+    def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0):
+        if name.split(".", 1)[0] == "sqlite_vec":
+            raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec")
+        return real_import(name, globals, locals, fromlist, level)
 
-    with pytest.raises(RuntimeError, match="dimension mismatch"):
-        cli._filesystem_from_workspace(str(tmp_path / "workspace"))
+    monkeypatch.setattr(builtins, "__import__", block_sqlite_vec)
+
+    filesystem = cli._filesystem_from_workspace(str(workspace))
+
+    assert filesystem.workspace == workspace
+    assert filesystem.semantic_retrieval_channels() == ()
+
+
+def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
+    from pageindex.filesystem import cli
+    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
+
+    workspace = tmp_path / "workspace"
+    index_dir = workspace / "artifacts" / "projection_indexes"
+    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    summary_index.reset(
+        dimension=3,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "test",
+            "embedding_model": "fake",
+            "embedding_dimensions": 3,
+        },
+    )
+    summary_index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="documents",
+                source_path="documents/a.pdf",
+                title="A",
+                text="summary",
+                vector=[1.0, 0.0, 0.0],
+            )
+        ]
+    )
+
+    with pytest.raises(
+        RuntimeError,
+        match=(
+            "summary projection index dimension mismatch: .*"
+            "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
+        ),
+    ):
+        cli._filesystem_from_workspace(str(workspace))
 
 
 def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):