fix(filesystem): restore summary vector search in cli

2026-06-27 20:29:41 +02:00 · 2026-05-26 16:40:14 +08:00 · 2026-05-26 16:40:14 +08:00 · fc0be1aeee
commit fc0be1aeee
parent 7e70b580f0
6 changed files with 147 additions and 3 deletions
--- a/pageindex/filesystem/agent.py
+++ b/pageindex/filesystem/agent.py
@ -42,8 +42,10 @@ Run a command in the PageIndex FileSystem virtual shell. This is not a real
 operating-system shell. By default the tool is read-only: use ls, tree, find,
 grep, cat, stat, head, tail, sed, and any dynamically available semantic search
 commands described in the workspace context. grep -R is lexical evidence search;
-semantic search commands return candidate documents and do not guarantee literal
-text matches. Errors are returned as text prefixed with ERROR. Do not call
+semantic search commands such as search-summary return candidate documents and
+do not guarantee literal text matches. Use search-summary when the user asks for
+summary search, semantic search, or vector search and the command is listed as
+available. Errors are returned as text prefixed with ERROR. Do not call
 commands that are not listed as available. When evidence is required, inspect it
 with cat or grep before answering. Prefer shell-like target-first cat syntax
 with stable targets: cat <path> --structure, cat <path> --page 31-59, and
@ -64,6 +66,7 @@ Tool policy:
 - Use --where only with metadata fields shown by stat --schema.
 - grep -R performs lexical evidence search.
 - Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
+- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary <query> <folder>; do not translate that request into find --where.
 - Tool errors are returned as ERROR text; recover by trying an available command.
 - Use cat or grep to gather evidence before making source-backed claims.
 - Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.
--- a/pageindex/filesystem/cli.py
+++ b/pageindex/filesystem/cli.py
@ -122,7 +122,10 @@ def _parse_agent_command(


 def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
-    return PageIndexFileSystem(Path(workspace).expanduser())
+    filesystem = PageIndexFileSystem(Path(workspace).expanduser())
+    with contextlib.suppress(Exception):
+        filesystem.configure_existing_projection_retrieval()
+    return filesystem


 def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]:
--- a/pageindex/filesystem/core.py
+++ b/pageindex/filesystem/core.py
@ -218,6 +218,64 @@ class PageIndexFileSystem:
                embedding_timeout=self.summary_projection_embedding_timeout,
            )

+    def configure_existing_projection_retrieval(self) -> bool:
+        """Attach semantic retrieval to already-built projection indexes.
+
+        Register-time generation owns building the index files. Opening an
+        existing workspace should still expose the corresponding read commands,
+        such as search-summary, without forcing a re-register step.
+        """
+        if self.semantic_retrieval_backend is not None:
+            return bool(self.semantic_retrieval_channels())
+        index_config = self._existing_projection_index_config()
+        if index_config is None:
+            return False
+        metadata = dict(index_config.get("metadata") or {})
+        embedding_provider = str(
+            metadata.get("embedding_provider")
+            or self.summary_projection_embedding_provider
+        )
+        embedding_model = str(
+            metadata.get("embedding_model")
+            or self.summary_projection_embedding_model
+        )
+        embedding_dimensions = int(
+            metadata.get("embedding_dimensions")
+            or index_config.get("dimension")
+            or self.summary_projection_embedding_dimensions
+        )
+        self.configure_hybrid_projection_retrieval(
+            self.summary_projection_index_dir,
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            embedding_timeout=self.summary_projection_embedding_timeout,
+        )
+        return bool(self.semantic_retrieval_channels())
+
+    def _existing_projection_index_config(self) -> dict[str, Any] | None:
+        from .hybrid_projection import INDEX_BY_CHANNEL
+        from .semantic_index import SQLiteVecSemanticIndex
+
+        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
+            index_name = INDEX_BY_CHANNEL.get(channel)
+            if not index_name:
+                continue
+            index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
+            if not index_path.exists():
+                continue
+            try:
+                info = SQLiteVecSemanticIndex(index_path).info()
+            except Exception:
+                continue
+            if int(info.get("document_count") or 0) <= 0:
+                continue
+            metadata = dict(info.get("metadata") or {})
+            if metadata.get("channel") and metadata.get("channel") != channel:
+                continue
+            return info
+        return None
+
    @staticmethod
    def _register_uses_deferred_metadata(policy: Any) -> bool:
        if not isinstance(policy, dict):