fix(filesystem): restore summary vector search in cli

This commit is contained in:
BukeLy 2026-05-26 16:40:14 +08:00
parent 7e70b580f0
commit fc0be1aeee
6 changed files with 147 additions and 3 deletions

View file

@ -42,8 +42,10 @@ Run a command in the PageIndex FileSystem virtual shell. This is not a real
operating-system shell. By default the tool is read-only: use ls, tree, find,
grep, cat, stat, head, tail, sed, and any dynamically available semantic search
commands described in the workspace context. grep -R is lexical evidence search;
semantic search commands return candidate documents and do not guarantee literal
text matches. Errors are returned as text prefixed with ERROR. Do not call
semantic search commands such as search-summary return candidate documents and
do not guarantee literal text matches. Use search-summary when the user asks for
summary search, semantic search, or vector search and the command is listed as
available. Errors are returned as text prefixed with ERROR. Do not call
commands that are not listed as available. When evidence is required, inspect it
with cat or grep before answering. Prefer shell-like target-first cat syntax
with stable targets: cat <path> --structure, cat <path> --page 31-59, and
@ -64,6 +66,7 @@ Tool policy:
- Use --where only with metadata fields shown by stat --schema.
- grep -R performs lexical evidence search.
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary <query> <folder>; do not translate that request into find --where.
- Tool errors are returned as ERROR text; recover by trying an available command.
- Use cat or grep to gather evidence before making source-backed claims.
- Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.

View file

@ -122,7 +122,10 @@ def _parse_agent_command(
def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
return PageIndexFileSystem(Path(workspace).expanduser())
filesystem = PageIndexFileSystem(Path(workspace).expanduser())
with contextlib.suppress(Exception):
filesystem.configure_existing_projection_retrieval()
return filesystem
def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]:

View file

@ -218,6 +218,64 @@ class PageIndexFileSystem:
embedding_timeout=self.summary_projection_embedding_timeout,
)
def configure_existing_projection_retrieval(self) -> bool:
"""Attach semantic retrieval to already-built projection indexes.
Register-time generation owns building the index files. Opening an
existing workspace should still expose the corresponding read commands,
such as search-summary, without forcing a re-register step.
"""
if self.semantic_retrieval_backend is not None:
return bool(self.semantic_retrieval_channels())
index_config = self._existing_projection_index_config()
if index_config is None:
return False
metadata = dict(index_config.get("metadata") or {})
embedding_provider = str(
metadata.get("embedding_provider")
or self.summary_projection_embedding_provider
)
embedding_model = str(
metadata.get("embedding_model")
or self.summary_projection_embedding_model
)
embedding_dimensions = int(
metadata.get("embedding_dimensions")
or index_config.get("dimension")
or self.summary_projection_embedding_dimensions
)
self.configure_hybrid_projection_retrieval(
self.summary_projection_index_dir,
embedding_provider=embedding_provider,
embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions,
embedding_timeout=self.summary_projection_embedding_timeout,
)
return bool(self.semantic_retrieval_channels())
def _existing_projection_index_config(self) -> dict[str, Any] | None:
from .hybrid_projection import INDEX_BY_CHANNEL
from .semantic_index import SQLiteVecSemanticIndex
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
index_name = INDEX_BY_CHANNEL.get(channel)
if not index_name:
continue
index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
if not index_path.exists():
continue
try:
info = SQLiteVecSemanticIndex(index_path).info()
except Exception:
continue
if int(info.get("document_count") or 0) <= 0:
continue
metadata = dict(info.get("metadata") or {})
if metadata.get("channel") and metadata.get("channel") != channel:
continue
return info
return None
@staticmethod
def _register_uses_deferred_metadata(policy: Any) -> bool:
if not isinstance(policy, dict):