fix(filesystem): restore summary vector search in cli

This commit is contained in:
BukeLy 2026-05-26 16:40:14 +08:00
parent 7e70b580f0
commit fc0be1aeee
6 changed files with 147 additions and 3 deletions

View file

@ -42,8 +42,10 @@ Run a command in the PageIndex FileSystem virtual shell. This is not a real
operating-system shell. By default the tool is read-only: use ls, tree, find,
grep, cat, stat, head, tail, sed, and any dynamically available semantic search
commands described in the workspace context. grep -R is lexical evidence search;
semantic search commands return candidate documents and do not guarantee literal
text matches. Errors are returned as text prefixed with ERROR. Do not call
semantic search commands such as search-summary return candidate documents and
do not guarantee literal text matches. Use search-summary when the user asks for
summary search, semantic search, or vector search and the command is listed as
available. Errors are returned as text prefixed with ERROR. Do not call
commands that are not listed as available. When evidence is required, inspect it
with cat or grep before answering. Prefer shell-like target-first cat syntax
with stable targets: cat <path> --structure, cat <path> --page 31-59, and
@ -64,6 +66,7 @@ Tool policy:
- Use --where only with metadata fields shown by stat --schema.
- grep -R performs lexical evidence search.
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary <query> <folder>; do not translate that request into find --where.
- Tool errors are returned as ERROR text; recover by trying an available command.
- Use cat or grep to gather evidence before making source-backed claims.
- Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.

View file

@ -122,7 +122,10 @@ def _parse_agent_command(
def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
return PageIndexFileSystem(Path(workspace).expanduser())
filesystem = PageIndexFileSystem(Path(workspace).expanduser())
with contextlib.suppress(Exception):
filesystem.configure_existing_projection_retrieval()
return filesystem
def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]:

View file

@ -218,6 +218,64 @@ class PageIndexFileSystem:
embedding_timeout=self.summary_projection_embedding_timeout,
)
def configure_existing_projection_retrieval(self) -> bool:
"""Attach semantic retrieval to already-built projection indexes.
Register-time generation owns building the index files. Opening an
existing workspace should still expose the corresponding read commands,
such as search-summary, without forcing a re-register step.
"""
if self.semantic_retrieval_backend is not None:
return bool(self.semantic_retrieval_channels())
index_config = self._existing_projection_index_config()
if index_config is None:
return False
metadata = dict(index_config.get("metadata") or {})
embedding_provider = str(
metadata.get("embedding_provider")
or self.summary_projection_embedding_provider
)
embedding_model = str(
metadata.get("embedding_model")
or self.summary_projection_embedding_model
)
embedding_dimensions = int(
metadata.get("embedding_dimensions")
or index_config.get("dimension")
or self.summary_projection_embedding_dimensions
)
self.configure_hybrid_projection_retrieval(
self.summary_projection_index_dir,
embedding_provider=embedding_provider,
embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions,
embedding_timeout=self.summary_projection_embedding_timeout,
)
return bool(self.semantic_retrieval_channels())
def _existing_projection_index_config(self) -> dict[str, Any] | None:
from .hybrid_projection import INDEX_BY_CHANNEL
from .semantic_index import SQLiteVecSemanticIndex
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
index_name = INDEX_BY_CHANNEL.get(channel)
if not index_name:
continue
index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
if not index_path.exists():
continue
try:
info = SQLiteVecSemanticIndex(index_path).info()
except Exception:
continue
if int(info.get("document_count") or 0) <= 0:
continue
metadata = dict(info.get("metadata") or {})
if metadata.get("channel") and metadata.get("channel") != channel:
continue
return info
return None
@staticmethod
def _register_uses_deferred_metadata(policy: Any) -> bool:
if not isinstance(policy, dict):

View file

@ -58,3 +58,60 @@ def test_semantic_search_scope_filters_explicit_source_type_facets():
{"folder_path": "/documents"}
) == {}
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
workspace = tmp_path / "workspace"
index_dir = workspace / "artifacts" / "projection_indexes"
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
summary_index.reset(
dimension=3,
metadata={
"channel": "summary",
"embedding_provider": "openai",
"embedding_model": "test-embedding",
"embedding_dimensions": 3,
},
)
summary_index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
)
]
)
filesystem = PageIndexFileSystem(workspace)
calls = []
def fake_configure(index_dir_arg, **kwargs):
calls.append((index_dir_arg, kwargs))
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
fake_configure,
)
assert filesystem.configure_existing_projection_retrieval() is True
assert calls == [
(
filesystem.summary_projection_index_dir,
{
"embedding_provider": "openai",
"embedding_model": "test-embedding",
"embedding_dimensions": 3,
"embedding_timeout": 60,
},
)
]
assert filesystem.semantic_retrieval_channels() == ("summary",)

View file

@ -204,6 +204,11 @@ class PIFSAgentStreamTest(unittest.TestCase):
self.assertIn("do not infer metadata presence or absence", AGENT_TOOL_POLICY)
self.assertIn("questions about metadata fields", BASH_TOOL_DESCRIPTION)
def test_prompt_routes_summary_search_to_search_summary(self):
self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION)
self.assertIn("use search-summary <query> <folder>", AGENT_TOOL_POLICY)
self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY)
def test_system_prompt_sets_workspace_identity_and_scope(self):
self.assertIn("PageIndex FileSystem Demo Agent", AGENT_SYSTEM_PROMPT)
self.assertIn("VectifyAI Team", AGENT_SYSTEM_PROMPT)

View file

@ -5,6 +5,24 @@ from pathlib import Path
class FakeFileSystem:
def __init__(self, workspace):
self.workspace = Path(workspace)
self.projection_retrieval_configured = False
def configure_existing_projection_retrieval(self):
self.projection_retrieval_configured = True
return True
def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp_path):
from pageindex.filesystem import cli
workspace = tmp_path / "workspace"
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
filesystem = cli._filesystem_from_workspace(str(workspace))
assert filesystem.workspace == workspace
assert filesystem.projection_retrieval_configured is True
def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):