From fc0be1aeeea41b51659c39697862f3f6ffd81c68 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Tue, 26 May 2026 16:40:14 +0800 Subject: [PATCH] fix(filesystem): restore summary vector search in cli --- pageindex/filesystem/agent.py | 7 ++- pageindex/filesystem/cli.py | 5 +- pageindex/filesystem/core.py | 58 ++++++++++++++++++++++++ tests/test_pageindex_filesystem_scope.py | 57 +++++++++++++++++++++++ tests/test_pifs_agent_stream.py | 5 ++ tests/test_pifs_cli.py | 18 ++++++++ 6 files changed, 147 insertions(+), 3 deletions(-) diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index 1b342eb..ba8645d 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -42,8 +42,10 @@ Run a command in the PageIndex FileSystem virtual shell. This is not a real operating-system shell. By default the tool is read-only: use ls, tree, find, grep, cat, stat, head, tail, sed, and any dynamically available semantic search commands described in the workspace context. grep -R is lexical evidence search; -semantic search commands return candidate documents and do not guarantee literal -text matches. Errors are returned as text prefixed with ERROR. Do not call +semantic search commands such as search-summary return candidate documents and +do not guarantee literal text matches. Use search-summary when the user asks for +summary search, semantic search, or vector search and the command is listed as +available. Errors are returned as text prefixed with ERROR. Do not call commands that are not listed as available. When evidence is required, inspect it with cat or grep before answering. Prefer shell-like target-first cat syntax with stable targets: cat --structure, cat --page 31-59, and @@ -64,6 +66,7 @@ Tool policy: - Use --where only with metadata fields shown by stat --schema. - grep -R performs lexical evidence search. - Semantic search commands are candidate-discovery tools and do not guarantee literal text matches. +- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary ; do not translate that request into find --where. - Tool errors are returned as ERROR text; recover by trying an available command. - Use cat or grep to gather evidence before making source-backed claims. - Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59, cat --node . diff --git a/pageindex/filesystem/cli.py b/pageindex/filesystem/cli.py index aa443a5..24a78f4 100644 --- a/pageindex/filesystem/cli.py +++ b/pageindex/filesystem/cli.py @@ -122,7 +122,10 @@ def _parse_agent_command( def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem: - return PageIndexFileSystem(Path(workspace).expanduser()) + filesystem = PageIndexFileSystem(Path(workspace).expanduser()) + with contextlib.suppress(Exception): + filesystem.configure_existing_projection_retrieval() + return filesystem def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]: diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 9a0f88b..35af513 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -218,6 +218,64 @@ class PageIndexFileSystem: embedding_timeout=self.summary_projection_embedding_timeout, ) + def configure_existing_projection_retrieval(self) -> bool: + """Attach semantic retrieval to already-built projection indexes. + + Register-time generation owns building the index files. Opening an + existing workspace should still expose the corresponding read commands, + such as search-summary, without forcing a re-register step. + """ + if self.semantic_retrieval_backend is not None: + return bool(self.semantic_retrieval_channels()) + index_config = self._existing_projection_index_config() + if index_config is None: + return False + metadata = dict(index_config.get("metadata") or {}) + embedding_provider = str( + metadata.get("embedding_provider") + or self.summary_projection_embedding_provider + ) + embedding_model = str( + metadata.get("embedding_model") + or self.summary_projection_embedding_model + ) + embedding_dimensions = int( + metadata.get("embedding_dimensions") + or index_config.get("dimension") + or self.summary_projection_embedding_dimensions + ) + self.configure_hybrid_projection_retrieval( + self.summary_projection_index_dir, + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + embedding_timeout=self.summary_projection_embedding_timeout, + ) + return bool(self.semantic_retrieval_channels()) + + def _existing_projection_index_config(self) -> dict[str, Any] | None: + from .hybrid_projection import INDEX_BY_CHANNEL + from .semantic_index import SQLiteVecSemanticIndex + + for channel in SEMANTIC_RETRIEVAL_CHANNELS: + index_name = INDEX_BY_CHANNEL.get(channel) + if not index_name: + continue + index_path = self.summary_projection_index_dir / f"{index_name}.sqlite" + if not index_path.exists(): + continue + try: + info = SQLiteVecSemanticIndex(index_path).info() + except Exception: + continue + if int(info.get("document_count") or 0) <= 0: + continue + metadata = dict(info.get("metadata") or {}) + if metadata.get("channel") and metadata.get("channel") != channel: + continue + return info + return None + @staticmethod def _register_uses_deferred_metadata(policy: Any) -> bool: if not isinstance(policy, dict): diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index e08dabd..b421714 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -58,3 +58,60 @@ def test_semantic_search_scope_filters_explicit_source_type_facets(): {"folder_path": "/documents"} ) == {} + +def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "openai", + "embedding_model": "test-embedding", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + filesystem = PageIndexFileSystem(workspace) + calls = [] + + def fake_configure(index_dir_arg, **kwargs): + calls.append((index_dir_arg, kwargs)) + filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") + return filesystem.semantic_retrieval_backend + + monkeypatch.setattr( + filesystem, + "configure_hybrid_projection_retrieval", + fake_configure, + ) + + assert filesystem.configure_existing_projection_retrieval() is True + assert calls == [ + ( + filesystem.summary_projection_index_dir, + { + "embedding_provider": "openai", + "embedding_model": "test-embedding", + "embedding_dimensions": 3, + "embedding_timeout": 60, + }, + ) + ] + assert filesystem.semantic_retrieval_channels() == ("summary",) diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index 4beaf6c..e160850 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -204,6 +204,11 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("do not infer metadata presence or absence", AGENT_TOOL_POLICY) self.assertIn("questions about metadata fields", BASH_TOOL_DESCRIPTION) + def test_prompt_routes_summary_search_to_search_summary(self): + self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION) + self.assertIn("use search-summary ", AGENT_TOOL_POLICY) + self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY) + def test_system_prompt_sets_workspace_identity_and_scope(self): self.assertIn("PageIndex FileSystem Demo Agent", AGENT_SYSTEM_PROMPT) self.assertIn("VectifyAI Team", AGENT_SYSTEM_PROMPT) diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py index 04717c4..74832b8 100644 --- a/tests/test_pifs_cli.py +++ b/tests/test_pifs_cli.py @@ -5,6 +5,24 @@ from pathlib import Path class FakeFileSystem: def __init__(self, workspace): self.workspace = Path(workspace) + self.projection_retrieval_configured = False + + def configure_existing_projection_retrieval(self): + self.projection_retrieval_configured = True + return True + + +def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + + filesystem = cli._filesystem_from_workspace(str(workspace)) + + assert filesystem.workspace == workspace + assert filesystem.projection_retrieval_configured is True def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):