From 27071cb7f5c0ab81574a1214344be642b3df49c3 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 31 May 2026 17:42:57 +0800 Subject: [PATCH] refactor(pifs): converge semantic retrieval on browse --- examples/pifs_demo.py | 14 +- pageindex/filesystem/agent.py | 44 ++- pageindex/filesystem/commands.py | 326 ++--------------------- pageindex/filesystem/core.py | 13 +- tests/test_pageindex_filesystem_scope.py | 139 ++++++---- tests/test_pifs_agent_stream.py | 17 +- 6 files changed, 150 insertions(+), 403 deletions(-) diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 7dcfd0d..a12f48d 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -4,7 +4,7 @@ PageIndex FileSystem (PIFS) agent demo. This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus through the PageIndex FileSystem shell instead of direct PageIndex document tools. The agent receives one read-only bash-like PIFS tool and must retrieve -evidence through commands such as ls, tree, find, grep, search-summary, +evidence through commands such as ls, tree, find, grep, browse, cat --structure, cat --page, and cat --node. The demo registers supported files under examples/documents. When a matching @@ -72,9 +72,9 @@ Retrieval strategy: or stable file_ref/document ids. Do not invent temporary ref_N aliases. - Folder paths such as /documents are positional command targets; do not put folder paths inside --where. -- Use search-summary when available to find likely documents. +- Use browse when available to find likely documents by semantic relevance. Quote multi-word queries and include a path, for example: - search-summary "Federal Reserve supervision regulation" /documents + browse /documents "Federal Reserve supervision regulation" - Use find --where only with JSON metadata DSL, for example: find /documents --where '{"file_format":"pdf"}' - Use grep -R only for lexical evidence; do not treat semantic candidates as @@ -642,15 +642,15 @@ def run_smoke_commands( verbose=verbose, ) - command = 'search-summary "Federal Reserve annual report supervision regulation section page range" /documents' + command = 'browse /documents "Federal Reserve annual report supervision regulation section page range"' summary = execute_json_command(json_executor, command) summary_hits = ((summary.get("data") or {}).get("data") or []) if summary_hits: - summary_result = f"{len(summary_hits)} summary-vector candidates; top={summary_hits[0].get('external_id')}" + summary_result = f"{len(summary_hits)} browse candidates; top={summary_hits[0].get('external_id')}" else: - summary_result = "summary-vector command is available, but this tiny two-doc demo returned no candidates" + summary_result = "browse is available, but this tiny two-doc demo returned no candidates" show_capability( - label="Semantic summary search", + label="Semantic browse", command=command, result=summary_result, raw=shell_executor.execute(command) if verbose else "", diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index b1f1625..f9b2241 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -35,9 +35,8 @@ document contents in the workspace. If the user asks what tools or capabilities you have, describe only the PIFS virtual shell capabilities available inside this workspace: ls, tree, find, -stat, grep, cat, and semantic search commands such as search-summary when they -are available. Do not mention host runtime tools, SDK internals, or orchestration -helpers that are not part of the PIFS shell. +stat, grep, cat, and browse. Do not mention host runtime tools, SDK internals, +or orchestration helpers that are not part of the PIFS shell. If the user asks a workspace-related topic question without naming a specific file, treat it as a retrieval task. Use available PIFS discovery commands to @@ -45,8 +44,8 @@ look for relevant files and inspect evidence before answering. Ask the user to clarify only after a reasonable search cannot identify relevant evidence. Do not conclude that no relevant document exists from one failed grep. If grep returns no matches for a workspace topic, verify with available semantic -candidate discovery such as search-summary, or inspect likely document -structure, before saying that the workspace lacks evidence. +candidate discovery through browse, or inspect likely document structure, +before saying that the workspace lacks evidence. Follow the task prompt for command policy, retrieval strategy, and answer format. If the caller needs stricter behavior, pass an explicit system_prompt. @@ -55,19 +54,18 @@ format. If the caller needs stricter behavior, pass an explicit system_prompt. BASH_TOOL_DESCRIPTION = """ Run a command in the PageIndex FileSystem virtual shell. This is not a real operating-system shell. By default the tool is read-only: use ls, tree, find, -grep, cat, stat, head, tail, sed, and any dynamically available semantic search -commands described in the workspace context. grep -R is lexical evidence search; +grep, cat, stat, head, tail, sed, and browse as described in the workspace +context. grep -R is lexical evidence search; grep does not support regex alternation such as "a|b"; run multiple grep -commands or use search-summary for semantic candidate discovery instead. -semantic search commands such as search-summary return candidate documents and -do not guarantee literal text matches or final answer evidence. After choosing -a likely search-summary candidate, verify the relevant claim with cat before -answering. Use search-summary when the user asks for summary search, semantic -search, or vector search and the command is listed as available. Quote -multi-word semantic queries, for example: -search-summary "Federal Reserve" /documents. Do not write -search-summary Federal Reserve /documents. Errors are returned as text prefixed -with ERROR. Do not call +commands or use browse for semantic candidate discovery instead. browse returns +candidate documents ranked by relevance and does not guarantee literal text +matches or final answer evidence. After choosing a likely browse candidate, +verify the relevant claim with cat before answering. Use browse when the user +asks for summary search, semantic search, or vector search and the command is +listed as available. Quote multi-word semantic queries, for example: +browse /documents "Federal Reserve". Do not write +browse /documents Federal Reserve. Errors are returned as text prefixed with +ERROR. Do not call commands that are not listed as available. When evidence is required, inspect it with cat or grep before answering. Prefer shell-like target-first cat syntax with stable targets: cat --structure, cat --page 31-59, and @@ -85,7 +83,7 @@ continue with another chunk before answering. For questions about metadata fields, available summaries, or whether metadata was provided, inspect stat --schema and stat before making claims. Do not use stat as a general content/topic discovery step. For document Q&A, -prefer search-summary/find/grep for candidates, then cat --structure and +prefer ls/tree to choose a folder, browse/find/grep for candidates, then cat --structure and cat --node or cat --page for evidence. """ @@ -97,11 +95,11 @@ Tool policy: - Folder paths such as /documents are positional command targets; never put folder paths in --where. - Use --where only with metadata fields shown by stat --schema. - grep -R performs lexical evidence search. -- grep does not support regex alternation such as "a|b"; run separate grep commands or use search-summary for semantic candidate discovery. -- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches or final answer evidence. After selecting a likely search-summary candidate, verify the relevant facts with cat before answering. -- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, search-summary, grep on a narrowed target, or cat on likely candidates instead. -- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with search-summary or another available semantic/vector candidate command, or inspect likely document structure, before answering no-evidence. -- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary "" ; quote multi-word queries, for example search-summary "Federal Reserve" /documents; do not translate that request into find --where. +- grep does not support regex alternation such as "a|b"; run separate grep commands or use browse for semantic candidate discovery. +- browse is the semantic candidate-discovery tool and does not guarantee literal text matches or final answer evidence. After selecting a likely browse candidate, verify the relevant facts with cat before answering. +- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, browse, grep on a narrowed target, or cat on likely candidates instead. +- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with browse or inspect likely document structure, before answering no-evidence. +- If the user asks for summary search, semantic search, vector search, or "用 summary 搜", use browse ""; quote multi-word queries, for example browse /documents "Federal Reserve"; use browse -R when the folder choice is uncertain; do not translate that request into find --where. - Tool errors are returned as ERROR text; recover by trying an available command. - Use cat or grep to gather evidence before making source-backed claims. - Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces. diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 8e99321..ee6e565 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -8,7 +8,7 @@ from dataclasses import asdict, is_dataclass from pathlib import Path from typing import Any -from .core import SEMANTIC_GREP_CHANNELS, SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem +from .core import SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem class PIFSCommandError(ValueError): @@ -30,30 +30,15 @@ class PIFSCommandExecutor: "tail", "sed", } - SEMANTIC_CHANNEL_COMMANDS = { - "summary": "search-summary", - "entity": "search-entity", - "relation": "search-relation", - } - ALLOWED_COMMANDS = ( - BASE_ALLOWED_COMMANDS - | {"semantic-grep"} - | set(SEMANTIC_CHANNEL_COMMANDS.values()) - ) + ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"} - COMMAND_METHODS = { - "search-summary": "_cmd_search_summary", - "search-entity": "_cmd_search_entity", - "search-relation": "_cmd_search_relation", - "semantic-grep": "_cmd_semantic_grep", - } + COMMAND_METHODS = {} MAX_CHAINED_COMMANDS = 3 MAX_PIPE_COMMANDS = 3 MAX_LS_LIMIT = 100 MAX_TREE_LIMIT = 200 MAX_FIND_LIMIT = 50 MAX_GREP_LIMIT = 20 - MAX_SEMANTIC_LIMIT = 20 BROWSE_PAGE_SIZE = 10 MAX_TEXT_LINES = 100 MAX_PAGE_SPAN = 5 @@ -65,7 +50,6 @@ class PIFSCommandExecutor: MAX_TREE_DEPTH = 4 MAX_LS_RENDER_FILES = 25 MAX_STAT_METADATA_FIELDS = 8 - SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT = 20 GREP_RECURSIVE_FOLDER_DEPTH_LIMIT = 2 GREP_RECURSIVE_FOLDER_FILE_LIMIT = 10 @@ -81,14 +65,7 @@ class PIFSCommandExecutor: self.query_context = query_context def allowed_commands(self) -> set[str]: - commands = set(self.BASE_ALLOWED_COMMANDS) - semantic_channels = set(self.filesystem.semantic_retrieval_channels()) - for channel in SEMANTIC_RETRIEVAL_CHANNELS: - if channel in semantic_channels: - commands.add(self.SEMANTIC_CHANNEL_COMMANDS[channel]) - if any(channel in semantic_channels for channel in SEMANTIC_GREP_CHANNELS): - commands.add("semantic-grep") - return commands + return set(self.BASE_ALLOWED_COMMANDS) def command_capabilities(self) -> dict[str, Any]: return { @@ -116,26 +93,10 @@ class PIFSCommandExecutor: "- cat --all: text artifact reads for txt/text files, paginated at 100 lines", "- stat --field : one metadata field across up to 20 documents", ] - if "entity" in semantic_channels: - lines.append("- find --name: entity semantic candidate discovery alias") - if "relation" in semantic_channels: - lines.append("- find --relation: relation semantic candidate discovery alias") - for channel in SEMANTIC_RETRIEVAL_CHANNELS: - if channel not in semantic_channels: - continue - lines.append( - f"- {self.SEMANTIC_CHANNEL_COMMANDS[channel]}: " - f"{channel} semantic vector candidate discovery" - ) - semantic_grep_channels = semantic.get("semantic_grep_channels") or [] - if semantic_grep_channels: - lines.append( - "- semantic-grep -R: semantic candidates from " - + ", ".join(semantic_grep_channels) - + " indexes followed by real line matching" - ) - if not semantic.get("commands"): - lines.append("- semantic vector commands: none available in this workspace") + if semantic_channels: + lines.append("- browse --space available: " + ", ".join(semantic_channels)) + else: + lines.append("- browse --space available: none in this workspace") lines.append("- grep , cat, stat: evidence inspection") return "\n".join(lines) @@ -207,8 +168,8 @@ class PIFSCommandExecutor: f"Unsupported pipe command: {name}. Supported pipes are: " f"{', '.join(sorted(self.ALLOWED_PIPE_FILTERS))}. " "If you meant regex alternation such as a|b, PIFS grep/search " - "does not support it; run multiple grep or search-summary " - "commands with one phrase each." + "does not support it; run multiple grep commands or browse " + "with one phrase each." ) if name == "head": return self._pipe_head_tail(input_text, tokens[1:], from_tail=False) @@ -405,24 +366,9 @@ class PIFSCommandExecutor: return [] scope["max_depth"] = max_depth if relation: - if not self.filesystem.has_semantic_channel("relation"): - raise PIFSCommandError( - "find --relation requires a relation semantic index in this workspace" - ) - return self.filesystem.search_semantic_channel( - "relation", - self._semantic_retrieval_query(relation), - scope=scope, - metadata_filter=where, - limit=limit, - ) - if name and self.filesystem.has_semantic_channel("entity"): - return self.filesystem.search_semantic_channel( - "entity", - self._semantic_retrieval_query(name), - scope=scope, - metadata_filter=where, - limit=limit, + raise PIFSCommandError( + 'find --relation is not supported; use browse "" ' + "--space relation for relation semantic file recall" ) return self.filesystem.search( query=name, @@ -769,172 +715,6 @@ class PIFSCommandExecutor: f"{start}-{end}", ) - def _cmd_search_summary(self, args: list[str]) -> Any: - return self._cmd_semantic_channel("summary", args) - - def _cmd_search_entity(self, args: list[str]) -> Any: - return self._cmd_semantic_channel("entity", args) - - def _cmd_search_relation(self, args: list[str]) -> Any: - return self._cmd_semantic_channel("relation", args) - - def _cmd_semantic_grep(self, args: list[str]) -> Any: - recursive = False - where = None - limit = 10 - positionals = [] - i = 0 - while i < len(args): - arg = args[i] - if arg in {"-R", "-r", "--recursive"}: - recursive = True - elif self._is_combined_grep_flag(arg): - recursive = recursive or "R" in arg or "r" in arg - elif arg in {"-n", "--line-number", "-i", "--ignore-case"}: - pass - elif arg == "--where": - i += 1 - where = args[i] - elif arg == "--limit": - i += 1 - limit = self._parse_bounded_int( - args[i], "semantic-grep --limit", max_value=self.MAX_SEMANTIC_LIMIT - ) - elif arg.startswith("-"): - raise PIFSCommandError(f"Unsupported semantic-grep option: {arg}") - else: - positionals.append(arg) - i += 1 - if not recursive: - raise PIFSCommandError("semantic-grep requires -R/--recursive") - channels = self._semantic_grep_channels() - if not channels: - raise PIFSCommandError( - "semantic-grep is not available; entity/relation semantic indexes are not configured" - ) - if not positionals: - raise PIFSCommandError("semantic-grep requires a query") - self._validate_search_positionals("semantic-grep", positionals) - query = positionals[0] - self._reject_regex_alternation_query(query, "semantic-grep") - path = positionals[1] if len(positionals) > 1 else "/" - if not self._is_folder(path): - raise PIFSCommandError("semantic-grep target must be a folder") - return self._semantic_recursive_grep( - self._normalize_folder_path(path), - query, - metadata_filter=where, - limit=limit, - channels=channels, - ) - - def _cmd_semantic_channel(self, channel: str, args: list[str]) -> Any: - if not self.filesystem.has_semantic_channel(channel): - raise PIFSCommandError( - f"search-{channel} is not available; {channel} semantic index is not configured" - ) - where = None - limit = 10 - positionals = [] - i = 0 - while i < len(args): - arg = args[i] - if arg == "--where": - i += 1 - where = args[i] - elif arg == "--limit": - i += 1 - limit = self._parse_bounded_int( - args[i], - f"search-{channel} --limit", - max_value=self.MAX_SEMANTIC_LIMIT, - ) - elif arg.startswith("-"): - raise PIFSCommandError(f"Unsupported search-{channel} option: {arg}") - else: - positionals.append(arg) - i += 1 - if not positionals: - raise PIFSCommandError(f"search-{channel} requires a query") - self._validate_search_positionals(f"search-{channel}", positionals) - query = positionals[0] - self._reject_regex_alternation_query(query, f"search-{channel}") - path = positionals[1] if len(positionals) > 1 else "/" - normalized = self._normalize_folder_path(path) - results = self.filesystem.search_semantic_channel( - channel, - self._semantic_retrieval_query(query), - scope={"folder_path": normalized, "recursive": True}, - metadata_filter=where, - limit=limit, - ) - return { - "mode": "files", - "query": query, - "scope": normalized, - "retrieval": f"{channel}_vector", - "data": self._semantic_channel_hits_from_results(channel, results, query), - } - - def _semantic_recursive_grep( - self, - folder_path: str, - query: str, - *, - metadata_filter: str | None, - limit: int, - channels: tuple[str, ...], - ) -> dict[str, Any]: - vector_query = str(query or "").strip() - candidate_debug: dict[str, Any] = {} - for channel in channels: - channel_results = self.filesystem.search_semantic_channel( - channel, - vector_query, - scope={"folder_path": folder_path, "recursive": True}, - metadata_filter=metadata_filter, - limit=self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, - ) - matches = self._grep_file_hits_from_results( - channel_results, - query, - require_match=True, - limit=limit, - ) - candidate_debug[channel] = { - "candidates": len(channel_results), - "line_matches": len(matches), - "candidate_doc_ids": [ - getattr(result, "external_id", None) - for result in channel_results[:5] - ], - } - if matches: - return { - "mode": "files", - "query": query, - "scope": folder_path, - "retrieval": "semantic_grep_" + "_then_".join(channels), - "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, - "matched_channel": channel, - "candidate_debug": candidate_debug, - "data": matches, - } - return { - "mode": "files", - "query": query, - "scope": folder_path, - "retrieval": "semantic_grep_" + "_then_".join(channels), - "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, - "matched_channel": "", - "candidate_debug": candidate_debug, - "data": [], - } - - def _semantic_grep_channels(self) -> tuple[str, ...]: - available = set(self.filesystem.semantic_retrieval_channels()) - return tuple(channel for channel in SEMANTIC_GREP_CHANNELS if channel in available) - def _bounded_text_artifact(self, target: str, location: str) -> dict[str, Any]: if str(location).strip().lower() in {"all", "full", "*"}: start, end = 1, self.MAX_TEXT_LINES @@ -1077,25 +857,10 @@ class PIFSCommandExecutor: return raise PIFSCommandError( f"{command_name} does not support regex alternation '|'. " - "Run multiple grep commands or multiple search-summary commands " + 'Run multiple grep commands or browse "" ' "with one phrase each." ) - @staticmethod - def _validate_search_positionals(command_name: str, positionals: list[str]) -> None: - if len(positionals) > 2: - raise PIFSCommandError( - f"{command_name} accepts one query and an optional folder path. " - f"Quote multi-word queries, for example: {command_name} " - '"Federal Reserve" /documents' - ) - if len(positionals) == 2 and not positionals[1].startswith("/"): - raise PIFSCommandError( - f"{command_name} target must be a PIFS folder path like /documents. " - f"If your query has spaces, quote it, for example: {command_name} " - '"Federal Reserve" /documents' - ) - @staticmethod def _parse_numeric_range(value: str, label: str) -> tuple[int, int]: try: @@ -1157,10 +922,8 @@ class PIFSCommandExecutor: return self._render_tree(data) if command_name == "browse": return self._render_browse(data) - if command_name in {"grep", "semantic-grep"}: + if command_name == "grep": return self._render_grep(data) - if command_name in {"search-summary", "search-entity", "search-relation"}: - return self._render_semantic_search(data) if command_name == "find": return self._render_find(data) if command_name == "stat": @@ -1283,26 +1046,6 @@ class PIFSCommandExecutor: ) return str(data) - def _render_semantic_search(self, data: Any) -> str: - if not isinstance(data, dict): - return str(data) - if data.get("mode") != "files": - return self._render_grep(data) - if not data.get("data", []): - return f"# no matches for: {data.get('query', '')}" - lines: list[str] = [] - for item in data.get("data", []): - lines.append(f"path: {item.get('path') or '-'}") - lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}") - if "entity" in item: - lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}") - if "relation" in item: - lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}") - line_text = self._one_line_value(item.get("line_text") or "") - lines.append(f"line_text: {line_text or '-'}") - lines.append("") - return "\n".join(lines).rstrip() - def _render_browse(self, data: Any) -> str: if not isinstance(data, dict): return str(data) @@ -1560,12 +1303,12 @@ class PIFSCommandExecutor: commands = [] quoted_query = shlex.quote(query) quoted_folder = shlex.quote(folder_path) - if self._semantic_grep_channels(): - commands.append(f"semantic-grep -R {quoted_query} {quoted_folder}") for channel in SEMANTIC_RETRIEVAL_CHANNELS: if self.filesystem.has_semantic_channel(channel): - command = self.SEMANTIC_CHANNEL_COMMANDS[channel] - commands.append(f"{command} {quoted_query} {quoted_folder}") + command = f"browse -R {quoted_folder} {quoted_query}" + if channel != "summary": + command += f" --space {channel}" + commands.append(command) return commands def _rank_child_folders( @@ -1627,37 +1370,6 @@ class PIFSCommandExecutor: break return hits - def _semantic_channel_hits_from_results( - self, - channel: str, - results: list[Any], - query: str, - ) -> list[dict[str, Any]]: - hits = [] - for result in results: - metadata = result.metadata or {} - line, text = self._first_matching_line(result.file_ref, query) - line_text = "" - if text: - line_text = f"{line}: {self._compact_text(text, max_chars=220)}" - hit = { - "path": self._stable_file_target_path( - { - "file_ref": result.file_ref, - "title": result.title, - "folder_paths": result.folder_paths, - "source_path": result.source_path, - "external_id": result.external_id, - } - ), - "summary": metadata.get("summary") or "", - "line_text": line_text, - } - if channel in {"entity", "relation"}: - hit[channel] = metadata.get(channel) or "" - hits.append(hit) - return hits - def _rank_child_folders_from_source( self, *, diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index c977c40..81b3848 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -76,7 +76,6 @@ PROJECTION_INDEX_STATUSES = { } SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation") -SEMANTIC_GREP_CHANNELS = ("entity", "relation") PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"} PAGEINDEX_DOCUMENT_CONTENT_TYPES = { "application/pdf", @@ -249,8 +248,8 @@ class PageIndexFileSystem: """Attach semantic retrieval to already-built projection indexes. Register-time generation owns building the index files. Opening an - existing workspace should still expose the corresponding read commands, - such as search-summary, without forcing a re-register step. + existing workspace should still expose semantic browse, without forcing + a re-register step. """ if self.semantic_retrieval_backend is not None: return bool(self.semantic_retrieval_channels()) @@ -696,12 +695,7 @@ class PageIndexFileSystem: def retrieval_capabilities(self) -> dict[str, Any]: semantic_channels = self.semantic_retrieval_channels() - semantic_commands = [f"search-{channel}" for channel in semantic_channels] - semantic_grep_channels = [ - channel for channel in SEMANTIC_GREP_CHANNELS if channel in semantic_channels - ] - if semantic_grep_channels: - semantic_commands.append("semantic-grep") + semantic_commands = ["browse"] if semantic_channels else [] return { "lexical": { "grep_recursive": True, @@ -713,7 +707,6 @@ class PageIndexFileSystem: "backend_configured": self.semantic_retrieval_backend is not None, "channels": list(semantic_channels), "commands": semantic_commands, - "semantic_grep_channels": semantic_grep_channels, }, } diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 6973c5a..9edf647 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -56,11 +56,13 @@ class ChannelBackend: def __init__(self, document_id, channels=("summary", "entity", "relation")): self.document_id = document_id self.channels = channels + self.calls = [] def available_channels(self): return self.channels def search_channel(self, channel, query, *, limit=10, filters=None): + self.calls.append((channel, query, limit, filters)) return [ SimpleNamespace( document_id=self.document_id, @@ -154,10 +156,30 @@ def test_browse_is_agent_visible_semantic_command(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report") executor = PIFSCommandExecutor(filesystem) - assert "browse" in executor.allowed_commands() - assert 'browse [-R] ""' in executor.describe_available_command_surfaces() + allowed = executor.allowed_commands() + surface = executor.describe_available_command_surfaces() + + assert "browse" in allowed + assert 'browse [-R] ""' in surface + assert not { + "search-summary", + "search-entity", + "search-relation", + "semantic-grep", + } & allowed + for old_command in ( + "search-summary", + "search-entity", + "search-relation", + "semantic-grep", + "find --name: entity semantic", + "find --relation: relation semantic", + ): + assert old_command not in surface + assert executor.command_capabilities()["retrieval"]["semantic"]["commands"] == ["browse"] def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path): @@ -328,7 +350,6 @@ def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path): "doc_direct", ] - def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path): import re @@ -447,7 +468,7 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp filesystem.store.resolve_file_ref("/shared/source.json") -def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): +def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -483,27 +504,17 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters executor = PIFSCommandExecutor(filesystem, json_output=True) result = json.loads( - executor.execute('search-summary "Federal Reserve annual report" /documents') + executor.execute('browse /documents "Federal Reserve annual report"') ) - assert backend.calls[0][2] == {} - assert result["data"]["data"][0] == { - "path": "/examples/documents/report.pdf", - "summary": "Federal Reserve annual report summary", - "line_text": "1: Federal Reserve supervision and regulation annual report.", - } + assert "source_type" not in backend.calls[0][2] + assert "source_path" not in backend.calls[0][2] + assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf" + assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref - executor.json_output = False - rendered = executor.execute('search-summary "Federal Reserve annual report" /documents') - assert "path: /examples/documents/report.pdf" in rendered - assert "summary: Federal Reserve annual report summary" in rendered - assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered - assert "id=dsid_report" not in rendered - assert "file_ref=" not in rendered - -def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path): +def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -552,7 +563,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") executor = PIFSCommandExecutor(filesystem, json_output=True) - result = json.loads(executor.execute('search-summary "H200 reservations" /documents')) + result = json.loads(executor.execute('browse /documents "H200 reservations"')) assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref @@ -560,7 +571,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa filesystem.store.resolve_file_ref("/documents/announcements") -def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path): +def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -609,14 +620,15 @@ def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_pat filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") executor = PIFSCommandExecutor(filesystem, json_output=True) - result = json.loads(executor.execute('search-summary "first" /documents')) + result = json.loads(executor.execute('browse /documents "first"')) assert result["data"]["data"][0]["path"] == "dsid_first" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref -def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): +def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError from pageindex.filesystem.metadata_generation import MetadataGenerationResult class MetadataGenerator: @@ -653,31 +665,29 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note") executor = PIFSCommandExecutor(filesystem, json_output=True) - entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents')) - assert entity["data"]["data"][0] == { - "path": "/examples/documents/market-note.pdf", - "summary": "Risk and compliance summary", - "line_text": "1: Federal Reserve policy affects Disney valuation.", - "entity": "Federal Reserve; Disney", - } + for command in ( + 'search-summary "Federal Reserve" /documents', + 'search-entity "Federal Reserve" /documents', + 'search-relation "Disney valuation" /documents', + 'semantic-grep -R "Federal Reserve" /documents', + ): + with pytest.raises(PIFSCommandError, match="Unsupported command"): + executor.execute(command) - relation = json.loads(executor.execute('search-relation "Disney valuation" /documents')) - assert relation["data"]["data"][0] == { - "path": "/examples/documents/market-note.pdf", - "summary": "Risk and compliance summary", - "line_text": "1: Federal Reserve policy affects Disney valuation.", - "relation": "Federal Reserve affects Disney valuation", - } + entity = json.loads( + executor.execute('browse /documents "Federal Reserve" --space entity') + ) + assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary" + assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf" - executor.json_output = False - rendered = executor.execute('search-entity "Federal Reserve" /documents') - assert "path: /examples/documents/market-note.pdf" in rendered - assert "summary: Risk and compliance summary" in rendered - assert "entity: Federal Reserve; Disney" in rendered - assert "file_ref=" not in rendered + relation = json.loads( + executor.execute('browse /documents "Disney valuation" --space relation') + ) + assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary" + assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf" -def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path): +def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError @@ -690,17 +700,42 @@ def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path): title="Annual report", content="Federal Reserve supervision and regulation annual report.", ) - filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report") + backend = ChannelBackend("dsid_report", channels=("entity", "relation")) + filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) - with pytest.raises(PIFSCommandError, match="Quote multi-word queries"): - executor.execute("search-summary Federal Reserve /documents") + result = json.loads(executor.execute("find /documents --name Reserve"))["data"] - with pytest.raises(PIFSCommandError, match="quote it"): - executor.execute("search-summary Federal Reserve") + assert result[0]["external_id"] == "dsid_report" + assert backend.calls == [] - with pytest.raises(PIFSCommandError, match="does not support regex alternation"): - executor.execute('search-summary "Federal|Reserve" /documents') + with pytest.raises(PIFSCommandError, match="find --relation is not supported"): + executor.execute('find /documents --relation "Reserve regulates report"') + + +def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "dsid_report", "/documents") + filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report") + filesystem.store.folder_subtree_thresholds = lambda *args, **kwargs: { + "depth_limit": 2, + "file_limit": 10, + "folder_depth_exceeds_limit": True, + "file_count_exceeds_limit": False, + "sampled_file_count": 11, + "sample_deep_folder_path": "/documents/deep", + } + executor = PIFSCommandExecutor(filesystem) + + rendered = executor.execute('grep -R "Federal Reserve" /documents') + + assert "# suggested: browse -R /documents 'Federal Reserve'" in rendered + assert "search-summary" not in rendered + assert "search-entity" not in rendered + assert "search-relation" not in rendered + assert "semantic-grep" not in rendered def test_semantic_search_scope_filters_explicit_source_type_facets(): diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index 69f62ed..f4475a7 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -215,10 +215,19 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("Do not run stat merely to understand what a document says", AGENT_TOOL_POLICY) self.assertIn("Do not use stat as a general content/topic discovery step", BASH_TOOL_DESCRIPTION) - def test_prompt_routes_summary_search_to_search_summary(self): - self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION) - self.assertIn('use search-summary "" ', AGENT_TOOL_POLICY) - self.assertIn('search-summary "Federal Reserve" /documents', BASH_TOOL_DESCRIPTION) + def test_prompt_routes_semantic_search_to_browse(self): + for old_command in ( + "search-summary", + "search-entity", + "search-relation", + "semantic-grep", + ): + self.assertNotIn(old_command, BASH_TOOL_DESCRIPTION) + self.assertNotIn(old_command, AGENT_TOOL_POLICY) + self.assertIn("Use browse when the user", BASH_TOOL_DESCRIPTION) + self.assertIn('use browse ""', AGENT_TOOL_POLICY) + self.assertIn('browse /documents "Federal Reserve"', BASH_TOOL_DESCRIPTION) + self.assertIn("browse -R ", AGENT_TOOL_POLICY) self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY) self.assertIn("verify the relevant facts with cat", AGENT_TOOL_POLICY) self.assertIn("verify the relevant claim with cat", BASH_TOOL_DESCRIPTION)