Merge Goal 1: converge semantic retrieval on browse

Merge removal of legacy semantic commands into feat/pageindex-filesystem.
This commit is contained in:
Bukely_ 2026-05-31 21:41:35 +08:00 committed by GitHub
commit 889db8cc01
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 150 additions and 403 deletions

View file

@ -4,7 +4,7 @@ PageIndex FileSystem (PIFS) agent demo.
This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus
through the PageIndex FileSystem shell instead of direct PageIndex document
tools. The agent receives one read-only bash-like PIFS tool and must retrieve
evidence through commands such as ls, tree, find, grep, search-summary,
evidence through commands such as ls, tree, find, grep, browse,
cat <path> --structure, cat <path> --page, and cat <path> --node.
The demo registers supported files under examples/documents. When a matching
@ -72,9 +72,9 @@ Retrieval strategy:
or stable file_ref/document ids. Do not invent temporary ref_N aliases.
- Folder paths such as /documents are positional command targets; do not put
folder paths inside --where.
- Use search-summary when available to find likely documents.
- Use browse when available to find likely documents by semantic relevance.
Quote multi-word queries and include a path, for example:
search-summary "Federal Reserve supervision regulation" /documents
browse /documents "Federal Reserve supervision regulation"
- Use find --where only with JSON metadata DSL, for example:
find /documents --where '{"file_format":"pdf"}'
- Use grep -R only for lexical evidence; do not treat semantic candidates as
@ -642,15 +642,15 @@ def run_smoke_commands(
verbose=verbose,
)
command = 'search-summary "Federal Reserve annual report supervision regulation section page range" /documents'
command = 'browse /documents "Federal Reserve annual report supervision regulation section page range"'
summary = execute_json_command(json_executor, command)
summary_hits = ((summary.get("data") or {}).get("data") or [])
if summary_hits:
summary_result = f"{len(summary_hits)} summary-vector candidates; top={summary_hits[0].get('external_id')}"
summary_result = f"{len(summary_hits)} browse candidates; top={summary_hits[0].get('external_id')}"
else:
summary_result = "summary-vector command is available, but this tiny two-doc demo returned no candidates"
summary_result = "browse is available, but this tiny two-doc demo returned no candidates"
show_capability(
label="Semantic summary search",
label="Semantic browse",
command=command,
result=summary_result,
raw=shell_executor.execute(command) if verbose else "",

View file

@ -35,9 +35,8 @@ document contents in the workspace.
If the user asks what tools or capabilities you have, describe only the PIFS
virtual shell capabilities available inside this workspace: ls, tree, find,
stat, grep, cat, and semantic search commands such as search-summary when they
are available. Do not mention host runtime tools, SDK internals, or orchestration
helpers that are not part of the PIFS shell.
stat, grep, cat, and browse. Do not mention host runtime tools, SDK internals,
or orchestration helpers that are not part of the PIFS shell.
If the user asks a workspace-related topic question without naming a specific
file, treat it as a retrieval task. Use available PIFS discovery commands to
@ -45,8 +44,8 @@ look for relevant files and inspect evidence before answering. Ask the user to
clarify only after a reasonable search cannot identify relevant evidence.
Do not conclude that no relevant document exists from one failed grep. If grep
returns no matches for a workspace topic, verify with available semantic
candidate discovery such as search-summary, or inspect likely document
structure, before saying that the workspace lacks evidence.
candidate discovery through browse, or inspect likely document structure,
before saying that the workspace lacks evidence.
Follow the task prompt for command policy, retrieval strategy, and answer
format. If the caller needs stricter behavior, pass an explicit system_prompt.
@ -55,19 +54,18 @@ format. If the caller needs stricter behavior, pass an explicit system_prompt.
BASH_TOOL_DESCRIPTION = """
Run a command in the PageIndex FileSystem virtual shell. This is not a real
operating-system shell. By default the tool is read-only: use ls, tree, find,
grep, cat, stat, head, tail, sed, and any dynamically available semantic search
commands described in the workspace context. grep -R is lexical evidence search;
grep, cat, stat, head, tail, sed, and browse as described in the workspace
context. grep -R is lexical evidence search;
grep does not support regex alternation such as "a|b"; run multiple grep
commands or use search-summary for semantic candidate discovery instead.
semantic search commands such as search-summary return candidate documents and
do not guarantee literal text matches or final answer evidence. After choosing
a likely search-summary candidate, verify the relevant claim with cat before
answering. Use search-summary when the user asks for summary search, semantic
search, or vector search and the command is listed as available. Quote
multi-word semantic queries, for example:
search-summary "Federal Reserve" /documents. Do not write
search-summary Federal Reserve /documents. Errors are returned as text prefixed
with ERROR. Do not call
commands or use browse for semantic candidate discovery instead. browse returns
candidate documents ranked by relevance and does not guarantee literal text
matches or final answer evidence. After choosing a likely browse candidate,
verify the relevant claim with cat before answering. Use browse when the user
asks for summary search, semantic search, or vector search and the command is
listed as available. Quote multi-word semantic queries, for example:
browse /documents "Federal Reserve". Do not write
browse /documents Federal Reserve. Errors are returned as text prefixed with
ERROR. Do not call
commands that are not listed as available. When evidence is required, inspect it
with cat or grep before answering. Prefer shell-like target-first cat syntax
with stable targets: cat <path> --structure, cat <path> --page 31-59, and
@ -85,7 +83,7 @@ continue with another chunk before answering.
For questions about metadata fields, available summaries, or whether metadata
was provided, inspect stat --schema and stat <target> before making claims.
Do not use stat as a general content/topic discovery step. For document Q&A,
prefer search-summary/find/grep for candidates, then cat --structure and
prefer ls/tree to choose a folder, browse/find/grep for candidates, then cat --structure and
cat --node or cat --page for evidence.
"""
@ -97,11 +95,11 @@ Tool policy:
- Folder paths such as /documents are positional command targets; never put folder paths in --where.
- Use --where only with metadata fields shown by stat --schema.
- grep -R performs lexical evidence search.
- grep does not support regex alternation such as "a|b"; run separate grep commands or use search-summary for semantic candidate discovery.
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches or final answer evidence. After selecting a likely search-summary candidate, verify the relevant facts with cat before answering.
- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, search-summary, grep on a narrowed target, or cat on likely candidates instead.
- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with search-summary or another available semantic/vector candidate command, or inspect likely document structure, before answering no-evidence.
- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary "<query>" <folder>; quote multi-word queries, for example search-summary "Federal Reserve" /documents; do not translate that request into find --where.
- grep does not support regex alternation such as "a|b"; run separate grep commands or use browse for semantic candidate discovery.
- browse is the semantic candidate-discovery tool and does not guarantee literal text matches or final answer evidence. After selecting a likely browse candidate, verify the relevant facts with cat before answering.
- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, browse, grep on a narrowed target, or cat on likely candidates instead.
- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with browse or inspect likely document structure, before answering no-evidence.
- If the user asks for summary search, semantic search, vector search, or "用 summary 搜", use browse <folder> "<query>"; quote multi-word queries, for example browse /documents "Federal Reserve"; use browse -R <folder> when the folder choice is uncertain; do not translate that request into find --where.
- Tool errors are returned as ERROR text; recover by trying an available command.
- Use cat or grep to gather evidence before making source-backed claims.
- Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces.

View file

@ -8,7 +8,7 @@ from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any
from .core import SEMANTIC_GREP_CHANNELS, SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
from .core import SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
class PIFSCommandError(ValueError):
@ -30,30 +30,15 @@ class PIFSCommandExecutor:
"tail",
"sed",
}
SEMANTIC_CHANNEL_COMMANDS = {
"summary": "search-summary",
"entity": "search-entity",
"relation": "search-relation",
}
ALLOWED_COMMANDS = (
BASE_ALLOWED_COMMANDS
| {"semantic-grep"}
| set(SEMANTIC_CHANNEL_COMMANDS.values())
)
ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS
ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
COMMAND_METHODS = {
"search-summary": "_cmd_search_summary",
"search-entity": "_cmd_search_entity",
"search-relation": "_cmd_search_relation",
"semantic-grep": "_cmd_semantic_grep",
}
COMMAND_METHODS = {}
MAX_CHAINED_COMMANDS = 3
MAX_PIPE_COMMANDS = 3
MAX_LS_LIMIT = 100
MAX_TREE_LIMIT = 200
MAX_FIND_LIMIT = 50
MAX_GREP_LIMIT = 20
MAX_SEMANTIC_LIMIT = 20
BROWSE_PAGE_SIZE = 10
MAX_TEXT_LINES = 100
MAX_PAGE_SPAN = 5
@ -65,7 +50,6 @@ class PIFSCommandExecutor:
MAX_TREE_DEPTH = 4
MAX_LS_RENDER_FILES = 25
MAX_STAT_METADATA_FIELDS = 8
SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT = 20
GREP_RECURSIVE_FOLDER_DEPTH_LIMIT = 2
GREP_RECURSIVE_FOLDER_FILE_LIMIT = 10
@ -81,14 +65,7 @@ class PIFSCommandExecutor:
self.query_context = query_context
def allowed_commands(self) -> set[str]:
commands = set(self.BASE_ALLOWED_COMMANDS)
semantic_channels = set(self.filesystem.semantic_retrieval_channels())
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
if channel in semantic_channels:
commands.add(self.SEMANTIC_CHANNEL_COMMANDS[channel])
if any(channel in semantic_channels for channel in SEMANTIC_GREP_CHANNELS):
commands.add("semantic-grep")
return commands
return set(self.BASE_ALLOWED_COMMANDS)
def command_capabilities(self) -> dict[str, Any]:
return {
@ -116,26 +93,10 @@ class PIFSCommandExecutor:
"- cat <path|file_ref|document_id> --all: text artifact reads for txt/text files, paginated at 100 lines",
"- stat --field <metadata_field> <target...>: one metadata field across up to 20 documents",
]
if "entity" in semantic_channels:
lines.append("- find --name: entity semantic candidate discovery alias")
if "relation" in semantic_channels:
lines.append("- find --relation: relation semantic candidate discovery alias")
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
if channel not in semantic_channels:
continue
lines.append(
f"- {self.SEMANTIC_CHANNEL_COMMANDS[channel]}: "
f"{channel} semantic vector candidate discovery"
)
semantic_grep_channels = semantic.get("semantic_grep_channels") or []
if semantic_grep_channels:
lines.append(
"- semantic-grep -R: semantic candidates from "
+ ", ".join(semantic_grep_channels)
+ " indexes followed by real line matching"
)
if not semantic.get("commands"):
lines.append("- semantic vector commands: none available in this workspace")
if semantic_channels:
lines.append("- browse --space available: " + ", ".join(semantic_channels))
else:
lines.append("- browse --space available: none in this workspace")
lines.append("- grep <query> <path|file_ref|document_id>, cat, stat: evidence inspection")
return "\n".join(lines)
@ -207,8 +168,8 @@ class PIFSCommandExecutor:
f"Unsupported pipe command: {name}. Supported pipes are: "
f"{', '.join(sorted(self.ALLOWED_PIPE_FILTERS))}. "
"If you meant regex alternation such as a|b, PIFS grep/search "
"does not support it; run multiple grep or search-summary "
"commands with one phrase each."
"does not support it; run multiple grep commands or browse "
"with one phrase each."
)
if name == "head":
return self._pipe_head_tail(input_text, tokens[1:], from_tail=False)
@ -405,24 +366,9 @@ class PIFSCommandExecutor:
return []
scope["max_depth"] = max_depth
if relation:
if not self.filesystem.has_semantic_channel("relation"):
raise PIFSCommandError(
"find --relation requires a relation semantic index in this workspace"
)
return self.filesystem.search_semantic_channel(
"relation",
self._semantic_retrieval_query(relation),
scope=scope,
metadata_filter=where,
limit=limit,
)
if name and self.filesystem.has_semantic_channel("entity"):
return self.filesystem.search_semantic_channel(
"entity",
self._semantic_retrieval_query(name),
scope=scope,
metadata_filter=where,
limit=limit,
raise PIFSCommandError(
'find --relation is not supported; use browse <folder> "<query>" '
"--space relation for relation semantic file recall"
)
return self.filesystem.search(
query=name,
@ -769,172 +715,6 @@ class PIFSCommandExecutor:
f"{start}-{end}",
)
def _cmd_search_summary(self, args: list[str]) -> Any:
return self._cmd_semantic_channel("summary", args)
def _cmd_search_entity(self, args: list[str]) -> Any:
return self._cmd_semantic_channel("entity", args)
def _cmd_search_relation(self, args: list[str]) -> Any:
return self._cmd_semantic_channel("relation", args)
def _cmd_semantic_grep(self, args: list[str]) -> Any:
recursive = False
where = None
limit = 10
positionals = []
i = 0
while i < len(args):
arg = args[i]
if arg in {"-R", "-r", "--recursive"}:
recursive = True
elif self._is_combined_grep_flag(arg):
recursive = recursive or "R" in arg or "r" in arg
elif arg in {"-n", "--line-number", "-i", "--ignore-case"}:
pass
elif arg == "--where":
i += 1
where = args[i]
elif arg == "--limit":
i += 1
limit = self._parse_bounded_int(
args[i], "semantic-grep --limit", max_value=self.MAX_SEMANTIC_LIMIT
)
elif arg.startswith("-"):
raise PIFSCommandError(f"Unsupported semantic-grep option: {arg}")
else:
positionals.append(arg)
i += 1
if not recursive:
raise PIFSCommandError("semantic-grep requires -R/--recursive")
channels = self._semantic_grep_channels()
if not channels:
raise PIFSCommandError(
"semantic-grep is not available; entity/relation semantic indexes are not configured"
)
if not positionals:
raise PIFSCommandError("semantic-grep requires a query")
self._validate_search_positionals("semantic-grep", positionals)
query = positionals[0]
self._reject_regex_alternation_query(query, "semantic-grep")
path = positionals[1] if len(positionals) > 1 else "/"
if not self._is_folder(path):
raise PIFSCommandError("semantic-grep target must be a folder")
return self._semantic_recursive_grep(
self._normalize_folder_path(path),
query,
metadata_filter=where,
limit=limit,
channels=channels,
)
def _cmd_semantic_channel(self, channel: str, args: list[str]) -> Any:
if not self.filesystem.has_semantic_channel(channel):
raise PIFSCommandError(
f"search-{channel} is not available; {channel} semantic index is not configured"
)
where = None
limit = 10
positionals = []
i = 0
while i < len(args):
arg = args[i]
if arg == "--where":
i += 1
where = args[i]
elif arg == "--limit":
i += 1
limit = self._parse_bounded_int(
args[i],
f"search-{channel} --limit",
max_value=self.MAX_SEMANTIC_LIMIT,
)
elif arg.startswith("-"):
raise PIFSCommandError(f"Unsupported search-{channel} option: {arg}")
else:
positionals.append(arg)
i += 1
if not positionals:
raise PIFSCommandError(f"search-{channel} requires a query")
self._validate_search_positionals(f"search-{channel}", positionals)
query = positionals[0]
self._reject_regex_alternation_query(query, f"search-{channel}")
path = positionals[1] if len(positionals) > 1 else "/"
normalized = self._normalize_folder_path(path)
results = self.filesystem.search_semantic_channel(
channel,
self._semantic_retrieval_query(query),
scope={"folder_path": normalized, "recursive": True},
metadata_filter=where,
limit=limit,
)
return {
"mode": "files",
"query": query,
"scope": normalized,
"retrieval": f"{channel}_vector",
"data": self._semantic_channel_hits_from_results(channel, results, query),
}
def _semantic_recursive_grep(
self,
folder_path: str,
query: str,
*,
metadata_filter: str | None,
limit: int,
channels: tuple[str, ...],
) -> dict[str, Any]:
vector_query = str(query or "").strip()
candidate_debug: dict[str, Any] = {}
for channel in channels:
channel_results = self.filesystem.search_semantic_channel(
channel,
vector_query,
scope={"folder_path": folder_path, "recursive": True},
metadata_filter=metadata_filter,
limit=self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
)
matches = self._grep_file_hits_from_results(
channel_results,
query,
require_match=True,
limit=limit,
)
candidate_debug[channel] = {
"candidates": len(channel_results),
"line_matches": len(matches),
"candidate_doc_ids": [
getattr(result, "external_id", None)
for result in channel_results[:5]
],
}
if matches:
return {
"mode": "files",
"query": query,
"scope": folder_path,
"retrieval": "semantic_grep_" + "_then_".join(channels),
"candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
"matched_channel": channel,
"candidate_debug": candidate_debug,
"data": matches,
}
return {
"mode": "files",
"query": query,
"scope": folder_path,
"retrieval": "semantic_grep_" + "_then_".join(channels),
"candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
"matched_channel": "",
"candidate_debug": candidate_debug,
"data": [],
}
def _semantic_grep_channels(self) -> tuple[str, ...]:
available = set(self.filesystem.semantic_retrieval_channels())
return tuple(channel for channel in SEMANTIC_GREP_CHANNELS if channel in available)
def _bounded_text_artifact(self, target: str, location: str) -> dict[str, Any]:
if str(location).strip().lower() in {"all", "full", "*"}:
start, end = 1, self.MAX_TEXT_LINES
@ -1077,25 +857,10 @@ class PIFSCommandExecutor:
return
raise PIFSCommandError(
f"{command_name} does not support regex alternation '|'. "
"Run multiple grep commands or multiple search-summary commands "
'Run multiple grep commands or browse <folder> "<query>" '
"with one phrase each."
)
@staticmethod
def _validate_search_positionals(command_name: str, positionals: list[str]) -> None:
if len(positionals) > 2:
raise PIFSCommandError(
f"{command_name} accepts one query and an optional folder path. "
f"Quote multi-word queries, for example: {command_name} "
'"Federal Reserve" /documents'
)
if len(positionals) == 2 and not positionals[1].startswith("/"):
raise PIFSCommandError(
f"{command_name} target must be a PIFS folder path like /documents. "
f"If your query has spaces, quote it, for example: {command_name} "
'"Federal Reserve" /documents'
)
@staticmethod
def _parse_numeric_range(value: str, label: str) -> tuple[int, int]:
try:
@ -1157,10 +922,8 @@ class PIFSCommandExecutor:
return self._render_tree(data)
if command_name == "browse":
return self._render_browse(data)
if command_name in {"grep", "semantic-grep"}:
if command_name == "grep":
return self._render_grep(data)
if command_name in {"search-summary", "search-entity", "search-relation"}:
return self._render_semantic_search(data)
if command_name == "find":
return self._render_find(data)
if command_name == "stat":
@ -1283,26 +1046,6 @@ class PIFSCommandExecutor:
)
return str(data)
def _render_semantic_search(self, data: Any) -> str:
if not isinstance(data, dict):
return str(data)
if data.get("mode") != "files":
return self._render_grep(data)
if not data.get("data", []):
return f"# no matches for: {data.get('query', '')}"
lines: list[str] = []
for item in data.get("data", []):
lines.append(f"path: {item.get('path') or '-'}")
lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}")
if "entity" in item:
lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}")
if "relation" in item:
lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}")
line_text = self._one_line_value(item.get("line_text") or "")
lines.append(f"line_text: {line_text or '-'}")
lines.append("")
return "\n".join(lines).rstrip()
def _render_browse(self, data: Any) -> str:
if not isinstance(data, dict):
return str(data)
@ -1560,12 +1303,12 @@ class PIFSCommandExecutor:
commands = []
quoted_query = shlex.quote(query)
quoted_folder = shlex.quote(folder_path)
if self._semantic_grep_channels():
commands.append(f"semantic-grep -R {quoted_query} {quoted_folder}")
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
if self.filesystem.has_semantic_channel(channel):
command = self.SEMANTIC_CHANNEL_COMMANDS[channel]
commands.append(f"{command} {quoted_query} {quoted_folder}")
command = f"browse -R {quoted_folder} {quoted_query}"
if channel != "summary":
command += f" --space {channel}"
commands.append(command)
return commands
def _rank_child_folders(
@ -1627,37 +1370,6 @@ class PIFSCommandExecutor:
break
return hits
def _semantic_channel_hits_from_results(
self,
channel: str,
results: list[Any],
query: str,
) -> list[dict[str, Any]]:
hits = []
for result in results:
metadata = result.metadata or {}
line, text = self._first_matching_line(result.file_ref, query)
line_text = ""
if text:
line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
hit = {
"path": self._stable_file_target_path(
{
"file_ref": result.file_ref,
"title": result.title,
"folder_paths": result.folder_paths,
"source_path": result.source_path,
"external_id": result.external_id,
}
),
"summary": metadata.get("summary") or "",
"line_text": line_text,
}
if channel in {"entity", "relation"}:
hit[channel] = metadata.get(channel) or ""
hits.append(hit)
return hits
def _rank_child_folders_from_source(
self,
*,

View file

@ -76,7 +76,6 @@ PROJECTION_INDEX_STATUSES = {
}
SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
SEMANTIC_GREP_CHANNELS = ("entity", "relation")
PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
"application/pdf",
@ -249,8 +248,8 @@ class PageIndexFileSystem:
"""Attach semantic retrieval to already-built projection indexes.
Register-time generation owns building the index files. Opening an
existing workspace should still expose the corresponding read commands,
such as search-summary, without forcing a re-register step.
existing workspace should still expose semantic browse, without forcing
a re-register step.
"""
if self.semantic_retrieval_backend is not None:
return bool(self.semantic_retrieval_channels())
@ -696,12 +695,7 @@ class PageIndexFileSystem:
def retrieval_capabilities(self) -> dict[str, Any]:
semantic_channels = self.semantic_retrieval_channels()
semantic_commands = [f"search-{channel}" for channel in semantic_channels]
semantic_grep_channels = [
channel for channel in SEMANTIC_GREP_CHANNELS if channel in semantic_channels
]
if semantic_grep_channels:
semantic_commands.append("semantic-grep")
semantic_commands = ["browse"] if semantic_channels else []
return {
"lexical": {
"grep_recursive": True,
@ -713,7 +707,6 @@ class PageIndexFileSystem:
"backend_configured": self.semantic_retrieval_backend is not None,
"channels": list(semantic_channels),
"commands": semantic_commands,
"semantic_grep_channels": semantic_grep_channels,
},
}

View file

@ -56,11 +56,13 @@ class ChannelBackend:
def __init__(self, document_id, channels=("summary", "entity", "relation")):
self.document_id = document_id
self.channels = channels
self.calls = []
def available_channels(self):
return self.channels
def search_channel(self, channel, query, *, limit=10, filters=None):
self.calls.append((channel, query, limit, filters))
return [
SimpleNamespace(
document_id=self.document_id,
@ -154,10 +156,30 @@ def test_browse_is_agent_visible_semantic_command(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
executor = PIFSCommandExecutor(filesystem)
assert "browse" in executor.allowed_commands()
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
allowed = executor.allowed_commands()
surface = executor.describe_available_command_surfaces()
assert "browse" in allowed
assert 'browse [-R] <folder> "<query>"' in surface
assert not {
"search-summary",
"search-entity",
"search-relation",
"semantic-grep",
} & allowed
for old_command in (
"search-summary",
"search-entity",
"search-relation",
"semantic-grep",
"find --name: entity semantic",
"find --relation: relation semantic",
):
assert old_command not in surface
assert executor.command_capabilities()["retrieval"]["semantic"]["commands"] == ["browse"]
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
@ -328,7 +350,6 @@ def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
"doc_direct",
]
def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path):
import re
@ -447,7 +468,7 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
filesystem.store.resolve_file_ref("/shared/source.json")
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -483,27 +504,17 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(
executor.execute('search-summary "Federal Reserve annual report" /documents')
executor.execute('browse /documents "Federal Reserve annual report"')
)
assert backend.calls[0][2] == {}
assert result["data"]["data"][0] == {
"path": "/examples/documents/report.pdf",
"summary": "Federal Reserve annual report summary",
"line_text": "1: Federal Reserve supervision and regulation annual report.",
}
assert "source_type" not in backend.calls[0][2]
assert "source_path" not in backend.calls[0][2]
assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf"
assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
executor.json_output = False
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
assert "path: /examples/documents/report.pdf" in rendered
assert "summary: Federal Reserve annual report summary" in rendered
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
assert "id=dsid_report" not in rendered
assert "file_ref=" not in rendered
def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -552,7 +563,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
result = json.loads(executor.execute('browse /documents "H200 reservations"'))
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
@ -560,7 +571,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa
filesystem.store.resolve_file_ref("/documents/announcements")
def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -609,14 +620,15 @@ def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_pat
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('search-summary "first" /documents'))
result = json.loads(executor.execute('browse /documents "first"'))
assert result["data"]["data"][0]["path"] == "dsid_first"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class MetadataGenerator:
@ -653,31 +665,29 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
executor = PIFSCommandExecutor(filesystem, json_output=True)
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
assert entity["data"]["data"][0] == {
"path": "/examples/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"entity": "Federal Reserve; Disney",
}
for command in (
'search-summary "Federal Reserve" /documents',
'search-entity "Federal Reserve" /documents',
'search-relation "Disney valuation" /documents',
'semantic-grep -R "Federal Reserve" /documents',
):
with pytest.raises(PIFSCommandError, match="Unsupported command"):
executor.execute(command)
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
assert relation["data"]["data"][0] == {
"path": "/examples/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"relation": "Federal Reserve affects Disney valuation",
}
entity = json.loads(
executor.execute('browse /documents "Federal Reserve" --space entity')
)
assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary"
assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
executor.json_output = False
rendered = executor.execute('search-entity "Federal Reserve" /documents')
assert "path: /examples/documents/market-note.pdf" in rendered
assert "summary: Risk and compliance summary" in rendered
assert "entity: Federal Reserve; Disney" in rendered
assert "file_ref=" not in rendered
relation = json.loads(
executor.execute('browse /documents "Disney valuation" --space relation')
)
assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary"
assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
@ -690,17 +700,42 @@ def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
title="Annual report",
content="Federal Reserve supervision and regulation annual report.",
)
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report")
backend = ChannelBackend("dsid_report", channels=("entity", "relation"))
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="Quote multi-word queries"):
executor.execute("search-summary Federal Reserve /documents")
result = json.loads(executor.execute("find /documents --name Reserve"))["data"]
with pytest.raises(PIFSCommandError, match="quote it"):
executor.execute("search-summary Federal Reserve")
assert result[0]["external_id"] == "dsid_report"
assert backend.calls == []
with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
executor.execute('search-summary "Federal|Reserve" /documents')
with pytest.raises(PIFSCommandError, match="find --relation is not supported"):
executor.execute('find /documents --relation "Reserve regulates report"')
def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "dsid_report", "/documents")
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
filesystem.store.folder_subtree_thresholds = lambda *args, **kwargs: {
"depth_limit": 2,
"file_limit": 10,
"folder_depth_exceeds_limit": True,
"file_count_exceeds_limit": False,
"sampled_file_count": 11,
"sample_deep_folder_path": "/documents/deep",
}
executor = PIFSCommandExecutor(filesystem)
rendered = executor.execute('grep -R "Federal Reserve" /documents')
assert "# suggested: browse -R /documents 'Federal Reserve'" in rendered
assert "search-summary" not in rendered
assert "search-entity" not in rendered
assert "search-relation" not in rendered
assert "semantic-grep" not in rendered
def test_semantic_search_scope_filters_explicit_source_type_facets():

View file

@ -215,10 +215,19 @@ class PIFSAgentStreamTest(unittest.TestCase):
self.assertIn("Do not run stat merely to understand what a document says", AGENT_TOOL_POLICY)
self.assertIn("Do not use stat as a general content/topic discovery step", BASH_TOOL_DESCRIPTION)
def test_prompt_routes_summary_search_to_search_summary(self):
self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION)
self.assertIn('use search-summary "<query>" <folder>', AGENT_TOOL_POLICY)
self.assertIn('search-summary "Federal Reserve" /documents', BASH_TOOL_DESCRIPTION)
def test_prompt_routes_semantic_search_to_browse(self):
for old_command in (
"search-summary",
"search-entity",
"search-relation",
"semantic-grep",
):
self.assertNotIn(old_command, BASH_TOOL_DESCRIPTION)
self.assertNotIn(old_command, AGENT_TOOL_POLICY)
self.assertIn("Use browse when the user", BASH_TOOL_DESCRIPTION)
self.assertIn('use browse <folder> "<query>"', AGENT_TOOL_POLICY)
self.assertIn('browse /documents "Federal Reserve"', BASH_TOOL_DESCRIPTION)
self.assertIn("browse -R <folder>", AGENT_TOOL_POLICY)
self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY)
self.assertIn("verify the relevant facts with cat", AGENT_TOOL_POLICY)
self.assertIn("verify the relevant claim with cat", BASH_TOOL_DESCRIPTION)