mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
Merge Goal 1: converge semantic retrieval on browse
Merge removal of legacy semantic commands into feat/pageindex-filesystem.
This commit is contained in:
commit
889db8cc01
6 changed files with 150 additions and 403 deletions
|
|
@ -4,7 +4,7 @@ PageIndex FileSystem (PIFS) agent demo.
|
|||
This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus
|
||||
through the PageIndex FileSystem shell instead of direct PageIndex document
|
||||
tools. The agent receives one read-only bash-like PIFS tool and must retrieve
|
||||
evidence through commands such as ls, tree, find, grep, search-summary,
|
||||
evidence through commands such as ls, tree, find, grep, browse,
|
||||
cat <path> --structure, cat <path> --page, and cat <path> --node.
|
||||
|
||||
The demo registers supported files under examples/documents. When a matching
|
||||
|
|
@ -72,9 +72,9 @@ Retrieval strategy:
|
|||
or stable file_ref/document ids. Do not invent temporary ref_N aliases.
|
||||
- Folder paths such as /documents are positional command targets; do not put
|
||||
folder paths inside --where.
|
||||
- Use search-summary when available to find likely documents.
|
||||
- Use browse when available to find likely documents by semantic relevance.
|
||||
Quote multi-word queries and include a path, for example:
|
||||
search-summary "Federal Reserve supervision regulation" /documents
|
||||
browse /documents "Federal Reserve supervision regulation"
|
||||
- Use find --where only with JSON metadata DSL, for example:
|
||||
find /documents --where '{"file_format":"pdf"}'
|
||||
- Use grep -R only for lexical evidence; do not treat semantic candidates as
|
||||
|
|
@ -642,15 +642,15 @@ def run_smoke_commands(
|
|||
verbose=verbose,
|
||||
)
|
||||
|
||||
command = 'search-summary "Federal Reserve annual report supervision regulation section page range" /documents'
|
||||
command = 'browse /documents "Federal Reserve annual report supervision regulation section page range"'
|
||||
summary = execute_json_command(json_executor, command)
|
||||
summary_hits = ((summary.get("data") or {}).get("data") or [])
|
||||
if summary_hits:
|
||||
summary_result = f"{len(summary_hits)} summary-vector candidates; top={summary_hits[0].get('external_id')}"
|
||||
summary_result = f"{len(summary_hits)} browse candidates; top={summary_hits[0].get('external_id')}"
|
||||
else:
|
||||
summary_result = "summary-vector command is available, but this tiny two-doc demo returned no candidates"
|
||||
summary_result = "browse is available, but this tiny two-doc demo returned no candidates"
|
||||
show_capability(
|
||||
label="Semantic summary search",
|
||||
label="Semantic browse",
|
||||
command=command,
|
||||
result=summary_result,
|
||||
raw=shell_executor.execute(command) if verbose else "",
|
||||
|
|
|
|||
|
|
@ -35,9 +35,8 @@ document contents in the workspace.
|
|||
|
||||
If the user asks what tools or capabilities you have, describe only the PIFS
|
||||
virtual shell capabilities available inside this workspace: ls, tree, find,
|
||||
stat, grep, cat, and semantic search commands such as search-summary when they
|
||||
are available. Do not mention host runtime tools, SDK internals, or orchestration
|
||||
helpers that are not part of the PIFS shell.
|
||||
stat, grep, cat, and browse. Do not mention host runtime tools, SDK internals,
|
||||
or orchestration helpers that are not part of the PIFS shell.
|
||||
|
||||
If the user asks a workspace-related topic question without naming a specific
|
||||
file, treat it as a retrieval task. Use available PIFS discovery commands to
|
||||
|
|
@ -45,8 +44,8 @@ look for relevant files and inspect evidence before answering. Ask the user to
|
|||
clarify only after a reasonable search cannot identify relevant evidence.
|
||||
Do not conclude that no relevant document exists from one failed grep. If grep
|
||||
returns no matches for a workspace topic, verify with available semantic
|
||||
candidate discovery such as search-summary, or inspect likely document
|
||||
structure, before saying that the workspace lacks evidence.
|
||||
candidate discovery through browse, or inspect likely document structure,
|
||||
before saying that the workspace lacks evidence.
|
||||
|
||||
Follow the task prompt for command policy, retrieval strategy, and answer
|
||||
format. If the caller needs stricter behavior, pass an explicit system_prompt.
|
||||
|
|
@ -55,19 +54,18 @@ format. If the caller needs stricter behavior, pass an explicit system_prompt.
|
|||
BASH_TOOL_DESCRIPTION = """
|
||||
Run a command in the PageIndex FileSystem virtual shell. This is not a real
|
||||
operating-system shell. By default the tool is read-only: use ls, tree, find,
|
||||
grep, cat, stat, head, tail, sed, and any dynamically available semantic search
|
||||
commands described in the workspace context. grep -R is lexical evidence search;
|
||||
grep, cat, stat, head, tail, sed, and browse as described in the workspace
|
||||
context. grep -R is lexical evidence search;
|
||||
grep does not support regex alternation such as "a|b"; run multiple grep
|
||||
commands or use search-summary for semantic candidate discovery instead.
|
||||
semantic search commands such as search-summary return candidate documents and
|
||||
do not guarantee literal text matches or final answer evidence. After choosing
|
||||
a likely search-summary candidate, verify the relevant claim with cat before
|
||||
answering. Use search-summary when the user asks for summary search, semantic
|
||||
search, or vector search and the command is listed as available. Quote
|
||||
multi-word semantic queries, for example:
|
||||
search-summary "Federal Reserve" /documents. Do not write
|
||||
search-summary Federal Reserve /documents. Errors are returned as text prefixed
|
||||
with ERROR. Do not call
|
||||
commands or use browse for semantic candidate discovery instead. browse returns
|
||||
candidate documents ranked by relevance and does not guarantee literal text
|
||||
matches or final answer evidence. After choosing a likely browse candidate,
|
||||
verify the relevant claim with cat before answering. Use browse when the user
|
||||
asks for summary search, semantic search, or vector search and the command is
|
||||
listed as available. Quote multi-word semantic queries, for example:
|
||||
browse /documents "Federal Reserve". Do not write
|
||||
browse /documents Federal Reserve. Errors are returned as text prefixed with
|
||||
ERROR. Do not call
|
||||
commands that are not listed as available. When evidence is required, inspect it
|
||||
with cat or grep before answering. Prefer shell-like target-first cat syntax
|
||||
with stable targets: cat <path> --structure, cat <path> --page 31-59, and
|
||||
|
|
@ -85,7 +83,7 @@ continue with another chunk before answering.
|
|||
For questions about metadata fields, available summaries, or whether metadata
|
||||
was provided, inspect stat --schema and stat <target> before making claims.
|
||||
Do not use stat as a general content/topic discovery step. For document Q&A,
|
||||
prefer search-summary/find/grep for candidates, then cat --structure and
|
||||
prefer ls/tree to choose a folder, browse/find/grep for candidates, then cat --structure and
|
||||
cat --node or cat --page for evidence.
|
||||
"""
|
||||
|
||||
|
|
@ -97,11 +95,11 @@ Tool policy:
|
|||
- Folder paths such as /documents are positional command targets; never put folder paths in --where.
|
||||
- Use --where only with metadata fields shown by stat --schema.
|
||||
- grep -R performs lexical evidence search.
|
||||
- grep does not support regex alternation such as "a|b"; run separate grep commands or use search-summary for semantic candidate discovery.
|
||||
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches or final answer evidence. After selecting a likely search-summary candidate, verify the relevant facts with cat before answering.
|
||||
- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, search-summary, grep on a narrowed target, or cat on likely candidates instead.
|
||||
- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with search-summary or another available semantic/vector candidate command, or inspect likely document structure, before answering no-evidence.
|
||||
- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary "<query>" <folder>; quote multi-word queries, for example search-summary "Federal Reserve" /documents; do not translate that request into find --where.
|
||||
- grep does not support regex alternation such as "a|b"; run separate grep commands or use browse for semantic candidate discovery.
|
||||
- browse is the semantic candidate-discovery tool and does not guarantee literal text matches or final answer evidence. After selecting a likely browse candidate, verify the relevant facts with cat before answering.
|
||||
- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, browse, grep on a narrowed target, or cat on likely candidates instead.
|
||||
- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with browse or inspect likely document structure, before answering no-evidence.
|
||||
- If the user asks for summary search, semantic search, vector search, or "用 summary 搜", use browse <folder> "<query>"; quote multi-word queries, for example browse /documents "Federal Reserve"; use browse -R <folder> when the folder choice is uncertain; do not translate that request into find --where.
|
||||
- Tool errors are returned as ERROR text; recover by trying an available command.
|
||||
- Use cat or grep to gather evidence before making source-backed claims.
|
||||
- Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces.
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from dataclasses import asdict, is_dataclass
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .core import SEMANTIC_GREP_CHANNELS, SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
|
||||
from .core import SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
|
||||
|
||||
|
||||
class PIFSCommandError(ValueError):
|
||||
|
|
@ -30,30 +30,15 @@ class PIFSCommandExecutor:
|
|||
"tail",
|
||||
"sed",
|
||||
}
|
||||
SEMANTIC_CHANNEL_COMMANDS = {
|
||||
"summary": "search-summary",
|
||||
"entity": "search-entity",
|
||||
"relation": "search-relation",
|
||||
}
|
||||
ALLOWED_COMMANDS = (
|
||||
BASE_ALLOWED_COMMANDS
|
||||
| {"semantic-grep"}
|
||||
| set(SEMANTIC_CHANNEL_COMMANDS.values())
|
||||
)
|
||||
ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS
|
||||
ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
|
||||
COMMAND_METHODS = {
|
||||
"search-summary": "_cmd_search_summary",
|
||||
"search-entity": "_cmd_search_entity",
|
||||
"search-relation": "_cmd_search_relation",
|
||||
"semantic-grep": "_cmd_semantic_grep",
|
||||
}
|
||||
COMMAND_METHODS = {}
|
||||
MAX_CHAINED_COMMANDS = 3
|
||||
MAX_PIPE_COMMANDS = 3
|
||||
MAX_LS_LIMIT = 100
|
||||
MAX_TREE_LIMIT = 200
|
||||
MAX_FIND_LIMIT = 50
|
||||
MAX_GREP_LIMIT = 20
|
||||
MAX_SEMANTIC_LIMIT = 20
|
||||
BROWSE_PAGE_SIZE = 10
|
||||
MAX_TEXT_LINES = 100
|
||||
MAX_PAGE_SPAN = 5
|
||||
|
|
@ -65,7 +50,6 @@ class PIFSCommandExecutor:
|
|||
MAX_TREE_DEPTH = 4
|
||||
MAX_LS_RENDER_FILES = 25
|
||||
MAX_STAT_METADATA_FIELDS = 8
|
||||
SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT = 20
|
||||
GREP_RECURSIVE_FOLDER_DEPTH_LIMIT = 2
|
||||
GREP_RECURSIVE_FOLDER_FILE_LIMIT = 10
|
||||
|
||||
|
|
@ -81,14 +65,7 @@ class PIFSCommandExecutor:
|
|||
self.query_context = query_context
|
||||
|
||||
def allowed_commands(self) -> set[str]:
|
||||
commands = set(self.BASE_ALLOWED_COMMANDS)
|
||||
semantic_channels = set(self.filesystem.semantic_retrieval_channels())
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
if channel in semantic_channels:
|
||||
commands.add(self.SEMANTIC_CHANNEL_COMMANDS[channel])
|
||||
if any(channel in semantic_channels for channel in SEMANTIC_GREP_CHANNELS):
|
||||
commands.add("semantic-grep")
|
||||
return commands
|
||||
return set(self.BASE_ALLOWED_COMMANDS)
|
||||
|
||||
def command_capabilities(self) -> dict[str, Any]:
|
||||
return {
|
||||
|
|
@ -116,26 +93,10 @@ class PIFSCommandExecutor:
|
|||
"- cat <path|file_ref|document_id> --all: text artifact reads for txt/text files, paginated at 100 lines",
|
||||
"- stat --field <metadata_field> <target...>: one metadata field across up to 20 documents",
|
||||
]
|
||||
if "entity" in semantic_channels:
|
||||
lines.append("- find --name: entity semantic candidate discovery alias")
|
||||
if "relation" in semantic_channels:
|
||||
lines.append("- find --relation: relation semantic candidate discovery alias")
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
if channel not in semantic_channels:
|
||||
continue
|
||||
lines.append(
|
||||
f"- {self.SEMANTIC_CHANNEL_COMMANDS[channel]}: "
|
||||
f"{channel} semantic vector candidate discovery"
|
||||
)
|
||||
semantic_grep_channels = semantic.get("semantic_grep_channels") or []
|
||||
if semantic_grep_channels:
|
||||
lines.append(
|
||||
"- semantic-grep -R: semantic candidates from "
|
||||
+ ", ".join(semantic_grep_channels)
|
||||
+ " indexes followed by real line matching"
|
||||
)
|
||||
if not semantic.get("commands"):
|
||||
lines.append("- semantic vector commands: none available in this workspace")
|
||||
if semantic_channels:
|
||||
lines.append("- browse --space available: " + ", ".join(semantic_channels))
|
||||
else:
|
||||
lines.append("- browse --space available: none in this workspace")
|
||||
lines.append("- grep <query> <path|file_ref|document_id>, cat, stat: evidence inspection")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
|
@ -207,8 +168,8 @@ class PIFSCommandExecutor:
|
|||
f"Unsupported pipe command: {name}. Supported pipes are: "
|
||||
f"{', '.join(sorted(self.ALLOWED_PIPE_FILTERS))}. "
|
||||
"If you meant regex alternation such as a|b, PIFS grep/search "
|
||||
"does not support it; run multiple grep or search-summary "
|
||||
"commands with one phrase each."
|
||||
"does not support it; run multiple grep commands or browse "
|
||||
"with one phrase each."
|
||||
)
|
||||
if name == "head":
|
||||
return self._pipe_head_tail(input_text, tokens[1:], from_tail=False)
|
||||
|
|
@ -405,24 +366,9 @@ class PIFSCommandExecutor:
|
|||
return []
|
||||
scope["max_depth"] = max_depth
|
||||
if relation:
|
||||
if not self.filesystem.has_semantic_channel("relation"):
|
||||
raise PIFSCommandError(
|
||||
"find --relation requires a relation semantic index in this workspace"
|
||||
)
|
||||
return self.filesystem.search_semantic_channel(
|
||||
"relation",
|
||||
self._semantic_retrieval_query(relation),
|
||||
scope=scope,
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
)
|
||||
if name and self.filesystem.has_semantic_channel("entity"):
|
||||
return self.filesystem.search_semantic_channel(
|
||||
"entity",
|
||||
self._semantic_retrieval_query(name),
|
||||
scope=scope,
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
raise PIFSCommandError(
|
||||
'find --relation is not supported; use browse <folder> "<query>" '
|
||||
"--space relation for relation semantic file recall"
|
||||
)
|
||||
return self.filesystem.search(
|
||||
query=name,
|
||||
|
|
@ -769,172 +715,6 @@ class PIFSCommandExecutor:
|
|||
f"{start}-{end}",
|
||||
)
|
||||
|
||||
def _cmd_search_summary(self, args: list[str]) -> Any:
|
||||
return self._cmd_semantic_channel("summary", args)
|
||||
|
||||
def _cmd_search_entity(self, args: list[str]) -> Any:
|
||||
return self._cmd_semantic_channel("entity", args)
|
||||
|
||||
def _cmd_search_relation(self, args: list[str]) -> Any:
|
||||
return self._cmd_semantic_channel("relation", args)
|
||||
|
||||
def _cmd_semantic_grep(self, args: list[str]) -> Any:
|
||||
recursive = False
|
||||
where = None
|
||||
limit = 10
|
||||
positionals = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
if arg in {"-R", "-r", "--recursive"}:
|
||||
recursive = True
|
||||
elif self._is_combined_grep_flag(arg):
|
||||
recursive = recursive or "R" in arg or "r" in arg
|
||||
elif arg in {"-n", "--line-number", "-i", "--ignore-case"}:
|
||||
pass
|
||||
elif arg == "--where":
|
||||
i += 1
|
||||
where = args[i]
|
||||
elif arg == "--limit":
|
||||
i += 1
|
||||
limit = self._parse_bounded_int(
|
||||
args[i], "semantic-grep --limit", max_value=self.MAX_SEMANTIC_LIMIT
|
||||
)
|
||||
elif arg.startswith("-"):
|
||||
raise PIFSCommandError(f"Unsupported semantic-grep option: {arg}")
|
||||
else:
|
||||
positionals.append(arg)
|
||||
i += 1
|
||||
if not recursive:
|
||||
raise PIFSCommandError("semantic-grep requires -R/--recursive")
|
||||
channels = self._semantic_grep_channels()
|
||||
if not channels:
|
||||
raise PIFSCommandError(
|
||||
"semantic-grep is not available; entity/relation semantic indexes are not configured"
|
||||
)
|
||||
if not positionals:
|
||||
raise PIFSCommandError("semantic-grep requires a query")
|
||||
self._validate_search_positionals("semantic-grep", positionals)
|
||||
query = positionals[0]
|
||||
self._reject_regex_alternation_query(query, "semantic-grep")
|
||||
path = positionals[1] if len(positionals) > 1 else "/"
|
||||
if not self._is_folder(path):
|
||||
raise PIFSCommandError("semantic-grep target must be a folder")
|
||||
return self._semantic_recursive_grep(
|
||||
self._normalize_folder_path(path),
|
||||
query,
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
channels=channels,
|
||||
)
|
||||
|
||||
def _cmd_semantic_channel(self, channel: str, args: list[str]) -> Any:
|
||||
if not self.filesystem.has_semantic_channel(channel):
|
||||
raise PIFSCommandError(
|
||||
f"search-{channel} is not available; {channel} semantic index is not configured"
|
||||
)
|
||||
where = None
|
||||
limit = 10
|
||||
positionals = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
if arg == "--where":
|
||||
i += 1
|
||||
where = args[i]
|
||||
elif arg == "--limit":
|
||||
i += 1
|
||||
limit = self._parse_bounded_int(
|
||||
args[i],
|
||||
f"search-{channel} --limit",
|
||||
max_value=self.MAX_SEMANTIC_LIMIT,
|
||||
)
|
||||
elif arg.startswith("-"):
|
||||
raise PIFSCommandError(f"Unsupported search-{channel} option: {arg}")
|
||||
else:
|
||||
positionals.append(arg)
|
||||
i += 1
|
||||
if not positionals:
|
||||
raise PIFSCommandError(f"search-{channel} requires a query")
|
||||
self._validate_search_positionals(f"search-{channel}", positionals)
|
||||
query = positionals[0]
|
||||
self._reject_regex_alternation_query(query, f"search-{channel}")
|
||||
path = positionals[1] if len(positionals) > 1 else "/"
|
||||
normalized = self._normalize_folder_path(path)
|
||||
results = self.filesystem.search_semantic_channel(
|
||||
channel,
|
||||
self._semantic_retrieval_query(query),
|
||||
scope={"folder_path": normalized, "recursive": True},
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
)
|
||||
return {
|
||||
"mode": "files",
|
||||
"query": query,
|
||||
"scope": normalized,
|
||||
"retrieval": f"{channel}_vector",
|
||||
"data": self._semantic_channel_hits_from_results(channel, results, query),
|
||||
}
|
||||
|
||||
def _semantic_recursive_grep(
|
||||
self,
|
||||
folder_path: str,
|
||||
query: str,
|
||||
*,
|
||||
metadata_filter: str | None,
|
||||
limit: int,
|
||||
channels: tuple[str, ...],
|
||||
) -> dict[str, Any]:
|
||||
vector_query = str(query or "").strip()
|
||||
candidate_debug: dict[str, Any] = {}
|
||||
for channel in channels:
|
||||
channel_results = self.filesystem.search_semantic_channel(
|
||||
channel,
|
||||
vector_query,
|
||||
scope={"folder_path": folder_path, "recursive": True},
|
||||
metadata_filter=metadata_filter,
|
||||
limit=self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
|
||||
)
|
||||
matches = self._grep_file_hits_from_results(
|
||||
channel_results,
|
||||
query,
|
||||
require_match=True,
|
||||
limit=limit,
|
||||
)
|
||||
candidate_debug[channel] = {
|
||||
"candidates": len(channel_results),
|
||||
"line_matches": len(matches),
|
||||
"candidate_doc_ids": [
|
||||
getattr(result, "external_id", None)
|
||||
for result in channel_results[:5]
|
||||
],
|
||||
}
|
||||
if matches:
|
||||
return {
|
||||
"mode": "files",
|
||||
"query": query,
|
||||
"scope": folder_path,
|
||||
"retrieval": "semantic_grep_" + "_then_".join(channels),
|
||||
"candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
|
||||
"matched_channel": channel,
|
||||
"candidate_debug": candidate_debug,
|
||||
"data": matches,
|
||||
}
|
||||
return {
|
||||
"mode": "files",
|
||||
"query": query,
|
||||
"scope": folder_path,
|
||||
"retrieval": "semantic_grep_" + "_then_".join(channels),
|
||||
"candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
|
||||
"matched_channel": "",
|
||||
"candidate_debug": candidate_debug,
|
||||
"data": [],
|
||||
}
|
||||
|
||||
def _semantic_grep_channels(self) -> tuple[str, ...]:
|
||||
available = set(self.filesystem.semantic_retrieval_channels())
|
||||
return tuple(channel for channel in SEMANTIC_GREP_CHANNELS if channel in available)
|
||||
|
||||
def _bounded_text_artifact(self, target: str, location: str) -> dict[str, Any]:
|
||||
if str(location).strip().lower() in {"all", "full", "*"}:
|
||||
start, end = 1, self.MAX_TEXT_LINES
|
||||
|
|
@ -1077,25 +857,10 @@ class PIFSCommandExecutor:
|
|||
return
|
||||
raise PIFSCommandError(
|
||||
f"{command_name} does not support regex alternation '|'. "
|
||||
"Run multiple grep commands or multiple search-summary commands "
|
||||
'Run multiple grep commands or browse <folder> "<query>" '
|
||||
"with one phrase each."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate_search_positionals(command_name: str, positionals: list[str]) -> None:
|
||||
if len(positionals) > 2:
|
||||
raise PIFSCommandError(
|
||||
f"{command_name} accepts one query and an optional folder path. "
|
||||
f"Quote multi-word queries, for example: {command_name} "
|
||||
'"Federal Reserve" /documents'
|
||||
)
|
||||
if len(positionals) == 2 and not positionals[1].startswith("/"):
|
||||
raise PIFSCommandError(
|
||||
f"{command_name} target must be a PIFS folder path like /documents. "
|
||||
f"If your query has spaces, quote it, for example: {command_name} "
|
||||
'"Federal Reserve" /documents'
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _parse_numeric_range(value: str, label: str) -> tuple[int, int]:
|
||||
try:
|
||||
|
|
@ -1157,10 +922,8 @@ class PIFSCommandExecutor:
|
|||
return self._render_tree(data)
|
||||
if command_name == "browse":
|
||||
return self._render_browse(data)
|
||||
if command_name in {"grep", "semantic-grep"}:
|
||||
if command_name == "grep":
|
||||
return self._render_grep(data)
|
||||
if command_name in {"search-summary", "search-entity", "search-relation"}:
|
||||
return self._render_semantic_search(data)
|
||||
if command_name == "find":
|
||||
return self._render_find(data)
|
||||
if command_name == "stat":
|
||||
|
|
@ -1283,26 +1046,6 @@ class PIFSCommandExecutor:
|
|||
)
|
||||
return str(data)
|
||||
|
||||
def _render_semantic_search(self, data: Any) -> str:
|
||||
if not isinstance(data, dict):
|
||||
return str(data)
|
||||
if data.get("mode") != "files":
|
||||
return self._render_grep(data)
|
||||
if not data.get("data", []):
|
||||
return f"# no matches for: {data.get('query', '')}"
|
||||
lines: list[str] = []
|
||||
for item in data.get("data", []):
|
||||
lines.append(f"path: {item.get('path') or '-'}")
|
||||
lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}")
|
||||
if "entity" in item:
|
||||
lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}")
|
||||
if "relation" in item:
|
||||
lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}")
|
||||
line_text = self._one_line_value(item.get("line_text") or "")
|
||||
lines.append(f"line_text: {line_text or '-'}")
|
||||
lines.append("")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
def _render_browse(self, data: Any) -> str:
|
||||
if not isinstance(data, dict):
|
||||
return str(data)
|
||||
|
|
@ -1560,12 +1303,12 @@ class PIFSCommandExecutor:
|
|||
commands = []
|
||||
quoted_query = shlex.quote(query)
|
||||
quoted_folder = shlex.quote(folder_path)
|
||||
if self._semantic_grep_channels():
|
||||
commands.append(f"semantic-grep -R {quoted_query} {quoted_folder}")
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
if self.filesystem.has_semantic_channel(channel):
|
||||
command = self.SEMANTIC_CHANNEL_COMMANDS[channel]
|
||||
commands.append(f"{command} {quoted_query} {quoted_folder}")
|
||||
command = f"browse -R {quoted_folder} {quoted_query}"
|
||||
if channel != "summary":
|
||||
command += f" --space {channel}"
|
||||
commands.append(command)
|
||||
return commands
|
||||
|
||||
def _rank_child_folders(
|
||||
|
|
@ -1627,37 +1370,6 @@ class PIFSCommandExecutor:
|
|||
break
|
||||
return hits
|
||||
|
||||
def _semantic_channel_hits_from_results(
|
||||
self,
|
||||
channel: str,
|
||||
results: list[Any],
|
||||
query: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
hits = []
|
||||
for result in results:
|
||||
metadata = result.metadata or {}
|
||||
line, text = self._first_matching_line(result.file_ref, query)
|
||||
line_text = ""
|
||||
if text:
|
||||
line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
|
||||
hit = {
|
||||
"path": self._stable_file_target_path(
|
||||
{
|
||||
"file_ref": result.file_ref,
|
||||
"title": result.title,
|
||||
"folder_paths": result.folder_paths,
|
||||
"source_path": result.source_path,
|
||||
"external_id": result.external_id,
|
||||
}
|
||||
),
|
||||
"summary": metadata.get("summary") or "",
|
||||
"line_text": line_text,
|
||||
}
|
||||
if channel in {"entity", "relation"}:
|
||||
hit[channel] = metadata.get(channel) or ""
|
||||
hits.append(hit)
|
||||
return hits
|
||||
|
||||
def _rank_child_folders_from_source(
|
||||
self,
|
||||
*,
|
||||
|
|
|
|||
|
|
@ -76,7 +76,6 @@ PROJECTION_INDEX_STATUSES = {
|
|||
}
|
||||
|
||||
SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
|
||||
SEMANTIC_GREP_CHANNELS = ("entity", "relation")
|
||||
PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
|
||||
PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
|
||||
"application/pdf",
|
||||
|
|
@ -249,8 +248,8 @@ class PageIndexFileSystem:
|
|||
"""Attach semantic retrieval to already-built projection indexes.
|
||||
|
||||
Register-time generation owns building the index files. Opening an
|
||||
existing workspace should still expose the corresponding read commands,
|
||||
such as search-summary, without forcing a re-register step.
|
||||
existing workspace should still expose semantic browse, without forcing
|
||||
a re-register step.
|
||||
"""
|
||||
if self.semantic_retrieval_backend is not None:
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
|
|
@ -696,12 +695,7 @@ class PageIndexFileSystem:
|
|||
|
||||
def retrieval_capabilities(self) -> dict[str, Any]:
|
||||
semantic_channels = self.semantic_retrieval_channels()
|
||||
semantic_commands = [f"search-{channel}" for channel in semantic_channels]
|
||||
semantic_grep_channels = [
|
||||
channel for channel in SEMANTIC_GREP_CHANNELS if channel in semantic_channels
|
||||
]
|
||||
if semantic_grep_channels:
|
||||
semantic_commands.append("semantic-grep")
|
||||
semantic_commands = ["browse"] if semantic_channels else []
|
||||
return {
|
||||
"lexical": {
|
||||
"grep_recursive": True,
|
||||
|
|
@ -713,7 +707,6 @@ class PageIndexFileSystem:
|
|||
"backend_configured": self.semantic_retrieval_backend is not None,
|
||||
"channels": list(semantic_channels),
|
||||
"commands": semantic_commands,
|
||||
"semantic_grep_channels": semantic_grep_channels,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -56,11 +56,13 @@ class ChannelBackend:
|
|||
def __init__(self, document_id, channels=("summary", "entity", "relation")):
|
||||
self.document_id = document_id
|
||||
self.channels = channels
|
||||
self.calls = []
|
||||
|
||||
def available_channels(self):
|
||||
return self.channels
|
||||
|
||||
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||
self.calls.append((channel, query, limit, filters))
|
||||
return [
|
||||
SimpleNamespace(
|
||||
document_id=self.document_id,
|
||||
|
|
@ -154,10 +156,30 @@ def test_browse_is_agent_visible_semantic_command(tmp_path):
|
|||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
assert "browse" in executor.allowed_commands()
|
||||
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
|
||||
allowed = executor.allowed_commands()
|
||||
surface = executor.describe_available_command_surfaces()
|
||||
|
||||
assert "browse" in allowed
|
||||
assert 'browse [-R] <folder> "<query>"' in surface
|
||||
assert not {
|
||||
"search-summary",
|
||||
"search-entity",
|
||||
"search-relation",
|
||||
"semantic-grep",
|
||||
} & allowed
|
||||
for old_command in (
|
||||
"search-summary",
|
||||
"search-entity",
|
||||
"search-relation",
|
||||
"semantic-grep",
|
||||
"find --name: entity semantic",
|
||||
"find --relation: relation semantic",
|
||||
):
|
||||
assert old_command not in surface
|
||||
assert executor.command_capabilities()["retrieval"]["semantic"]["commands"] == ["browse"]
|
||||
|
||||
|
||||
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
|
||||
|
|
@ -328,7 +350,6 @@ def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
|||
"doc_direct",
|
||||
]
|
||||
|
||||
|
||||
def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path):
|
||||
import re
|
||||
|
||||
|
|
@ -447,7 +468,7 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
|
|||
filesystem.store.resolve_file_ref("/shared/source.json")
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
|
|
@ -483,27 +504,17 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
|
|||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(
|
||||
executor.execute('search-summary "Federal Reserve annual report" /documents')
|
||||
executor.execute('browse /documents "Federal Reserve annual report"')
|
||||
)
|
||||
|
||||
assert backend.calls[0][2] == {}
|
||||
assert result["data"]["data"][0] == {
|
||||
"path": "/examples/documents/report.pdf",
|
||||
"summary": "Federal Reserve annual report summary",
|
||||
"line_text": "1: Federal Reserve supervision and regulation annual report.",
|
||||
}
|
||||
assert "source_type" not in backend.calls[0][2]
|
||||
assert "source_path" not in backend.calls[0][2]
|
||||
assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf"
|
||||
assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
|
||||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
|
||||
assert "path: /examples/documents/report.pdf" in rendered
|
||||
assert "summary: Federal Reserve annual report summary" in rendered
|
||||
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
|
||||
assert "id=dsid_report" not in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
|
||||
|
||||
def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
|
||||
def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
|
|
@ -552,7 +563,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa
|
|||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
|
||||
result = json.loads(executor.execute('browse /documents "H200 reservations"'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
|
|
@ -560,7 +571,7 @@ def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_pa
|
|||
filesystem.store.resolve_file_ref("/documents/announcements")
|
||||
|
||||
|
||||
def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
||||
def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
|
|
@ -609,14 +620,15 @@ def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_pat
|
|||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('search-summary "first" /documents'))
|
||||
result = json.loads(executor.execute('browse /documents "first"'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "dsid_first"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
|
||||
|
||||
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
||||
def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class MetadataGenerator:
|
||||
|
|
@ -653,31 +665,29 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
|||
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
|
||||
assert entity["data"]["data"][0] == {
|
||||
"path": "/examples/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"entity": "Federal Reserve; Disney",
|
||||
}
|
||||
for command in (
|
||||
'search-summary "Federal Reserve" /documents',
|
||||
'search-entity "Federal Reserve" /documents',
|
||||
'search-relation "Disney valuation" /documents',
|
||||
'semantic-grep -R "Federal Reserve" /documents',
|
||||
):
|
||||
with pytest.raises(PIFSCommandError, match="Unsupported command"):
|
||||
executor.execute(command)
|
||||
|
||||
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
|
||||
assert relation["data"]["data"][0] == {
|
||||
"path": "/examples/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"relation": "Federal Reserve affects Disney valuation",
|
||||
}
|
||||
entity = json.loads(
|
||||
executor.execute('browse /documents "Federal Reserve" --space entity')
|
||||
)
|
||||
assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary"
|
||||
assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
|
||||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-entity "Federal Reserve" /documents')
|
||||
assert "path: /examples/documents/market-note.pdf" in rendered
|
||||
assert "summary: Risk and compliance summary" in rendered
|
||||
assert "entity: Federal Reserve; Disney" in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
relation = json.loads(
|
||||
executor.execute('browse /documents "Disney valuation" --space relation')
|
||||
)
|
||||
assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary"
|
||||
assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
|
||||
|
||||
|
||||
def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
|
||||
def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
|
|
@ -690,17 +700,42 @@ def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
|
|||
title="Annual report",
|
||||
content="Federal Reserve supervision and regulation annual report.",
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report")
|
||||
backend = ChannelBackend("dsid_report", channels=("entity", "relation"))
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="Quote multi-word queries"):
|
||||
executor.execute("search-summary Federal Reserve /documents")
|
||||
result = json.loads(executor.execute("find /documents --name Reserve"))["data"]
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="quote it"):
|
||||
executor.execute("search-summary Federal Reserve")
|
||||
assert result[0]["external_id"] == "dsid_report"
|
||||
assert backend.calls == []
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
|
||||
executor.execute('search-summary "Federal|Reserve" /documents')
|
||||
with pytest.raises(PIFSCommandError, match="find --relation is not supported"):
|
||||
executor.execute('find /documents --relation "Reserve regulates report"')
|
||||
|
||||
|
||||
def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
_register_browse_file(filesystem, "dsid_report", "/documents")
|
||||
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
|
||||
filesystem.store.folder_subtree_thresholds = lambda *args, **kwargs: {
|
||||
"depth_limit": 2,
|
||||
"file_limit": 10,
|
||||
"folder_depth_exceeds_limit": True,
|
||||
"file_count_exceeds_limit": False,
|
||||
"sampled_file_count": 11,
|
||||
"sample_deep_folder_path": "/documents/deep",
|
||||
}
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
rendered = executor.execute('grep -R "Federal Reserve" /documents')
|
||||
|
||||
assert "# suggested: browse -R /documents 'Federal Reserve'" in rendered
|
||||
assert "search-summary" not in rendered
|
||||
assert "search-entity" not in rendered
|
||||
assert "search-relation" not in rendered
|
||||
assert "semantic-grep" not in rendered
|
||||
|
||||
|
||||
def test_semantic_search_scope_filters_explicit_source_type_facets():
|
||||
|
|
|
|||
|
|
@ -215,10 +215,19 @@ class PIFSAgentStreamTest(unittest.TestCase):
|
|||
self.assertIn("Do not run stat merely to understand what a document says", AGENT_TOOL_POLICY)
|
||||
self.assertIn("Do not use stat as a general content/topic discovery step", BASH_TOOL_DESCRIPTION)
|
||||
|
||||
def test_prompt_routes_summary_search_to_search_summary(self):
|
||||
self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION)
|
||||
self.assertIn('use search-summary "<query>" <folder>', AGENT_TOOL_POLICY)
|
||||
self.assertIn('search-summary "Federal Reserve" /documents', BASH_TOOL_DESCRIPTION)
|
||||
def test_prompt_routes_semantic_search_to_browse(self):
|
||||
for old_command in (
|
||||
"search-summary",
|
||||
"search-entity",
|
||||
"search-relation",
|
||||
"semantic-grep",
|
||||
):
|
||||
self.assertNotIn(old_command, BASH_TOOL_DESCRIPTION)
|
||||
self.assertNotIn(old_command, AGENT_TOOL_POLICY)
|
||||
self.assertIn("Use browse when the user", BASH_TOOL_DESCRIPTION)
|
||||
self.assertIn('use browse <folder> "<query>"', AGENT_TOOL_POLICY)
|
||||
self.assertIn('browse /documents "Federal Reserve"', BASH_TOOL_DESCRIPTION)
|
||||
self.assertIn("browse -R <folder>", AGENT_TOOL_POLICY)
|
||||
self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY)
|
||||
self.assertIn("verify the relevant facts with cat", AGENT_TOOL_POLICY)
|
||||
self.assertIn("verify the relevant claim with cat", BASH_TOOL_DESCRIPTION)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue