fix(filesystem): require target-first cat syntax

This commit is contained in:
BukeLy 2026-05-26 15:00:23 +08:00
parent 3d62293a1e
commit b9ee711087
5 changed files with 57 additions and 37 deletions

View file

@ -5,7 +5,7 @@ This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus
through the PageIndex FileSystem shell instead of direct PageIndex document
tools. The agent receives one read-only bash-like PIFS tool and must retrieve
evidence through commands such as ls, tree, find, grep, search-summary,
cat --structure, cat --page, and cat --node.
cat <ref> --structure, cat <ref> --page, and cat <ref> --node.
The demo uses PDFs under examples/documents. When a matching
examples/documents/results/*_structure.json file exists, it is loaded into the
@ -71,12 +71,12 @@ Retrieval strategy:
- Use grep -R only for lexical evidence; do not treat semantic candidates as
literal matches.
- Run one evidence command at a time. Do not chain large commands like
cat --structure, grep, and cat --page in one bash call.
- For PDFs, use cat --structure <ref> to inspect the PageIndex tree, then
cat --page <range> <ref> for evidence, for example:
cat --page 31-35 ref_1
- For page-range questions, use cat --structure to identify the full section
range. Then run cat --page on the smallest useful evidence range, usually the
cat <ref> --structure, grep, and cat <ref> --page in one bash call.
- For PDFs, use cat <ref> --structure to inspect the PageIndex tree, then
cat <ref> --page <range> for evidence, for example:
cat ref_1 --page 31-35
- For page-range questions, use cat <ref> --structure to identify the full section
range. Then run cat <ref> --page on the smallest useful evidence range, usually the
section start page or first 1-2 pages, before the final answer. Do not print
a broad multi-page section unless the user asks to read the whole section.
- Do not use cat --all on PDFs.
@ -646,7 +646,7 @@ def run_smoke_commands(
verbose=verbose,
)
command = f"cat --structure {first_ref}"
command = f"cat {first_ref} --structure"
structure_payload = execute_json_command(json_executor, command)
structure_data = structure_payload.get("data") or {}
structure = structure_data.get("structure") or []
@ -664,7 +664,7 @@ def run_smoke_commands(
)
evidence_range = opening_page_range_for_node(supervision_node) or "1-2"
command = f"cat --page {evidence_range} {first_ref}"
command = f"cat {first_ref} --page {evidence_range}"
page = execute_json_command(json_executor, command)
page_text = str((page.get("data") or {}).get("text") or "")
show_capability(

View file

@ -36,7 +36,8 @@ commands described in the workspace context. grep -R is lexical evidence search;
semantic search commands return candidate documents and do not guarantee literal
text matches. Errors are returned as text prefixed with ERROR. Do not call
commands that are not listed as available. When evidence is required, inspect it
with cat or grep before answering.
with cat or grep before answering. Prefer shell-like target-first cat syntax:
cat <ref> --structure, cat <ref> --page 31-59, and cat <ref> --node 0009.
"""
AGENT_TOOL_POLICY = """
@ -48,6 +49,8 @@ Tool policy:
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
- Tool errors are returned as ERROR text; recover by trying an available command.
- Use cat or grep to gather evidence before making source-backed claims.
- Prefer target-first cat syntax: cat <ref> --structure, cat <ref> --page 31-59, cat <ref> --node <node_id>.
- Do not call cat --page <ref> <start> <end>; if you need a page span, use cat <ref> --page <start>-<end>.
"""
STREAM_MODE_ALIASES = {

View file

@ -90,8 +90,8 @@ class PIFSCommandExecutor:
"- ls/tree: folder browsing",
"- find --where: exact/canonical metadata DSL filtering",
"- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled",
"- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files",
"- cat --all: full text artifact reads for txt/text files",
"- cat <ref> --structure/--node/--page: cached PageIndex reads for PDF/Markdown files",
"- cat <ref> --all: full text artifact reads for txt/text files",
]
if "entity" in semantic_channels:
lines.append("- find --name: entity semantic candidate discovery alias")
@ -405,12 +405,17 @@ class PIFSCommandExecutor:
def _cmd_cat(self, args: list[str]) -> Any:
if not args:
raise PIFSCommandError("cat requires a file target")
target = None
target = args[0]
if target.startswith("-"):
raise PIFSCommandError(
"cat syntax is target-first: cat <ref> --structure, "
"cat <ref> --page 31-59, or cat <ref> --node 0009"
)
location = "all"
structural_mode: str | None = None
node_id: str | None = None
page_range: str | None = None
i = 0
i = 1
while i < len(args):
arg = args[i]
if arg == "--range":
@ -437,16 +442,22 @@ class PIFSCommandExecutor:
elif arg.startswith("-"):
raise PIFSCommandError(f"Unsupported cat option: {arg}")
else:
target = arg
raise PIFSCommandError(
"cat accepts one file target. Use: cat <ref> --page <page-or-range>, "
"for example: cat ref_1 --page 31-59"
)
i += 1
if not target:
raise PIFSCommandError("cat requires a file target")
if structural_mode == "structure":
return self.filesystem.pageindex_structure(target)
if structural_mode == "node":
return self.filesystem.pageindex_node(target, str(node_id))
if structural_mode == "page":
return self.filesystem.pageindex_pages(target, str(page_range))
if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range):
raise PIFSCommandError(
"cat --page requires one page selector like 31 or 31-59. "
"Use: cat <ref> --page <page-or-range>"
)
return self.filesystem.pageindex_pages(target, page_range)
return self.filesystem.cat_text_artifact(target, location)
def _cmd_stat(self, args: list[str]) -> Any:

View file

@ -677,7 +677,8 @@ class PageIndexFileSystem:
raise ValueError(
f"{command} is only supported for txt/text files; "
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
"Use cat --structure, cat --page, or cat --node for PDF/Markdown PageIndex files."
"Use cat <ref> --structure, cat <ref> --page, or cat <ref> --node "
"for PDF/Markdown PageIndex files."
)
def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
@ -686,7 +687,7 @@ class PageIndexFileSystem:
raise ValueError(
f"{command} is only supported for PDF/Markdown PageIndex files; "
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
"Use cat --all for txt/text files."
"Use cat <ref> --all for txt/text files."
)
@classmethod

View file

@ -67,9 +67,9 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
structure = json.loads(executor.execute("cat --structure dsid_structural_missing"))
node = json.loads(executor.execute("cat --node 0001 dsid_structural_missing"))
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_missing"))
structure = json.loads(executor.execute("cat dsid_structural_missing --structure"))
node = json.loads(executor.execute("cat dsid_structural_missing --node 0001"))
pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2"))
stat = json.loads(executor.execute("stat dsid_structural_missing"))
assert structure["data"]["mode"] == "structure"
@ -282,6 +282,7 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp
def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
from pageindex import PageIndexClient
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
with tempfile.TemporaryDirectory() as tmp:
source = Path(tmp) / "report.pdf"
@ -333,8 +334,8 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
structure = json.loads(executor.execute("cat --structure dsid_structural_cached"))
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_cached"))
structure = json.loads(executor.execute("cat dsid_structural_cached --structure"))
pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2"))
stat = json.loads(executor.execute("stat dsid_structural_cached"))
assert structure["data"]["available"] is True
@ -345,6 +346,10 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
assert pages["data"]["available"] is True
assert pages["data"]["text"] == "Page one text\n\nPage two text"
with pytest.raises(PIFSCommandError, match="target-first"):
executor.execute("cat --page 1-2 dsid_structural_cached")
with pytest.raises(PIFSCommandError, match="one file target"):
executor.execute("cat dsid_structural_cached --page 1 2")
assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
assert stat["data"]["pageindex_tree_status"] == "built"
@ -387,7 +392,7 @@ def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact(
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
node = json.loads(executor.execute("cat --node 0001 dsid_md_cached"))
node = json.loads(executor.execute("cat dsid_md_cached --node 0001"))
assert node["data"]["available"] is True
assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
@ -453,7 +458,7 @@ def test_tree_does_not_read_file_internal_pageindex_structure():
with pytest.raises(PIFSCommandError):
executor.execute("tree dsid_tree_is_folder_only")
structure = json.loads(executor.execute("cat --structure dsid_tree_is_folder_only"))
structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure"))
assert structure["data"]["structure"][0]["title"] == "Introduction"
@ -494,19 +499,19 @@ def test_cat_all_is_limited_to_text_files():
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
text = json.loads(executor.execute("cat --all dsid_text_file"))
text = json.loads(executor.execute("cat dsid_text_file --all"))
assert text["data"]["text"] == "plain text body"
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
executor.execute("cat --all dsid_pdf_file")
executor.execute("cat dsid_pdf_file --all")
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
filesystem.open("dsid_pdf_file")
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
executor.execute("cat --all dsid_md_file")
executor.execute("cat dsid_md_file --all")
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
filesystem.open("dsid_md_file")
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
executor.execute("cat --all dsid_json_file")
executor.execute("cat dsid_json_file --all")
assert filesystem.open("dsid_json_file").text == '{"body":"json"}'
for command in (
"head dsid_pdf_file",
@ -536,9 +541,9 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
executor = PIFSCommandExecutor(filesystem, json_output=True)
for command in (
"cat --structure dsid_text_only",
"cat --page 1 dsid_text_only",
"cat --node 0001 dsid_text_only",
"cat dsid_text_only --structure",
"cat dsid_text_only --page 1",
"cat dsid_text_only --node 0001",
):
with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
executor.execute(command)
@ -581,10 +586,10 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
structure = json.loads(executor.execute("cat --structure dsid_legacy_pageindex"))
structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure"))
assert structure["data"]["structure"][0]["title"] == "Uploaded"
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
executor.execute("cat --all dsid_legacy_pageindex")
executor.execute("cat dsid_legacy_pageindex --all")
def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
@ -624,7 +629,7 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
structure = json.loads(executor.execute("cat --structure dsid_late_cache"))
structure = json.loads(executor.execute("cat dsid_late_cache --structure"))
stat = json.loads(executor.execute("stat dsid_late_cache"))
assert structure["data"]["available"] is False