diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index fddec04..839d37c 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -5,7 +5,7 @@ This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus through the PageIndex FileSystem shell instead of direct PageIndex document tools. The agent receives one read-only bash-like PIFS tool and must retrieve evidence through commands such as ls, tree, find, grep, search-summary, -cat --structure, cat --page, and cat --node. +cat --structure, cat --page, and cat --node. The demo uses PDFs under examples/documents. When a matching examples/documents/results/*_structure.json file exists, it is loaded into the @@ -71,12 +71,12 @@ Retrieval strategy: - Use grep -R only for lexical evidence; do not treat semantic candidates as literal matches. - Run one evidence command at a time. Do not chain large commands like - cat --structure, grep, and cat --page in one bash call. -- For PDFs, use cat --structure to inspect the PageIndex tree, then - cat --page for evidence, for example: - cat --page 31-35 ref_1 -- For page-range questions, use cat --structure to identify the full section - range. Then run cat --page on the smallest useful evidence range, usually the + cat --structure, grep, and cat --page in one bash call. +- For PDFs, use cat --structure to inspect the PageIndex tree, then + cat --page for evidence, for example: + cat ref_1 --page 31-35 +- For page-range questions, use cat --structure to identify the full section + range. Then run cat --page on the smallest useful evidence range, usually the section start page or first 1-2 pages, before the final answer. Do not print a broad multi-page section unless the user asks to read the whole section. - Do not use cat --all on PDFs. @@ -646,7 +646,7 @@ def run_smoke_commands( verbose=verbose, ) - command = f"cat --structure {first_ref}" + command = f"cat {first_ref} --structure" structure_payload = execute_json_command(json_executor, command) structure_data = structure_payload.get("data") or {} structure = structure_data.get("structure") or [] @@ -664,7 +664,7 @@ def run_smoke_commands( ) evidence_range = opening_page_range_for_node(supervision_node) or "1-2" - command = f"cat --page {evidence_range} {first_ref}" + command = f"cat {first_ref} --page {evidence_range}" page = execute_json_command(json_executor, command) page_text = str((page.get("data") or {}).get("text") or "") show_capability( diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index a090b66..15e95a5 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -36,7 +36,8 @@ commands described in the workspace context. grep -R is lexical evidence search; semantic search commands return candidate documents and do not guarantee literal text matches. Errors are returned as text prefixed with ERROR. Do not call commands that are not listed as available. When evidence is required, inspect it -with cat or grep before answering. +with cat or grep before answering. Prefer shell-like target-first cat syntax: +cat --structure, cat --page 31-59, and cat --node 0009. """ AGENT_TOOL_POLICY = """ @@ -48,6 +49,8 @@ Tool policy: - Semantic search commands are candidate-discovery tools and do not guarantee literal text matches. - Tool errors are returned as ERROR text; recover by trying an available command. - Use cat or grep to gather evidence before making source-backed claims. +- Prefer target-first cat syntax: cat --structure, cat --page 31-59, cat --node . +- Do not call cat --page ; if you need a page span, use cat --page -. """ STREAM_MODE_ALIASES = { diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index a741333..124825d 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -90,8 +90,8 @@ class PIFSCommandExecutor: "- ls/tree: folder browsing", "- find --where: exact/canonical metadata DSL filtering", "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", - "- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files", - "- cat --all: full text artifact reads for txt/text files", + "- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files", + "- cat --all: full text artifact reads for txt/text files", ] if "entity" in semantic_channels: lines.append("- find --name: entity semantic candidate discovery alias") @@ -405,12 +405,17 @@ class PIFSCommandExecutor: def _cmd_cat(self, args: list[str]) -> Any: if not args: raise PIFSCommandError("cat requires a file target") - target = None + target = args[0] + if target.startswith("-"): + raise PIFSCommandError( + "cat syntax is target-first: cat --structure, " + "cat --page 31-59, or cat --node 0009" + ) location = "all" structural_mode: str | None = None node_id: str | None = None page_range: str | None = None - i = 0 + i = 1 while i < len(args): arg = args[i] if arg == "--range": @@ -437,16 +442,22 @@ class PIFSCommandExecutor: elif arg.startswith("-"): raise PIFSCommandError(f"Unsupported cat option: {arg}") else: - target = arg + raise PIFSCommandError( + "cat accepts one file target. Use: cat --page , " + "for example: cat ref_1 --page 31-59" + ) i += 1 - if not target: - raise PIFSCommandError("cat requires a file target") if structural_mode == "structure": return self.filesystem.pageindex_structure(target) if structural_mode == "node": return self.filesystem.pageindex_node(target, str(node_id)) if structural_mode == "page": - return self.filesystem.pageindex_pages(target, str(page_range)) + if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range): + raise PIFSCommandError( + "cat --page requires one page selector like 31 or 31-59. " + "Use: cat --page " + ) + return self.filesystem.pageindex_pages(target, page_range) return self.filesystem.cat_text_artifact(target, location) def _cmd_stat(self, args: list[str]) -> Any: diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 314a532..5fddfe3 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -677,7 +677,8 @@ class PageIndexFileSystem: raise ValueError( f"{command} is only supported for txt/text files; " f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " - "Use cat --structure, cat --page, or cat --node for PDF/Markdown PageIndex files." + "Use cat --structure, cat --page, or cat --node " + "for PDF/Markdown PageIndex files." ) def _require_pageindex_document_file(self, entry: Any, command: str) -> None: @@ -686,7 +687,7 @@ class PageIndexFileSystem: raise ValueError( f"{command} is only supported for PDF/Markdown PageIndex files; " f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " - "Use cat --all for txt/text files." + "Use cat --all for txt/text files." ) @classmethod diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py index fcb2725..5b06290 100644 --- a/tests/test_pageindex_structural_read.py +++ b/tests/test_pageindex_structural_read.py @@ -67,9 +67,9 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): ) executor = PIFSCommandExecutor(filesystem, json_output=True) - structure = json.loads(executor.execute("cat --structure dsid_structural_missing")) - node = json.loads(executor.execute("cat --node 0001 dsid_structural_missing")) - pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_missing")) + structure = json.loads(executor.execute("cat dsid_structural_missing --structure")) + node = json.loads(executor.execute("cat dsid_structural_missing --node 0001")) + pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2")) stat = json.loads(executor.execute("stat dsid_structural_missing")) assert structure["data"]["mode"] == "structure" @@ -282,6 +282,7 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch): from pageindex import PageIndexClient from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError with tempfile.TemporaryDirectory() as tmp: source = Path(tmp) / "report.pdf" @@ -333,8 +334,8 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke ) executor = PIFSCommandExecutor(filesystem, json_output=True) - structure = json.loads(executor.execute("cat --structure dsid_structural_cached")) - pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_cached")) + structure = json.loads(executor.execute("cat dsid_structural_cached --structure")) + pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2")) stat = json.loads(executor.execute("stat dsid_structural_cached")) assert structure["data"]["available"] is True @@ -345,6 +346,10 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke assert pages["data"]["available"] is True assert pages["data"]["text"] == "Page one text\n\nPage two text" + with pytest.raises(PIFSCommandError, match="target-first"): + executor.execute("cat --page 1-2 dsid_structural_cached") + with pytest.raises(PIFSCommandError, match="one file target"): + executor.execute("cat dsid_structural_cached --page 1 2") assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf" assert stat["data"]["pageindex_tree_status"] == "built" @@ -387,7 +392,7 @@ def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact( ) executor = PIFSCommandExecutor(filesystem, json_output=True) - node = json.loads(executor.execute("cat --node 0001 dsid_md_cached")) + node = json.loads(executor.execute("cat dsid_md_cached --node 0001")) assert node["data"]["available"] is True assert node["data"]["pageindex_doc_id"] == "doc_cached_md" @@ -453,7 +458,7 @@ def test_tree_does_not_read_file_internal_pageindex_structure(): with pytest.raises(PIFSCommandError): executor.execute("tree dsid_tree_is_folder_only") - structure = json.loads(executor.execute("cat --structure dsid_tree_is_folder_only")) + structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure")) assert structure["data"]["structure"][0]["title"] == "Introduction" @@ -494,19 +499,19 @@ def test_cat_all_is_limited_to_text_files(): ) executor = PIFSCommandExecutor(filesystem, json_output=True) - text = json.loads(executor.execute("cat --all dsid_text_file")) + text = json.loads(executor.execute("cat dsid_text_file --all")) assert text["data"]["text"] == "plain text body" with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): - executor.execute("cat --all dsid_pdf_file") + executor.execute("cat dsid_pdf_file --all") with pytest.raises(ValueError, match="not supported for PDF/Markdown"): filesystem.open("dsid_pdf_file") with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): - executor.execute("cat --all dsid_md_file") + executor.execute("cat dsid_md_file --all") with pytest.raises(ValueError, match="not supported for PDF/Markdown"): filesystem.open("dsid_md_file") with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): - executor.execute("cat --all dsid_json_file") + executor.execute("cat dsid_json_file --all") assert filesystem.open("dsid_json_file").text == '{"body":"json"}' for command in ( "head dsid_pdf_file", @@ -536,9 +541,9 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown(): executor = PIFSCommandExecutor(filesystem, json_output=True) for command in ( - "cat --structure dsid_text_only", - "cat --page 1 dsid_text_only", - "cat --node 0001 dsid_text_only", + "cat dsid_text_only --structure", + "cat dsid_text_only --page 1", + "cat dsid_text_only --node 0001", ): with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"): executor.execute(command) @@ -581,10 +586,10 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix(): ) executor = PIFSCommandExecutor(filesystem, json_output=True) - structure = json.loads(executor.execute("cat --structure dsid_legacy_pageindex")) + structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure")) assert structure["data"]["structure"][0]["title"] == "Uploaded" with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): - executor.execute("cat --all dsid_legacy_pageindex") + executor.execute("cat dsid_legacy_pageindex --all") def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch): @@ -624,7 +629,7 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke ) executor = PIFSCommandExecutor(filesystem, json_output=True) - structure = json.loads(executor.execute("cat --structure dsid_late_cache")) + structure = json.loads(executor.execute("cat dsid_late_cache --structure")) stat = json.loads(executor.execute("stat dsid_late_cache")) assert structure["data"]["available"] is False