diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index d220654..2434371 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -5,7 +5,7 @@ This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus through the PageIndex FileSystem shell instead of direct PageIndex document tools. The agent receives one read-only bash-like PIFS tool and must retrieve evidence through commands such as ls, tree, browse, find, grep, cat ---structure, cat --page, and cat --node. +--structure, and cat --page. The demo registers supported files under examples/documents. When a matching examples/documents/results/*_structure.json file exists, it is loaded into the @@ -81,7 +81,7 @@ Retrieval strategy: browse -R /documents "Federal Reserve supervision regulation" - browse returns file candidates only; it is not folder semantic recall. - After browse returns candidates, verify evidence with grep, cat - --structure, cat --node, or cat --page before answering. + --structure, or cat --page before answering. - Use find --where only with JSON metadata DSL, for example: find /documents --where '{"file_format":"pdf"}' - Use grep -R only for lexical evidence; do not treat semantic candidates as diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index 282f67e..d3bba68 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -70,22 +70,19 @@ likely browse candidate, verify the relevant claim with cat or grep before answering. Errors are returned as text prefixed with ERROR. Do not call commands that are not listed as available. When evidence is required, inspect it with cat or grep before answering. Prefer shell-like target-first cat syntax with stable -targets: cat --structure, cat --page 31-59, and cat --node -0009. You may also use file_ref or document_id when a path is ambiguous. Do not reconstruct paths from document titles; use exact targets returned by PIFS -commands and quote paths containing spaces. After structure identifies a -relevant section node, prefer -cat --node ; use cat --page when the user asks -for page-level evidence, no suitable node exists, or exact page text is needed. -cat --structure is paginated; request more with --offset if needed. Page -reads are limited to five pages at once, node reads to at most ten node ids, -and text cat --all returns only the first page of text lines. If a cat limit -error requires a smaller call, stop when the evidence is sufficient; otherwise -continue with another chunk before answering. +targets: cat --structure and cat --page 31-59. You may also use +file_ref or document_id when a path is ambiguous. Do not reconstruct paths from +document titles; use exact targets returned by PIFS commands and quote paths +containing spaces. Use cat --structure to inspect the document structure +JSON, then cat --page for exact page text evidence. Page reads +are limited to five pages at once, and text cat --all returns only the first +page of text lines. If a cat limit error requires a smaller call, stop when the +evidence is sufficient; otherwise continue with another chunk before answering. For questions about metadata fields, available summaries, or whether metadata was provided, inspect stat --schema and stat before making claims. Do not use stat as a general content/topic discovery step. For document Q&A, prefer ls/tree for folder selection, browse for file candidates, then cat ---structure and cat --node or cat --page for evidence. +--structure and cat --page for evidence. """ AGENT_TOOL_POLICY = """ @@ -110,15 +107,14 @@ Tool policy: - Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces. - For broad topic, method, or "what solution" questions that are likely about the workspace, search for candidate documents before asking the user to choose a document. - Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says. -- Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59, cat --node . -- cat --structure returns at most 25 nodes; use --offset and --limit for more structure pages. -- cat --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range or node. -- cat --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible. -- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering. +- Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59. +- cat --structure returns the cached PageIndex structure JSON without text fields. +- cat --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range. +- When recovering from cat page/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering. - cat --all returns at most 100 text lines; use cat --range - for the next page. -- After cat --structure finds a relevant section/subsection with a node_id, prefer cat --node for content from that semantic unit. -- Use cat --page - when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence. -- Avoid fetching a broad page span after a matching node is available unless page-level citation or verification is required. +- After cat --structure identifies a relevant section/subsection, use cat --page - for exact evidence. +- Use cat --page - when the user explicitly asks for pages/page ranges or when you need exact page text to verify evidence. +- Avoid fetching a broad page span unless page-level citation or verification is required. - Do not call cat --page ; if you need a page span, use cat --page -. - For metadata or summary-field questions, run stat --schema and stat for relevant files before answering; do not infer metadata presence or absence from ls/find output alone. - Distinguish default/register metadata from caller-provided custom metadata when the evidence supports it. diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 3d1fbae..4b9a598 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -40,10 +40,6 @@ class PIFSCommandExecutor: BROWSE_PAGE_SIZE = 10 MAX_TEXT_LINES = 100 MAX_PAGE_SPAN = 5 - MAX_STRUCTURE_NODES = 25 - MAX_NODE_IDS = 10 - MAX_NODE_TEXT_LINES = 100 - MAX_NODE_TEXT_CHARS = 12_000 MAX_STAT_FIELD_TARGETS = 20 MAX_TREE_DEPTH = 4 MAX_LS_RENDER_FILES = 25 @@ -85,9 +81,8 @@ class PIFSCommandExecutor: "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", "- find -maxdepth N -type f|d: bounded folder traversal for find", "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", - "- cat --structure: cached PageIndex node list, paginated at 25 nodes", + "- cat --structure: cached PageIndex structure JSON without text fields", "- cat --page: cached PageIndex page reads, limited to 5 pages", - "- cat --node: cached PageIndex node reads, limited to 10 node ids", "- cat --all: text artifact reads for txt/text files, paginated at 100 lines", "- stat --field : one metadata field across up to 20 documents", ] @@ -495,15 +490,11 @@ class PIFSCommandExecutor: if target.startswith("-"): raise PIFSCommandError( "cat syntax is target-first: cat --structure, " - "cat --page 31-59, or " - "cat --node 0009" + "or cat --page 31-59" ) location = "all" structural_mode: str | None = None - node_ids: list[str] = [] page_range: str | None = None - structure_offset = 0 - structure_limit = self.MAX_STRUCTURE_NODES i = 1 while i < len(args): arg = args[i] @@ -516,29 +507,6 @@ class PIFSCommandExecutor: location = "all" elif arg == "--structure": structural_mode = "structure" - elif arg == "--offset": - i += 1 - if i >= len(args): - raise PIFSCommandError("cat --structure --offset requires a value") - structure_offset = self._parse_non_negative_int(args[i], "cat --structure --offset") - elif arg == "--limit": - i += 1 - if i >= len(args): - raise PIFSCommandError("cat --structure --limit requires a value") - structure_limit = self._parse_bounded_int( - args[i], - "cat --structure --limit", - max_value=self.MAX_STRUCTURE_NODES, - ) - elif arg == "--node": - i += 1 - if i >= len(args): - raise PIFSCommandError("cat --node requires a node id") - structural_mode = "node" - while i < len(args) and not args[i].startswith("-"): - node_ids.extend(self._parse_node_ids(args[i])) - i += 1 - i -= 1 elif arg == "--page": i += 1 if i >= len(args): @@ -551,8 +519,7 @@ class PIFSCommandExecutor: raise PIFSCommandError( "cat accepts one file target. Use target-first syntax: " "cat --structure, " - "cat --node 0002 0004, or " - "cat --page 31-33. " + "or cat --page 31-33. " f"Unexpected extra argument: {arg!r}. If the target path or title contains " "spaces, quote the whole target, for example: cat \"/documents/report name.pdf\" " "--structure. If a title-derived path is ambiguous, use the file_ref or " @@ -560,47 +527,7 @@ class PIFSCommandExecutor: ) i += 1 if structural_mode == "structure": - if structure_limit < 1: - raise PIFSCommandError( - "cat --structure --limit must be at least 1 and at most " - f"{self.MAX_STRUCTURE_NODES}." - ) - data = self.filesystem.pageindex_structure( - target, - offset=structure_offset, - limit=structure_limit, - ) - self._attach_structure_next_command(data, target) - return data - if structural_mode == "node": - self._require_at_most( - len(node_ids), - "cat --node node count", - self.MAX_NODE_IDS, - ) - if not node_ids: - raise PIFSCommandError("cat --node requires a node id") - node_results = [ - self._bounded_node_result( - self.filesystem.pageindex_node(target, node_id), - target=target, - node_id=node_id, - ) - for node_id in node_ids - ] - if len(node_results) == 1: - return node_results[0] - return { - "mode": "nodes", - "target": target, - "available": all(result.get("available") is not False for result in node_results), - "node_ids": node_ids, - "nodes": node_results, - "text": "\n\n".join( - f"[node {result.get('node_id') or node_id}]\n{result.get('text', '')}" - for node_id, result in zip(node_ids, node_results) - ), - } + return self.filesystem.pageindex_structure(target) if structural_mode == "page": if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range): raise PIFSCommandError( @@ -752,66 +679,6 @@ class PIFSCommandExecutor: data["pagination"] = pagination return data - def _bounded_node_result( - self, - data: dict[str, Any], - *, - target: str, - node_id: str, - ) -> dict[str, Any]: - if not isinstance(data, dict) or data.get("available") is False: - return data - text = str(data.get("text") or "") - lines = text.splitlines() - truncated_by_lines = len(lines) > self.MAX_NODE_TEXT_LINES - truncated_by_chars = len(text) > self.MAX_NODE_TEXT_CHARS - if not truncated_by_lines and not truncated_by_chars: - data["node_pagination"] = { - "limit_nodes": self.MAX_NODE_IDS, - "text_truncated": False, - } - return data - - selected = "\n".join(lines[: self.MAX_NODE_TEXT_LINES]) - if len(selected) > self.MAX_NODE_TEXT_CHARS: - selected = selected[: self.MAX_NODE_TEXT_CHARS].rstrip() - data["text"] = ( - selected.rstrip() - + "\n" - + self._pagination_footer( - "cat --node", - ( - f"node text limited to {self.MAX_NODE_TEXT_LINES} lines/" - f"{self.MAX_NODE_TEXT_CHARS} chars" - ), - f"cat {shlex.quote(target)} --structure", - ) - ).strip() - data["node_pagination"] = { - "limit_nodes": self.MAX_NODE_IDS, - "line_limit": self.MAX_NODE_TEXT_LINES, - "char_limit": self.MAX_NODE_TEXT_CHARS, - "original_lines": len(lines), - "original_chars": len(text), - "text_truncated": True, - "suggested_command": f"cat {shlex.quote(target)} --structure", - "node_id": node_id, - } - return data - - def _attach_structure_next_command(self, data: dict[str, Any], target: str) -> None: - pagination = data.get("structure_pagination") - if not isinstance(pagination, dict): - return - if pagination.get("has_more") and pagination.get("next_offset") is not None: - next_command = ( - f"cat {shlex.quote(target)} --structure " - f"--offset {pagination['next_offset']} --limit {pagination['limit']}" - ) - pagination["next_command"] = next_command - else: - pagination["next_command"] = None - def _attach_page_next_command( self, data: dict[str, Any], @@ -841,10 +708,6 @@ class PIFSCommandExecutor: f"Next: {next_command}. If unsure, use cat --structure." ) - @staticmethod - def _parse_node_ids(value: str) -> list[str]: - return [part.strip() for part in value.split(",") if part.strip()] - @staticmethod def _reject_regex_alternation_query(query: str, command_name: str) -> None: if "|" not in str(query): @@ -939,7 +802,6 @@ class PIFSCommandExecutor: return json.dumps( { "structure": data.get("structure", []), - "pagination": data.get("structure_pagination", {}), }, ensure_ascii=False, indent=2, diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 5d2fc68..67adb8f 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -1278,10 +1278,7 @@ class PageIndexFileSystem: return "" client._ensure_doc_loaded(doc_id) doc = client.documents.get(doc_id) or {} - page_text = self._pageindex_pages_text(doc.get("pages")) - if page_text: - return page_text - return self._pageindex_structure_text(doc.get("structure", [])) + return self._pageindex_pages_text(doc.get("pages")) @staticmethod def _pageindex_pages_text(pages: Any) -> str: @@ -1296,25 +1293,6 @@ class PageIndexFileSystem: parts.append(content) return "\n\n".join(parts) - @classmethod - def _pageindex_structure_text(cls, structure: Any) -> str: - parts: list[str] = [] - cls._collect_pageindex_node_text(structure, parts) - return "\n\n".join(parts) - - @classmethod - def _collect_pageindex_node_text(cls, node: Any, parts: list[str]) -> None: - if isinstance(node, list): - for item in node: - cls._collect_pageindex_node_text(item, parts) - return - if not isinstance(node, dict): - return - text = str(node.get("text") or "").strip() - if text: - parts.append(text) - cls._collect_pageindex_node_text(node.get("nodes", []), parts) - @staticmethod def _raw_artifact_payload( *, diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py index 3994aa4..63a08dd 100644 --- a/tests/test_pageindex_structural_read.py +++ b/tests/test_pageindex_structural_read.py @@ -68,7 +68,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): executor = PIFSCommandExecutor(filesystem, json_output=True) structure = json.loads(executor.execute("cat dsid_structural_missing --structure")) - node = json.loads(executor.execute("cat dsid_structural_missing --node 0001")) pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2")) stat = json.loads(executor.execute("stat dsid_structural_missing")) @@ -85,10 +84,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): "message": "index failed: extractor unavailable", } - assert node["data"]["mode"] == "node" - assert node["data"]["available"] is False - assert node["data"]["node_id"] == "0001" - assert pages["data"]["mode"] == "page" assert pages["data"]["available"] is False assert pages["data"]["pages"] == "1-2" @@ -135,6 +130,9 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft "nodes": [], } ], + "pages": [ + {"page": 1, "content": "PageIndex Markdown extracted gamma text."} + ], } write_pageindex_client_doc(self.workspace, doc_id, doc) self.documents[doc_id] = doc @@ -348,10 +346,10 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke assert structure["data"]["available"] is True assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf" assert structure["data"]["structure"][0]["title"] == "Introduction" - assert structure["data"]["structure"][1]["title"] == "Findings" - assert structure["data"]["structure_pagination"]["limit"] == 25 + assert structure["data"]["structure"][0]["nodes"][0]["title"] == "Findings" + assert "structure_pagination" not in structure["data"] assert "text" not in structure["data"]["structure"][0] - assert "text" not in structure["data"]["structure"][1] + assert "text" not in structure["data"]["structure"][0]["nodes"][0] assert pages["data"]["available"] is True assert pages["data"]["text"] == "Page one text\n\nPage two text" @@ -364,53 +362,26 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke assert stat["data"]["pageindex_tree_status"] == "built" -def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact(): +def test_cat_node_is_not_supported(): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError with tempfile.TemporaryDirectory() as tmp: - source = Path(tmp) / "notes.md" - source.write_text("# Notes\n\nBody", encoding="utf-8") filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") - write_pageindex_client_doc( - filesystem.pageindex_client_workspace, - "doc_cached_md", - { - "id": "doc_cached_md", - "type": "md", - "path": str(source.resolve()), - "doc_name": "notes", - "doc_description": "", - "line_count": 3, - "structure": [ - { - "title": "Notes", - "node_id": "0001", - "line_num": 1, - "text": "# Notes\n\nBody", - "nodes": [], - } - ], - }, - ) filesystem.register_file( - storage_uri=source.as_uri(), + storage_uri="file:///tmp/notes.md", source_path="docs/notes.md", external_id="dsid_md_cached", title="Cached markdown notes", - content=source.read_text(encoding="utf-8"), + content="# Notes\n\nBody", ) executor = PIFSCommandExecutor(filesystem, json_output=True) - node = json.loads(executor.execute("cat dsid_md_cached --node 0001")) - - assert node["data"]["available"] is True - assert node["data"]["pageindex_doc_id"] == "doc_cached_md" - assert node["data"]["node"]["title"] == "Notes" - assert node["data"]["text"] == "# Notes\n\nBody" - assert "text" not in node["data"]["node"] + with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"): + executor.execute("cat dsid_md_cached --node 0001") -def test_cat_structure_page_node_and_text_outputs_are_hard_limited(): +def test_cat_structure_page_and_text_outputs_are_hard_limited(): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError @@ -463,16 +434,13 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited(): ) executor = PIFSCommandExecutor(filesystem, json_output=True) - first_structure = json.loads(executor.execute("cat dsid_limited_pdf --structure")) - assert len(first_structure["data"]["structure"]) == 25 - assert first_structure["data"]["structure_pagination"]["has_more"] is True - assert first_structure["data"]["structure_pagination"]["next_offset"] == 25 - - second_structure = json.loads( + structure = json.loads(executor.execute("cat dsid_limited_pdf --structure")) + assert len(structure["data"]["structure"]) == 30 + assert structure["data"]["structure"][25]["node_id"] == "0026" + assert "text" not in structure["data"]["structure"][0] + assert "structure_pagination" not in structure["data"] + with pytest.raises(PIFSCommandError, match="Unsupported cat option: --offset"): executor.execute("cat dsid_limited_pdf --structure --offset 25") - ) - assert len(second_structure["data"]["structure"]) == 5 - assert second_structure["data"]["structure"][0]["node_id"] == "0026" pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5")) assert pages["data"]["text"] == ( @@ -484,38 +452,8 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited(): with pytest.raises(PIFSCommandError, match="evidence is sufficient"): executor.execute("cat dsid_limited_pdf --page 1-6") - nodes = json.loads( - executor.execute( - "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " - "0006 0007 0008 0009 0010" - ) - ) - assert nodes["data"]["node_ids"] == [ - "0001", - "0002", - "0003", - "0004", - "0005", - "0006", - "0007", - "0008", - "0009", - "0010", - ] - comma_nodes = json.loads( - executor.execute("cat dsid_limited_pdf --node 0001,0002") - ) - assert comma_nodes["data"]["node_ids"] == ["0001", "0002"] - with pytest.raises(PIFSCommandError, match="at most 10"): - executor.execute( - "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " - "0006 0007 0008 0009 0010 0011" - ) - with pytest.raises(PIFSCommandError, match="continue with additional chunks"): - executor.execute( - "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " - "0006 0007 0008 0009 0010 0011" - ) + with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"): + executor.execute("cat dsid_limited_pdf --node 0001") with pytest.raises(PIFSCommandError, match="quote the whole target"): executor.execute("cat dsid_limited_pdf 0001") @@ -672,11 +610,13 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown(): for command in ( "cat dsid_text_only --structure", "cat dsid_text_only --page 1", - "cat dsid_text_only --node 0001", ): with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"): executor.execute(command) + with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"): + executor.execute("cat dsid_text_only --node 0001") + def test_existing_pageindex_status_allows_legacy_record_without_format_suffix(): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index 3c833ee..ea03976 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -218,13 +218,17 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}') - def test_prompt_tells_agent_when_to_choose_node_or_page(self): - self.assertIn("prefer cat --node ", AGENT_TOOL_POLICY) - self.assertIn("page-level evidence", AGENT_TOOL_POLICY) - self.assertIn("prefer\ncat --node ", BASH_TOOL_DESCRIPTION) + def test_prompt_tells_agent_to_use_structure_then_page(self): + self.assertIn( + "cat --structure returns the cached PageIndex structure JSON", + AGENT_TOOL_POLICY, + ) + self.assertIn("exact page text", BASH_TOOL_DESCRIPTION) + self.assertIn("cat --structure and cat --page", BASH_TOOL_DESCRIPTION) self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY) self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION) - self.assertIn("Do not reconstruct paths from document titles", BASH_TOOL_DESCRIPTION) + self.assertIn("Do not reconstruct paths from", BASH_TOOL_DESCRIPTION) + self.assertIn("document titles", BASH_TOOL_DESCRIPTION) self.assertIn("file_ref/document_id", AGENT_TOOL_POLICY) def test_prompt_requires_stat_for_metadata_questions(self): @@ -244,7 +248,6 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("browse returns file candidates only", AGENT_TOOL_POLICY) self.assertIn("verify the relevant facts with cat or grep", AGENT_TOOL_POLICY) self.assertIn("cat --structure", AGENT_TOOL_POLICY) - self.assertIn("cat --node ", AGENT_TOOL_POLICY) self.assertIn("cat --page", AGENT_TOOL_POLICY) self.assertIn("Do not use browse as folder semantic recall", AGENT_TOOL_POLICY)