diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index 4f796ec..facaceb 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -74,8 +74,10 @@ ambiguous. After structure identifies a relevant section node, prefer cat --node ; use cat --page when the user asks for page-level evidence, no suitable node exists, or exact page text is needed. cat --structure is paginated; request more with --offset if needed. Page -reads are limited to three pages at once, node reads to at most five node ids, -and text cat --all returns only the first page of text lines. +reads are limited to five pages at once, node reads to at most ten node ids, +and text cat --all returns only the first page of text lines. If a cat limit +error requires a smaller call, stop when the evidence is sufficient; otherwise +continue with another chunk before answering. For questions about metadata fields, available summaries, or whether metadata was provided, inspect stat --schema and stat before making claims. Do not use stat as a general content/topic discovery step. For document Q&A, @@ -101,8 +103,9 @@ Tool policy: - Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says. - Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59, cat --node . - cat --structure returns at most 25 nodes; use --offset and --limit for more structure pages. -- cat --page accepts at most 3 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range or node. -- cat --node accepts at most 5 node ids at once. Prefer one relevant node when possible. +- cat --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range or node. +- cat --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible. +- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering. - cat --all returns at most 100 text lines; use cat --range - for the next page. - After cat --structure finds a relevant section/subsection with a node_id, prefer cat --node for content from that semantic unit. - Use cat --page - when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence. diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 29ea46a..6341b8b 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -54,9 +54,9 @@ class PIFSCommandExecutor: MAX_GREP_LIMIT = 20 MAX_SEMANTIC_LIMIT = 20 MAX_TEXT_LINES = 100 - MAX_PAGE_SPAN = 3 + MAX_PAGE_SPAN = 5 MAX_STRUCTURE_NODES = 25 - MAX_NODE_IDS = 5 + MAX_NODE_IDS = 10 MAX_NODE_TEXT_LINES = 100 MAX_NODE_TEXT_CHARS = 12_000 MAX_STAT_FIELD_TARGETS = 20 @@ -107,8 +107,8 @@ class PIFSCommandExecutor: "- find -maxdepth N -type f|d: bounded folder traversal for find", "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", "- cat --structure: cached PageIndex node list, paginated at 25 nodes", - "- cat --page: cached PageIndex page reads, limited to 3 pages", - "- cat --node: cached PageIndex node reads, limited to 5 node ids", + "- cat --page: cached PageIndex page reads, limited to 5 pages", + "- cat --node: cached PageIndex node reads, limited to 10 node ids", "- cat --all: text artifact reads for txt/text files, paginated at 100 lines", "- stat --field : one metadata field across up to 20 documents", ] @@ -2003,8 +2003,10 @@ class PIFSCommandExecutor: if value > max_value: raise PIFSCommandError( f"{label} supports at most {max_value}; requested {value}. " - "Use a smaller value. If you are unsure where to inspect, " - "use cat --structure first." + "Split it into a smaller call. If the evidence is sufficient, " + "stop; if not, continue with additional chunks before " + "answering. If you are unsure where to inspect, use cat " + "--structure first." ) return value diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py index cd104c5..f5b6dea 100644 --- a/tests/test_pageindex_structural_read.py +++ b/tests/test_pageindex_structural_read.py @@ -474,22 +474,48 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited(): assert len(second_structure["data"]["structure"]) == 5 assert second_structure["data"]["structure"][0]["node_id"] == "0026" - pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-3")) - assert pages["data"]["text"] == "Page 1 text\n\nPage 2 text\n\nPage 3 text" - assert pages["data"]["page_pagination"]["limit"] == 3 - with pytest.raises(PIFSCommandError, match="at most 3"): - executor.execute("cat dsid_limited_pdf --page 1-4") + pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5")) + assert pages["data"]["text"] == ( + "Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text" + ) + assert pages["data"]["page_pagination"]["limit"] == 5 + with pytest.raises(PIFSCommandError, match="at most 5"): + executor.execute("cat dsid_limited_pdf --page 1-6") + with pytest.raises(PIFSCommandError, match="evidence is sufficient"): + executor.execute("cat dsid_limited_pdf --page 1-6") nodes = json.loads( - executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005") + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010" + ) ) - assert nodes["data"]["node_ids"] == ["0001", "0002", "0003", "0004", "0005"] + assert nodes["data"]["node_ids"] == [ + "0001", + "0002", + "0003", + "0004", + "0005", + "0006", + "0007", + "0008", + "0009", + "0010", + ] comma_nodes = json.loads( executor.execute("cat dsid_limited_pdf --node 0001,0002") ) assert comma_nodes["data"]["node_ids"] == ["0001", "0002"] - with pytest.raises(PIFSCommandError, match="at most 5"): - executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 0006") + with pytest.raises(PIFSCommandError, match="at most 10"): + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010 0011" + ) + with pytest.raises(PIFSCommandError, match="continue with additional chunks"): + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010 0011" + ) with pytest.raises(PIFSCommandError, match="cat accepts one file target"): executor.execute("cat dsid_limited_pdf 0001") diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index 9fe62c1..5dae40b 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -202,6 +202,8 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("prefer cat --node ", AGENT_TOOL_POLICY) self.assertIn("page-level evidence", AGENT_TOOL_POLICY) self.assertIn("prefer\ncat --node ", BASH_TOOL_DESCRIPTION) + self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY) + self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION) def test_prompt_requires_stat_for_metadata_questions(self): self.assertIn("stat --schema and stat ", AGENT_TOOL_POLICY)