fix(filesystem): relax structural read limits

2026-06-12 19:55:17 +02:00 · 2026-05-26 20:57:08 +08:00 · 2026-05-26 20:57:08 +08:00 · cd571ddbaf
commit cd571ddbaf
parent 2b69719f60
4 changed files with 52 additions and 19 deletions
--- a/pageindex/filesystem/agent.py
+++ b/pageindex/filesystem/agent.py
@ -74,8 +74,10 @@ ambiguous. After structure identifies a relevant section node, prefer
 cat <path> --node <node_id>; use cat <path> --page <range> when the user asks
 for page-level evidence, no suitable node exists, or exact page text is needed.
 cat <path> --structure is paginated; request more with --offset if needed. Page
-reads are limited to three pages at once, node reads to at most five node ids,
-and text cat --all returns only the first page of text lines.
+reads are limited to five pages at once, node reads to at most ten node ids,
+and text cat --all returns only the first page of text lines. If a cat limit
+error requires a smaller call, stop when the evidence is sufficient; otherwise
+continue with another chunk before answering.
 For questions about metadata fields, available summaries, or whether metadata
 was provided, inspect stat --schema and stat <target> before making claims.
 Do not use stat as a general content/topic discovery step. For document Q&A,
@ -101,8 +103,9 @@ Tool policy:
 - Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says.
 - Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.
 - cat <target> --structure returns at most 25 nodes; use --offset and --limit for more structure pages.
- cat <target> --page accepts at most 3 pages at once. If a larger range is needed, first inspect cat <target> --structure and then read a smaller page range or node.
- cat <target> --node accepts at most 5 node ids at once. Prefer one relevant node when possible.
+- cat <target> --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat <target> --structure and then read a smaller page range or node.
+- cat <target> --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible.
+- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering.
 - cat <target> --all returns at most 100 text lines; use cat <target> --range <start>-<end> for the next page.
 - After cat <target> --structure finds a relevant section/subsection with a node_id, prefer cat <target> --node <node_id> for content from that semantic unit.
 - Use cat <target> --page <start>-<end> when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence.
--- a/pageindex/filesystem/commands.py
+++ b/pageindex/filesystem/commands.py
@ -54,9 +54,9 @@ class PIFSCommandExecutor:
    MAX_GREP_LIMIT = 20
    MAX_SEMANTIC_LIMIT = 20
    MAX_TEXT_LINES = 100
-    MAX_PAGE_SPAN = 3
+    MAX_PAGE_SPAN = 5
    MAX_STRUCTURE_NODES = 25
-    MAX_NODE_IDS = 5
+    MAX_NODE_IDS = 10
    MAX_NODE_TEXT_LINES = 100
    MAX_NODE_TEXT_CHARS = 12_000
    MAX_STAT_FIELD_TARGETS = 20
@ -107,8 +107,8 @@ class PIFSCommandExecutor:
            "- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
            "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled",
            "- cat <path|file_ref|document_id> --structure: cached PageIndex node list, paginated at 25 nodes",
-            "- cat <path|file_ref|document_id> --page: cached PageIndex page reads, limited to 3 pages",
-            "- cat <path|file_ref|document_id> --node: cached PageIndex node reads, limited to 5 node ids",
+            "- cat <path|file_ref|document_id> --page: cached PageIndex page reads, limited to 5 pages",
+            "- cat <path|file_ref|document_id> --node: cached PageIndex node reads, limited to 10 node ids",
            "- cat <path|file_ref|document_id> --all: text artifact reads for txt/text files, paginated at 100 lines",
            "- stat --field <metadata_field> <target...>: one metadata field across up to 20 documents",
        ]
@ -2003,8 +2003,10 @@ class PIFSCommandExecutor:
        if value > max_value:
            raise PIFSCommandError(
                f"{label} supports at most {max_value}; requested {value}. "
-                "Use a smaller value. If you are unsure where to inspect, "
-                "use cat <target> --structure first."
+                "Split it into a smaller call. If the evidence is sufficient, "
+                "stop; if not, continue with additional chunks before "
+                "answering. If you are unsure where to inspect, use cat <target> "
+                "--structure first."
            )
        return value

--- a/tests/test_pageindex_structural_read.py
+++ b/tests/test_pageindex_structural_read.py
@ -474,22 +474,48 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
        assert len(second_structure["data"]["structure"]) == 5
        assert second_structure["data"]["structure"][0]["node_id"] == "0026"

-        pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-3"))
-        assert pages["data"]["text"] == "Page 1 text\n\nPage 2 text\n\nPage 3 text"
-        assert pages["data"]["page_pagination"]["limit"] == 3
-        with pytest.raises(PIFSCommandError, match="at most 3"):
-            executor.execute("cat dsid_limited_pdf --page 1-4")
+        pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5"))
+        assert pages["data"]["text"] == (
+            "Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text"
+        )
+        assert pages["data"]["page_pagination"]["limit"] == 5
+        with pytest.raises(PIFSCommandError, match="at most 5"):
+            executor.execute("cat dsid_limited_pdf --page 1-6")
+        with pytest.raises(PIFSCommandError, match="evidence is sufficient"):
+            executor.execute("cat dsid_limited_pdf --page 1-6")

        nodes = json.loads(
-            executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005")
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010"
+            )
        )
-        assert nodes["data"]["node_ids"] == ["0001", "0002", "0003", "0004", "0005"]
+        assert nodes["data"]["node_ids"] == [
+            "0001",
+            "0002",
+            "0003",
+            "0004",
+            "0005",
+            "0006",
+            "0007",
+            "0008",
+            "0009",
+            "0010",
+        ]
        comma_nodes = json.loads(
            executor.execute("cat dsid_limited_pdf --node 0001,0002")
        )
        assert comma_nodes["data"]["node_ids"] == ["0001", "0002"]
-        with pytest.raises(PIFSCommandError, match="at most 5"):
-            executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 0006")
+        with pytest.raises(PIFSCommandError, match="at most 10"):
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010 0011"
+            )
+        with pytest.raises(PIFSCommandError, match="continue with additional chunks"):
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010 0011"
+            )

        with pytest.raises(PIFSCommandError, match="cat accepts one file target"):
            executor.execute("cat dsid_limited_pdf 0001")
--- a/tests/test_pifs_agent_stream.py
+++ b/tests/test_pifs_agent_stream.py
@ -202,6 +202,8 @@ class PIFSAgentStreamTest(unittest.TestCase):
        self.assertIn("prefer cat <target> --node <node_id>", AGENT_TOOL_POLICY)
        self.assertIn("page-level evidence", AGENT_TOOL_POLICY)
        self.assertIn("prefer\ncat <path> --node <node_id>", BASH_TOOL_DESCRIPTION)
+        self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY)
+        self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION)

    def test_prompt_requires_stat_for_metadata_questions(self):
        self.assertIn("stat --schema and stat <target>", AGENT_TOOL_POLICY)