mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): relax structural read limits
This commit is contained in:
parent
2b69719f60
commit
cd571ddbaf
4 changed files with 52 additions and 19 deletions
|
|
@ -74,8 +74,10 @@ ambiguous. After structure identifies a relevant section node, prefer
|
|||
cat <path> --node <node_id>; use cat <path> --page <range> when the user asks
|
||||
for page-level evidence, no suitable node exists, or exact page text is needed.
|
||||
cat <path> --structure is paginated; request more with --offset if needed. Page
|
||||
reads are limited to three pages at once, node reads to at most five node ids,
|
||||
and text cat --all returns only the first page of text lines.
|
||||
reads are limited to five pages at once, node reads to at most ten node ids,
|
||||
and text cat --all returns only the first page of text lines. If a cat limit
|
||||
error requires a smaller call, stop when the evidence is sufficient; otherwise
|
||||
continue with another chunk before answering.
|
||||
For questions about metadata fields, available summaries, or whether metadata
|
||||
was provided, inspect stat --schema and stat <target> before making claims.
|
||||
Do not use stat as a general content/topic discovery step. For document Q&A,
|
||||
|
|
@ -101,8 +103,9 @@ Tool policy:
|
|||
- Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says.
|
||||
- Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.
|
||||
- cat <target> --structure returns at most 25 nodes; use --offset and --limit for more structure pages.
|
||||
- cat <target> --page accepts at most 3 pages at once. If a larger range is needed, first inspect cat <target> --structure and then read a smaller page range or node.
|
||||
- cat <target> --node accepts at most 5 node ids at once. Prefer one relevant node when possible.
|
||||
- cat <target> --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat <target> --structure and then read a smaller page range or node.
|
||||
- cat <target> --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible.
|
||||
- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering.
|
||||
- cat <target> --all returns at most 100 text lines; use cat <target> --range <start>-<end> for the next page.
|
||||
- After cat <target> --structure finds a relevant section/subsection with a node_id, prefer cat <target> --node <node_id> for content from that semantic unit.
|
||||
- Use cat <target> --page <start>-<end> when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence.
|
||||
|
|
|
|||
|
|
@ -54,9 +54,9 @@ class PIFSCommandExecutor:
|
|||
MAX_GREP_LIMIT = 20
|
||||
MAX_SEMANTIC_LIMIT = 20
|
||||
MAX_TEXT_LINES = 100
|
||||
MAX_PAGE_SPAN = 3
|
||||
MAX_PAGE_SPAN = 5
|
||||
MAX_STRUCTURE_NODES = 25
|
||||
MAX_NODE_IDS = 5
|
||||
MAX_NODE_IDS = 10
|
||||
MAX_NODE_TEXT_LINES = 100
|
||||
MAX_NODE_TEXT_CHARS = 12_000
|
||||
MAX_STAT_FIELD_TARGETS = 20
|
||||
|
|
@ -107,8 +107,8 @@ class PIFSCommandExecutor:
|
|||
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
|
||||
"- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled",
|
||||
"- cat <path|file_ref|document_id> --structure: cached PageIndex node list, paginated at 25 nodes",
|
||||
"- cat <path|file_ref|document_id> --page: cached PageIndex page reads, limited to 3 pages",
|
||||
"- cat <path|file_ref|document_id> --node: cached PageIndex node reads, limited to 5 node ids",
|
||||
"- cat <path|file_ref|document_id> --page: cached PageIndex page reads, limited to 5 pages",
|
||||
"- cat <path|file_ref|document_id> --node: cached PageIndex node reads, limited to 10 node ids",
|
||||
"- cat <path|file_ref|document_id> --all: text artifact reads for txt/text files, paginated at 100 lines",
|
||||
"- stat --field <metadata_field> <target...>: one metadata field across up to 20 documents",
|
||||
]
|
||||
|
|
@ -2003,8 +2003,10 @@ class PIFSCommandExecutor:
|
|||
if value > max_value:
|
||||
raise PIFSCommandError(
|
||||
f"{label} supports at most {max_value}; requested {value}. "
|
||||
"Use a smaller value. If you are unsure where to inspect, "
|
||||
"use cat <target> --structure first."
|
||||
"Split it into a smaller call. If the evidence is sufficient, "
|
||||
"stop; if not, continue with additional chunks before "
|
||||
"answering. If you are unsure where to inspect, use cat <target> "
|
||||
"--structure first."
|
||||
)
|
||||
return value
|
||||
|
||||
|
|
|
|||
|
|
@ -474,22 +474,48 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
|
|||
assert len(second_structure["data"]["structure"]) == 5
|
||||
assert second_structure["data"]["structure"][0]["node_id"] == "0026"
|
||||
|
||||
pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-3"))
|
||||
assert pages["data"]["text"] == "Page 1 text\n\nPage 2 text\n\nPage 3 text"
|
||||
assert pages["data"]["page_pagination"]["limit"] == 3
|
||||
with pytest.raises(PIFSCommandError, match="at most 3"):
|
||||
executor.execute("cat dsid_limited_pdf --page 1-4")
|
||||
pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5"))
|
||||
assert pages["data"]["text"] == (
|
||||
"Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text"
|
||||
)
|
||||
assert pages["data"]["page_pagination"]["limit"] == 5
|
||||
with pytest.raises(PIFSCommandError, match="at most 5"):
|
||||
executor.execute("cat dsid_limited_pdf --page 1-6")
|
||||
with pytest.raises(PIFSCommandError, match="evidence is sufficient"):
|
||||
executor.execute("cat dsid_limited_pdf --page 1-6")
|
||||
|
||||
nodes = json.loads(
|
||||
executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005")
|
||||
executor.execute(
|
||||
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
||||
"0006 0007 0008 0009 0010"
|
||||
)
|
||||
)
|
||||
assert nodes["data"]["node_ids"] == ["0001", "0002", "0003", "0004", "0005"]
|
||||
assert nodes["data"]["node_ids"] == [
|
||||
"0001",
|
||||
"0002",
|
||||
"0003",
|
||||
"0004",
|
||||
"0005",
|
||||
"0006",
|
||||
"0007",
|
||||
"0008",
|
||||
"0009",
|
||||
"0010",
|
||||
]
|
||||
comma_nodes = json.loads(
|
||||
executor.execute("cat dsid_limited_pdf --node 0001,0002")
|
||||
)
|
||||
assert comma_nodes["data"]["node_ids"] == ["0001", "0002"]
|
||||
with pytest.raises(PIFSCommandError, match="at most 5"):
|
||||
executor.execute("cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 0006")
|
||||
with pytest.raises(PIFSCommandError, match="at most 10"):
|
||||
executor.execute(
|
||||
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
||||
"0006 0007 0008 0009 0010 0011"
|
||||
)
|
||||
with pytest.raises(PIFSCommandError, match="continue with additional chunks"):
|
||||
executor.execute(
|
||||
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
||||
"0006 0007 0008 0009 0010 0011"
|
||||
)
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="cat accepts one file target"):
|
||||
executor.execute("cat dsid_limited_pdf 0001")
|
||||
|
|
|
|||
|
|
@ -202,6 +202,8 @@ class PIFSAgentStreamTest(unittest.TestCase):
|
|||
self.assertIn("prefer cat <target> --node <node_id>", AGENT_TOOL_POLICY)
|
||||
self.assertIn("page-level evidence", AGENT_TOOL_POLICY)
|
||||
self.assertIn("prefer\ncat <path> --node <node_id>", BASH_TOOL_DESCRIPTION)
|
||||
self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY)
|
||||
self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION)
|
||||
|
||||
def test_prompt_requires_stat_for_metadata_questions(self):
|
||||
self.assertIn("stat --schema and stat <target>", AGENT_TOOL_POLICY)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue