mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): require target-first cat syntax
This commit is contained in:
parent
3d62293a1e
commit
b9ee711087
5 changed files with 57 additions and 37 deletions
|
|
@ -5,7 +5,7 @@ This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus
|
|||
through the PageIndex FileSystem shell instead of direct PageIndex document
|
||||
tools. The agent receives one read-only bash-like PIFS tool and must retrieve
|
||||
evidence through commands such as ls, tree, find, grep, search-summary,
|
||||
cat --structure, cat --page, and cat --node.
|
||||
cat <ref> --structure, cat <ref> --page, and cat <ref> --node.
|
||||
|
||||
The demo uses PDFs under examples/documents. When a matching
|
||||
examples/documents/results/*_structure.json file exists, it is loaded into the
|
||||
|
|
@ -71,12 +71,12 @@ Retrieval strategy:
|
|||
- Use grep -R only for lexical evidence; do not treat semantic candidates as
|
||||
literal matches.
|
||||
- Run one evidence command at a time. Do not chain large commands like
|
||||
cat --structure, grep, and cat --page in one bash call.
|
||||
- For PDFs, use cat --structure <ref> to inspect the PageIndex tree, then
|
||||
cat --page <range> <ref> for evidence, for example:
|
||||
cat --page 31-35 ref_1
|
||||
- For page-range questions, use cat --structure to identify the full section
|
||||
range. Then run cat --page on the smallest useful evidence range, usually the
|
||||
cat <ref> --structure, grep, and cat <ref> --page in one bash call.
|
||||
- For PDFs, use cat <ref> --structure to inspect the PageIndex tree, then
|
||||
cat <ref> --page <range> for evidence, for example:
|
||||
cat ref_1 --page 31-35
|
||||
- For page-range questions, use cat <ref> --structure to identify the full section
|
||||
range. Then run cat <ref> --page on the smallest useful evidence range, usually the
|
||||
section start page or first 1-2 pages, before the final answer. Do not print
|
||||
a broad multi-page section unless the user asks to read the whole section.
|
||||
- Do not use cat --all on PDFs.
|
||||
|
|
@ -646,7 +646,7 @@ def run_smoke_commands(
|
|||
verbose=verbose,
|
||||
)
|
||||
|
||||
command = f"cat --structure {first_ref}"
|
||||
command = f"cat {first_ref} --structure"
|
||||
structure_payload = execute_json_command(json_executor, command)
|
||||
structure_data = structure_payload.get("data") or {}
|
||||
structure = structure_data.get("structure") or []
|
||||
|
|
@ -664,7 +664,7 @@ def run_smoke_commands(
|
|||
)
|
||||
|
||||
evidence_range = opening_page_range_for_node(supervision_node) or "1-2"
|
||||
command = f"cat --page {evidence_range} {first_ref}"
|
||||
command = f"cat {first_ref} --page {evidence_range}"
|
||||
page = execute_json_command(json_executor, command)
|
||||
page_text = str((page.get("data") or {}).get("text") or "")
|
||||
show_capability(
|
||||
|
|
|
|||
|
|
@ -36,7 +36,8 @@ commands described in the workspace context. grep -R is lexical evidence search;
|
|||
semantic search commands return candidate documents and do not guarantee literal
|
||||
text matches. Errors are returned as text prefixed with ERROR. Do not call
|
||||
commands that are not listed as available. When evidence is required, inspect it
|
||||
with cat or grep before answering.
|
||||
with cat or grep before answering. Prefer shell-like target-first cat syntax:
|
||||
cat <ref> --structure, cat <ref> --page 31-59, and cat <ref> --node 0009.
|
||||
"""
|
||||
|
||||
AGENT_TOOL_POLICY = """
|
||||
|
|
@ -48,6 +49,8 @@ Tool policy:
|
|||
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
|
||||
- Tool errors are returned as ERROR text; recover by trying an available command.
|
||||
- Use cat or grep to gather evidence before making source-backed claims.
|
||||
- Prefer target-first cat syntax: cat <ref> --structure, cat <ref> --page 31-59, cat <ref> --node <node_id>.
|
||||
- Do not call cat --page <ref> <start> <end>; if you need a page span, use cat <ref> --page <start>-<end>.
|
||||
"""
|
||||
|
||||
STREAM_MODE_ALIASES = {
|
||||
|
|
|
|||
|
|
@ -90,8 +90,8 @@ class PIFSCommandExecutor:
|
|||
"- ls/tree: folder browsing",
|
||||
"- find --where: exact/canonical metadata DSL filtering",
|
||||
"- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled",
|
||||
"- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files",
|
||||
"- cat --all: full text artifact reads for txt/text files",
|
||||
"- cat <ref> --structure/--node/--page: cached PageIndex reads for PDF/Markdown files",
|
||||
"- cat <ref> --all: full text artifact reads for txt/text files",
|
||||
]
|
||||
if "entity" in semantic_channels:
|
||||
lines.append("- find --name: entity semantic candidate discovery alias")
|
||||
|
|
@ -405,12 +405,17 @@ class PIFSCommandExecutor:
|
|||
def _cmd_cat(self, args: list[str]) -> Any:
|
||||
if not args:
|
||||
raise PIFSCommandError("cat requires a file target")
|
||||
target = None
|
||||
target = args[0]
|
||||
if target.startswith("-"):
|
||||
raise PIFSCommandError(
|
||||
"cat syntax is target-first: cat <ref> --structure, "
|
||||
"cat <ref> --page 31-59, or cat <ref> --node 0009"
|
||||
)
|
||||
location = "all"
|
||||
structural_mode: str | None = None
|
||||
node_id: str | None = None
|
||||
page_range: str | None = None
|
||||
i = 0
|
||||
i = 1
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
if arg == "--range":
|
||||
|
|
@ -437,16 +442,22 @@ class PIFSCommandExecutor:
|
|||
elif arg.startswith("-"):
|
||||
raise PIFSCommandError(f"Unsupported cat option: {arg}")
|
||||
else:
|
||||
target = arg
|
||||
raise PIFSCommandError(
|
||||
"cat accepts one file target. Use: cat <ref> --page <page-or-range>, "
|
||||
"for example: cat ref_1 --page 31-59"
|
||||
)
|
||||
i += 1
|
||||
if not target:
|
||||
raise PIFSCommandError("cat requires a file target")
|
||||
if structural_mode == "structure":
|
||||
return self.filesystem.pageindex_structure(target)
|
||||
if structural_mode == "node":
|
||||
return self.filesystem.pageindex_node(target, str(node_id))
|
||||
if structural_mode == "page":
|
||||
return self.filesystem.pageindex_pages(target, str(page_range))
|
||||
if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range):
|
||||
raise PIFSCommandError(
|
||||
"cat --page requires one page selector like 31 or 31-59. "
|
||||
"Use: cat <ref> --page <page-or-range>"
|
||||
)
|
||||
return self.filesystem.pageindex_pages(target, page_range)
|
||||
return self.filesystem.cat_text_artifact(target, location)
|
||||
|
||||
def _cmd_stat(self, args: list[str]) -> Any:
|
||||
|
|
|
|||
|
|
@ -677,7 +677,8 @@ class PageIndexFileSystem:
|
|||
raise ValueError(
|
||||
f"{command} is only supported for txt/text files; "
|
||||
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
|
||||
"Use cat --structure, cat --page, or cat --node for PDF/Markdown PageIndex files."
|
||||
"Use cat <ref> --structure, cat <ref> --page, or cat <ref> --node "
|
||||
"for PDF/Markdown PageIndex files."
|
||||
)
|
||||
|
||||
def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
|
||||
|
|
@ -686,7 +687,7 @@ class PageIndexFileSystem:
|
|||
raise ValueError(
|
||||
f"{command} is only supported for PDF/Markdown PageIndex files; "
|
||||
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
|
||||
"Use cat --all for txt/text files."
|
||||
"Use cat <ref> --all for txt/text files."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
|
|
|||
|
|
@ -67,9 +67,9 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_structural_missing"))
|
||||
node = json.loads(executor.execute("cat --node 0001 dsid_structural_missing"))
|
||||
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_missing"))
|
||||
structure = json.loads(executor.execute("cat dsid_structural_missing --structure"))
|
||||
node = json.loads(executor.execute("cat dsid_structural_missing --node 0001"))
|
||||
pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2"))
|
||||
stat = json.loads(executor.execute("stat dsid_structural_missing"))
|
||||
|
||||
assert structure["data"]["mode"] == "structure"
|
||||
|
|
@ -282,6 +282,7 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp
|
|||
def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "report.pdf"
|
||||
|
|
@ -333,8 +334,8 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_structural_cached"))
|
||||
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_cached"))
|
||||
structure = json.loads(executor.execute("cat dsid_structural_cached --structure"))
|
||||
pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2"))
|
||||
stat = json.loads(executor.execute("stat dsid_structural_cached"))
|
||||
|
||||
assert structure["data"]["available"] is True
|
||||
|
|
@ -345,6 +346,10 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
|
|||
|
||||
assert pages["data"]["available"] is True
|
||||
assert pages["data"]["text"] == "Page one text\n\nPage two text"
|
||||
with pytest.raises(PIFSCommandError, match="target-first"):
|
||||
executor.execute("cat --page 1-2 dsid_structural_cached")
|
||||
with pytest.raises(PIFSCommandError, match="one file target"):
|
||||
executor.execute("cat dsid_structural_cached --page 1 2")
|
||||
|
||||
assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
|
||||
assert stat["data"]["pageindex_tree_status"] == "built"
|
||||
|
|
@ -387,7 +392,7 @@ def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact(
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
node = json.loads(executor.execute("cat --node 0001 dsid_md_cached"))
|
||||
node = json.loads(executor.execute("cat dsid_md_cached --node 0001"))
|
||||
|
||||
assert node["data"]["available"] is True
|
||||
assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
|
||||
|
|
@ -453,7 +458,7 @@ def test_tree_does_not_read_file_internal_pageindex_structure():
|
|||
with pytest.raises(PIFSCommandError):
|
||||
executor.execute("tree dsid_tree_is_folder_only")
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_tree_is_folder_only"))
|
||||
structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure"))
|
||||
assert structure["data"]["structure"][0]["title"] == "Introduction"
|
||||
|
||||
|
||||
|
|
@ -494,19 +499,19 @@ def test_cat_all_is_limited_to_text_files():
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
text = json.loads(executor.execute("cat --all dsid_text_file"))
|
||||
text = json.loads(executor.execute("cat dsid_text_file --all"))
|
||||
assert text["data"]["text"] == "plain text body"
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_pdf_file")
|
||||
executor.execute("cat dsid_pdf_file --all")
|
||||
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
||||
filesystem.open("dsid_pdf_file")
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_md_file")
|
||||
executor.execute("cat dsid_md_file --all")
|
||||
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
||||
filesystem.open("dsid_md_file")
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_json_file")
|
||||
executor.execute("cat dsid_json_file --all")
|
||||
assert filesystem.open("dsid_json_file").text == '{"body":"json"}'
|
||||
for command in (
|
||||
"head dsid_pdf_file",
|
||||
|
|
@ -536,9 +541,9 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
|
|||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
for command in (
|
||||
"cat --structure dsid_text_only",
|
||||
"cat --page 1 dsid_text_only",
|
||||
"cat --node 0001 dsid_text_only",
|
||||
"cat dsid_text_only --structure",
|
||||
"cat dsid_text_only --page 1",
|
||||
"cat dsid_text_only --node 0001",
|
||||
):
|
||||
with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
|
||||
executor.execute(command)
|
||||
|
|
@ -581,10 +586,10 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_legacy_pageindex"))
|
||||
structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure"))
|
||||
assert structure["data"]["structure"][0]["title"] == "Uploaded"
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_legacy_pageindex")
|
||||
executor.execute("cat dsid_legacy_pageindex --all")
|
||||
|
||||
|
||||
def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
|
||||
|
|
@ -624,7 +629,7 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke
|
|||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_late_cache"))
|
||||
structure = json.loads(executor.execute("cat dsid_late_cache --structure"))
|
||||
stat = json.loads(executor.execute("stat dsid_late_cache"))
|
||||
|
||||
assert structure["data"]["available"] is False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue