diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index f5dffe7..8b80457 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -86,11 +86,16 @@ Retrieval strategy: find /documents --where '{"file_format":"pdf"}' - Use grep -R only for lexical evidence; do not treat semantic candidates as literal matches. +- Use grep for one selected file; use grep -R only with folder + targets. - Run one evidence command at a time. Do not chain large commands like cat --structure, grep, and cat --page in one bash call. - For PDFs, use cat --structure to inspect the PageIndex tree, then cat --page for evidence, for example: cat /documents/2023-annual-report.pdf --page 31-35 +- Do not use cat --page as the first inspection command for a selected PDF. + Run cat --structure for that same target first, then choose pages. +- Do not guess cat --page ranges from grep line numbers. - For page-range questions, use cat --structure to identify the full section range. Then run cat --page on the smallest useful evidence range, usually the section start page or first 1-2 pages, before the final answer. Do not print diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index 6691f01..7b98316 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -59,8 +59,9 @@ operating-system shell. By default the tool is read-only: use ls, tree, find, grep, cat, stat, and browse when listed in the workspace context. grep -R is lexical evidence search; grep does not support regex alternation such as "a|b"; run multiple grep commands or use browse for -relevance-ranked file discovery instead. Start broad workspace questions with -ls or tree to understand folders. After choosing a folder, use positional +relevance-ranked file discovery instead. Use grep for one +selected file; use grep -R only with folder targets. Start broad workspace +questions with ls or tree to understand folders. After choosing a folder, use positional browse syntax with a quoted query, for example: browse /documents "Federal Reserve". If the relevant folder is uncertain, use browse -R /documents "Federal Reserve" to retrieve file candidates across that @@ -78,6 +79,9 @@ JSON, then cat --page for exact page text evidence. Page reads are limited to five pages at once, and text cat --all returns only the first page of text lines. If a cat limit error requires a smaller call, stop when the evidence is sufficient; otherwise continue with another chunk before answering. +Do not use cat --page as the first inspection command for a selected PDF or +PageIndex document; run cat --structure for that same target first. +Do not guess PDF page numbers from grep line numbers or text offsets. For questions about metadata fields, available summaries, or whether metadata was provided, inspect stat --schema and stat before making claims. Do not use stat as a general content/topic discovery step. For document Q&A, @@ -98,6 +102,7 @@ Tool policy: - browse returns file candidates only; Do not use browse as folder semantic recall. - browse candidates are not final evidence. After selecting candidates, verify the relevant facts with cat or grep before making source-backed claims. - grep -R performs lexical evidence search. +- Use grep for a selected single file. Use grep -R only with folder targets; do not run grep -R against one file. - grep does not support regex alternation such as "a|b"; run separate grep commands or use browse for relevance-ranked file discovery. - Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, browse, grep on a narrowed target, or cat on likely candidates instead. - A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with browse on a relevant folder or inspect likely document structure before answering no-evidence. @@ -109,11 +114,13 @@ Tool policy: - Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says. - Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59. - cat --structure returns the cached PageIndex structure JSON without text fields. +- For PDF/PageIndex document Q&A, run cat --structure before the first cat --page call for that target; page reads without structure are blind page guessing. - cat --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range. - When recovering from cat page/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering. - cat --all returns at most 100 text lines; use cat --range - for the next page. - After cat --structure identifies a relevant section/subsection, use cat --page - for exact evidence. - Use cat --page - when the user explicitly asks for pages/page ranges or when you need exact page text to verify evidence. +- Do not guess cat --page ranges from grep line numbers, text offsets, or table-of-contents intuition; use cat --structure to map the document first. - Avoid fetching a broad page span unless page-level citation or verification is required. - Do not call cat --page ; if you need a page span, use cat --page -. - For metadata or summary-field questions, run stat --schema and stat for relevant files before answering; do not infer metadata presence or absence from ls/find output alone. diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 3b1819d..a75c0e9 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -75,7 +75,8 @@ class PIFSCommandExecutor: "- find : folder path is positional; do not put paths in --where", "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", "- find -maxdepth N -type f|d: bounded folder traversal for find", - "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", + "- grep : single-file lexical evidence search without path prefixes", + "- grep -R : recursive lexical/FTS search only; semantic vector prefilter is disabled", "- cat --structure: cached PageIndex structure JSON without text fields", "- cat --page: cached PageIndex page reads, limited to 5 pages", "- cat --all: text artifact reads for txt/text files, paginated at 100 lines", @@ -444,6 +445,11 @@ class PIFSCommandExecutor: require_match=True, ), } + if recursive: + raise PIFSCommandError( + "grep -R is for folder targets; use grep " + " for a single file" + ) return { "mode": "matches", "query": query, @@ -824,9 +830,10 @@ class PIFSCommandExecutor: for item in data.get("data", []) ) if mode == "matches": + if not data.get("data", []): + return f"# no matches for: {data.get('query', '')}" return "\n".join( - f"{self._file_target_path(item)}:{item['line']}: " - f"{self._compact_text(item['text'], max_chars=220)}" + f"{item['line']}: {self._compact_text(item['text'], max_chars=220)}" for item in data.get("data", []) ) return str(data) diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index ea03976..be5f325 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -227,6 +227,9 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("cat --structure and cat --page", BASH_TOOL_DESCRIPTION) self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY) self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION) + self.assertIn("run cat --structure before the first cat --page", AGENT_TOOL_POLICY) + self.assertIn("Do not guess cat --page ranges from grep line numbers", AGENT_TOOL_POLICY) + self.assertIn("Do not use cat --page as the first inspection command", BASH_TOOL_DESCRIPTION) self.assertIn("Do not reconstruct paths from", BASH_TOOL_DESCRIPTION) self.assertIn("document titles", BASH_TOOL_DESCRIPTION) self.assertIn("file_ref/document_id", AGENT_TOOL_POLICY) @@ -250,6 +253,8 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn("cat --structure", AGENT_TOOL_POLICY) self.assertIn("cat --page", AGENT_TOOL_POLICY) self.assertIn("Do not use browse as folder semantic recall", AGENT_TOOL_POLICY) + self.assertIn("Use grep for a selected single file", AGENT_TOOL_POLICY) + self.assertIn("Use grep for one\nselected file", BASH_TOOL_DESCRIPTION) def test_default_agent_prompts_do_not_suggest_legacy_semantic_commands(self): prompt_surface = "\n".join( @@ -274,6 +279,8 @@ class PIFSAgentStreamTest(unittest.TestCase): self.assertIn('browse -R /documents "Federal Reserve supervision regulation"', demo_prompt) self.assertIn("verify", demo_prompt) self.assertIn("cat --structure", demo_prompt) + self.assertIn("Use grep for one selected file", demo_prompt) + self.assertIn("Do not guess cat --page ranges from grep line numbers", demo_prompt) self.assertNotIn("search-summary", demo_prompt) def test_prompt_rejects_find_grep_as_exhaustive_search(self): diff --git a/tests/test_pifs_find_maxdepth.py b/tests/test_pifs_find_maxdepth.py index 7fbc445..614ffa2 100644 --- a/tests/test_pifs_find_maxdepth.py +++ b/tests/test_pifs_find_maxdepth.py @@ -97,6 +97,28 @@ def test_stable_path_targets_work_without_session_refs(tmp_path): assert "Root document fixture text" in text +def test_single_file_grep_shell_output_omits_file_path(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + output = executor.execute("grep Root '/documents/Root document'") + + assert output == "1: Root document fixture text" + assert "/documents/Root document" not in output + assert "file_ref=" not in output + assert "id=doc_root" not in output + + +def test_recursive_grep_rejects_single_file_target(tmp_path): + from pageindex.filesystem.commands import PIFSCommandError + + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + with pytest.raises(PIFSCommandError, match="grep -R is for folder targets"): + executor.execute("grep -R Root '/documents/Root document'") + + def test_shell_limits_reject_context_expanding_counts(tmp_path): from pageindex.filesystem.commands import PIFSCommandError