From a80b84dae432912f6cf5a1cf18bdf332676216c2 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Tue, 26 May 2026 15:39:16 +0800 Subject: [PATCH] fix(filesystem): remove session ref aliases from cli output --- examples/pifs_demo.py | 28 +++++++++--------- pageindex/filesystem/agent.py | 10 ++++--- pageindex/filesystem/commands.py | 51 +++++++++++++++++++++----------- pageindex/filesystem/core.py | 24 ++++----------- tests/test_pifs_find_maxdepth.py | 23 ++++++++++++++ 5 files changed, 82 insertions(+), 54 deletions(-) diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index fa610ad..230d586 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -5,7 +5,7 @@ This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus through the PageIndex FileSystem shell instead of direct PageIndex document tools. The agent receives one read-only bash-like PIFS tool and must retrieve evidence through commands such as ls, tree, find, grep, search-summary, -cat --structure, cat --page, and cat --node. +cat --structure, cat --page, and cat --node. The demo uses PDFs under examples/documents. When a matching examples/documents/results/*_structure.json file exists, it is loaded into the @@ -61,8 +61,8 @@ shell. The workspace contains registered example PDFs. Retrieval strategy: - Start with ls or tree to understand the workspace. -- Use refs exactly as listed, such as ref_1, or use a concrete file path from - ls output. Do not invent paths like /documents/ref_1. +- Use concrete PIFS paths from ls/find output, such as /documents/report.pdf, + or stable file_ref/document ids. Do not invent temporary ref_N aliases. - Folder paths such as /documents are positional command targets; do not put folder paths inside --where. - Use search-summary when available to find likely documents. @@ -73,12 +73,12 @@ Retrieval strategy: - Use grep -R only for lexical evidence; do not treat semantic candidates as literal matches. - Run one evidence command at a time. Do not chain large commands like - cat --structure, grep, and cat --page in one bash call. -- For PDFs, use cat --structure to inspect the PageIndex tree, then - cat --page for evidence, for example: - cat ref_1 --page 31-35 -- For page-range questions, use cat --structure to identify the full section - range. Then run cat --page on the smallest useful evidence range, usually the + cat --structure, grep, and cat --page in one bash call. +- For PDFs, use cat --structure to inspect the PageIndex tree, then + cat --page for evidence, for example: + cat /documents/2023-annual-report.pdf --page 31-35 +- For page-range questions, use cat --structure to identify the full section + range. Then run cat --page on the smallest useful evidence range, usually the section start page or first 1-2 pages, before the final answer. Do not print a broad multi-page section unless the user asks to read the whole section. - Do not use cat --all on PDFs. @@ -630,11 +630,11 @@ def run_smoke_commands( verbose=verbose, ) - first_ref = registered[0]["file_ref"] if registered else None - if not first_ref: + first_target = f"/documents/{Path(str(registered[0]['path'])).name}" if registered else None + if not first_target: return - command = f"stat {first_ref}" + command = f"stat {first_target}" stat = execute_json_command(json_executor, command) stat_data = stat.get("data") or {} show_capability( @@ -648,7 +648,7 @@ def run_smoke_commands( verbose=verbose, ) - command = f"cat {first_ref} --structure" + command = f"cat {first_target} --structure" structure_payload = execute_json_command(json_executor, command) structure_data = structure_payload.get("data") or {} structure = structure_data.get("structure") or [] @@ -666,7 +666,7 @@ def run_smoke_commands( ) evidence_range = opening_page_range_for_node(supervision_node) or "1-2" - command = f"cat {first_ref} --page {evidence_range}" + command = f"cat {first_target} --page {evidence_range}" page = execute_json_command(json_executor, command) page_text = str((page.get("data") or {}).get("text") or "") show_capability( diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py index dce9aca..2fbe034 100644 --- a/pageindex/filesystem/agent.py +++ b/pageindex/filesystem/agent.py @@ -36,8 +36,10 @@ commands described in the workspace context. grep -R is lexical evidence search; semantic search commands return candidate documents and do not guarantee literal text matches. Errors are returned as text prefixed with ERROR. Do not call commands that are not listed as available. When evidence is required, inspect it -with cat or grep before answering. Prefer shell-like target-first cat syntax: -cat --structure, cat --page 31-59, and cat --node 0009. +with cat or grep before answering. Prefer shell-like target-first cat syntax +with stable targets: cat --structure, cat --page 31-59, and +cat --node 0009. You may also use file_ref or document_id when a path is +ambiguous. """ AGENT_TOOL_POLICY = """ @@ -51,8 +53,8 @@ Tool policy: - Semantic search commands are candidate-discovery tools and do not guarantee literal text matches. - Tool errors are returned as ERROR text; recover by trying an available command. - Use cat or grep to gather evidence before making source-backed claims. -- Prefer target-first cat syntax: cat --structure, cat --page 31-59, cat --node . -- Do not call cat --page ; if you need a page span, use cat --page -. +- Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59, cat --node . +- Do not call cat --page ; if you need a page span, use cat --page -. """ STREAM_MODE_ALIASES = { diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 730deee..860e1b7 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -92,8 +92,8 @@ class PIFSCommandExecutor: "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", "- find -maxdepth N -type f|d: bounded folder traversal for find", "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", - "- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files", - "- cat --all: full text artifact reads for txt/text files", + "- cat --structure/--node/--page: cached PageIndex reads for PDF/Markdown files", + "- cat --all: full text artifact reads for txt/text files", ] if "entity" in semantic_channels: lines.append("- find --name: entity semantic candidate discovery alias") @@ -115,7 +115,7 @@ class PIFSCommandExecutor: ) if not semantic.get("commands"): lines.append("- semantic vector commands: none available in this workspace") - lines.append("- grep , cat, stat: evidence inspection") + lines.append("- grep , cat, stat: evidence inspection") return "\n".join(lines) def execute(self, command: str) -> str: @@ -432,8 +432,9 @@ class PIFSCommandExecutor: target = args[0] if target.startswith("-"): raise PIFSCommandError( - "cat syntax is target-first: cat --structure, " - "cat --page 31-59, or cat --node 0009" + "cat syntax is target-first: cat --structure, " + "cat --page 31-59, or " + "cat --node 0009" ) location = "all" structural_mode: str | None = None @@ -467,8 +468,8 @@ class PIFSCommandExecutor: raise PIFSCommandError(f"Unsupported cat option: {arg}") else: raise PIFSCommandError( - "cat accepts one file target. Use: cat --page , " - "for example: cat ref_1 --page 31-59" + "cat accepts one file target. Use: cat --page , " + "for example: cat /documents/report.pdf --page 31-59" ) i += 1 if structural_mode == "structure": @@ -479,7 +480,7 @@ class PIFSCommandExecutor: if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range): raise PIFSCommandError( "cat --page requires one page selector like 31 or 31-59. " - "Use: cat --page " + "Use: cat --page " ) return self.filesystem.pageindex_pages(target, page_range) return self.filesystem.cat_text_artifact(target, location) @@ -804,7 +805,7 @@ class PIFSCommandExecutor: ) if mode == "matches": return "\n".join( - f"{item['reference_id']}:{item['line']}: " + f"{self._file_target_path(item)}:{item['line']}: " f"{self._compact_text(item['text'], max_chars=220)}" for item in data.get("data", []) ) @@ -835,7 +836,7 @@ class PIFSCommandExecutor: lines.append(f"{name}: {field.get('type', 'string')}") return "\n".join(lines) lines = [ - f"ref: {data.get('target') or data.get('file_ref')}", + f"target: {data.get('target') or data.get('file_ref')}", f"file_ref: {data.get('file_ref')}", f"document_id: {data.get('external_id') or data.get('document_id') or '-'}", f"source_path: {data.get('source_path') or '-'}", @@ -857,23 +858,37 @@ class PIFSCommandExecutor: def _file_row_text(self, item: dict[str, Any]) -> str: file_ref = item.get("file_ref") - ref = item.get("reference_id") or (self.filesystem._reference_for(file_ref) if file_ref else "-") doc_id = item.get("external_id") or item.get("document_id") or "-" title = self._compact_text(item.get("title") or item.get("name") or "", max_chars=80) source_path = item.get("source_path") or "-" folder_paths = item.get("folder_paths") or self._folder_paths_for_file(file_ref) folders = f" folders={','.join(folder_paths)}" if folder_paths else "" - return f"{ref} {doc_id} {title} {source_path}{folders}".strip() + target = self._file_target_path(item) + return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title} source={source_path}{folders}".strip() def _grep_file_hit_text(self, item: dict[str, Any]) -> str: doc_id = item.get("external_id") or "-" - source_path = item.get("source_path") or "-" line = item.get("line") or 1 + target = self._file_target_path(item) return ( - f"{item['reference_id']} {doc_id} {source_path}:{line}: " + f"{target}:{line}: id={doc_id} " f"{self._compact_text(item.get('text') or '', max_chars=180)}" ) + def _file_target_path(self, item: dict[str, Any]) -> str: + file_ref = item.get("file_ref") + title = str(item.get("title") or item.get("name") or "").strip() + folder_paths = item.get("folder_paths") or [] + folder_path = item.get("folder_path") + if not folder_paths and folder_path: + folder_paths = [folder_path] + if not folder_paths: + folder_paths = self._folder_paths_for_file(file_ref) + if folder_paths and title: + folder = str(folder_paths[0] or "/").rstrip("/") + return f"{folder}/{title}" if folder else f"/{title}" + return str(item.get("source_path") or item.get("external_id") or file_ref or "-") + def _semantic_retrieval_query(self, query: str) -> str: query = str(query or "").strip() context = str(self.query_context or "").strip() @@ -1040,11 +1055,10 @@ class PIFSCommandExecutor: continue if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path: continue - reference_id = self.filesystem._reference_for(file_row["file_ref"]) line_number, text = self._first_matching_source_line(path, query) hits.append( { - "reference_id": reference_id, + "reference_id": file_row["external_id"] or file_row["file_ref"], "file_ref": file_row["file_ref"], "external_id": file_row["external_id"], "title": file_row["title"], @@ -1060,17 +1074,18 @@ class PIFSCommandExecutor: def _grep_file_matches(self, target: str, query: str, *, limit: int) -> list[dict[str, Any]]: file_ref = self.filesystem._resolve_reference(target) - reference_id = self.filesystem._reference_for(file_ref) entry = self.filesystem.store.get_file(file_ref) matches = [] for line_number, line in enumerate(self.filesystem.store.read_text(file_ref).splitlines(), 1): if self._line_matches(line, query): matches.append( { - "reference_id": reference_id, + "reference_id": entry.external_id or file_ref, "file_ref": file_ref, "external_id": entry.external_id, + "title": entry.title, "source_path": entry.source_path, + "folder_paths": self._folder_paths_for_file(file_ref), "line": line_number, "text": self._compact_text(line, max_chars=220), } diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index d45e679..7110fa6 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -102,7 +102,6 @@ class PageIndexFileSystem: self.workspace = Path(workspace).expanduser() self.store = SQLiteFileSystemStore(self.workspace) self.metadata = MetadataQueryEngine(self.store) - self._references: dict[str, str] = {} self.semantic_retrieval_backend = semantic_retrieval_backend self.metadata_generator = metadata_generator self.summary_projection_indexer = summary_projection_indexer @@ -388,7 +387,6 @@ class PageIndexFileSystem: results = [] scope_path = self._scope_folder_path(scope) for row in rows: - reference_id = self._reference_for(row["file_ref"]) folder_paths = [ folder["path"] for folder in self.store.folder_memberships(row["file_ref"]) @@ -396,7 +394,7 @@ class PageIndexFileSystem: folder_path = self._preferred_folder_path(folder_paths, scope_path, row["folder_path"]) results.append( SearchResult( - reference_id=reference_id, + reference_id=row["external_id"] or row["file_ref"], file_ref=row["file_ref"], external_id=row["external_id"], title=row["title"], @@ -693,8 +691,9 @@ class PageIndexFileSystem: raise ValueError( f"{command} is only supported for txt/text files; " f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " - "Use cat --structure, cat --page, or cat --node " - "for PDF/Markdown PageIndex files." + "Use cat --structure, " + "cat --page, or " + "cat --node for PDF/Markdown PageIndex files." ) def _require_pageindex_document_file(self, entry: Any, command: str) -> None: @@ -703,7 +702,7 @@ class PageIndexFileSystem: raise ValueError( f"{command} is only supported for PDF/Markdown PageIndex files; " f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " - "Use cat --all for txt/text files." + "Use cat --all for txt/text files." ) @classmethod @@ -1254,8 +1253,6 @@ class PageIndexFileSystem: return result def _resolve_reference(self, reference_id: str) -> str: - if reference_id in self._references: - return self._references[reference_id] return self.store.resolve_file_ref(reference_id) def _should_use_semantic_retrieval( @@ -1315,7 +1312,6 @@ class PageIndexFileSystem: continue seen.add(file_ref) entry = self.store.get_file(file_ref) - reference_id = self._reference_for(file_ref) folder_paths = [ folder["path"] for folder in self.store.folder_memberships(file_ref) @@ -1323,7 +1319,7 @@ class PageIndexFileSystem: folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path) results.append( SearchResult( - reference_id=reference_id, + reference_id=entry.external_id or file_ref, file_ref=file_ref, external_id=entry.external_id, title=entry.title, @@ -1348,14 +1344,6 @@ class PageIndexFileSystem: break return results - def _reference_for(self, file_ref: str) -> str: - for reference_id, existing in self._references.items(): - if existing == file_ref: - return reference_id - reference_id = f"ref_{len(self._references) + 1}" - self._references[reference_id] = file_ref - return reference_id - @staticmethod def _build_descriptor(title: str, metadata: dict[str, Any]) -> str: source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel") diff --git a/tests/test_pifs_find_maxdepth.py b/tests/test_pifs_find_maxdepth.py index 2635f27..580633e 100644 --- a/tests/test_pifs_find_maxdepth.py +++ b/tests/test_pifs_find_maxdepth.py @@ -75,6 +75,29 @@ def test_find_maxdepth_one_returns_direct_files_only(tmp_path): assert [row["external_id"] for row in rows] == ["doc_root"] +def test_find_output_is_path_first_without_session_refs(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + output = executor.execute("find /documents -maxdepth 1 -type f") + + assert output.startswith("/documents/Root document id=doc_root file_ref=file_") + assert "ref_1" not in output + assert "title=Root document" in output + + +def test_stable_path_targets_work_without_session_refs(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + stat = executor.execute("stat '/documents/Root document'") + text = executor.execute("cat '/documents/Root document' --all") + + assert "target: /documents/Root document" in stat + assert "document_id: doc_root" in stat + assert "Root document fixture text" in text + + def test_find_maxdepth_zero_type_directory_returns_start_folder(tmp_path): executor = _register_find_fixture(tmp_path)