fix(filesystem): remove cat node reads

Return nested PageIndex structure JSON from cat --structure and keep content reads page-based only. Remove the cat --node command surface, related limits, prompts, and structure-text fallback.
2026-06-24 20:28:12 +02:00 · 2026-06-01 00:05:19 +08:00 · 2026-06-01 00:05:19 +08:00 · d0c0c67a39
commit d0c0c67a39
parent e368562e03
6 changed files with 56 additions and 277 deletions
--- a/tests/test_pageindex_structural_read.py
+++ b/tests/test_pageindex_structural_read.py
@ -68,7 +68,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
        executor = PIFSCommandExecutor(filesystem, json_output=True)

        structure = json.loads(executor.execute("cat dsid_structural_missing --structure"))
-        node = json.loads(executor.execute("cat dsid_structural_missing --node 0001"))
        pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2"))
        stat = json.loads(executor.execute("stat dsid_structural_missing"))

@ -85,10 +84,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
            "message": "index failed: extractor unavailable",
        }

-        assert node["data"]["mode"] == "node"
-        assert node["data"]["available"] is False
-        assert node["data"]["node_id"] == "0001"
-
        assert pages["data"]["mode"] == "page"
        assert pages["data"]["available"] is False
        assert pages["data"]["pages"] == "1-2"
@ -135,6 +130,9 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
                        "nodes": [],
                    }
                ],
+                "pages": [
+                    {"page": 1, "content": "PageIndex Markdown extracted gamma text."}
+                ],
            }
        write_pageindex_client_doc(self.workspace, doc_id, doc)
        self.documents[doc_id] = doc
@ -348,10 +346,10 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
        assert structure["data"]["available"] is True
        assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf"
        assert structure["data"]["structure"][0]["title"] == "Introduction"
-        assert structure["data"]["structure"][1]["title"] == "Findings"
-        assert structure["data"]["structure_pagination"]["limit"] == 25
+        assert structure["data"]["structure"][0]["nodes"][0]["title"] == "Findings"
+        assert "structure_pagination" not in structure["data"]
        assert "text" not in structure["data"]["structure"][0]
-        assert "text" not in structure["data"]["structure"][1]
+        assert "text" not in structure["data"]["structure"][0]["nodes"][0]

        assert pages["data"]["available"] is True
        assert pages["data"]["text"] == "Page one text\n\nPage two text"
@ -364,53 +362,26 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
        assert stat["data"]["pageindex_tree_status"] == "built"


-def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact():
+def test_cat_node_is_not_supported():
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError

    with tempfile.TemporaryDirectory() as tmp:
-        source = Path(tmp) / "notes.md"
-        source.write_text("# Notes\n\nBody", encoding="utf-8")
        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
-        write_pageindex_client_doc(
-            filesystem.pageindex_client_workspace,
-            "doc_cached_md",
-            {
-                "id": "doc_cached_md",
-                "type": "md",
-                "path": str(source.resolve()),
-                "doc_name": "notes",
-                "doc_description": "",
-                "line_count": 3,
-                "structure": [
-                    {
-                        "title": "Notes",
-                        "node_id": "0001",
-                        "line_num": 1,
-                        "text": "# Notes\n\nBody",
-                        "nodes": [],
-                    }
-                ],
-            },
-        )
        filesystem.register_file(
-            storage_uri=source.as_uri(),
+            storage_uri="file:///tmp/notes.md",
            source_path="docs/notes.md",
            external_id="dsid_md_cached",
            title="Cached markdown notes",
-            content=source.read_text(encoding="utf-8"),
+            content="# Notes\n\nBody",
        )
        executor = PIFSCommandExecutor(filesystem, json_output=True)

-        node = json.loads(executor.execute("cat dsid_md_cached --node 0001"))
-
-        assert node["data"]["available"] is True
-        assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
-        assert node["data"]["node"]["title"] == "Notes"
-        assert node["data"]["text"] == "# Notes\n\nBody"
-        assert "text" not in node["data"]["node"]
+        with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"):
+            executor.execute("cat dsid_md_cached --node 0001")


-def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
+def test_cat_structure_page_and_text_outputs_are_hard_limited():
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

@ -463,16 +434,13 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
        )
        executor = PIFSCommandExecutor(filesystem, json_output=True)

-        first_structure = json.loads(executor.execute("cat dsid_limited_pdf --structure"))
-        assert len(first_structure["data"]["structure"]) == 25
-        assert first_structure["data"]["structure_pagination"]["has_more"] is True
-        assert first_structure["data"]["structure_pagination"]["next_offset"] == 25
-
-        second_structure = json.loads(
+        structure = json.loads(executor.execute("cat dsid_limited_pdf --structure"))
+        assert len(structure["data"]["structure"]) == 30
+        assert structure["data"]["structure"][25]["node_id"] == "0026"
+        assert "text" not in structure["data"]["structure"][0]
+        assert "structure_pagination" not in structure["data"]
+        with pytest.raises(PIFSCommandError, match="Unsupported cat option: --offset"):
            executor.execute("cat dsid_limited_pdf --structure --offset 25")
-        )
-        assert len(second_structure["data"]["structure"]) == 5
-        assert second_structure["data"]["structure"][0]["node_id"] == "0026"

        pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5"))
        assert pages["data"]["text"] == (
@ -484,38 +452,8 @@ def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
        with pytest.raises(PIFSCommandError, match="evidence is sufficient"):
            executor.execute("cat dsid_limited_pdf --page 1-6")

-        nodes = json.loads(
-            executor.execute(
-                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
-                "0006 0007 0008 0009 0010"
-            )
-        )
-        assert nodes["data"]["node_ids"] == [
-            "0001",
-            "0002",
-            "0003",
-            "0004",
-            "0005",
-            "0006",
-            "0007",
-            "0008",
-            "0009",
-            "0010",
-        ]
-        comma_nodes = json.loads(
-            executor.execute("cat dsid_limited_pdf --node 0001,0002")
-        )
-        assert comma_nodes["data"]["node_ids"] == ["0001", "0002"]
-        with pytest.raises(PIFSCommandError, match="at most 10"):
-            executor.execute(
-                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
-                "0006 0007 0008 0009 0010 0011"
-            )
-        with pytest.raises(PIFSCommandError, match="continue with additional chunks"):
-            executor.execute(
-                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
-                "0006 0007 0008 0009 0010 0011"
-            )
+        with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"):
+            executor.execute("cat dsid_limited_pdf --node 0001")

        with pytest.raises(PIFSCommandError, match="quote the whole target"):
            executor.execute("cat dsid_limited_pdf 0001")
@ -672,11 +610,13 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
        for command in (
            "cat dsid_text_only --structure",
            "cat dsid_text_only --page 1",
-            "cat dsid_text_only --node 0001",
        ):
            with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
                executor.execute(command)

+        with pytest.raises(PIFSCommandError, match="Unsupported cat option: --node"):
+            executor.execute("cat dsid_text_only --node 0001")
+

 def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
--- a/tests/test_pifs_agent_stream.py
+++ b/tests/test_pifs_agent_stream.py
@ -218,13 +218,17 @@ class PIFSAgentStreamTest(unittest.TestCase):

        self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}')

-    def test_prompt_tells_agent_when_to_choose_node_or_page(self):
-        self.assertIn("prefer cat <target> --node <node_id>", AGENT_TOOL_POLICY)
-        self.assertIn("page-level evidence", AGENT_TOOL_POLICY)
-        self.assertIn("prefer\ncat <path> --node <node_id>", BASH_TOOL_DESCRIPTION)
+    def test_prompt_tells_agent_to_use_structure_then_page(self):
+        self.assertIn(
+            "cat <target> --structure returns the cached PageIndex structure JSON",
+            AGENT_TOOL_POLICY,
+        )
+        self.assertIn("exact page text", BASH_TOOL_DESCRIPTION)
+        self.assertIn("cat <path> --structure and cat <path> --page", BASH_TOOL_DESCRIPTION)
        self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY)
        self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION)
-        self.assertIn("Do not reconstruct paths from document titles", BASH_TOOL_DESCRIPTION)
+        self.assertIn("Do not reconstruct paths from", BASH_TOOL_DESCRIPTION)
+        self.assertIn("document titles", BASH_TOOL_DESCRIPTION)
        self.assertIn("file_ref/document_id", AGENT_TOOL_POLICY)

    def test_prompt_requires_stat_for_metadata_questions(self):
@ -244,7 +248,6 @@ class PIFSAgentStreamTest(unittest.TestCase):
        self.assertIn("browse returns file candidates only", AGENT_TOOL_POLICY)
        self.assertIn("verify the relevant facts with cat or grep", AGENT_TOOL_POLICY)
        self.assertIn("cat <target> --structure", AGENT_TOOL_POLICY)
-        self.assertIn("cat <target> --node <node_id>", AGENT_TOOL_POLICY)
        self.assertIn("cat <target> --page", AGENT_TOOL_POLICY)
        self.assertIn("Do not use browse as folder semantic recall", AGENT_TOOL_POLICY)