feat(filesystem): add PageIndex filesystem shell

2026-06-24 20:28:12 +02:00 · 2026-05-26 01:41:57 +08:00 · 2026-05-26 01:41:57 +08:00 · 74d0600261
commit 74d0600261
parent 7592163e2a
24 changed files with 11373 additions and 4 deletions
--- a/tests/test_pageindex_filesystem_scope.py
+++ b/tests/test_pageindex_filesystem_scope.py
@ -0,0 +1,60 @@
+import json
+from types import SimpleNamespace
+
+
+class SummaryBackend:
+    def __init__(self, document_id):
+        self.document_id = document_id
+        self.calls = []
+
+    def available_channels(self):
+        return ("summary",)
+
+    def search_channel(self, channel, query, *, limit=10, filters=None):
+        self.calls.append((channel, query, filters))
+        return [
+            SimpleNamespace(
+                document_id=self.document_id,
+                snippet=f"summary candidate: {query}",
+            )
+        ]
+
+
+def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    filesystem.register_file(
+        storage_uri="file:///tmp/report.pdf",
+        source_path="examples/documents/report.pdf",
+        folder_path="/documents",
+        external_id="dsid_report",
+        title="Annual report",
+        metadata={"source_type": "examples-documents"},
+        content="Federal Reserve supervision and regulation annual report.",
+    )
+    backend = SummaryBackend("dsid_report")
+    filesystem.semantic_retrieval_backend = backend
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    result = json.loads(
+        executor.execute('search-summary "Federal Reserve annual report" /documents')
+    )
+
+    assert backend.calls[0][2] == {}
+    assert result["data"]["data"][0]["external_id"] == "dsid_report"
+
+
+def test_semantic_search_scope_filters_explicit_source_type_facets():
+    from pageindex.filesystem import PageIndexFileSystem
+
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/source_type=google-drive"}
+    ) == {"source_type": "google_drive"}
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/semantic/source_type=google-drive"}
+    ) == {"source_type": "google_drive"}
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/documents"}
+    ) == {}
+
--- a/tests/test_pageindex_structural_read.py
+++ b/tests/test_pageindex_structural_read.py
@ -0,0 +1,632 @@
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None:
+    workspace.mkdir(parents=True, exist_ok=True)
+    (workspace / f"{doc_id}.json").write_text(
+        json.dumps(doc, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    meta = {
+        doc_id: {
+            "type": doc.get("type", ""),
+            "doc_name": doc.get("doc_name", ""),
+            "doc_description": doc.get("doc_description", ""),
+            "path": doc.get("path", ""),
+        }
+    }
+    if doc.get("type") == "pdf":
+        meta[doc_id]["page_count"] = doc.get("page_count")
+    elif doc.get("type") == "md":
+        meta[doc_id]["line_count"] = doc.get("line_count")
+    (workspace / "_meta.json").write_text(
+        json.dumps(meta, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+
+
+class RecordingMetadataGenerator:
+    values = {
+        "summary": "Generated retrieval summary.",
+        "doc_type": "technical_note",
+        "domain": "documentation",
+        "topic": "pageindex extraction",
+    }
+
+    def __init__(self):
+        self.calls = []
+
+    def generate(self, request, *, fields):
+        self.calls.append((request, list(fields)))
+        return {field: self.values[field] for field in fields if field in self.values}
+
+
+def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.md"
+        source.write_text("# Report\n\nCached structure is not built yet.", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        def fail_index(*args, **kwargs):
+            raise RuntimeError("index failed")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.md",
+            external_id="dsid_structural_missing",
+            title="Structural report",
+            content=source.read_text(encoding="utf-8"),
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat --structure dsid_structural_missing"))
+        node = json.loads(executor.execute("cat --node 0001 dsid_structural_missing"))
+        pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_missing"))
+        stat = json.loads(executor.execute("stat dsid_structural_missing"))
+
+        assert structure["data"]["mode"] == "structure"
+        assert structure["data"]["available"] is False
+        assert structure["data"]["status"] == "failed"
+        assert "PageIndexClient workspace" in structure["data"]["message"]
+        assert stat["data"]["pageindex_tree_status"] == "failed"
+
+        assert node["data"]["mode"] == "node"
+        assert node["data"]["available"] is False
+        assert node["data"]["node_id"] == "0001"
+
+        assert pages["data"]["mode"] == "page"
+        assert pages["data"]["available"] is False
+        assert pages["data"]["pages"] == "1-2"
+
+        assert "cp" not in executor.allowed_commands()
+        assert "mkdir" not in executor.allowed_commands()
+
+
+def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_fts(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    def fake_index(self, file_path, mode="auto"):
+        suffix = Path(file_path).suffix.lower()
+        doc_id = f"doc_{suffix.lstrip('.')}"
+        if suffix == ".pdf":
+            doc = {
+                "id": doc_id,
+                "type": "pdf",
+                "path": str(Path(file_path).resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 2,
+                "structure": [{"title": "Report", "node_id": "0001", "nodes": []}],
+                "pages": [
+                    {"page": 1, "content": "PageIndex PDF extracted alpha text."},
+                    {"page": 2, "content": "Second PageIndex PDF extracted beta text."},
+                ],
+            }
+        else:
+            doc = {
+                "id": doc_id,
+                "type": "md",
+                "path": str(Path(file_path).resolve()),
+                "doc_name": "notes",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {
+                        "title": "Notes",
+                        "node_id": "0001",
+                        "line_num": 1,
+                        "text": "# Notes\n\nPageIndex Markdown extracted gamma text.",
+                        "nodes": [],
+                    }
+                ],
+            }
+        write_pageindex_client_doc(self.workspace, doc_id, doc)
+        self.documents[doc_id] = doc
+        return doc_id
+
+    monkeypatch.setattr(PageIndexClient, "index", fake_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        source_pdf = Path(tmp) / "report.pdf"
+        source_md = Path(tmp) / "notes.md"
+        source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        source_md.write_text("# Notes\n\nCaller markdown content", encoding="utf-8")
+        generator = RecordingMetadataGenerator()
+        filesystem = PageIndexFileSystem(
+            workspace=Path(tmp) / "workspace",
+            metadata_generator=generator,
+        )
+
+        filesystem.register_file(
+            storage_uri=source_pdf.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_extracted",
+            title="PDF extracted",
+            content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
+        )
+        filesystem.register_file(
+            storage_uri=source_md.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_extracted",
+            title="Markdown extracted",
+            content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
+        )
+
+        pdf_request = generator.calls[0][0]
+        md_request = generator.calls[1][0]
+        pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
+        md_stat = filesystem.store.file_info("dsid_md_extracted")
+
+        assert "PageIndex PDF extracted alpha text" in pdf_request.text
+        assert "Second PageIndex PDF extracted beta text" in pdf_request.text
+        assert "CALLER PDF CONTENT" not in pdf_request.text
+        assert "PageIndex Markdown extracted gamma text" in md_request.text
+        assert "CALLER MD CONTENT" not in md_request.text
+        assert "PageIndex PDF extracted alpha text" in Path(
+            pdf_stat["text_artifact_path"]
+        ).read_text(encoding="utf-8")
+        assert "PageIndex Markdown extracted gamma text" in Path(
+            md_stat["text_artifact_path"]
+        ).read_text(encoding="utf-8")
+        assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
+            "dsid_pdf_extracted"
+        ]
+        assert [r.external_id for r in filesystem.search("gamma", limit=5)] == [
+            "dsid_md_extracted"
+        ]
+        assert filesystem.search("CALLER", limit=5) == []
+
+
+def test_register_text_metadata_generation_keeps_caller_content_without_pageindex(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    def fail_index(*args, **kwargs):
+        raise AssertionError("PageIndexClient.index should not be called for text files")
+
+    monkeypatch.setattr(PageIndexClient, "index", fail_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        generator = RecordingMetadataGenerator()
+        filesystem = PageIndexFileSystem(
+            workspace=Path(tmp) / "workspace",
+            metadata_generator=generator,
+        )
+
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_generation",
+            title="Text generation",
+            content="Plain text caller content stays authoritative.",
+            content_type="text/plain",
+        )
+
+        stat = filesystem.store.file_info("dsid_text_generation")
+
+        assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
+        assert stat["pageindex_doc_id"] is None
+        assert stat["pageindex_tree_status"] == "not_built"
+        assert Path(stat["text_artifact_path"]).read_text(
+            encoding="utf-8"
+        ) == "Plain text caller content stays authoritative."
+
+
+def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    calls: list[str] = []
+
+    def fake_index(self, file_path, mode="auto"):
+        calls.append(str(file_path))
+        doc_id = f"doc_{Path(file_path).suffix.lstrip('.')}"
+        doc_type = "pdf" if Path(file_path).suffix == ".pdf" else "md"
+        doc = {
+            "id": doc_id,
+            "type": doc_type,
+            "path": str(Path(file_path).resolve()),
+            "doc_name": Path(file_path).name,
+            "doc_description": "",
+            "structure": [{"title": Path(file_path).stem, "node_id": "0001", "nodes": []}],
+        }
+        if doc_type == "pdf":
+            doc["page_count"] = 1
+            doc["pages"] = [{"page": 1, "content": "Page one text"}]
+        else:
+            doc["line_count"] = 1
+        write_pageindex_client_doc(self.workspace, doc_id, doc)
+        self.documents[doc_id] = doc
+        return doc_id
+
+    monkeypatch.setattr(PageIndexClient, "index", fake_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        source_pdf = Path(tmp) / "report.pdf"
+        source_md = Path(tmp) / "notes.md"
+        source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        source_md.write_text("# Notes", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        filesystem.register_file(
+            storage_uri=str(source_pdf),
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_build",
+            title="PDF build",
+            content="pdf text",
+        )
+        filesystem.register_file(
+            storage_uri=source_md.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_build",
+            title="Markdown build",
+            content=source_md.read_text(encoding="utf-8"),
+        )
+
+        pdf_stat = filesystem.store.file_info("dsid_pdf_build")
+        md_stat = filesystem.store.file_info("dsid_md_build")
+
+        assert calls == [str(source_pdf.resolve()), str(source_md.resolve())]
+        assert pdf_stat["pageindex_doc_id"] == "doc_pdf"
+        assert pdf_stat["pageindex_tree_status"] == "built"
+        assert md_stat["pageindex_doc_id"] == "doc_md"
+        assert md_stat["pageindex_tree_status"] == "built"
+
+
+def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.pdf"
+        source.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        workspace = Path(tmp) / "workspace"
+        filesystem = PageIndexFileSystem(workspace=workspace)
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_cached_pdf",
+            {
+                "id": "doc_cached_pdf",
+                "type": "pdf",
+                "path": str(source.resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 2,
+                "structure": [
+                    {
+                        "title": "Introduction",
+                        "node_id": "0001",
+                        "text": "Intro section text",
+                        "nodes": [
+                            {
+                                "title": "Findings",
+                                "node_id": "0002",
+                                "physical_index": 2,
+                                "nodes": [],
+                            }
+                        ],
+                    }
+                ],
+                "pages": [
+                    {"page": 1, "content": "Page one text"},
+                    {"page": 2, "content": "Page two text"},
+                ],
+            },
+        )
+
+        def fail_index(*args, **kwargs):
+            raise AssertionError("PageIndexClient.index should not be called on cache hit")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_structural_cached",
+            title="Cached structural report",
+            content="text artifact remains available for grep, not cat --all",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat --structure dsid_structural_cached"))
+        pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_cached"))
+        stat = json.loads(executor.execute("stat dsid_structural_cached"))
+
+        assert structure["data"]["available"] is True
+        assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf"
+        assert structure["data"]["structure"][0]["title"] == "Introduction"
+        assert "text" not in structure["data"]["structure"][0]
+        assert "text" not in structure["data"]["structure"][0]["nodes"][0]
+
+        assert pages["data"]["available"] is True
+        assert pages["data"]["text"] == "Page one text\n\nPage two text"
+
+        assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
+        assert stat["data"]["pageindex_tree_status"] == "built"
+
+
+def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "notes.md"
+        source.write_text("# Notes\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_cached_md",
+            {
+                "id": "doc_cached_md",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "notes",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {
+                        "title": "Notes",
+                        "node_id": "0001",
+                        "line_num": 1,
+                        "text": "# Notes\n\nBody",
+                        "nodes": [],
+                    }
+                ],
+            },
+        )
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_cached",
+            title="Cached markdown notes",
+            content=source.read_text(encoding="utf-8"),
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        node = json.loads(executor.execute("cat --node 0001 dsid_md_cached"))
+
+        assert node["data"]["available"] is True
+        assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
+        assert node["data"]["node"]["title"] == "Notes"
+        assert node["data"]["text"] == "# Notes\n\nBody"
+        assert "text" not in node["data"]["node"]
+
+
+def test_tree_folder_behavior_is_preserved():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/report.txt",
+            source_path="docs/report.txt",
+            folder_path="/docs/reports",
+            external_id="dsid_folder_tree",
+            title="Folder report",
+            content="folder tree behavior remains intact",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        folder_tree = json.loads(executor.execute("tree /docs --depth 2"))
+
+        assert folder_tree["data"]["path"] == "/docs"
+        assert folder_tree["data"]["folders"][0]["path"] == "/docs/reports"
+
+
+def test_tree_does_not_read_file_internal_pageindex_structure():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.pdf"
+        source.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_tree_is_folder_only",
+            {
+                "id": "doc_tree_is_folder_only",
+                "type": "pdf",
+                "path": str(source.resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 1,
+                "structure": [
+                    {"title": "Introduction", "node_id": "0001", "nodes": []}
+                ],
+                "pages": [{"page": 1, "content": "Page one text"}],
+            },
+        )
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_tree_is_folder_only",
+            title="Cached structural report",
+            content="text artifact remains available",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        with pytest.raises(PIFSCommandError):
+            executor.execute("tree dsid_tree_is_folder_only")
+
+        structure = json.loads(executor.execute("cat --structure dsid_tree_is_folder_only"))
+        assert structure["data"]["structure"][0]["title"] == "Introduction"
+
+
+def test_cat_all_is_limited_to_text_files():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_file",
+            title="Text readme",
+            content="plain text body",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/report.pdf",
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_file",
+            title="PDF report",
+            content="extracted text should not be served through cat --all",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/notes.md",
+            source_path="docs/notes.md",
+            external_id="dsid_md_file",
+            title="Markdown notes",
+            content="markdown text should use PageIndex structure reads",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/data.json",
+            source_path="docs/data.json",
+            external_id="dsid_json_file",
+            title="JSON record",
+            content='{"body":"json"}',
+            content_type="application/json",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        text = json.loads(executor.execute("cat --all dsid_text_file"))
+        assert text["data"]["text"] == "plain text body"
+
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat --all dsid_pdf_file")
+        with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
+            filesystem.open("dsid_pdf_file")
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat --all dsid_md_file")
+        with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
+            filesystem.open("dsid_md_file")
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat --all dsid_json_file")
+        assert filesystem.open("dsid_json_file").text == '{"body":"json"}'
+        for command in (
+            "head dsid_pdf_file",
+            "tail dsid_pdf_file",
+            "sed -n 1,1p dsid_pdf_file",
+            "head dsid_md_file",
+            "tail dsid_md_file",
+            "sed -n 1,1p dsid_md_file",
+        ):
+            with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+                executor.execute(command)
+
+
+def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_only",
+            title="Text readme",
+            content="plain text body",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        for command in (
+            "cat --structure dsid_text_only",
+            "cat --page 1 dsid_text_only",
+            "cat --node 0001 dsid_text_only",
+        ):
+            with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
+                executor.execute(command)
+
+
+def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "uploaded"
+        source.write_text("# Uploaded\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        file_ref = filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="uploads/uploaded",
+            external_id="dsid_legacy_pageindex",
+            title="Legacy PageIndex record",
+            content="text/plain is only a weak default here",
+        )
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_legacy_pageindex",
+            {
+                "id": "doc_legacy_pageindex",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "uploaded",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {"title": "Uploaded", "node_id": "0001", "text": "Body", "nodes": []}
+                ],
+            },
+        )
+        filesystem.store.update_pageindex_pointer(
+            file_ref,
+            pageindex_doc_id="doc_legacy_pageindex",
+            pageindex_tree_status="built",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat --structure dsid_legacy_pageindex"))
+        assert structure["data"]["structure"][0]["title"] == "Uploaded"
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat --all dsid_legacy_pageindex")
+
+
+def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "late.md"
+        source.write_text("# Late\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        def fail_index(*args, **kwargs):
+            raise RuntimeError("index failed")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/late.md",
+            external_id="dsid_late_cache",
+            title="Late cache",
+            content=source.read_text(encoding="utf-8"),
+        )
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_late_cache",
+            {
+                "id": "doc_late_cache",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "late",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {"title": "Late", "node_id": "0001", "text": "Body", "nodes": []}
+                ],
+            },
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat --structure dsid_late_cache"))
+        stat = json.loads(executor.execute("stat dsid_late_cache"))
+
+        assert structure["data"]["available"] is False
+        assert stat["data"]["pageindex_doc_id"] is None
+        assert stat["data"]["pageindex_tree_status"] == "failed"
--- a/tests/test_pifs_agent_stream.py
+++ b/tests/test_pifs_agent_stream.py
@ -0,0 +1,185 @@
+import io
+import os
+import unittest
+from types import SimpleNamespace
+
+from pydantic import BaseModel, ConfigDict
+
+from pageindex.filesystem.agent import (
+    PIFSAgentStreamObserver,
+    build_agent_model_settings,
+    normalize_agent_stream_mode,
+    normalize_reasoning_effort,
+    normalize_reasoning_summary,
+    pifs_agent_raw_reasoning_enabled,
+    serialize_agent_final_output,
+    should_disable_pifs_agent_tracing,
+    should_use_openai_compatible_chat_model,
+)
+
+
+class StructuredAnswer(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    answer: str
+    document_ids: list[str]
+
+
+class PIFSAgentStreamTest(unittest.TestCase):
+    def raw_event(self, event_type, delta):
+        return SimpleNamespace(
+            type="raw_response_event",
+            data=SimpleNamespace(type=event_type, delta=delta),
+        )
+
+    def test_model_stream_prints_output_and_think_deltas(self):
+        output = io.StringIO()
+        stream_log = []
+        observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
+
+        observer.handle_event(self.raw_event("response.reasoning_summary_text.delta", "look up folder"))
+        observer.handle_event(self.raw_event("response.output_text.delta", '{"answer":'))
+        observer.handle_event(self.raw_event("response.output_text.delta", '"done"}'))
+        observer.finish()
+
+        printed = output.getvalue()
+        self.assertIn("[llm reasoning summary stream]", printed)
+        self.assertIn("look up folder", printed)
+        self.assertIn("[llm final output stream]", printed)
+        self.assertIn('{"answer":"done"}', printed.replace("\n", ""))
+        self.assertEqual(
+            stream_log,
+            [
+                {"kind": "output", "text": '{"answer":"done"}'},
+                {"kind": "think_summary", "text": "look up folder"},
+            ],
+        )
+
+    def test_tools_mode_does_not_print_model_text(self):
+        output = io.StringIO()
+        stream_log = []
+        observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output)
+
+        observer.handle_event(self.raw_event("response.output_text.delta", "hidden from tools mode"))
+        observer.handle_event(self.raw_event("response.function_call_arguments.delta", '{"command":"ls /"}'))
+        observer.emit_tool_call("ls /")
+        observer.emit_tool_result(ok=True, output='{"ok": true}', seconds=0.001)
+        observer.finish()
+
+        printed = output.getvalue()
+        self.assertNotIn("hidden from tools mode", printed)
+        self.assertIn("[llm -> pifs command]", printed)
+        self.assertIn("ls /", printed)
+        self.assertIn("[pifs -> llm result preview]", printed)
+        self.assertIn('{"ok": true}', printed)
+        self.assertEqual(stream_log[0], {"kind": "tool_call", "command": "ls /"})
+        self.assertEqual(stream_log[1]["kind"], "tool_result")
+        self.assertEqual(stream_log[2], {"kind": "tool_args", "text": '{"command":"ls /"}'})
+
+    def test_tool_result_preview_compacts_large_outputs(self):
+        output = io.StringIO()
+        observer = PIFSAgentStreamObserver("tools", output=output)
+
+        observer.emit_tool_result(
+            ok=True,
+            output="\n".join(f"line {index}" for index in range(50)),
+            seconds=0.001,
+        )
+
+        printed = output.getvalue()
+        self.assertIn("[large PIFS result", printed)
+        self.assertIn("line 0", printed)
+        self.assertIn("more lines omitted from preview", printed)
+        self.assertNotIn("line 49", printed)
+
+    def test_raw_reasoning_is_not_logged_by_default_but_summary_is(self):
+        output = io.StringIO()
+        stream_log = []
+        previous = os.environ.pop("PAGEINDEX_PIFS_AGENT_RAW_REASONING", None)
+        try:
+            observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
+            observer.handle_event(self.raw_event("response.reasoning_text.delta", "private chain"))
+            observer.handle_event(
+                self.raw_event("response.reasoning_summary_text.delta", "visible summary")
+            )
+            observer.finish()
+        finally:
+            if previous is not None:
+                os.environ["PAGEINDEX_PIFS_AGENT_RAW_REASONING"] = previous
+
+        printed = output.getvalue()
+        self.assertNotIn("private chain", printed)
+        self.assertIn("visible summary", printed)
+        self.assertEqual(stream_log, [{"kind": "think_summary", "text": "visible summary"}])
+
+    def test_raw_reasoning_requires_debug_env_flag(self):
+        self.assertFalse(pifs_agent_raw_reasoning_enabled({}))
+        self.assertTrue(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "on"})
+        )
+        self.assertTrue(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "TRUE"})
+        )
+        self.assertFalse(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "0"})
+        )
+
+    def test_stream_mode_aliases(self):
+        self.assertEqual(normalize_agent_stream_mode("think"), "model")
+        self.assertEqual(normalize_agent_stream_mode("debug"), "all")
+        self.assertEqual(normalize_agent_stream_mode(""), "off")
+        with self.assertRaises(ValueError):
+            normalize_agent_stream_mode("nope")
+
+    def test_reasoning_settings_enable_effort_and_summary(self):
+        settings = build_agent_model_settings(
+            reasoning_effort="medium",
+            reasoning_summary="detailed",
+        )
+
+        self.assertIsNotNone(settings)
+        self.assertEqual(settings.reasoning.effort, "medium")
+        self.assertEqual(settings.reasoning.summary, "detailed")
+        self.assertEqual(settings.verbosity, "low")
+
+    def test_reasoning_effort_defaults_to_visible_summary(self):
+        settings = build_agent_model_settings(reasoning_effort="low")
+
+        self.assertIsNotNone(settings)
+        self.assertEqual(settings.reasoning.effort, "low")
+        self.assertEqual(settings.reasoning.summary, "auto")
+
+    def test_reasoning_and_base_url_normalization(self):
+        self.assertEqual(normalize_reasoning_effort("xhigh"), "xhigh")
+        self.assertIsNone(normalize_reasoning_summary("none"))
+        self.assertFalse(should_use_openai_compatible_chat_model(None))
+        self.assertFalse(should_use_openai_compatible_chat_model("https://api.openai.com/v1/"))
+        self.assertTrue(should_use_openai_compatible_chat_model("https://example.test/v1"))
+        with self.assertRaises(ValueError):
+            normalize_reasoning_effort("maximum")
+
+    def test_tracing_is_disabled_by_default_unless_env_enables_it(self):
+        self.assertTrue(should_disable_pifs_agent_tracing({}))
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "1"})
+        )
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "true"})
+        )
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "on"})
+        )
+        self.assertTrue(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "0"})
+        )
+
+    def test_structured_agent_output_serializes_to_json(self):
+        output = serialize_agent_final_output(
+            StructuredAnswer(answer="done", document_ids=["dsid_1"])
+        )
+
+        self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}')
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_semantic_index.py
+++ b/tests/test_semantic_index.py
@ -0,0 +1,53 @@
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from pageindex.filesystem.semantic_index import (
+    SemanticIndexRecord,
+    SQLiteVecSemanticIndex,
+)
+
+
+def test_sqlite_vec_semantic_index_round_trip(tmp_path):
+    index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
+    index.reset(dimension=3, metadata={"field_mode": "summary"})
+
+    index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="github",
+                source_path="github/a.json",
+                title="Multipart upload limits",
+                text="multipart upload limits",
+                vector=[1.0, 0.0, 0.0],
+                metadata={"topic": "uploads"},
+            ),
+            SemanticIndexRecord(
+                file_ref="file_b",
+                external_id="doc_b",
+                source_type="slack",
+                source_path="slack/b.json",
+                title="GPU cache issue",
+                text="gpu cache issue",
+                vector=[0.0, 1.0, 0.0],
+                metadata={"topic": "runtime"},
+            ),
+        ]
+    )
+
+    assert index.info()["document_count"] == 2
+
+    results = index.search([0.9, 0.1, 0.0], limit=2)
+    assert [item.external_id for item in results] == ["doc_a", "doc_b"]
+
+    filtered = index.search(
+        [0.9, 0.1, 0.0],
+        limit=2,
+        filters={"source_type": "slack"},
+    )
+    assert [item.external_id for item in filtered] == ["doc_b"]