PageIndex/tests/test_pageindex_filesystem_scope.py

import json
from types import SimpleNamespace

import pytest


def test_filesystem_lazy_exports_remain_public():
    import pageindex.filesystem as filesystem
    from pageindex.filesystem import (
        SemanticProjectionSearchBackend,
        RebuildableSemanticIndex,
        SemanticIndexRecord,
        SemanticSearchResult,
        SQLiteVecSemanticIndex,
        SummaryProjectionIndexer,
    )

    for name in (
        "SemanticProjectionSearchBackend",
        "RebuildableSemanticIndex",
        "SemanticIndexRecord",
        "SemanticSearchResult",
        "SQLiteVecSemanticIndex",
        "SummaryProjectionIndexer",
    ):
        assert name in filesystem.__all__
        assert name in dir(filesystem)

    assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend"
    assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
    assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
    assert SemanticSearchResult.__name__ == "SemanticSearchResult"
    assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex"
    assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer"


class SummaryBackend:
    def __init__(self, document_id):
        self.document_id = document_id
        self.calls = []

    def available_channels(self):
        return ("summary",)

    def search_channel(self, channel, query, *, limit=10, filters=None):
        self.calls.append((channel, query, filters))
        return [
            SimpleNamespace(
                document_id=self.document_id,
                snippet=f"summary candidate: {query}",
            )
        ]


class ChannelBackend:
    def __init__(self, document_id, channels=("summary", "entity", "relation")):
        self.document_id = document_id
        self.channels = channels
        self.calls = []

    def available_channels(self):
        return self.channels

    def search_channel(self, channel, query, *, limit=10, filters=None):
        self.calls.append((channel, query, limit, filters))
        return [
            SimpleNamespace(
                document_id=self.document_id,
                snippet=f"{channel} candidate: {query}",
            )
        ]


class BrowseBackend:
    def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
        self.document_ids = list(document_ids)
        self.channels = channels
        self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
        self.calls = []

    def available_channels(self):
        return self.channels

    def search_channel(self, channel, query, *, limit=10, filters=None):
        self.calls.append((channel, query, limit, filters))
        file_ref_filter = set()
        if isinstance(filters, dict):
            raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
            if isinstance(raw_file_refs, str):
                file_ref_filter = {raw_file_refs}
            else:
                file_ref_filter = {str(item) for item in raw_file_refs}
        document_ids = self.document_ids
        if file_ref_filter and self.file_refs_by_document_id:
            document_ids = [
                document_id
                for document_id in document_ids
                if self.file_refs_by_document_id.get(document_id) in file_ref_filter
            ]
        return [
            SimpleNamespace(
                document_id=document_id,
                snippet=f"{channel} candidate {rank}: {query}",
                score=1.0 - rank * 0.01,
                sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
            )
            for rank, document_id in enumerate(document_ids[:limit], 1)
        ]


def _register_browse_file(
    filesystem,
    external_id,
    folder_path,
    *,
    department="ops",
    summary=None,
):
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            values = {
                "summary": summary
                if summary is not None
                else f"summary for {document.external_id}",
                "doc_type": "memo",
                "domain": "finance",
                "topic": "risk",
            }
            return MetadataGenerationResult(
                values={field: values[field] for field in fields if field in values}
            )

    filesystem.metadata_generator = SummaryGenerator()
    return filesystem.register_file(
        storage_uri=f"file:///tmp/{external_id}.txt",
        folder_path=folder_path,
        external_id=external_id,
        title=f"{external_id}.txt",
        content=f"{external_id} discusses vector databases and retrieval.",
        metadata={"department": department},
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )


def test_browse_is_agent_visible_semantic_command(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
    executor = PIFSCommandExecutor(filesystem)

    allowed = executor.allowed_commands()
    surface = executor.describe_available_command_surfaces()

    assert "browse" in allowed
    assert 'browse [-R] <folder> "<query>"' in surface
    assert not {
        "search-summary",
        "search-entity",
        "search-relation",
        "semantic-grep",
    } & allowed
    for old_command in (
        "search-summary",
        "search-entity",
        "search-relation",
        "semantic-grep",
        "find --name: entity semantic",
        "find --relation: relation semantic",
    ):
        assert old_command not in surface
    assert executor.command_capabilities()["retrieval"]["semantic"]["commands"] == ["browse"]


def test_shell_text_window_commands_are_not_agent_visible(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    executor = PIFSCommandExecutor(filesystem)

    assert not {"head", "tail", "sed"} & executor.allowed_commands()
    assert not {"head", "tail", "sed"} & set(
        executor.command_capabilities()["allowed_commands"]
    )

    for command in ("head /documents/a.txt", "tail /documents/a.txt", "sed -n 1,1p /documents/a.txt"):
        with pytest.raises(PIFSCommandError, match="Unsupported command"):
            executor.execute(command)


def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    _register_browse_file(filesystem, "doc_direct", "/documents")
    filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    with pytest.raises(PIFSCommandError, match="browse requires a query"):
        executor.execute("browse /documents")
    with pytest.raises(PIFSCommandError, match="--query"):
        executor.execute('browse /documents "vector database" --query "other"')
    with pytest.raises(PIFSCommandError, match="--limit"):
        executor.execute('browse /documents "vector database" --limit 10')
    with pytest.raises(PIFSCommandError, match="--offset"):
        executor.execute('browse /documents "vector database" --offset 10')
    with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
        executor.execute("browse /documents vector database")


def test_browse_validates_space_availability_and_page(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    _register_browse_file(filesystem, "doc_direct", "/documents")
    filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
        executor.execute('browse /documents "vector database" --space hybrid')
    with pytest.raises(PIFSCommandError, match="available spaces: summary"):
        executor.execute('browse /documents "vector database" --space entity')
    with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
        executor.execute('browse /documents "vector database" --page 0')


def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
    import json

    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    _register_browse_file(filesystem, "doc_direct", "/documents")
    backend = BrowseBackend(["doc_direct"], channels=("entity",))
    filesystem.semantic_retrieval_backend = backend
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    with pytest.raises(PIFSCommandError, match="available spaces: entity"):
        executor.execute('browse /documents "vector database"')
    assert backend.calls == []

    result = json.loads(
        executor.execute('browse /documents "vector database" --space entity')
    )["data"]
    assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
    assert backend.calls[-1][0] == "entity"


def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
    import json

    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    _register_browse_file(filesystem, "doc_direct", "/documents")
    _register_browse_file(filesystem, "doc_deep", "/documents/reports")
    backend = BrowseBackend(["doc_deep", "doc_direct"])
    filesystem.semantic_retrieval_backend = backend
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
    assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
    assert direct["recursive"] is False
    assert direct["space"] == "summary"
    assert direct["page"] == 1
    assert direct["page_size"] == 10
    assert backend.calls[-1][0] == "summary"

    recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
    assert [item["document_id"] for item in recursive["data"]] == [
        "doc_deep",
        "doc_direct",
    ]
    assert [item["rank"] for item in recursive["data"]] == [1, 2]
    assert recursive["recursive"] is True


def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
    import json

    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    document_ids = []
    for index in range(12):
        external_id = f"doc_{index:02d}"
        document_ids.append(external_id)
        department = "finance" if index == 10 else "ops"
        _register_browse_file(filesystem, external_id, "/documents", department=department)
    filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
    assert len(first_page["data"]) == 10
    assert first_page["has_more"] is True
    assert first_page["data"][0]["rank"] == 1

    second_page = json.loads(
        executor.execute('browse /documents "vector database" --page 2')
    )["data"]
    assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
    assert [item["rank"] for item in second_page["data"]] == [11, 12]
    assert second_page["has_more"] is False

    filtered = json.loads(
        executor.execute(
            'browse /documents "vector database" --where \'{"department":"finance"}\''
        )
    )["data"]
    assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
    assert filtered["data"][0]["summary"] == "summary for doc_10"


def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path):
    import json

    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    file_refs_by_document_id = {}
    candidate_ids = []
    for index in range(150):
        external_id = f"off_scope_{index:02d}"
        candidate_ids.append(external_id)
        file_refs_by_document_id[external_id] = _register_browse_file(
            filesystem,
            external_id,
            "/other",
        )
    file_refs_by_document_id["doc_deep"] = _register_browse_file(
        filesystem,
        "doc_deep",
        "/documents/reports",
    )
    file_refs_by_document_id["doc_direct"] = _register_browse_file(
        filesystem,
        "doc_direct",
        "/documents",
    )
    backend = BrowseBackend(
        [*candidate_ids, "doc_deep", "doc_direct"],
        file_refs_by_document_id=file_refs_by_document_id,
    )
    filesystem.semantic_retrieval_backend = backend
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
    assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]

    recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
    assert [item["document_id"] for item in recursive["data"]] == [
        "doc_deep",
        "doc_direct",
    ]

def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path):
    import re

    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    document_ids = []
    for index in range(12):
        external_id = f"doc_{index:02d}"
        document_ids.append(external_id)
        _register_browse_file(
            filesystem,
            external_id,
            "/documents",
            department="finance",
            summary=(
                "first line\nsecond\tline   with spaces"
                if index == 0
                else f"summary for {external_id}"
            ),
        )
    filesystem.semantic_retrieval_backend = BrowseBackend(
        document_ids,
        channels=("summary", "entity"),
    )
    executor = PIFSCommandExecutor(filesystem)

    rendered = executor.execute(
        'browse -R /documents "vector database" --space entity '
        '--where \'{"department":"finance"}\''
    )
    lines = rendered.splitlines()

    assert lines[:6] == [
        "# page=1 page_size=10 has_more=true",
        "rank: 1",
        "similarity: 0.91",
        "path: /documents/doc_00.txt",
        "summary: first line second line with spaces",
        "",
    ]
    assert lines[6:10] == [
        "rank: 2",
        "similarity: 0.83",
        "path: /documents/doc_01.txt",
        "summary: summary for doc_01",
    ]
    similarity_lines = [line for line in lines if line.startswith("similarity: ")]
    assert len(similarity_lines) == 10
    assert all(re.fullmatch(r"similarity: [01]\.\d{2}", line) for line in similarity_lines)
    assert all(0.0 <= float(line.removeprefix("similarity: ")) <= 1.0 for line in similarity_lines)
    assert lines[-1] == (
        "# next: browse -R /documents 'vector database' --space entity "
        '--where \'{"department":"finance"}\' --page 2'
    )
    assert "mode:" not in rendered
    assert "data:" not in rendered
    assert "score:" not in rendered


def test_browse_shell_path_uses_virtual_locator_when_source_collides(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": f"summary for {document.external_id}"}
            )

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
    )
    first_ref = filesystem.register_file(
        storage_uri="file:///tmp/first.json",
        folder_path="/documents",
        external_id="dsid_first",
        title="First",
        content="first content",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    filesystem.register_file(
        storage_uri="file:///tmp/second.json",
        folder_path="/documents",
        external_id="dsid_second",
        title="Second",
        content="second content",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_first"])
    executor = PIFSCommandExecutor(filesystem)

    rendered = executor.execute('browse /documents "first"')

    assert "path: /documents/First" in rendered
    assert "path: /shared/source.json" not in rendered
    assert filesystem.store.resolve_file_ref("/documents/First") == first_ref
    with pytest.raises(KeyError, match="Unknown file target"):
        filesystem.store.resolve_file_ref("/shared/source.json")


def test_browse_shell_path_never_returns_storage_uri_path(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": "summary for physical source report"}
            )

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
    )
    file_ref = filesystem.register_file(
        storage_uri="file:///Users/chengjie/Downloads/source/report.pdf",
        folder_path="/documents/reports",
        external_id="dsid_report",
        title="report.pdf",
        content="physical source report content",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_report"])
    executor = PIFSCommandExecutor(filesystem)

    rendered = executor.execute('browse /documents/reports "physical source"')

    assert "path: /documents/reports/report.pdf" in rendered
    assert "/Users/chengjie/Downloads" not in rendered
    assert filesystem.store.resolve_file_ref("/documents/reports/report.pdf") == file_ref


def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": "Federal Reserve annual report summary"}
            )

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
    )
    file_ref = filesystem.register_file(
        storage_uri="file:///tmp/report.pdf",
        folder_path="/documents",
        external_id="dsid_report",
        title="report.pdf",
        metadata={"source_type": "examples-documents"},
        content="Federal Reserve supervision and regulation annual report.",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    backend = SummaryBackend("dsid_report")
    filesystem.semantic_retrieval_backend = backend
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    result = json.loads(
        executor.execute('browse /documents "Federal Reserve annual report"')
    )

    assert "source_type" not in backend.calls[0][2]
    assert result["data"]["data"][0]["path"] == "/documents/report.pdf"
    assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary"
    assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref


def test_register_file_rejects_duplicate_title_in_folder(tmp_path):
    from pageindex.filesystem import PageIndexFileSystem
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": f"summary for {document.external_id}"}
            )

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
    )
    filesystem.register_file(
        storage_uri="file:///tmp/first.json",
        folder_path="/documents",
        external_id="dsid_first",
        title="announcements",
        content="first announcement mentions H200 reservations.",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    with pytest.raises(FileExistsError, match="File already exists at /documents/announcements"):
        filesystem.register_file(
            storage_uri="file:///tmp/second.json",
            folder_path="/documents",
            external_id="dsid_second",
            title="announcements",
            content="second announcement mentions unrelated maintenance.",
            metadata_policy={
                "fields": {
                    "summary": True,
                    "doc_type": False,
                    "domain": False,
                    "topic": False,
                }
            },
        )


def test_browse_path_uses_virtual_title_when_storage_paths_are_unrelated(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": f"summary for {document.external_id}"}
            )

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
    )
    first_ref = filesystem.register_file(
        storage_uri="file:///tmp/first.json",
        folder_path="/documents",
        external_id="dsid_first",
        title="First",
        content="first content",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    filesystem.register_file(
        storage_uri="file:///tmp/second.json",
        folder_path="/documents",
        external_id="dsid_second",
        title="Second",
        content="second content",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )
    filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    result = json.loads(executor.execute('browse /documents "first"'))

    assert result["data"]["data"][0]["path"] == "/documents/First"
    assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref


def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult

    class MetadataGenerator:
        def generate(self, document, *, fields):
            values = {
                "summary": "Risk and compliance summary",
                "entity": "Federal Reserve; Disney",
                "relation": "Federal Reserve affects Disney valuation",
            }
            return MetadataGenerationResult(values={field: values[field] for field in fields})

    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=MetadataGenerator(),
    )
    filesystem.register_file(
        storage_uri="file:///tmp/market-note.pdf",
        folder_path="/documents",
        external_id="dsid_market_note",
        title="market-note.pdf",
        content="Federal Reserve policy affects Disney valuation.",
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
                "entity": True,
                "relation": True,
            }
        },
    )
    filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    for command in (
        'search-summary "Federal Reserve" /documents',
        'search-entity "Federal Reserve" /documents',
        'search-relation "Disney valuation" /documents',
        'semantic-grep -R "Federal Reserve" /documents',
    ):
        with pytest.raises(PIFSCommandError, match="Unsupported command"):
            executor.execute(command)

    entity = json.loads(
        executor.execute('browse /documents "Federal Reserve" --space entity')
    )
    assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary"
    assert entity["data"]["data"][0]["path"] == "/documents/market-note.pdf"

    relation = json.loads(
        executor.execute('browse /documents "Disney valuation" --space relation')
    )
    assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary"
    assert relation["data"]["data"][0]["path"] == "/documents/market-note.pdf"


def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
    from pageindex.filesystem.commands import PIFSCommandError

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    filesystem.register_file(
        storage_uri="file:///tmp/report.pdf",
        folder_path="/documents",
        external_id="dsid_report",
        title="Annual report",
        content="Federal Reserve supervision and regulation annual report.",
    )
    backend = ChannelBackend("dsid_report", channels=("entity", "relation"))
    filesystem.semantic_retrieval_backend = backend
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    result = json.loads(executor.execute("find /documents --name Reserve"))["data"]

    assert result[0]["external_id"] == "dsid_report"
    assert backend.calls == []

    with pytest.raises(PIFSCommandError, match="find --relation is not supported"):
        executor.execute('find /documents --relation "Reserve regulates report"')


def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    _register_browse_file(filesystem, "dsid_report", "/documents")
    filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report")
    filesystem.store.folder_subtree_thresholds = lambda *args, **kwargs: {
        "depth_limit": 2,
        "file_limit": 10,
        "folder_depth_exceeds_limit": True,
        "file_count_exceeds_limit": False,
        "sampled_file_count": 11,
        "sample_deep_folder_path": "/documents/deep",
    }
    executor = PIFSCommandExecutor(filesystem)

    rendered = executor.execute('grep -R "Federal Reserve" /documents')

    assert "# suggested: browse -R /documents 'Federal Reserve'" in rendered
    assert "search-summary" not in rendered
    assert "search-entity" not in rendered
    assert "search-relation" not in rendered
    assert "semantic-grep" not in rendered


def test_grep_file_requires_terms_on_same_line(tmp_path):
    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem

    source_dir = tmp_path / "source" / "documents"
    source_dir.mkdir(parents=True)
    source = source_dir / "split.json"
    source.write_text(
        '{\n  "first": "alpha evidence lives here",\n'
        '  "second": "omega evidence lives there"\n}\n',
        encoding="utf-8",
    )
    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
    filesystem.register_file(
        storage_uri=str(source),
        folder_path="/documents",
        external_id="doc_split_terms",
        title="Split source terms",
        content=source.read_text(encoding="utf-8"),
    )
    executor = PIFSCommandExecutor(filesystem, json_output=True)

    result = json.loads(executor.execute('grep -R "alpha omega" /documents'))

    assert result["data"]["mode"] == "files"
    assert result["data"]["data"] == []

    matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))

    assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
    assert matched["data"]["data"][0]["line"] == 2
    assert "alpha evidence" in matched["data"]["data"][0]["text"]


def test_existing_summary_projection_index_uses_current_config_when_dimensions_match(
    tmp_path, monkeypatch
):
    from pageindex.filesystem import PageIndexFileSystem
    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex

    workspace = tmp_path / "workspace"
    index_dir = workspace / "artifacts" / "projection_indexes"
    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
    summary_index.reset(
        dimension=3,
        metadata={
            "channel": "summary",
            "embedding_provider": "stale-provider",
            "embedding_model": "stale-embedding",
            "embedding_dimensions": 3,
        },
    )
    summary_index.upsert_many(
        [
            SemanticIndexRecord(
                file_ref="file_a",
                external_id="doc_a",
                source_type="documents",
                title="A",
                text="summary",
                vector=[1.0, 0.0, 0.0],
            )
        ]
    )
    filesystem = PageIndexFileSystem(
        workspace,
        summary_projection_embedding_provider="current-provider",
        summary_projection_embedding_model="current-embedding",
        summary_projection_embedding_dimensions=3,
        summary_projection_embedding_timeout=12,
    )
    calls = []

    def fake_configure(index_dir_arg, **kwargs):
        calls.append((index_dir_arg, kwargs))
        filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
        return filesystem.semantic_retrieval_backend

    monkeypatch.setattr(
        filesystem,
        "configure_semantic_projection_retrieval",
        fake_configure,
    )

    assert filesystem.configure_existing_projection_retrieval() is True
    assert calls == [
        (
            filesystem.summary_projection_index_dir,
            {
                "embedding_provider": "current-provider",
                "embedding_model": "current-embedding",
                "embedding_dimensions": 3,
                "embedding_timeout": 12,
            },
        )
    ]
    assert filesystem.semantic_retrieval_channels() == ("summary",)


def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
    tmp_path, monkeypatch
):
    from pageindex.filesystem import PageIndexFileSystem
    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex

    workspace = tmp_path / "workspace"
    index_dir = workspace / "artifacts" / "projection_indexes"
    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
    summary_index.reset(
        dimension=3,
        metadata={
            "channel": "summary",
            "embedding_provider": "openai",
            "embedding_model": "test-embedding",
            "embedding_dimensions": 3,
        },
    )
    summary_index.upsert_many(
        [
            SemanticIndexRecord(
                file_ref="file_a",
                external_id="doc_a",
                source_type="documents",
                title="A",
                text="summary",
                vector=[1.0, 0.0, 0.0],
            )
        ]
    )
    filesystem = PageIndexFileSystem(workspace)

    def fail_configure(*args, **kwargs):
        raise AssertionError("retrieval backend should not be configured on dimension mismatch")

    monkeypatch.setattr(
        filesystem,
        "configure_semantic_projection_retrieval",
        fail_configure,
    )

    with pytest.raises(
        RuntimeError,
        match=(
            "summary projection index dimension mismatch: .*"
            "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
        ),
    ):
        filesystem.configure_existing_projection_retrieval()


def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
    from pageindex.filesystem import PageIndexFileSystem
    from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    class FixedEmbedder:
        def embed(self, texts):
            return [[1.0, 0.0, 0.0] for _ in texts]

    class SummaryGenerator:
        def generate(self, document, *, fields):
            return MetadataGenerationResult(
                values={"summary": "vendor renewal risk matrix"}
            )

    source = tmp_path / "source.txt"
    source.write_text("ordinary fixture body", encoding="utf-8")
    index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
    indexer = SummaryProjectionIndexer(
        index_dir,
        embedder=FixedEmbedder(),
        embedding_provider="test",
        embedding_model="fake",
        embedding_dimensions=3,
    )
    backend = SemanticProjectionSearchBackend(
        index_dir,
        embedder=FixedEmbedder(),
        embedding_provider="test",
        embedding_model="fake",
        embedding_dimensions=3,
    )
    filesystem = PageIndexFileSystem(
        workspace=tmp_path / "workspace",
        metadata_generator=SummaryGenerator(),
        summary_projection_indexer=indexer,
        semantic_retrieval_backend=backend,
    )
    filesystem.register_file(
        storage_uri=source.as_uri(),
        folder_path="/documents",
        external_id="doc_summary_only",
        title="Operations note",
        content=source.read_text(encoding="utf-8"),
        metadata={"department": "ops"},
        metadata_policy={
            "fields": {
                "summary": True,
                "doc_type": False,
                "domain": False,
                "topic": False,
            }
        },
    )

    assert filesystem.search("purchase order exposure") == []

    results = filesystem.browse_semantic_files(
        "/documents",
        "purchase order exposure",
        recursive=True,
        page_size=5,
    )

    assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"]
    assert results["data"][0]["snippet"] == "summary_vector rank=1"