import json from types import SimpleNamespace import pytest def test_filesystem_lazy_exports_remain_public(): import pageindex.filesystem as filesystem from pageindex.filesystem import ( SemanticProjectionSearchBackend, RebuildableSemanticIndex, SemanticIndexRecord, SemanticSearchResult, SQLiteVecSemanticIndex, SummaryProjectionIndexer, ) for name in ( "SemanticProjectionSearchBackend", "RebuildableSemanticIndex", "SemanticIndexRecord", "SemanticSearchResult", "SQLiteVecSemanticIndex", "SummaryProjectionIndexer", ): assert name in filesystem.__all__ assert name in dir(filesystem) assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend" assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex" assert SemanticIndexRecord.__name__ == "SemanticIndexRecord" assert SemanticSearchResult.__name__ == "SemanticSearchResult" assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex" assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer" class SummaryBackend: def __init__(self, document_id): self.document_id = document_id self.calls = [] def available_channels(self): return ("summary",) def search_channel(self, channel, query, *, limit=10, filters=None): self.calls.append((channel, query, filters)) return [ SimpleNamespace( document_id=self.document_id, snippet=f"summary candidate: {query}", ) ] class ChannelBackend: def __init__(self, document_id, channels=("summary", "entity", "relation")): self.document_id = document_id self.channels = channels self.calls = [] def available_channels(self): return self.channels def search_channel(self, channel, query, *, limit=10, filters=None): self.calls.append((channel, query, limit, filters)) return [ SimpleNamespace( document_id=self.document_id, snippet=f"{channel} candidate: {query}", ) ] class BrowseBackend: def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None): self.document_ids = list(document_ids) self.channels = channels self.file_refs_by_document_id = dict(file_refs_by_document_id or {}) self.calls = [] def available_channels(self): return self.channels def search_channel(self, channel, query, *, limit=10, filters=None): self.calls.append((channel, query, limit, filters)) file_ref_filter = set() if isinstance(filters, dict): raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or [] if isinstance(raw_file_refs, str): file_ref_filter = {raw_file_refs} else: file_ref_filter = {str(item) for item in raw_file_refs} document_ids = self.document_ids if file_ref_filter and self.file_refs_by_document_id: document_ids = [ document_id for document_id in document_ids if self.file_refs_by_document_id.get(document_id) in file_ref_filter ] return [ SimpleNamespace( document_id=document_id, snippet=f"{channel} candidate {rank}: {query}", score=1.0 - rank * 0.01, sources=[{"channel": channel, "rank": rank, "distance": rank / 10}], ) for rank, document_id in enumerate(document_ids[:limit], 1) ] def _register_browse_file( filesystem, external_id, folder_path, *, department="ops", summary=None, ): from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): values = { "summary": summary if summary is not None else f"summary for {document.external_id}", "doc_type": "memo", "domain": "finance", "topic": "risk", } return MetadataGenerationResult( values={field: values[field] for field in fields if field in values} ) filesystem.metadata_generator = SummaryGenerator() return filesystem.register_file( storage_uri=f"file:///tmp/{external_id}.txt", folder_path=folder_path, external_id=external_id, title=f"{external_id}.txt", content=f"{external_id} discusses vector databases and retrieval.", metadata={"department": department}, metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) def test_browse_is_agent_visible_semantic_command(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report") executor = PIFSCommandExecutor(filesystem) allowed = executor.allowed_commands() surface = executor.describe_available_command_surfaces() assert "browse" in allowed assert 'browse [-R] ""' in surface assert not { "search-summary", "search-entity", "search-relation", "semantic-grep", } & allowed for old_command in ( "search-summary", "search-entity", "search-relation", "semantic-grep", "find --name: entity semantic", "find --relation: relation semantic", ): assert old_command not in surface assert executor.command_capabilities()["retrieval"]["semantic"]["commands"] == ["browse"] def test_shell_text_window_commands_are_not_agent_visible(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") executor = PIFSCommandExecutor(filesystem) assert not {"head", "tail", "sed"} & executor.allowed_commands() assert not {"head", "tail", "sed"} & set( executor.command_capabilities()["allowed_commands"] ) for command in ("head /documents/a.txt", "tail /documents/a.txt", "sed -n 1,1p /documents/a.txt"): with pytest.raises(PIFSCommandError, match="Unsupported command"): executor.execute(command) def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") _register_browse_file(filesystem, "doc_direct", "/documents") filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"]) executor = PIFSCommandExecutor(filesystem, json_output=True) with pytest.raises(PIFSCommandError, match="browse requires a query"): executor.execute("browse /documents") with pytest.raises(PIFSCommandError, match="--query"): executor.execute('browse /documents "vector database" --query "other"') with pytest.raises(PIFSCommandError, match="--limit"): executor.execute('browse /documents "vector database" --limit 10') with pytest.raises(PIFSCommandError, match="--offset"): executor.execute('browse /documents "vector database" --offset 10') with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"): executor.execute("browse /documents vector database") def test_browse_validates_space_availability_and_page(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") _register_browse_file(filesystem, "doc_direct", "/documents") filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",)) executor = PIFSCommandExecutor(filesystem, json_output=True) with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"): executor.execute('browse /documents "vector database" --space hybrid') with pytest.raises(PIFSCommandError, match="available spaces: summary"): executor.execute('browse /documents "vector database" --space entity') with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"): executor.execute('browse /documents "vector database" --page 0') def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path): import json from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") _register_browse_file(filesystem, "doc_direct", "/documents") backend = BrowseBackend(["doc_direct"], channels=("entity",)) filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) with pytest.raises(PIFSCommandError, match="available spaces: entity"): executor.execute('browse /documents "vector database"') assert backend.calls == [] result = json.loads( executor.execute('browse /documents "vector database" --space entity') )["data"] assert [item["document_id"] for item in result["data"]] == ["doc_direct"] assert backend.calls[-1][0] == "entity" def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path): import json from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") _register_browse_file(filesystem, "doc_direct", "/documents") _register_browse_file(filesystem, "doc_deep", "/documents/reports") backend = BrowseBackend(["doc_deep", "doc_direct"]) filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) direct = json.loads(executor.execute('browse /documents "vector database"'))["data"] assert [item["document_id"] for item in direct["data"]] == ["doc_direct"] assert direct["recursive"] is False assert direct["space"] == "summary" assert direct["page"] == 1 assert direct["page_size"] == 10 assert backend.calls[-1][0] == "summary" recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"] assert [item["document_id"] for item in recursive["data"]] == [ "doc_deep", "doc_direct", ] assert [item["rank"] for item in recursive["data"]] == [1, 2] assert recursive["recursive"] is True def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path): import json from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") document_ids = [] for index in range(12): external_id = f"doc_{index:02d}" document_ids.append(external_id) department = "finance" if index == 10 else "ops" _register_browse_file(filesystem, external_id, "/documents", department=department) filesystem.semantic_retrieval_backend = BrowseBackend(document_ids) executor = PIFSCommandExecutor(filesystem, json_output=True) first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"] assert len(first_page["data"]) == 10 assert first_page["has_more"] is True assert first_page["data"][0]["rank"] == 1 second_page = json.loads( executor.execute('browse /documents "vector database" --page 2') )["data"] assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"] assert [item["rank"] for item in second_page["data"]] == [11, 12] assert second_page["has_more"] is False filtered = json.loads( executor.execute( 'browse /documents "vector database" --where \'{"department":"finance"}\'' ) )["data"] assert [item["document_id"] for item in filtered["data"]] == ["doc_10"] assert filtered["data"][0]["summary"] == "summary for doc_10" def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path): import json from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") file_refs_by_document_id = {} candidate_ids = [] for index in range(150): external_id = f"off_scope_{index:02d}" candidate_ids.append(external_id) file_refs_by_document_id[external_id] = _register_browse_file( filesystem, external_id, "/other", ) file_refs_by_document_id["doc_deep"] = _register_browse_file( filesystem, "doc_deep", "/documents/reports", ) file_refs_by_document_id["doc_direct"] = _register_browse_file( filesystem, "doc_direct", "/documents", ) backend = BrowseBackend( [*candidate_ids, "doc_deep", "doc_direct"], file_refs_by_document_id=file_refs_by_document_id, ) filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) direct = json.loads(executor.execute('browse /documents "vector database"'))["data"] assert [item["document_id"] for item in direct["data"]] == ["doc_direct"] recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"] assert [item["document_id"] for item in recursive["data"]] == [ "doc_deep", "doc_direct", ] def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path): import re from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") document_ids = [] for index in range(12): external_id = f"doc_{index:02d}" document_ids.append(external_id) _register_browse_file( filesystem, external_id, "/documents", department="finance", summary=( "first line\nsecond\tline with spaces" if index == 0 else f"summary for {external_id}" ), ) filesystem.semantic_retrieval_backend = BrowseBackend( document_ids, channels=("summary", "entity"), ) executor = PIFSCommandExecutor(filesystem) rendered = executor.execute( 'browse -R /documents "vector database" --space entity ' '--where \'{"department":"finance"}\'' ) lines = rendered.splitlines() assert lines[:6] == [ "# page=1 page_size=10 has_more=true", "rank: 1", "similarity: 0.91", "path: /documents/doc_00.txt", "summary: first line second line with spaces", "", ] assert lines[6:10] == [ "rank: 2", "similarity: 0.83", "path: /documents/doc_01.txt", "summary: summary for doc_01", ] similarity_lines = [line for line in lines if line.startswith("similarity: ")] assert len(similarity_lines) == 10 assert all(re.fullmatch(r"similarity: [01]\.\d{2}", line) for line in similarity_lines) assert all(0.0 <= float(line.removeprefix("similarity: ")) <= 1.0 for line in similarity_lines) assert lines[-1] == ( "# next: browse -R /documents 'vector database' --space entity " '--where \'{"department":"finance"}\' --page 2' ) assert "mode:" not in rendered assert "data:" not in rendered assert "score:" not in rendered def test_browse_shell_path_uses_virtual_locator_when_source_collides(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": f"summary for {document.external_id}"} ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) first_ref = filesystem.register_file( storage_uri="file:///tmp/first.json", folder_path="/documents", external_id="dsid_first", title="First", content="first content", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) filesystem.register_file( storage_uri="file:///tmp/second.json", folder_path="/documents", external_id="dsid_second", title="Second", content="second content", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_first"]) executor = PIFSCommandExecutor(filesystem) rendered = executor.execute('browse /documents "first"') assert "path: /documents/First" in rendered assert "path: /shared/source.json" not in rendered assert filesystem.store.resolve_file_ref("/documents/First") == first_ref with pytest.raises(KeyError, match="Unknown file target"): filesystem.store.resolve_file_ref("/shared/source.json") def test_browse_shell_path_never_returns_storage_uri_path(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": "summary for physical source report"} ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) file_ref = filesystem.register_file( storage_uri="file:///Users/chengjie/Downloads/source/report.pdf", folder_path="/documents/reports", external_id="dsid_report", title="report.pdf", content="physical source report content", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_report"]) executor = PIFSCommandExecutor(filesystem) rendered = executor.execute('browse /documents/reports "physical source"') assert "path: /documents/reports/report.pdf" in rendered assert "/Users/chengjie/Downloads" not in rendered assert filesystem.store.resolve_file_ref("/documents/reports/report.pdf") == file_ref def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": "Federal Reserve annual report summary"} ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) file_ref = filesystem.register_file( storage_uri="file:///tmp/report.pdf", folder_path="/documents", external_id="dsid_report", title="report.pdf", metadata={"source_type": "examples-documents"}, content="Federal Reserve supervision and regulation annual report.", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) backend = SummaryBackend("dsid_report") filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) result = json.loads( executor.execute('browse /documents "Federal Reserve annual report"') ) assert "source_type" not in backend.calls[0][2] assert result["data"]["data"][0]["path"] == "/documents/report.pdf" assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref def test_register_file_rejects_duplicate_title_in_folder(tmp_path): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": f"summary for {document.external_id}"} ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) filesystem.register_file( storage_uri="file:///tmp/first.json", folder_path="/documents", external_id="dsid_first", title="announcements", content="first announcement mentions H200 reservations.", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) with pytest.raises(FileExistsError, match="File already exists at /documents/announcements"): filesystem.register_file( storage_uri="file:///tmp/second.json", folder_path="/documents", external_id="dsid_second", title="announcements", content="second announcement mentions unrelated maintenance.", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) def test_browse_path_uses_virtual_title_when_storage_paths_are_unrelated(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": f"summary for {document.external_id}"} ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) first_ref = filesystem.register_file( storage_uri="file:///tmp/first.json", folder_path="/documents", external_id="dsid_first", title="First", content="first content", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) filesystem.register_file( storage_uri="file:///tmp/second.json", folder_path="/documents", external_id="dsid_second", title="Second", content="second content", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") executor = PIFSCommandExecutor(filesystem, json_output=True) result = json.loads(executor.execute('browse /documents "first"')) assert result["data"]["data"][0]["path"] == "/documents/First" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError from pageindex.filesystem.metadata_generation import MetadataGenerationResult class MetadataGenerator: def generate(self, document, *, fields): values = { "summary": "Risk and compliance summary", "entity": "Federal Reserve; Disney", "relation": "Federal Reserve affects Disney valuation", } return MetadataGenerationResult(values={field: values[field] for field in fields}) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=MetadataGenerator(), ) filesystem.register_file( storage_uri="file:///tmp/market-note.pdf", folder_path="/documents", external_id="dsid_market_note", title="market-note.pdf", content="Federal Reserve policy affects Disney valuation.", metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, "entity": True, "relation": True, } }, ) filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note") executor = PIFSCommandExecutor(filesystem, json_output=True) for command in ( 'search-summary "Federal Reserve" /documents', 'search-entity "Federal Reserve" /documents', 'search-relation "Disney valuation" /documents', 'semantic-grep -R "Federal Reserve" /documents', ): with pytest.raises(PIFSCommandError, match="Unsupported command"): executor.execute(command) entity = json.loads( executor.execute('browse /documents "Federal Reserve" --space entity') ) assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary" assert entity["data"]["data"][0]["path"] == "/documents/market-note.pdf" relation = json.loads( executor.execute('browse /documents "Disney valuation" --space relation') ) assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary" assert relation["data"]["data"][0]["path"] == "/documents/market-note.pdf" def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.commands import PIFSCommandError filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") filesystem.register_file( storage_uri="file:///tmp/report.pdf", folder_path="/documents", external_id="dsid_report", title="Annual report", content="Federal Reserve supervision and regulation annual report.", ) backend = ChannelBackend("dsid_report", channels=("entity", "relation")) filesystem.semantic_retrieval_backend = backend executor = PIFSCommandExecutor(filesystem, json_output=True) result = json.loads(executor.execute("find /documents --name Reserve"))["data"] assert result[0]["external_id"] == "dsid_report" assert backend.calls == [] with pytest.raises(PIFSCommandError, match="find --relation is not supported"): executor.execute('find /documents --relation "Reserve regulates report"') def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") _register_browse_file(filesystem, "dsid_report", "/documents") filesystem.semantic_retrieval_backend = ChannelBackend("dsid_report") filesystem.store.folder_subtree_thresholds = lambda *args, **kwargs: { "depth_limit": 2, "file_limit": 10, "folder_depth_exceeds_limit": True, "file_count_exceeds_limit": False, "sampled_file_count": 11, "sample_deep_folder_path": "/documents/deep", } executor = PIFSCommandExecutor(filesystem) rendered = executor.execute('grep -R "Federal Reserve" /documents') assert "# suggested: browse -R /documents 'Federal Reserve'" in rendered assert "search-summary" not in rendered assert "search-entity" not in rendered assert "search-relation" not in rendered assert "semantic-grep" not in rendered def test_grep_file_requires_terms_on_same_line(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem source_dir = tmp_path / "source" / "documents" source_dir.mkdir(parents=True) source = source_dir / "split.json" source.write_text( '{\n "first": "alpha evidence lives here",\n' ' "second": "omega evidence lives there"\n}\n', encoding="utf-8", ) filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") filesystem.register_file( storage_uri=str(source), folder_path="/documents", external_id="doc_split_terms", title="Split source terms", content=source.read_text(encoding="utf-8"), ) executor = PIFSCommandExecutor(filesystem, json_output=True) result = json.loads(executor.execute('grep -R "alpha omega" /documents')) assert result["data"]["mode"] == "files" assert result["data"]["data"] == [] matched = json.loads(executor.execute('grep -R "alpha evidence" /documents')) assert matched["data"]["data"][0]["external_id"] == "doc_split_terms" assert matched["data"]["data"][0]["line"] == 2 assert "alpha evidence" in matched["data"]["data"][0]["text"] def test_existing_summary_projection_index_uses_current_config_when_dimensions_match( tmp_path, monkeypatch ): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex workspace = tmp_path / "workspace" index_dir = workspace / "artifacts" / "projection_indexes" summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") summary_index.reset( dimension=3, metadata={ "channel": "summary", "embedding_provider": "stale-provider", "embedding_model": "stale-embedding", "embedding_dimensions": 3, }, ) summary_index.upsert_many( [ SemanticIndexRecord( file_ref="file_a", external_id="doc_a", source_type="documents", title="A", text="summary", vector=[1.0, 0.0, 0.0], ) ] ) filesystem = PageIndexFileSystem( workspace, summary_projection_embedding_provider="current-provider", summary_projection_embedding_model="current-embedding", summary_projection_embedding_dimensions=3, summary_projection_embedding_timeout=12, ) calls = [] def fake_configure(index_dir_arg, **kwargs): calls.append((index_dir_arg, kwargs)) filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") return filesystem.semantic_retrieval_backend monkeypatch.setattr( filesystem, "configure_semantic_projection_retrieval", fake_configure, ) assert filesystem.configure_existing_projection_retrieval() is True assert calls == [ ( filesystem.summary_projection_index_dir, { "embedding_provider": "current-provider", "embedding_model": "current-embedding", "embedding_dimensions": 3, "embedding_timeout": 12, }, ) ] assert filesystem.semantic_retrieval_channels() == ("summary",) def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( tmp_path, monkeypatch ): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex workspace = tmp_path / "workspace" index_dir = workspace / "artifacts" / "projection_indexes" summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") summary_index.reset( dimension=3, metadata={ "channel": "summary", "embedding_provider": "openai", "embedding_model": "test-embedding", "embedding_dimensions": 3, }, ) summary_index.upsert_many( [ SemanticIndexRecord( file_ref="file_a", external_id="doc_a", source_type="documents", title="A", text="summary", vector=[1.0, 0.0, 0.0], ) ] ) filesystem = PageIndexFileSystem(workspace) def fail_configure(*args, **kwargs): raise AssertionError("retrieval backend should not be configured on dimension mismatch") monkeypatch.setattr( filesystem, "configure_semantic_projection_retrieval", fail_configure, ) with pytest.raises( RuntimeError, match=( "summary projection index dimension mismatch: .*" "dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild" ), ): filesystem.configure_existing_projection_retrieval() def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend from pageindex.filesystem.metadata_generation import MetadataGenerationResult from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer class FixedEmbedder: def embed(self, texts): return [[1.0, 0.0, 0.0] for _ in texts] class SummaryGenerator: def generate(self, document, *, fields): return MetadataGenerationResult( values={"summary": "vendor renewal risk matrix"} ) source = tmp_path / "source.txt" source.write_text("ordinary fixture body", encoding="utf-8") index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes" indexer = SummaryProjectionIndexer( index_dir, embedder=FixedEmbedder(), embedding_provider="test", embedding_model="fake", embedding_dimensions=3, ) backend = SemanticProjectionSearchBackend( index_dir, embedder=FixedEmbedder(), embedding_provider="test", embedding_model="fake", embedding_dimensions=3, ) filesystem = PageIndexFileSystem( workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), summary_projection_indexer=indexer, semantic_retrieval_backend=backend, ) filesystem.register_file( storage_uri=source.as_uri(), folder_path="/documents", external_id="doc_summary_only", title="Operations note", content=source.read_text(encoding="utf-8"), metadata={"department": "ops"}, metadata_policy={ "fields": { "summary": True, "doc_type": False, "domain": False, "topic": False, } }, ) assert filesystem.search("purchase order exposure") == [] results = filesystem.browse_semantic_files( "/documents", "purchase order exposure", recursive=True, page_size=5, ) assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"] assert results["data"][0]["snippet"] == "summary_vector rank=1"