From ba821a70b9887cadb7f8d822083f9cbd72c1e4a2 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 31 May 2026 17:17:43 +0800 Subject: [PATCH] feat(pifs): add semantic browse command --- pageindex/filesystem/commands.py | 78 +++++++++ pageindex/filesystem/core.py | 151 ++++++++++++++++++ tests/test_pageindex_filesystem_scope.py | 193 +++++++++++++++++++++++ 3 files changed, 422 insertions(+) diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 18a85cc..7c2d91e 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -23,6 +23,7 @@ class PIFSCommandExecutor: "tree", "find", "grep", + "browse", "cat", "stat", "head", @@ -53,6 +54,7 @@ class PIFSCommandExecutor: MAX_FIND_LIMIT = 50 MAX_GREP_LIMIT = 20 MAX_SEMANTIC_LIMIT = 20 + BROWSE_PAGE_SIZE = 10 MAX_TEXT_LINES = 100 MAX_PAGE_SPAN = 5 MAX_STRUCTURE_NODES = 25 @@ -102,6 +104,8 @@ class PIFSCommandExecutor: "Available command surfaces for this workspace:", "- mode: read-only inspection", "- ls/tree: folder browsing", + '- browse [-R] "" [--space summary|entity|relation] ' + "[--page N] [--where JSON]: semantic relevance file browsing", "- find : folder path is positional; do not put paths in --where", "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", "- find -maxdepth N -type f|d: bounded folder traversal for find", @@ -264,6 +268,80 @@ class PIFSCommandExecutor: listing = self.filesystem.browse(path, recursive=True, limit=limit) return {"path": path, "depth": depth, "limit": limit, **listing} + def _cmd_browse(self, args: list[str]) -> Any: + recursive = False + where = None + space = "summary" + page = 1 + positionals = [] + i = 0 + while i < len(args): + arg = args[i] + if arg in {"-R", "-r", "--recursive"}: + recursive = True + elif arg == "--where": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --where requires a JSON value") + where = args[i] + elif arg == "--space": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --space requires a value") + space = args[i] + elif arg == "--page": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --page requires a value") + page = self._parse_non_negative_int(args[i], "browse --page") + elif arg in {"--limit", "--offset", "--query"}: + raise PIFSCommandError( + f"browse does not support {arg}; use fixed page size " + f"{self.BROWSE_PAGE_SIZE} and --page N" + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported browse option: {arg}") + else: + positionals.append(arg) + i += 1 + if len(positionals) < 2: + raise PIFSCommandError('browse requires a query: browse ""') + if len(positionals) > 2: + raise PIFSCommandError( + 'browse accepts a folder and one quoted query, for example: ' + 'browse /documents "Federal Reserve"' + ) + path, query = positionals + if not str(path).startswith("/"): + raise PIFSCommandError("browse target must be a PIFS folder path like /documents") + query = str(query or "").strip() + if not query: + raise PIFSCommandError('browse requires a query: browse ""') + if page < 1: + raise PIFSCommandError("browse --page must be at least 1") + if space not in SEMANTIC_RETRIEVAL_CHANNELS: + raise PIFSCommandError( + "Unsupported browse --space: " + f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}" + ) + if not self.filesystem.has_semantic_channel(space): + available = self.filesystem.semantic_retrieval_channels() + available_text = ", ".join(available) if available else "none" + raise PIFSCommandError( + f"browse --space {space} is not available; available spaces: {available_text}" + ) + normalized = self._normalize_folder_path(path) + return self.filesystem.browse_semantic_files( + normalized, + query, + retrieval_query=self._semantic_retrieval_query(query), + recursive=recursive, + space=space, + page=page, + page_size=self.BROWSE_PAGE_SIZE, + metadata_filter=where, + ) + def _cmd_find(self, args: list[str]) -> Any: path = "/" where = None diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 72833b7..c20cccf 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -331,6 +331,118 @@ class PageIndexFileSystem: max_depth=max_depth, ) + def browse_semantic_files( + self, + path: str, + query: str, + *, + retrieval_query: str | None = None, + recursive: bool = False, + space: str = "summary", + page: int = 1, + page_size: int = 10, + metadata_filter: Optional[dict[str, Any] | str] = None, + ) -> dict[str, Any]: + path = normalize_path(path) + self.store.folder_info(path) + query_text = self._query_text(retrieval_query or query).strip() + if not query_text: + raise ValueError("browse requires a query") + if page < 1: + raise ValueError("browse --page must be at least 1") + if page_size < 1: + raise ValueError("browse page_size must be at least 1") + if space not in SEMANTIC_RETRIEVAL_CHANNELS: + raise ValueError( + "Unsupported browse --space: " + f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}" + ) + available_spaces = self.semantic_retrieval_channels() + if space not in available_spaces: + available = ", ".join(available_spaces) if available_spaces else "none" + raise ValueError( + f"browse --space {space} is not available; available spaces: {available}" + ) + search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None) + if search_channel is None: + available = ", ".join(available_spaces) if available_spaces else "none" + raise ValueError( + f"browse --space {space} is not available; available spaces: {available}" + ) + parsed_filter = self.metadata.parse_filter(metadata_filter) + scope = {"folder_path": path, "recursive": recursive} + offset = (page - 1) * page_size + needed = offset + page_size + 1 + fetch_limit = max(needed * 10, 50) + candidates = search_channel( + space, + query_text, + limit=fetch_limit, + filters=self._semantic_filters_for_scope(scope), + ) + rows: list[dict[str, Any]] = [] + seen: set[str] = set() + for candidate in candidates: + try: + file_ref = self.store.resolve_file_ref(candidate.document_id) + except KeyError: + continue + if file_ref in seen: + continue + if not self.store.file_matches( + file_ref, + scope=scope, + metadata_filter=parsed_filter, + ): + continue + seen.add(file_ref) + entry = self.store.get_file(file_ref) + folder_paths = [ + folder["path"] + for folder in self.store.folder_memberships(file_ref) + ] + rank = len(rows) + 1 + rows.append( + { + "rank": rank, + "similarity": self._semantic_candidate_similarity(candidate), + "score": self._semantic_candidate_score(candidate), + "path": self._stable_file_locator(file_ref, entry), + "file_ref": file_ref, + "document_id": entry.external_id, + "external_id": entry.external_id, + "title": entry.title, + "source_path": entry.source_path, + "folder_path": self._preferred_folder_path( + folder_paths, + path, + entry.folder_path, + ), + "folder_paths": folder_paths, + "summary": str((entry.metadata or {}).get("summary") or ""), + "snippet": str(getattr(candidate, "snippet", "") or entry.descriptor), + "metadata": entry.metadata, + "metadata_status": entry.metadata_status, + "sources": list(getattr(candidate, "sources", []) or []), + } + ) + if len(rows) >= needed: + break + page_rows = rows[offset : offset + page_size] + return { + "mode": "files", + "retrieval": f"{space}_vector", + "query": query, + "scope": path, + "recursive": recursive, + "space": space, + "available_spaces": list(available_spaces), + "page": page, + "page_size": page_size, + "has_more": len(rows) > offset + page_size, + "data": page_rows, + } + def folder_info(self, path: str = "/") -> dict[str, Any]: return self.store.folder_info(path) @@ -1515,6 +1627,45 @@ class PageIndexFileSystem: break return results + @staticmethod + def _semantic_candidate_score(candidate: Any) -> float | None: + try: + return float(getattr(candidate, "score")) + except (AttributeError, TypeError, ValueError): + return None + + @classmethod + def _semantic_candidate_similarity(cls, candidate: Any) -> float: + distances: list[float] = [] + for source in getattr(candidate, "sources", []) or []: + if not isinstance(source, dict) or source.get("distance") is None: + continue + try: + distances.append(float(source["distance"])) + except (TypeError, ValueError): + continue + if distances: + distance = max(min(distances), 0.0) + return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4) + score = cls._semantic_candidate_score(candidate) + if score is None: + return 0.0 + return round(max(0.0, min(1.0, score)), 4) + + def _stable_file_locator(self, file_ref: str, entry: Any) -> str: + source_path = str(getattr(entry, "source_path", "") or "").strip() + if source_path: + target = "/" + source_path.strip("/") + try: + if self.store.resolve_file_ref(target) == file_ref: + return target + except KeyError: + pass + external_id = str(getattr(entry, "external_id", "") or "").strip() + if external_id: + return external_id + return file_ref + @staticmethod def _build_descriptor(title: str, metadata: dict[str, Any]) -> str: source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel") diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 087473a..b18270c 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -69,6 +69,199 @@ class ChannelBackend: ] +class BrowseBackend: + def __init__(self, document_ids, channels=("summary",)): + self.document_ids = list(document_ids) + self.channels = channels + self.calls = [] + + def available_channels(self): + return self.channels + + def search_channel(self, channel, query, *, limit=10, filters=None): + self.calls.append((channel, query, limit, filters)) + return [ + SimpleNamespace( + document_id=document_id, + snippet=f"{channel} candidate {rank}: {query}", + score=1.0 - rank * 0.01, + sources=[{"channel": channel, "rank": rank, "distance": rank / 10}], + ) + for rank, document_id in enumerate(self.document_ids[:limit], 1) + ] + + +def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"): + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + values = { + "summary": f"summary for {document.external_id}", + "doc_type": "memo", + "domain": "finance", + "topic": "risk", + } + return MetadataGenerationResult( + values={field: values[field] for field in fields if field in values} + ) + + filesystem.metadata_generator = SummaryGenerator() + return filesystem.register_file( + storage_uri=f"file:///tmp/{external_id}.pdf", + source_path=f"documents/{external_id}.pdf", + folder_path=folder_path, + external_id=external_id, + title=f"{external_id}.pdf", + content=f"{external_id} discusses vector databases and retrieval.", + metadata={"department": department}, + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + + +def test_browse_is_agent_visible_semantic_command(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + executor = PIFSCommandExecutor(filesystem) + + assert "browse" in executor.allowed_commands() + assert 'browse [-R] ""' in executor.describe_available_command_surfaces() + + +def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"]) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="browse requires a query"): + executor.execute("browse /documents") + with pytest.raises(PIFSCommandError, match="--query"): + executor.execute('browse /documents "vector database" --query "other"') + with pytest.raises(PIFSCommandError, match="--limit"): + executor.execute('browse /documents "vector database" --limit 10') + with pytest.raises(PIFSCommandError, match="--offset"): + executor.execute('browse /documents "vector database" --offset 10') + with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"): + executor.execute("browse /documents vector database") + + +def test_browse_validates_space_availability_and_page(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",)) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"): + executor.execute('browse /documents "vector database" --space hybrid') + with pytest.raises(PIFSCommandError, match="available spaces: summary"): + executor.execute('browse /documents "vector database" --space entity') + with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"): + executor.execute('browse /documents "vector database" --page 0') + + +def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + backend = BrowseBackend(["doc_direct"], channels=("entity",)) + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="available spaces: entity"): + executor.execute('browse /documents "vector database"') + assert backend.calls == [] + + result = json.loads( + executor.execute('browse /documents "vector database" --space entity') + )["data"] + assert [item["document_id"] for item in result["data"]] == ["doc_direct"] + assert backend.calls[-1][0] == "entity" + + +def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + _register_browse_file(filesystem, "doc_deep", "/documents/reports") + backend = BrowseBackend(["doc_deep", "doc_direct"]) + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + direct = json.loads(executor.execute('browse /documents "vector database"'))["data"] + assert [item["document_id"] for item in direct["data"]] == ["doc_direct"] + assert direct["recursive"] is False + assert direct["space"] == "summary" + assert direct["page"] == 1 + assert direct["page_size"] == 10 + assert backend.calls[-1][0] == "summary" + + recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"] + assert [item["document_id"] for item in recursive["data"]] == [ + "doc_deep", + "doc_direct", + ] + assert [item["rank"] for item in recursive["data"]] == [1, 2] + assert recursive["recursive"] is True + + +def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + document_ids = [] + for index in range(12): + external_id = f"doc_{index:02d}" + document_ids.append(external_id) + department = "finance" if index == 10 else "ops" + _register_browse_file(filesystem, external_id, "/documents", department=department) + filesystem.semantic_retrieval_backend = BrowseBackend(document_ids) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"] + assert len(first_page["data"]) == 10 + assert first_page["has_more"] is True + assert first_page["data"][0]["rank"] == 1 + + second_page = json.loads( + executor.execute('browse /documents "vector database" --page 2') + )["data"] + assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"] + assert [item["rank"] for item in second_page["data"]] == [11, 12] + assert second_page["has_more"] is False + + filtered = json.loads( + executor.execute( + 'browse /documents "vector database" --where \'{"department":"finance"}\'' + ) + )["data"] + assert [item["document_id"] for item in filtered["data"]] == ["doc_10"] + assert filtered["data"][0]["summary"] == "summary for doc_10" + + def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult