diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 7c2d91e..8e99321 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -1155,6 +1155,8 @@ class PIFSCommandExecutor: return self._render_listing(data) if command_name == "tree": return self._render_tree(data) + if command_name == "browse": + return self._render_browse(data) if command_name in {"grep", "semantic-grep"}: return self._render_grep(data) if command_name in {"search-summary", "search-entity", "search-relation"}: @@ -1301,6 +1303,86 @@ class PIFSCommandExecutor: lines.append("") return "\n".join(lines).rstrip() + def _render_browse(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + page = self._coerce_positive_int(data.get("page"), default=1) + page_size = self._coerce_positive_int( + data.get("page_size"), + default=self.BROWSE_PAGE_SIZE, + ) + has_more = bool(data.get("has_more")) + lines = [ + f"# page={page} page_size={page_size} " + f"has_more={'true' if has_more else 'false'}" + ] + results = data.get("data") or [] + for index, item in enumerate(results): + if index: + lines.append("") + item = item if isinstance(item, dict) else {} + lines.extend( + [ + f"rank: {item.get('rank') or index + 1}", + f"similarity: {self._format_similarity(item.get('similarity'))}", + f"path: {self._browse_result_path(item)}", + "summary: " + f"{self._compact_text(self._one_line_value(item.get('summary')), max_chars=240)}", + ] + ) + if has_more: + if results: + lines.append("") + lines.append(f"# next: {self._browse_next_command(data, page=page)}") + return "\n".join(lines).rstrip() + + @staticmethod + def _coerce_positive_int(value: Any, *, default: int) -> int: + try: + parsed = int(value) + except (TypeError, ValueError): + return default + return parsed if parsed >= 1 else default + + @staticmethod + def _format_similarity(value: Any) -> str: + try: + similarity = float(value) + except (TypeError, ValueError): + similarity = 0.0 + similarity = max(0.0, min(1.0, similarity)) + return f"{similarity:.2f}" + + @staticmethod + def _browse_result_path(item: dict[str, Any]) -> str: + return str( + item.get("path") + or item.get("document_id") + or item.get("external_id") + or item.get("file_ref") + or "-" + ) + + def _browse_next_command(self, data: dict[str, Any], *, page: int) -> str: + parts = ["browse"] + if data.get("recursive"): + parts.append("-R") + parts.append(shlex.quote(str(data.get("scope") or "/"))) + parts.append(shlex.quote(str(data.get("query") or ""))) + space = str(data.get("space") or "summary") + if space != "summary": + parts.extend(["--space", shlex.quote(space)]) + if data.get("where") is not None: + parts.extend(["--where", shlex.quote(self._browse_where_text(data["where"]))]) + parts.extend(["--page", str(page + 1)]) + return " ".join(parts) + + @staticmethod + def _browse_where_text(where: Any) -> str: + if isinstance(where, str): + return where + return json.dumps(where, ensure_ascii=False, sort_keys=True, separators=(",", ":")) + def _render_find(self, data: Any) -> str: if not isinstance(data, list): return str(data) diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 78c1cec..c977c40 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -441,7 +441,7 @@ class PageIndexFileSystem: if len(rows) >= needed: break page_rows = rows[offset : offset + page_size] - return { + payload = { "mode": "files", "retrieval": f"{space}_vector", "query": query, @@ -454,6 +454,9 @@ class PageIndexFileSystem: "has_more": len(rows) > offset + page_size, "data": page_rows, } + if metadata_filter is not None: + payload["where"] = self._metadata_filter_payload(metadata_filter) + return payload def folder_info(self, path: str = "/") -> dict[str, Any]: return self.store.folder_info(path) @@ -1664,6 +1667,17 @@ class PageIndexFileSystem: return 0.0 return round(max(0.0, min(1.0, score)), 4) + @staticmethod + def _metadata_filter_payload(metadata_filter: Any) -> str: + if isinstance(metadata_filter, str): + return metadata_filter + return json.dumps( + metadata_filter, + ensure_ascii=False, + sort_keys=True, + separators=(",", ":"), + ) + def _stable_file_locator(self, file_ref: str, entry: Any) -> str: source_path = str(getattr(entry, "source_path", "") or "").strip() if source_path: diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 04ce084..6973c5a 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -106,13 +106,22 @@ class BrowseBackend: ] -def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"): +def _register_browse_file( + filesystem, + external_id, + folder_path, + *, + department="ops", + summary=None, +): from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: def generate(self, document, *, fields): values = { - "summary": f"summary for {document.external_id}", + "summary": summary + if summary is not None + else f"summary for {document.external_id}", "doc_type": "memo", "domain": "finance", "topic": "risk", @@ -320,6 +329,124 @@ def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path): ] +def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path): + import re + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + document_ids = [] + for index in range(12): + external_id = f"doc_{index:02d}" + document_ids.append(external_id) + _register_browse_file( + filesystem, + external_id, + "/documents", + department="finance", + summary=( + "first line\nsecond\tline with spaces" + if index == 0 + else f"summary for {external_id}" + ), + ) + filesystem.semantic_retrieval_backend = BrowseBackend( + document_ids, + channels=("summary", "entity"), + ) + executor = PIFSCommandExecutor(filesystem) + + rendered = executor.execute( + 'browse -R /documents "vector database" --space entity ' + '--where \'{"department":"finance"}\'' + ) + lines = rendered.splitlines() + + assert lines[:6] == [ + "# page=1 page_size=10 has_more=true", + "rank: 1", + "similarity: 0.91", + "path: /documents/doc_00.txt", + "summary: first line second line with spaces", + "", + ] + assert lines[6:10] == [ + "rank: 2", + "similarity: 0.83", + "path: /documents/doc_01.txt", + "summary: summary for doc_01", + ] + similarity_lines = [line for line in lines if line.startswith("similarity: ")] + assert len(similarity_lines) == 10 + assert all(re.fullmatch(r"similarity: [01]\.\d{2}", line) for line in similarity_lines) + assert all(0.0 <= float(line.removeprefix("similarity: ")) <= 1.0 for line in similarity_lines) + assert lines[-1] == ( + "# next: browse -R /documents 'vector database' --space entity " + '--where \'{"department":"finance"}\' --page 2' + ) + assert "mode:" not in rendered + assert "data:" not in rendered + assert "score:" not in rendered + + +def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": f"summary for {document.external_id}"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_first", + title="First", + content="first content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.register_file( + storage_uri="file:///tmp/second.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_second", + title="Second", + content="second content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_first"]) + executor = PIFSCommandExecutor(filesystem) + + rendered = executor.execute('browse /documents "first"') + + assert "path: dsid_first" in rendered + assert "path: /shared/source.json" not in rendered + assert filesystem.store.resolve_file_ref("dsid_first") == first_ref + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/shared/source.json") + + def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult