feat(pifs): add semantic browse command

This commit is contained in:
BukeLy 2026-05-31 17:17:43 +08:00
parent 8071889508
commit ba821a70b9
3 changed files with 422 additions and 0 deletions

View file

@ -23,6 +23,7 @@ class PIFSCommandExecutor:
"tree",
"find",
"grep",
"browse",
"cat",
"stat",
"head",
@ -53,6 +54,7 @@ class PIFSCommandExecutor:
MAX_FIND_LIMIT = 50
MAX_GREP_LIMIT = 20
MAX_SEMANTIC_LIMIT = 20
BROWSE_PAGE_SIZE = 10
MAX_TEXT_LINES = 100
MAX_PAGE_SPAN = 5
MAX_STRUCTURE_NODES = 25
@ -102,6 +104,8 @@ class PIFSCommandExecutor:
"Available command surfaces for this workspace:",
"- mode: read-only inspection",
"- ls/tree: folder browsing",
'- browse [-R] <folder> "<query>" [--space summary|entity|relation] '
"[--page N] [--where JSON]: semantic relevance file browsing",
"- find <folder>: folder path is positional; do not put paths in --where",
"- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
@ -264,6 +268,80 @@ class PIFSCommandExecutor:
listing = self.filesystem.browse(path, recursive=True, limit=limit)
return {"path": path, "depth": depth, "limit": limit, **listing}
def _cmd_browse(self, args: list[str]) -> Any:
recursive = False
where = None
space = "summary"
page = 1
positionals = []
i = 0
while i < len(args):
arg = args[i]
if arg in {"-R", "-r", "--recursive"}:
recursive = True
elif arg == "--where":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --where requires a JSON value")
where = args[i]
elif arg == "--space":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --space requires a value")
space = args[i]
elif arg == "--page":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --page requires a value")
page = self._parse_non_negative_int(args[i], "browse --page")
elif arg in {"--limit", "--offset", "--query"}:
raise PIFSCommandError(
f"browse does not support {arg}; use fixed page size "
f"{self.BROWSE_PAGE_SIZE} and --page N"
)
elif arg.startswith("-"):
raise PIFSCommandError(f"Unsupported browse option: {arg}")
else:
positionals.append(arg)
i += 1
if len(positionals) < 2:
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
if len(positionals) > 2:
raise PIFSCommandError(
'browse accepts a folder and one quoted query, for example: '
'browse /documents "Federal Reserve"'
)
path, query = positionals
if not str(path).startswith("/"):
raise PIFSCommandError("browse target must be a PIFS folder path like /documents")
query = str(query or "").strip()
if not query:
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
if page < 1:
raise PIFSCommandError("browse --page must be at least 1")
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
raise PIFSCommandError(
"Unsupported browse --space: "
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
)
if not self.filesystem.has_semantic_channel(space):
available = self.filesystem.semantic_retrieval_channels()
available_text = ", ".join(available) if available else "none"
raise PIFSCommandError(
f"browse --space {space} is not available; available spaces: {available_text}"
)
normalized = self._normalize_folder_path(path)
return self.filesystem.browse_semantic_files(
normalized,
query,
retrieval_query=self._semantic_retrieval_query(query),
recursive=recursive,
space=space,
page=page,
page_size=self.BROWSE_PAGE_SIZE,
metadata_filter=where,
)
def _cmd_find(self, args: list[str]) -> Any:
path = "/"
where = None

View file

@ -331,6 +331,118 @@ class PageIndexFileSystem:
max_depth=max_depth,
)
def browse_semantic_files(
self,
path: str,
query: str,
*,
retrieval_query: str | None = None,
recursive: bool = False,
space: str = "summary",
page: int = 1,
page_size: int = 10,
metadata_filter: Optional[dict[str, Any] | str] = None,
) -> dict[str, Any]:
path = normalize_path(path)
self.store.folder_info(path)
query_text = self._query_text(retrieval_query or query).strip()
if not query_text:
raise ValueError("browse requires a query")
if page < 1:
raise ValueError("browse --page must be at least 1")
if page_size < 1:
raise ValueError("browse page_size must be at least 1")
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
raise ValueError(
"Unsupported browse --space: "
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
)
available_spaces = self.semantic_retrieval_channels()
if space not in available_spaces:
available = ", ".join(available_spaces) if available_spaces else "none"
raise ValueError(
f"browse --space {space} is not available; available spaces: {available}"
)
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
if search_channel is None:
available = ", ".join(available_spaces) if available_spaces else "none"
raise ValueError(
f"browse --space {space} is not available; available spaces: {available}"
)
parsed_filter = self.metadata.parse_filter(metadata_filter)
scope = {"folder_path": path, "recursive": recursive}
offset = (page - 1) * page_size
needed = offset + page_size + 1
fetch_limit = max(needed * 10, 50)
candidates = search_channel(
space,
query_text,
limit=fetch_limit,
filters=self._semantic_filters_for_scope(scope),
)
rows: list[dict[str, Any]] = []
seen: set[str] = set()
for candidate in candidates:
try:
file_ref = self.store.resolve_file_ref(candidate.document_id)
except KeyError:
continue
if file_ref in seen:
continue
if not self.store.file_matches(
file_ref,
scope=scope,
metadata_filter=parsed_filter,
):
continue
seen.add(file_ref)
entry = self.store.get_file(file_ref)
folder_paths = [
folder["path"]
for folder in self.store.folder_memberships(file_ref)
]
rank = len(rows) + 1
rows.append(
{
"rank": rank,
"similarity": self._semantic_candidate_similarity(candidate),
"score": self._semantic_candidate_score(candidate),
"path": self._stable_file_locator(file_ref, entry),
"file_ref": file_ref,
"document_id": entry.external_id,
"external_id": entry.external_id,
"title": entry.title,
"source_path": entry.source_path,
"folder_path": self._preferred_folder_path(
folder_paths,
path,
entry.folder_path,
),
"folder_paths": folder_paths,
"summary": str((entry.metadata or {}).get("summary") or ""),
"snippet": str(getattr(candidate, "snippet", "") or entry.descriptor),
"metadata": entry.metadata,
"metadata_status": entry.metadata_status,
"sources": list(getattr(candidate, "sources", []) or []),
}
)
if len(rows) >= needed:
break
page_rows = rows[offset : offset + page_size]
return {
"mode": "files",
"retrieval": f"{space}_vector",
"query": query,
"scope": path,
"recursive": recursive,
"space": space,
"available_spaces": list(available_spaces),
"page": page,
"page_size": page_size,
"has_more": len(rows) > offset + page_size,
"data": page_rows,
}
def folder_info(self, path: str = "/") -> dict[str, Any]:
return self.store.folder_info(path)
@ -1515,6 +1627,45 @@ class PageIndexFileSystem:
break
return results
@staticmethod
def _semantic_candidate_score(candidate: Any) -> float | None:
try:
return float(getattr(candidate, "score"))
except (AttributeError, TypeError, ValueError):
return None
@classmethod
def _semantic_candidate_similarity(cls, candidate: Any) -> float:
distances: list[float] = []
for source in getattr(candidate, "sources", []) or []:
if not isinstance(source, dict) or source.get("distance") is None:
continue
try:
distances.append(float(source["distance"]))
except (TypeError, ValueError):
continue
if distances:
distance = max(min(distances), 0.0)
return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4)
score = cls._semantic_candidate_score(candidate)
if score is None:
return 0.0
return round(max(0.0, min(1.0, score)), 4)
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
source_path = str(getattr(entry, "source_path", "") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if self.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(getattr(entry, "external_id", "") or "").strip()
if external_id:
return external_id
return file_ref
@staticmethod
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")

View file

@ -69,6 +69,199 @@ class ChannelBackend:
]
class BrowseBackend:
def __init__(self, document_ids, channels=("summary",)):
self.document_ids = list(document_ids)
self.channels = channels
self.calls = []
def available_channels(self):
return self.channels
def search_channel(self, channel, query, *, limit=10, filters=None):
self.calls.append((channel, query, limit, filters))
return [
SimpleNamespace(
document_id=document_id,
snippet=f"{channel} candidate {rank}: {query}",
score=1.0 - rank * 0.01,
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
)
for rank, document_id in enumerate(self.document_ids[:limit], 1)
]
def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"):
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
values = {
"summary": f"summary for {document.external_id}",
"doc_type": "memo",
"domain": "finance",
"topic": "risk",
}
return MetadataGenerationResult(
values={field: values[field] for field in fields if field in values}
)
filesystem.metadata_generator = SummaryGenerator()
return filesystem.register_file(
storage_uri=f"file:///tmp/{external_id}.pdf",
source_path=f"documents/{external_id}.pdf",
folder_path=folder_path,
external_id=external_id,
title=f"{external_id}.pdf",
content=f"{external_id} discusses vector databases and retrieval.",
metadata={"department": department},
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
def test_browse_is_agent_visible_semantic_command(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
executor = PIFSCommandExecutor(filesystem)
assert "browse" in executor.allowed_commands()
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="browse requires a query"):
executor.execute("browse /documents")
with pytest.raises(PIFSCommandError, match="--query"):
executor.execute('browse /documents "vector database" --query "other"')
with pytest.raises(PIFSCommandError, match="--limit"):
executor.execute('browse /documents "vector database" --limit 10')
with pytest.raises(PIFSCommandError, match="--offset"):
executor.execute('browse /documents "vector database" --offset 10')
with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
executor.execute("browse /documents vector database")
def test_browse_validates_space_availability_and_page(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
executor.execute('browse /documents "vector database" --space hybrid')
with pytest.raises(PIFSCommandError, match="available spaces: summary"):
executor.execute('browse /documents "vector database" --space entity')
with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
executor.execute('browse /documents "vector database" --page 0')
def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
backend = BrowseBackend(["doc_direct"], channels=("entity",))
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="available spaces: entity"):
executor.execute('browse /documents "vector database"')
assert backend.calls == []
result = json.loads(
executor.execute('browse /documents "vector database" --space entity')
)["data"]
assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
assert backend.calls[-1][0] == "entity"
def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
_register_browse_file(filesystem, "doc_deep", "/documents/reports")
backend = BrowseBackend(["doc_deep", "doc_direct"])
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
assert direct["recursive"] is False
assert direct["space"] == "summary"
assert direct["page"] == 1
assert direct["page_size"] == 10
assert backend.calls[-1][0] == "summary"
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
assert [item["document_id"] for item in recursive["data"]] == [
"doc_deep",
"doc_direct",
]
assert [item["rank"] for item in recursive["data"]] == [1, 2]
assert recursive["recursive"] is True
def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
document_ids = []
for index in range(12):
external_id = f"doc_{index:02d}"
document_ids.append(external_id)
department = "finance" if index == 10 else "ops"
_register_browse_file(filesystem, external_id, "/documents", department=department)
filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
executor = PIFSCommandExecutor(filesystem, json_output=True)
first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert len(first_page["data"]) == 10
assert first_page["has_more"] is True
assert first_page["data"][0]["rank"] == 1
second_page = json.loads(
executor.execute('browse /documents "vector database" --page 2')
)["data"]
assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
assert [item["rank"] for item in second_page["data"]] == [11, 12]
assert second_page["has_more"] is False
filtered = json.loads(
executor.execute(
'browse /documents "vector database" --where \'{"department":"finance"}\''
)
)["data"]
assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
assert filtered["data"][0]["summary"] == "summary for doc_10"
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult