mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
feat(pifs): render browse results as records
This commit is contained in:
parent
3562d47fdb
commit
c63ebe573b
3 changed files with 226 additions and 3 deletions
|
|
@ -1155,6 +1155,8 @@ class PIFSCommandExecutor:
|
|||
return self._render_listing(data)
|
||||
if command_name == "tree":
|
||||
return self._render_tree(data)
|
||||
if command_name == "browse":
|
||||
return self._render_browse(data)
|
||||
if command_name in {"grep", "semantic-grep"}:
|
||||
return self._render_grep(data)
|
||||
if command_name in {"search-summary", "search-entity", "search-relation"}:
|
||||
|
|
@ -1301,6 +1303,86 @@ class PIFSCommandExecutor:
|
|||
lines.append("")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
def _render_browse(self, data: Any) -> str:
|
||||
if not isinstance(data, dict):
|
||||
return str(data)
|
||||
page = self._coerce_positive_int(data.get("page"), default=1)
|
||||
page_size = self._coerce_positive_int(
|
||||
data.get("page_size"),
|
||||
default=self.BROWSE_PAGE_SIZE,
|
||||
)
|
||||
has_more = bool(data.get("has_more"))
|
||||
lines = [
|
||||
f"# page={page} page_size={page_size} "
|
||||
f"has_more={'true' if has_more else 'false'}"
|
||||
]
|
||||
results = data.get("data") or []
|
||||
for index, item in enumerate(results):
|
||||
if index:
|
||||
lines.append("")
|
||||
item = item if isinstance(item, dict) else {}
|
||||
lines.extend(
|
||||
[
|
||||
f"rank: {item.get('rank') or index + 1}",
|
||||
f"similarity: {self._format_similarity(item.get('similarity'))}",
|
||||
f"path: {self._browse_result_path(item)}",
|
||||
"summary: "
|
||||
f"{self._compact_text(self._one_line_value(item.get('summary')), max_chars=240)}",
|
||||
]
|
||||
)
|
||||
if has_more:
|
||||
if results:
|
||||
lines.append("")
|
||||
lines.append(f"# next: {self._browse_next_command(data, page=page)}")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
@staticmethod
|
||||
def _coerce_positive_int(value: Any, *, default: int) -> int:
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
return parsed if parsed >= 1 else default
|
||||
|
||||
@staticmethod
|
||||
def _format_similarity(value: Any) -> str:
|
||||
try:
|
||||
similarity = float(value)
|
||||
except (TypeError, ValueError):
|
||||
similarity = 0.0
|
||||
similarity = max(0.0, min(1.0, similarity))
|
||||
return f"{similarity:.2f}"
|
||||
|
||||
@staticmethod
|
||||
def _browse_result_path(item: dict[str, Any]) -> str:
|
||||
return str(
|
||||
item.get("path")
|
||||
or item.get("document_id")
|
||||
or item.get("external_id")
|
||||
or item.get("file_ref")
|
||||
or "-"
|
||||
)
|
||||
|
||||
def _browse_next_command(self, data: dict[str, Any], *, page: int) -> str:
|
||||
parts = ["browse"]
|
||||
if data.get("recursive"):
|
||||
parts.append("-R")
|
||||
parts.append(shlex.quote(str(data.get("scope") or "/")))
|
||||
parts.append(shlex.quote(str(data.get("query") or "")))
|
||||
space = str(data.get("space") or "summary")
|
||||
if space != "summary":
|
||||
parts.extend(["--space", shlex.quote(space)])
|
||||
if data.get("where") is not None:
|
||||
parts.extend(["--where", shlex.quote(self._browse_where_text(data["where"]))])
|
||||
parts.extend(["--page", str(page + 1)])
|
||||
return " ".join(parts)
|
||||
|
||||
@staticmethod
|
||||
def _browse_where_text(where: Any) -> str:
|
||||
if isinstance(where, str):
|
||||
return where
|
||||
return json.dumps(where, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
||||
|
||||
def _render_find(self, data: Any) -> str:
|
||||
if not isinstance(data, list):
|
||||
return str(data)
|
||||
|
|
|
|||
|
|
@ -441,7 +441,7 @@ class PageIndexFileSystem:
|
|||
if len(rows) >= needed:
|
||||
break
|
||||
page_rows = rows[offset : offset + page_size]
|
||||
return {
|
||||
payload = {
|
||||
"mode": "files",
|
||||
"retrieval": f"{space}_vector",
|
||||
"query": query,
|
||||
|
|
@ -454,6 +454,9 @@ class PageIndexFileSystem:
|
|||
"has_more": len(rows) > offset + page_size,
|
||||
"data": page_rows,
|
||||
}
|
||||
if metadata_filter is not None:
|
||||
payload["where"] = self._metadata_filter_payload(metadata_filter)
|
||||
return payload
|
||||
|
||||
def folder_info(self, path: str = "/") -> dict[str, Any]:
|
||||
return self.store.folder_info(path)
|
||||
|
|
@ -1664,6 +1667,17 @@ class PageIndexFileSystem:
|
|||
return 0.0
|
||||
return round(max(0.0, min(1.0, score)), 4)
|
||||
|
||||
@staticmethod
|
||||
def _metadata_filter_payload(metadata_filter: Any) -> str:
|
||||
if isinstance(metadata_filter, str):
|
||||
return metadata_filter
|
||||
return json.dumps(
|
||||
metadata_filter,
|
||||
ensure_ascii=False,
|
||||
sort_keys=True,
|
||||
separators=(",", ":"),
|
||||
)
|
||||
|
||||
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
|
||||
source_path = str(getattr(entry, "source_path", "") or "").strip()
|
||||
if source_path:
|
||||
|
|
|
|||
|
|
@ -106,13 +106,22 @@ class BrowseBackend:
|
|||
]
|
||||
|
||||
|
||||
def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"):
|
||||
def _register_browse_file(
|
||||
filesystem,
|
||||
external_id,
|
||||
folder_path,
|
||||
*,
|
||||
department="ops",
|
||||
summary=None,
|
||||
):
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
values = {
|
||||
"summary": f"summary for {document.external_id}",
|
||||
"summary": summary
|
||||
if summary is not None
|
||||
else f"summary for {document.external_id}",
|
||||
"doc_type": "memo",
|
||||
"domain": "finance",
|
||||
"topic": "risk",
|
||||
|
|
@ -320,6 +329,124 @@ def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
|||
]
|
||||
|
||||
|
||||
def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path):
|
||||
import re
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
document_ids = []
|
||||
for index in range(12):
|
||||
external_id = f"doc_{index:02d}"
|
||||
document_ids.append(external_id)
|
||||
_register_browse_file(
|
||||
filesystem,
|
||||
external_id,
|
||||
"/documents",
|
||||
department="finance",
|
||||
summary=(
|
||||
"first line\nsecond\tline with spaces"
|
||||
if index == 0
|
||||
else f"summary for {external_id}"
|
||||
),
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(
|
||||
document_ids,
|
||||
channels=("summary", "entity"),
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
rendered = executor.execute(
|
||||
'browse -R /documents "vector database" --space entity '
|
||||
'--where \'{"department":"finance"}\''
|
||||
)
|
||||
lines = rendered.splitlines()
|
||||
|
||||
assert lines[:6] == [
|
||||
"# page=1 page_size=10 has_more=true",
|
||||
"rank: 1",
|
||||
"similarity: 0.91",
|
||||
"path: /documents/doc_00.txt",
|
||||
"summary: first line second line with spaces",
|
||||
"",
|
||||
]
|
||||
assert lines[6:10] == [
|
||||
"rank: 2",
|
||||
"similarity: 0.83",
|
||||
"path: /documents/doc_01.txt",
|
||||
"summary: summary for doc_01",
|
||||
]
|
||||
similarity_lines = [line for line in lines if line.startswith("similarity: ")]
|
||||
assert len(similarity_lines) == 10
|
||||
assert all(re.fullmatch(r"similarity: [01]\.\d{2}", line) for line in similarity_lines)
|
||||
assert all(0.0 <= float(line.removeprefix("similarity: ")) <= 1.0 for line in similarity_lines)
|
||||
assert lines[-1] == (
|
||||
"# next: browse -R /documents 'vector database' --space entity "
|
||||
'--where \'{"department":"finance"}\' --page 2'
|
||||
)
|
||||
assert "mode:" not in rendered
|
||||
assert "data:" not in rendered
|
||||
assert "score:" not in rendered
|
||||
|
||||
|
||||
def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={"summary": f"summary for {document.external_id}"}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="First",
|
||||
content="first content",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="Second",
|
||||
content="second content",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_first"])
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
rendered = executor.execute('browse /documents "first"')
|
||||
|
||||
assert "path: dsid_first" in rendered
|
||||
assert "path: /shared/source.json" not in rendered
|
||||
assert filesystem.store.resolve_file_ref("dsid_first") == first_ref
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/shared/source.json")
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue