mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
Merge Goal 2: semantic browse command
Merge the unified browse command implementation into feat/pageindex-filesystem.
This commit is contained in:
commit
d9ce184e50
6 changed files with 649 additions and 5 deletions
|
|
@ -23,6 +23,7 @@ class PIFSCommandExecutor:
|
|||
"tree",
|
||||
"find",
|
||||
"grep",
|
||||
"browse",
|
||||
"cat",
|
||||
"stat",
|
||||
"head",
|
||||
|
|
@ -53,6 +54,7 @@ class PIFSCommandExecutor:
|
|||
MAX_FIND_LIMIT = 50
|
||||
MAX_GREP_LIMIT = 20
|
||||
MAX_SEMANTIC_LIMIT = 20
|
||||
BROWSE_PAGE_SIZE = 10
|
||||
MAX_TEXT_LINES = 100
|
||||
MAX_PAGE_SPAN = 5
|
||||
MAX_STRUCTURE_NODES = 25
|
||||
|
|
@ -102,6 +104,8 @@ class PIFSCommandExecutor:
|
|||
"Available command surfaces for this workspace:",
|
||||
"- mode: read-only inspection",
|
||||
"- ls/tree: folder browsing",
|
||||
'- browse [-R] <folder> "<query>" [--space summary|entity|relation] '
|
||||
"[--page N] [--where JSON]: semantic relevance file browsing",
|
||||
"- find <folder>: folder path is positional; do not put paths in --where",
|
||||
"- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
|
||||
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
|
||||
|
|
@ -264,6 +268,80 @@ class PIFSCommandExecutor:
|
|||
listing = self.filesystem.browse(path, recursive=True, limit=limit)
|
||||
return {"path": path, "depth": depth, "limit": limit, **listing}
|
||||
|
||||
def _cmd_browse(self, args: list[str]) -> Any:
|
||||
recursive = False
|
||||
where = None
|
||||
space = "summary"
|
||||
page = 1
|
||||
positionals = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
if arg in {"-R", "-r", "--recursive"}:
|
||||
recursive = True
|
||||
elif arg == "--where":
|
||||
i += 1
|
||||
if i >= len(args):
|
||||
raise PIFSCommandError("browse --where requires a JSON value")
|
||||
where = args[i]
|
||||
elif arg == "--space":
|
||||
i += 1
|
||||
if i >= len(args):
|
||||
raise PIFSCommandError("browse --space requires a value")
|
||||
space = args[i]
|
||||
elif arg == "--page":
|
||||
i += 1
|
||||
if i >= len(args):
|
||||
raise PIFSCommandError("browse --page requires a value")
|
||||
page = self._parse_non_negative_int(args[i], "browse --page")
|
||||
elif arg in {"--limit", "--offset", "--query"}:
|
||||
raise PIFSCommandError(
|
||||
f"browse does not support {arg}; use fixed page size "
|
||||
f"{self.BROWSE_PAGE_SIZE} and --page N"
|
||||
)
|
||||
elif arg.startswith("-"):
|
||||
raise PIFSCommandError(f"Unsupported browse option: {arg}")
|
||||
else:
|
||||
positionals.append(arg)
|
||||
i += 1
|
||||
if len(positionals) < 2:
|
||||
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
|
||||
if len(positionals) > 2:
|
||||
raise PIFSCommandError(
|
||||
'browse accepts a folder and one quoted query, for example: '
|
||||
'browse /documents "Federal Reserve"'
|
||||
)
|
||||
path, query = positionals
|
||||
if not str(path).startswith("/"):
|
||||
raise PIFSCommandError("browse target must be a PIFS folder path like /documents")
|
||||
query = str(query or "").strip()
|
||||
if not query:
|
||||
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
|
||||
if page < 1:
|
||||
raise PIFSCommandError("browse --page must be at least 1")
|
||||
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
raise PIFSCommandError(
|
||||
"Unsupported browse --space: "
|
||||
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
|
||||
)
|
||||
if not self.filesystem.has_semantic_channel(space):
|
||||
available = self.filesystem.semantic_retrieval_channels()
|
||||
available_text = ", ".join(available) if available else "none"
|
||||
raise PIFSCommandError(
|
||||
f"browse --space {space} is not available; available spaces: {available_text}"
|
||||
)
|
||||
normalized = self._normalize_folder_path(path)
|
||||
return self.filesystem.browse_semantic_files(
|
||||
normalized,
|
||||
query,
|
||||
retrieval_query=self._semantic_retrieval_query(query),
|
||||
recursive=recursive,
|
||||
space=space,
|
||||
page=page,
|
||||
page_size=self.BROWSE_PAGE_SIZE,
|
||||
metadata_filter=where,
|
||||
)
|
||||
|
||||
def _cmd_find(self, args: list[str]) -> Any:
|
||||
path = "/"
|
||||
where = None
|
||||
|
|
|
|||
|
|
@ -331,6 +331,130 @@ class PageIndexFileSystem:
|
|||
max_depth=max_depth,
|
||||
)
|
||||
|
||||
def browse_semantic_files(
|
||||
self,
|
||||
path: str,
|
||||
query: str,
|
||||
*,
|
||||
retrieval_query: str | None = None,
|
||||
recursive: bool = False,
|
||||
space: str = "summary",
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
metadata_filter: Optional[dict[str, Any] | str] = None,
|
||||
) -> dict[str, Any]:
|
||||
path = normalize_path(path)
|
||||
self.store.folder_info(path)
|
||||
query_text = self._query_text(retrieval_query or query).strip()
|
||||
if not query_text:
|
||||
raise ValueError("browse requires a query")
|
||||
if page < 1:
|
||||
raise ValueError("browse --page must be at least 1")
|
||||
if page_size < 1:
|
||||
raise ValueError("browse page_size must be at least 1")
|
||||
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
raise ValueError(
|
||||
"Unsupported browse --space: "
|
||||
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
|
||||
)
|
||||
available_spaces = self.semantic_retrieval_channels()
|
||||
if space not in available_spaces:
|
||||
available = ", ".join(available_spaces) if available_spaces else "none"
|
||||
raise ValueError(
|
||||
f"browse --space {space} is not available; available spaces: {available}"
|
||||
)
|
||||
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
|
||||
if search_channel is None:
|
||||
available = ", ".join(available_spaces) if available_spaces else "none"
|
||||
raise ValueError(
|
||||
f"browse --space {space} is not available; available spaces: {available}"
|
||||
)
|
||||
parsed_filter = self.metadata.parse_filter(metadata_filter)
|
||||
scope = {"folder_path": path, "recursive": recursive}
|
||||
scope_file_refs = self.store.file_refs_for_scope(
|
||||
scope=scope,
|
||||
metadata_filter=parsed_filter,
|
||||
)
|
||||
offset = (page - 1) * page_size
|
||||
needed = offset + page_size + 1
|
||||
semantic_filters = self._semantic_filters_for_scope(scope)
|
||||
semantic_filters["file_ref"] = scope_file_refs
|
||||
candidates = (
|
||||
search_channel(
|
||||
space,
|
||||
query_text,
|
||||
limit=needed,
|
||||
filters=semantic_filters,
|
||||
)
|
||||
if scope_file_refs
|
||||
else []
|
||||
)
|
||||
scope_file_ref_set = set(scope_file_refs)
|
||||
rows: list[dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
for candidate in candidates:
|
||||
try:
|
||||
file_ref = self.store.resolve_file_ref(candidate.document_id)
|
||||
except KeyError:
|
||||
continue
|
||||
if file_ref in seen:
|
||||
continue
|
||||
if file_ref not in scope_file_ref_set:
|
||||
continue
|
||||
if not self.store.file_matches(
|
||||
file_ref,
|
||||
scope=scope,
|
||||
metadata_filter=parsed_filter,
|
||||
):
|
||||
continue
|
||||
seen.add(file_ref)
|
||||
entry = self.store.get_file(file_ref)
|
||||
folder_paths = [
|
||||
folder["path"]
|
||||
for folder in self.store.folder_memberships(file_ref)
|
||||
]
|
||||
rank = len(rows) + 1
|
||||
rows.append(
|
||||
{
|
||||
"rank": rank,
|
||||
"similarity": self._semantic_candidate_similarity(candidate),
|
||||
"score": self._semantic_candidate_score(candidate),
|
||||
"path": self._stable_file_locator(file_ref, entry),
|
||||
"file_ref": file_ref,
|
||||
"document_id": entry.external_id,
|
||||
"external_id": entry.external_id,
|
||||
"title": entry.title,
|
||||
"source_path": entry.source_path,
|
||||
"folder_path": self._preferred_folder_path(
|
||||
folder_paths,
|
||||
path,
|
||||
entry.folder_path,
|
||||
),
|
||||
"folder_paths": folder_paths,
|
||||
"summary": str((entry.metadata or {}).get("summary") or ""),
|
||||
"snippet": str(getattr(candidate, "snippet", "") or entry.descriptor),
|
||||
"metadata": entry.metadata,
|
||||
"metadata_status": entry.metadata_status,
|
||||
"sources": list(getattr(candidate, "sources", []) or []),
|
||||
}
|
||||
)
|
||||
if len(rows) >= needed:
|
||||
break
|
||||
page_rows = rows[offset : offset + page_size]
|
||||
return {
|
||||
"mode": "files",
|
||||
"retrieval": f"{space}_vector",
|
||||
"query": query,
|
||||
"scope": path,
|
||||
"recursive": recursive,
|
||||
"space": space,
|
||||
"available_spaces": list(available_spaces),
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"has_more": len(rows) > offset + page_size,
|
||||
"data": page_rows,
|
||||
}
|
||||
|
||||
def folder_info(self, path: str = "/") -> dict[str, Any]:
|
||||
return self.store.folder_info(path)
|
||||
|
||||
|
|
@ -1515,6 +1639,45 @@ class PageIndexFileSystem:
|
|||
break
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _semantic_candidate_score(candidate: Any) -> float | None:
|
||||
try:
|
||||
return float(getattr(candidate, "score"))
|
||||
except (AttributeError, TypeError, ValueError):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _semantic_candidate_similarity(cls, candidate: Any) -> float:
|
||||
distances: list[float] = []
|
||||
for source in getattr(candidate, "sources", []) or []:
|
||||
if not isinstance(source, dict) or source.get("distance") is None:
|
||||
continue
|
||||
try:
|
||||
distances.append(float(source["distance"]))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if distances:
|
||||
distance = max(min(distances), 0.0)
|
||||
return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4)
|
||||
score = cls._semantic_candidate_score(candidate)
|
||||
if score is None:
|
||||
return 0.0
|
||||
return round(max(0.0, min(1.0, score)), 4)
|
||||
|
||||
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
|
||||
source_path = str(getattr(entry, "source_path", "") or "").strip()
|
||||
if source_path:
|
||||
target = "/" + source_path.strip("/")
|
||||
try:
|
||||
if self.store.resolve_file_ref(target) == file_ref:
|
||||
return target
|
||||
except KeyError:
|
||||
pass
|
||||
external_id = str(getattr(entry, "external_id", "") or "").strip()
|
||||
if external_id:
|
||||
return external_id
|
||||
return file_ref
|
||||
|
||||
@staticmethod
|
||||
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
|
||||
source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")
|
||||
|
|
|
|||
|
|
@ -159,15 +159,29 @@ class SQLiteVecSemanticIndex:
|
|||
raise SemanticIndexError(
|
||||
f"query vector dimension mismatch: expected {dimension}, got {len(vector)}"
|
||||
)
|
||||
fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1)))
|
||||
source_types = _source_type_filters(filters or {})
|
||||
raw_filters = filters or {}
|
||||
source_types = _source_type_filters(raw_filters)
|
||||
file_refs = _file_ref_filters(raw_filters)
|
||||
if file_refs == []:
|
||||
return []
|
||||
with self.connect() as conn:
|
||||
if file_refs is not None:
|
||||
_install_file_ref_filter_table(conn, file_refs)
|
||||
rows = []
|
||||
if source_types:
|
||||
for source_type in source_types:
|
||||
fetch_k = self._search_fetch_k(
|
||||
conn,
|
||||
limit,
|
||||
fetch_multiplier,
|
||||
exact_file_ref_filter=file_refs is not None,
|
||||
source_type=source_type,
|
||||
)
|
||||
if fetch_k <= 0:
|
||||
continue
|
||||
rows.extend(
|
||||
conn.execute(
|
||||
"""
|
||||
f"""
|
||||
SELECT
|
||||
d.file_ref,
|
||||
d.external_id,
|
||||
|
|
@ -180,6 +194,7 @@ class SQLiteVecSemanticIndex:
|
|||
FROM semantic_index_vec v
|
||||
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
||||
WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ?
|
||||
{_file_ref_filter_sql(file_refs)}
|
||||
ORDER BY v.distance
|
||||
""",
|
||||
(sqlite_vec.serialize_float32(vector), fetch_k, source_type),
|
||||
|
|
@ -187,8 +202,16 @@ class SQLiteVecSemanticIndex:
|
|||
)
|
||||
rows.sort(key=lambda row: float(row["distance"]))
|
||||
else:
|
||||
fetch_k = self._search_fetch_k(
|
||||
conn,
|
||||
limit,
|
||||
fetch_multiplier,
|
||||
exact_file_ref_filter=file_refs is not None,
|
||||
)
|
||||
if fetch_k <= 0:
|
||||
return []
|
||||
rows = conn.execute(
|
||||
"""
|
||||
f"""
|
||||
SELECT
|
||||
d.file_ref,
|
||||
d.external_id,
|
||||
|
|
@ -201,6 +224,7 @@ class SQLiteVecSemanticIndex:
|
|||
FROM semantic_index_vec v
|
||||
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
||||
WHERE v.embedding MATCH ? AND k = ?
|
||||
{_file_ref_filter_sql(file_refs)}
|
||||
ORDER BY v.distance
|
||||
""",
|
||||
(sqlite_vec.serialize_float32(vector), fetch_k),
|
||||
|
|
@ -226,6 +250,30 @@ class SQLiteVecSemanticIndex:
|
|||
break
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _search_fetch_k(
|
||||
conn: sqlite3.Connection,
|
||||
limit: int,
|
||||
fetch_multiplier: int,
|
||||
*,
|
||||
exact_file_ref_filter: bool,
|
||||
source_type: str | None = None,
|
||||
) -> int:
|
||||
if exact_file_ref_filter:
|
||||
where = []
|
||||
params: list[Any] = []
|
||||
if source_type is not None:
|
||||
where.append("source_type = ?")
|
||||
params.append(source_type)
|
||||
where_sql = "WHERE " + " AND ".join(where) if where else ""
|
||||
return int(
|
||||
conn.execute(
|
||||
f"SELECT COUNT(*) FROM semantic_index_docs {where_sql}",
|
||||
params,
|
||||
).fetchone()[0]
|
||||
)
|
||||
return min(4096, max(limit, limit * max(fetch_multiplier, 1)))
|
||||
|
||||
def info(self) -> dict[str, Any]:
|
||||
with self.connect() as conn:
|
||||
config = {
|
||||
|
|
@ -344,7 +392,8 @@ def _matches_filters(
|
|||
filters: dict[str, Any],
|
||||
) -> bool:
|
||||
for key, expected in filters.items():
|
||||
actual = row[key] if key in row.keys() else metadata.get(key)
|
||||
actual_key = "file_ref" if key == "file_refs" else key
|
||||
actual = row[actual_key] if actual_key in row.keys() else metadata.get(actual_key)
|
||||
if isinstance(expected, list):
|
||||
if str(actual) not in {str(item) for item in expected}:
|
||||
return False
|
||||
|
|
@ -360,3 +409,41 @@ def _source_type_filters(filters: dict[str, Any]) -> list[str]:
|
|||
if isinstance(value, list):
|
||||
return [str(item) for item in value if str(item)]
|
||||
return [str(value)] if str(value) else []
|
||||
|
||||
|
||||
def _file_ref_filters(filters: dict[str, Any]) -> list[str] | None:
|
||||
if "file_ref" in filters:
|
||||
value = filters.get("file_ref")
|
||||
elif "file_refs" in filters:
|
||||
value = filters.get("file_refs")
|
||||
else:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return [str(item) for item in value if str(item)]
|
||||
return [str(value)] if str(value) else []
|
||||
|
||||
|
||||
def _install_file_ref_filter_table(conn: sqlite3.Connection, file_refs: list[str]) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TEMP TABLE IF NOT EXISTS semantic_index_filter_file_refs (
|
||||
file_ref TEXT PRIMARY KEY
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("DELETE FROM semantic_index_filter_file_refs")
|
||||
conn.executemany(
|
||||
"INSERT OR IGNORE INTO semantic_index_filter_file_refs(file_ref) VALUES (?)",
|
||||
[(file_ref,) for file_ref in file_refs],
|
||||
)
|
||||
|
||||
|
||||
def _file_ref_filter_sql(file_refs: list[str] | None) -> str:
|
||||
if file_refs is None:
|
||||
return ""
|
||||
return (
|
||||
"AND EXISTS ("
|
||||
"SELECT 1 FROM semantic_index_filter_file_refs scope_refs "
|
||||
"WHERE scope_refs.file_ref = d.file_ref"
|
||||
")"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -753,6 +753,33 @@ class SQLiteFileSystemStore:
|
|||
return results
|
||||
return results
|
||||
|
||||
def file_refs_for_scope(
|
||||
self,
|
||||
*,
|
||||
scope: Optional[dict[str, Any]] = None,
|
||||
metadata_filter: Optional[dict[str, Any]] = None,
|
||||
) -> list[str]:
|
||||
where = ["f.deleted_at IS NULL"]
|
||||
params: list[Any] = []
|
||||
scope_sql, scope_params = self._scope_sql(scope)
|
||||
if scope_sql:
|
||||
where.append(scope_sql)
|
||||
params.extend(scope_params)
|
||||
metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
|
||||
where.extend(metadata_sql)
|
||||
params.extend(metadata_params)
|
||||
with self.connect() as conn:
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT DISTINCT f.file_ref
|
||||
FROM files f
|
||||
WHERE {" AND ".join(where)}
|
||||
ORDER BY f.file_ref
|
||||
""",
|
||||
params,
|
||||
).fetchall()
|
||||
return [row["file_ref"] for row in rows]
|
||||
|
||||
def _search_once(
|
||||
self,
|
||||
match_query: str | None,
|
||||
|
|
|
|||
|
|
@ -69,6 +69,257 @@ class ChannelBackend:
|
|||
]
|
||||
|
||||
|
||||
class BrowseBackend:
|
||||
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
|
||||
self.document_ids = list(document_ids)
|
||||
self.channels = channels
|
||||
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
|
||||
self.calls = []
|
||||
|
||||
def available_channels(self):
|
||||
return self.channels
|
||||
|
||||
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||
self.calls.append((channel, query, limit, filters))
|
||||
file_ref_filter = set()
|
||||
if isinstance(filters, dict):
|
||||
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
|
||||
if isinstance(raw_file_refs, str):
|
||||
file_ref_filter = {raw_file_refs}
|
||||
else:
|
||||
file_ref_filter = {str(item) for item in raw_file_refs}
|
||||
document_ids = self.document_ids
|
||||
if file_ref_filter and self.file_refs_by_document_id:
|
||||
document_ids = [
|
||||
document_id
|
||||
for document_id in document_ids
|
||||
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
|
||||
]
|
||||
return [
|
||||
SimpleNamespace(
|
||||
document_id=document_id,
|
||||
snippet=f"{channel} candidate {rank}: {query}",
|
||||
score=1.0 - rank * 0.01,
|
||||
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
|
||||
)
|
||||
for rank, document_id in enumerate(document_ids[:limit], 1)
|
||||
]
|
||||
|
||||
|
||||
def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"):
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
values = {
|
||||
"summary": f"summary for {document.external_id}",
|
||||
"doc_type": "memo",
|
||||
"domain": "finance",
|
||||
"topic": "risk",
|
||||
}
|
||||
return MetadataGenerationResult(
|
||||
values={field: values[field] for field in fields if field in values}
|
||||
)
|
||||
|
||||
filesystem.metadata_generator = SummaryGenerator()
|
||||
return filesystem.register_file(
|
||||
storage_uri=f"file:///tmp/{external_id}.txt",
|
||||
source_path=f"documents/{external_id}.txt",
|
||||
folder_path=folder_path,
|
||||
external_id=external_id,
|
||||
title=f"{external_id}.txt",
|
||||
content=f"{external_id} discusses vector databases and retrieval.",
|
||||
metadata={"department": department},
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_browse_is_agent_visible_semantic_command(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
assert "browse" in executor.allowed_commands()
|
||||
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
|
||||
|
||||
|
||||
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="browse requires a query"):
|
||||
executor.execute("browse /documents")
|
||||
with pytest.raises(PIFSCommandError, match="--query"):
|
||||
executor.execute('browse /documents "vector database" --query "other"')
|
||||
with pytest.raises(PIFSCommandError, match="--limit"):
|
||||
executor.execute('browse /documents "vector database" --limit 10')
|
||||
with pytest.raises(PIFSCommandError, match="--offset"):
|
||||
executor.execute('browse /documents "vector database" --offset 10')
|
||||
with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
|
||||
executor.execute("browse /documents vector database")
|
||||
|
||||
|
||||
def test_browse_validates_space_availability_and_page(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
|
||||
executor.execute('browse /documents "vector database" --space hybrid')
|
||||
with pytest.raises(PIFSCommandError, match="available spaces: summary"):
|
||||
executor.execute('browse /documents "vector database" --space entity')
|
||||
with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
|
||||
executor.execute('browse /documents "vector database" --page 0')
|
||||
|
||||
|
||||
def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||
backend = BrowseBackend(["doc_direct"], channels=("entity",))
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="available spaces: entity"):
|
||||
executor.execute('browse /documents "vector database"')
|
||||
assert backend.calls == []
|
||||
|
||||
result = json.loads(
|
||||
executor.execute('browse /documents "vector database" --space entity')
|
||||
)["data"]
|
||||
assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
|
||||
assert backend.calls[-1][0] == "entity"
|
||||
|
||||
|
||||
def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||
_register_browse_file(filesystem, "doc_deep", "/documents/reports")
|
||||
backend = BrowseBackend(["doc_deep", "doc_direct"])
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
||||
assert direct["recursive"] is False
|
||||
assert direct["space"] == "summary"
|
||||
assert direct["page"] == 1
|
||||
assert direct["page_size"] == 10
|
||||
assert backend.calls[-1][0] == "summary"
|
||||
|
||||
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in recursive["data"]] == [
|
||||
"doc_deep",
|
||||
"doc_direct",
|
||||
]
|
||||
assert [item["rank"] for item in recursive["data"]] == [1, 2]
|
||||
assert recursive["recursive"] is True
|
||||
|
||||
|
||||
def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
document_ids = []
|
||||
for index in range(12):
|
||||
external_id = f"doc_{index:02d}"
|
||||
document_ids.append(external_id)
|
||||
department = "finance" if index == 10 else "ops"
|
||||
_register_browse_file(filesystem, external_id, "/documents", department=department)
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||
assert len(first_page["data"]) == 10
|
||||
assert first_page["has_more"] is True
|
||||
assert first_page["data"][0]["rank"] == 1
|
||||
|
||||
second_page = json.loads(
|
||||
executor.execute('browse /documents "vector database" --page 2')
|
||||
)["data"]
|
||||
assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
|
||||
assert [item["rank"] for item in second_page["data"]] == [11, 12]
|
||||
assert second_page["has_more"] is False
|
||||
|
||||
filtered = json.loads(
|
||||
executor.execute(
|
||||
'browse /documents "vector database" --where \'{"department":"finance"}\''
|
||||
)
|
||||
)["data"]
|
||||
assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
|
||||
assert filtered["data"][0]["summary"] == "summary for doc_10"
|
||||
|
||||
|
||||
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
file_refs_by_document_id = {}
|
||||
candidate_ids = []
|
||||
for index in range(150):
|
||||
external_id = f"off_scope_{index:02d}"
|
||||
candidate_ids.append(external_id)
|
||||
file_refs_by_document_id[external_id] = _register_browse_file(
|
||||
filesystem,
|
||||
external_id,
|
||||
"/other",
|
||||
)
|
||||
file_refs_by_document_id["doc_deep"] = _register_browse_file(
|
||||
filesystem,
|
||||
"doc_deep",
|
||||
"/documents/reports",
|
||||
)
|
||||
file_refs_by_document_id["doc_direct"] = _register_browse_file(
|
||||
filesystem,
|
||||
"doc_direct",
|
||||
"/documents",
|
||||
)
|
||||
backend = BrowseBackend(
|
||||
[*candidate_ids, "doc_deep", "doc_direct"],
|
||||
file_refs_by_document_id=file_refs_by_document_id,
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
||||
|
||||
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in recursive["data"]] == [
|
||||
"doc_deep",
|
||||
"doc_direct",
|
||||
]
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
|
|||
|
|
@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|||
assert [item.external_id for item in filtered] == ["doc_b"]
|
||||
|
||||
|
||||
def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path):
|
||||
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||
index.reset(dimension=2, metadata={"field_mode": "summary"})
|
||||
|
||||
records = [
|
||||
SemanticIndexRecord(
|
||||
file_ref=f"file_off_{item:02d}",
|
||||
external_id=f"doc_off_{item:02d}",
|
||||
source_type="documents",
|
||||
source_path=f"other/{item:02d}.pdf",
|
||||
title=f"Off scope {item:02d}",
|
||||
text="off scope",
|
||||
vector=[1.0, 0.0],
|
||||
)
|
||||
for item in range(30)
|
||||
]
|
||||
records.append(
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_in_scope",
|
||||
external_id="doc_in_scope",
|
||||
source_type="documents",
|
||||
source_path="documents/in-scope.pdf",
|
||||
title="In scope",
|
||||
text="in scope",
|
||||
vector=[0.0, 1.0],
|
||||
)
|
||||
)
|
||||
index.upsert_many(records)
|
||||
|
||||
results = index.search(
|
||||
[1.0, 0.0],
|
||||
limit=1,
|
||||
filters={"file_ref": ["file_in_scope"]},
|
||||
)
|
||||
|
||||
assert [item.file_ref for item in results] == ["file_in_scope"]
|
||||
|
||||
|
||||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue