mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-21 20:18:09 +02:00
Merge Goal 2: semantic browse command
Merge the unified browse command implementation into feat/pageindex-filesystem.
This commit is contained in:
commit
d9ce184e50
6 changed files with 649 additions and 5 deletions
|
|
@ -23,6 +23,7 @@ class PIFSCommandExecutor:
|
||||||
"tree",
|
"tree",
|
||||||
"find",
|
"find",
|
||||||
"grep",
|
"grep",
|
||||||
|
"browse",
|
||||||
"cat",
|
"cat",
|
||||||
"stat",
|
"stat",
|
||||||
"head",
|
"head",
|
||||||
|
|
@ -53,6 +54,7 @@ class PIFSCommandExecutor:
|
||||||
MAX_FIND_LIMIT = 50
|
MAX_FIND_LIMIT = 50
|
||||||
MAX_GREP_LIMIT = 20
|
MAX_GREP_LIMIT = 20
|
||||||
MAX_SEMANTIC_LIMIT = 20
|
MAX_SEMANTIC_LIMIT = 20
|
||||||
|
BROWSE_PAGE_SIZE = 10
|
||||||
MAX_TEXT_LINES = 100
|
MAX_TEXT_LINES = 100
|
||||||
MAX_PAGE_SPAN = 5
|
MAX_PAGE_SPAN = 5
|
||||||
MAX_STRUCTURE_NODES = 25
|
MAX_STRUCTURE_NODES = 25
|
||||||
|
|
@ -102,6 +104,8 @@ class PIFSCommandExecutor:
|
||||||
"Available command surfaces for this workspace:",
|
"Available command surfaces for this workspace:",
|
||||||
"- mode: read-only inspection",
|
"- mode: read-only inspection",
|
||||||
"- ls/tree: folder browsing",
|
"- ls/tree: folder browsing",
|
||||||
|
'- browse [-R] <folder> "<query>" [--space summary|entity|relation] '
|
||||||
|
"[--page N] [--where JSON]: semantic relevance file browsing",
|
||||||
"- find <folder>: folder path is positional; do not put paths in --where",
|
"- find <folder>: folder path is positional; do not put paths in --where",
|
||||||
"- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
|
"- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
|
||||||
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
|
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
|
||||||
|
|
@ -264,6 +268,80 @@ class PIFSCommandExecutor:
|
||||||
listing = self.filesystem.browse(path, recursive=True, limit=limit)
|
listing = self.filesystem.browse(path, recursive=True, limit=limit)
|
||||||
return {"path": path, "depth": depth, "limit": limit, **listing}
|
return {"path": path, "depth": depth, "limit": limit, **listing}
|
||||||
|
|
||||||
|
def _cmd_browse(self, args: list[str]) -> Any:
|
||||||
|
recursive = False
|
||||||
|
where = None
|
||||||
|
space = "summary"
|
||||||
|
page = 1
|
||||||
|
positionals = []
|
||||||
|
i = 0
|
||||||
|
while i < len(args):
|
||||||
|
arg = args[i]
|
||||||
|
if arg in {"-R", "-r", "--recursive"}:
|
||||||
|
recursive = True
|
||||||
|
elif arg == "--where":
|
||||||
|
i += 1
|
||||||
|
if i >= len(args):
|
||||||
|
raise PIFSCommandError("browse --where requires a JSON value")
|
||||||
|
where = args[i]
|
||||||
|
elif arg == "--space":
|
||||||
|
i += 1
|
||||||
|
if i >= len(args):
|
||||||
|
raise PIFSCommandError("browse --space requires a value")
|
||||||
|
space = args[i]
|
||||||
|
elif arg == "--page":
|
||||||
|
i += 1
|
||||||
|
if i >= len(args):
|
||||||
|
raise PIFSCommandError("browse --page requires a value")
|
||||||
|
page = self._parse_non_negative_int(args[i], "browse --page")
|
||||||
|
elif arg in {"--limit", "--offset", "--query"}:
|
||||||
|
raise PIFSCommandError(
|
||||||
|
f"browse does not support {arg}; use fixed page size "
|
||||||
|
f"{self.BROWSE_PAGE_SIZE} and --page N"
|
||||||
|
)
|
||||||
|
elif arg.startswith("-"):
|
||||||
|
raise PIFSCommandError(f"Unsupported browse option: {arg}")
|
||||||
|
else:
|
||||||
|
positionals.append(arg)
|
||||||
|
i += 1
|
||||||
|
if len(positionals) < 2:
|
||||||
|
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
|
||||||
|
if len(positionals) > 2:
|
||||||
|
raise PIFSCommandError(
|
||||||
|
'browse accepts a folder and one quoted query, for example: '
|
||||||
|
'browse /documents "Federal Reserve"'
|
||||||
|
)
|
||||||
|
path, query = positionals
|
||||||
|
if not str(path).startswith("/"):
|
||||||
|
raise PIFSCommandError("browse target must be a PIFS folder path like /documents")
|
||||||
|
query = str(query or "").strip()
|
||||||
|
if not query:
|
||||||
|
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
|
||||||
|
if page < 1:
|
||||||
|
raise PIFSCommandError("browse --page must be at least 1")
|
||||||
|
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||||
|
raise PIFSCommandError(
|
||||||
|
"Unsupported browse --space: "
|
||||||
|
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
|
||||||
|
)
|
||||||
|
if not self.filesystem.has_semantic_channel(space):
|
||||||
|
available = self.filesystem.semantic_retrieval_channels()
|
||||||
|
available_text = ", ".join(available) if available else "none"
|
||||||
|
raise PIFSCommandError(
|
||||||
|
f"browse --space {space} is not available; available spaces: {available_text}"
|
||||||
|
)
|
||||||
|
normalized = self._normalize_folder_path(path)
|
||||||
|
return self.filesystem.browse_semantic_files(
|
||||||
|
normalized,
|
||||||
|
query,
|
||||||
|
retrieval_query=self._semantic_retrieval_query(query),
|
||||||
|
recursive=recursive,
|
||||||
|
space=space,
|
||||||
|
page=page,
|
||||||
|
page_size=self.BROWSE_PAGE_SIZE,
|
||||||
|
metadata_filter=where,
|
||||||
|
)
|
||||||
|
|
||||||
def _cmd_find(self, args: list[str]) -> Any:
|
def _cmd_find(self, args: list[str]) -> Any:
|
||||||
path = "/"
|
path = "/"
|
||||||
where = None
|
where = None
|
||||||
|
|
|
||||||
|
|
@ -331,6 +331,130 @@ class PageIndexFileSystem:
|
||||||
max_depth=max_depth,
|
max_depth=max_depth,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def browse_semantic_files(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
retrieval_query: str | None = None,
|
||||||
|
recursive: bool = False,
|
||||||
|
space: str = "summary",
|
||||||
|
page: int = 1,
|
||||||
|
page_size: int = 10,
|
||||||
|
metadata_filter: Optional[dict[str, Any] | str] = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
path = normalize_path(path)
|
||||||
|
self.store.folder_info(path)
|
||||||
|
query_text = self._query_text(retrieval_query or query).strip()
|
||||||
|
if not query_text:
|
||||||
|
raise ValueError("browse requires a query")
|
||||||
|
if page < 1:
|
||||||
|
raise ValueError("browse --page must be at least 1")
|
||||||
|
if page_size < 1:
|
||||||
|
raise ValueError("browse page_size must be at least 1")
|
||||||
|
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||||
|
raise ValueError(
|
||||||
|
"Unsupported browse --space: "
|
||||||
|
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
|
||||||
|
)
|
||||||
|
available_spaces = self.semantic_retrieval_channels()
|
||||||
|
if space not in available_spaces:
|
||||||
|
available = ", ".join(available_spaces) if available_spaces else "none"
|
||||||
|
raise ValueError(
|
||||||
|
f"browse --space {space} is not available; available spaces: {available}"
|
||||||
|
)
|
||||||
|
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
|
||||||
|
if search_channel is None:
|
||||||
|
available = ", ".join(available_spaces) if available_spaces else "none"
|
||||||
|
raise ValueError(
|
||||||
|
f"browse --space {space} is not available; available spaces: {available}"
|
||||||
|
)
|
||||||
|
parsed_filter = self.metadata.parse_filter(metadata_filter)
|
||||||
|
scope = {"folder_path": path, "recursive": recursive}
|
||||||
|
scope_file_refs = self.store.file_refs_for_scope(
|
||||||
|
scope=scope,
|
||||||
|
metadata_filter=parsed_filter,
|
||||||
|
)
|
||||||
|
offset = (page - 1) * page_size
|
||||||
|
needed = offset + page_size + 1
|
||||||
|
semantic_filters = self._semantic_filters_for_scope(scope)
|
||||||
|
semantic_filters["file_ref"] = scope_file_refs
|
||||||
|
candidates = (
|
||||||
|
search_channel(
|
||||||
|
space,
|
||||||
|
query_text,
|
||||||
|
limit=needed,
|
||||||
|
filters=semantic_filters,
|
||||||
|
)
|
||||||
|
if scope_file_refs
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
scope_file_ref_set = set(scope_file_refs)
|
||||||
|
rows: list[dict[str, Any]] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for candidate in candidates:
|
||||||
|
try:
|
||||||
|
file_ref = self.store.resolve_file_ref(candidate.document_id)
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
if file_ref in seen:
|
||||||
|
continue
|
||||||
|
if file_ref not in scope_file_ref_set:
|
||||||
|
continue
|
||||||
|
if not self.store.file_matches(
|
||||||
|
file_ref,
|
||||||
|
scope=scope,
|
||||||
|
metadata_filter=parsed_filter,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
seen.add(file_ref)
|
||||||
|
entry = self.store.get_file(file_ref)
|
||||||
|
folder_paths = [
|
||||||
|
folder["path"]
|
||||||
|
for folder in self.store.folder_memberships(file_ref)
|
||||||
|
]
|
||||||
|
rank = len(rows) + 1
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"rank": rank,
|
||||||
|
"similarity": self._semantic_candidate_similarity(candidate),
|
||||||
|
"score": self._semantic_candidate_score(candidate),
|
||||||
|
"path": self._stable_file_locator(file_ref, entry),
|
||||||
|
"file_ref": file_ref,
|
||||||
|
"document_id": entry.external_id,
|
||||||
|
"external_id": entry.external_id,
|
||||||
|
"title": entry.title,
|
||||||
|
"source_path": entry.source_path,
|
||||||
|
"folder_path": self._preferred_folder_path(
|
||||||
|
folder_paths,
|
||||||
|
path,
|
||||||
|
entry.folder_path,
|
||||||
|
),
|
||||||
|
"folder_paths": folder_paths,
|
||||||
|
"summary": str((entry.metadata or {}).get("summary") or ""),
|
||||||
|
"snippet": str(getattr(candidate, "snippet", "") or entry.descriptor),
|
||||||
|
"metadata": entry.metadata,
|
||||||
|
"metadata_status": entry.metadata_status,
|
||||||
|
"sources": list(getattr(candidate, "sources", []) or []),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if len(rows) >= needed:
|
||||||
|
break
|
||||||
|
page_rows = rows[offset : offset + page_size]
|
||||||
|
return {
|
||||||
|
"mode": "files",
|
||||||
|
"retrieval": f"{space}_vector",
|
||||||
|
"query": query,
|
||||||
|
"scope": path,
|
||||||
|
"recursive": recursive,
|
||||||
|
"space": space,
|
||||||
|
"available_spaces": list(available_spaces),
|
||||||
|
"page": page,
|
||||||
|
"page_size": page_size,
|
||||||
|
"has_more": len(rows) > offset + page_size,
|
||||||
|
"data": page_rows,
|
||||||
|
}
|
||||||
|
|
||||||
def folder_info(self, path: str = "/") -> dict[str, Any]:
|
def folder_info(self, path: str = "/") -> dict[str, Any]:
|
||||||
return self.store.folder_info(path)
|
return self.store.folder_info(path)
|
||||||
|
|
||||||
|
|
@ -1515,6 +1639,45 @@ class PageIndexFileSystem:
|
||||||
break
|
break
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _semantic_candidate_score(candidate: Any) -> float | None:
|
||||||
|
try:
|
||||||
|
return float(getattr(candidate, "score"))
|
||||||
|
except (AttributeError, TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _semantic_candidate_similarity(cls, candidate: Any) -> float:
|
||||||
|
distances: list[float] = []
|
||||||
|
for source in getattr(candidate, "sources", []) or []:
|
||||||
|
if not isinstance(source, dict) or source.get("distance") is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
distances.append(float(source["distance"]))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if distances:
|
||||||
|
distance = max(min(distances), 0.0)
|
||||||
|
return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4)
|
||||||
|
score = cls._semantic_candidate_score(candidate)
|
||||||
|
if score is None:
|
||||||
|
return 0.0
|
||||||
|
return round(max(0.0, min(1.0, score)), 4)
|
||||||
|
|
||||||
|
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
|
||||||
|
source_path = str(getattr(entry, "source_path", "") or "").strip()
|
||||||
|
if source_path:
|
||||||
|
target = "/" + source_path.strip("/")
|
||||||
|
try:
|
||||||
|
if self.store.resolve_file_ref(target) == file_ref:
|
||||||
|
return target
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
external_id = str(getattr(entry, "external_id", "") or "").strip()
|
||||||
|
if external_id:
|
||||||
|
return external_id
|
||||||
|
return file_ref
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
|
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
|
||||||
source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")
|
source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")
|
||||||
|
|
|
||||||
|
|
@ -159,15 +159,29 @@ class SQLiteVecSemanticIndex:
|
||||||
raise SemanticIndexError(
|
raise SemanticIndexError(
|
||||||
f"query vector dimension mismatch: expected {dimension}, got {len(vector)}"
|
f"query vector dimension mismatch: expected {dimension}, got {len(vector)}"
|
||||||
)
|
)
|
||||||
fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1)))
|
raw_filters = filters or {}
|
||||||
source_types = _source_type_filters(filters or {})
|
source_types = _source_type_filters(raw_filters)
|
||||||
|
file_refs = _file_ref_filters(raw_filters)
|
||||||
|
if file_refs == []:
|
||||||
|
return []
|
||||||
with self.connect() as conn:
|
with self.connect() as conn:
|
||||||
|
if file_refs is not None:
|
||||||
|
_install_file_ref_filter_table(conn, file_refs)
|
||||||
rows = []
|
rows = []
|
||||||
if source_types:
|
if source_types:
|
||||||
for source_type in source_types:
|
for source_type in source_types:
|
||||||
|
fetch_k = self._search_fetch_k(
|
||||||
|
conn,
|
||||||
|
limit,
|
||||||
|
fetch_multiplier,
|
||||||
|
exact_file_ref_filter=file_refs is not None,
|
||||||
|
source_type=source_type,
|
||||||
|
)
|
||||||
|
if fetch_k <= 0:
|
||||||
|
continue
|
||||||
rows.extend(
|
rows.extend(
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT
|
SELECT
|
||||||
d.file_ref,
|
d.file_ref,
|
||||||
d.external_id,
|
d.external_id,
|
||||||
|
|
@ -180,6 +194,7 @@ class SQLiteVecSemanticIndex:
|
||||||
FROM semantic_index_vec v
|
FROM semantic_index_vec v
|
||||||
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
||||||
WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ?
|
WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ?
|
||||||
|
{_file_ref_filter_sql(file_refs)}
|
||||||
ORDER BY v.distance
|
ORDER BY v.distance
|
||||||
""",
|
""",
|
||||||
(sqlite_vec.serialize_float32(vector), fetch_k, source_type),
|
(sqlite_vec.serialize_float32(vector), fetch_k, source_type),
|
||||||
|
|
@ -187,8 +202,16 @@ class SQLiteVecSemanticIndex:
|
||||||
)
|
)
|
||||||
rows.sort(key=lambda row: float(row["distance"]))
|
rows.sort(key=lambda row: float(row["distance"]))
|
||||||
else:
|
else:
|
||||||
|
fetch_k = self._search_fetch_k(
|
||||||
|
conn,
|
||||||
|
limit,
|
||||||
|
fetch_multiplier,
|
||||||
|
exact_file_ref_filter=file_refs is not None,
|
||||||
|
)
|
||||||
|
if fetch_k <= 0:
|
||||||
|
return []
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
"""
|
f"""
|
||||||
SELECT
|
SELECT
|
||||||
d.file_ref,
|
d.file_ref,
|
||||||
d.external_id,
|
d.external_id,
|
||||||
|
|
@ -201,6 +224,7 @@ class SQLiteVecSemanticIndex:
|
||||||
FROM semantic_index_vec v
|
FROM semantic_index_vec v
|
||||||
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
JOIN semantic_index_docs d ON d.rowid = v.rowid
|
||||||
WHERE v.embedding MATCH ? AND k = ?
|
WHERE v.embedding MATCH ? AND k = ?
|
||||||
|
{_file_ref_filter_sql(file_refs)}
|
||||||
ORDER BY v.distance
|
ORDER BY v.distance
|
||||||
""",
|
""",
|
||||||
(sqlite_vec.serialize_float32(vector), fetch_k),
|
(sqlite_vec.serialize_float32(vector), fetch_k),
|
||||||
|
|
@ -226,6 +250,30 @@ class SQLiteVecSemanticIndex:
|
||||||
break
|
break
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _search_fetch_k(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
limit: int,
|
||||||
|
fetch_multiplier: int,
|
||||||
|
*,
|
||||||
|
exact_file_ref_filter: bool,
|
||||||
|
source_type: str | None = None,
|
||||||
|
) -> int:
|
||||||
|
if exact_file_ref_filter:
|
||||||
|
where = []
|
||||||
|
params: list[Any] = []
|
||||||
|
if source_type is not None:
|
||||||
|
where.append("source_type = ?")
|
||||||
|
params.append(source_type)
|
||||||
|
where_sql = "WHERE " + " AND ".join(where) if where else ""
|
||||||
|
return int(
|
||||||
|
conn.execute(
|
||||||
|
f"SELECT COUNT(*) FROM semantic_index_docs {where_sql}",
|
||||||
|
params,
|
||||||
|
).fetchone()[0]
|
||||||
|
)
|
||||||
|
return min(4096, max(limit, limit * max(fetch_multiplier, 1)))
|
||||||
|
|
||||||
def info(self) -> dict[str, Any]:
|
def info(self) -> dict[str, Any]:
|
||||||
with self.connect() as conn:
|
with self.connect() as conn:
|
||||||
config = {
|
config = {
|
||||||
|
|
@ -344,7 +392,8 @@ def _matches_filters(
|
||||||
filters: dict[str, Any],
|
filters: dict[str, Any],
|
||||||
) -> bool:
|
) -> bool:
|
||||||
for key, expected in filters.items():
|
for key, expected in filters.items():
|
||||||
actual = row[key] if key in row.keys() else metadata.get(key)
|
actual_key = "file_ref" if key == "file_refs" else key
|
||||||
|
actual = row[actual_key] if actual_key in row.keys() else metadata.get(actual_key)
|
||||||
if isinstance(expected, list):
|
if isinstance(expected, list):
|
||||||
if str(actual) not in {str(item) for item in expected}:
|
if str(actual) not in {str(item) for item in expected}:
|
||||||
return False
|
return False
|
||||||
|
|
@ -360,3 +409,41 @@ def _source_type_filters(filters: dict[str, Any]) -> list[str]:
|
||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
return [str(item) for item in value if str(item)]
|
return [str(item) for item in value if str(item)]
|
||||||
return [str(value)] if str(value) else []
|
return [str(value)] if str(value) else []
|
||||||
|
|
||||||
|
|
||||||
|
def _file_ref_filters(filters: dict[str, Any]) -> list[str] | None:
|
||||||
|
if "file_ref" in filters:
|
||||||
|
value = filters.get("file_ref")
|
||||||
|
elif "file_refs" in filters:
|
||||||
|
value = filters.get("file_refs")
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [str(item) for item in value if str(item)]
|
||||||
|
return [str(value)] if str(value) else []
|
||||||
|
|
||||||
|
|
||||||
|
def _install_file_ref_filter_table(conn: sqlite3.Connection, file_refs: list[str]) -> None:
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
CREATE TEMP TABLE IF NOT EXISTS semantic_index_filter_file_refs (
|
||||||
|
file_ref TEXT PRIMARY KEY
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
conn.execute("DELETE FROM semantic_index_filter_file_refs")
|
||||||
|
conn.executemany(
|
||||||
|
"INSERT OR IGNORE INTO semantic_index_filter_file_refs(file_ref) VALUES (?)",
|
||||||
|
[(file_ref,) for file_ref in file_refs],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _file_ref_filter_sql(file_refs: list[str] | None) -> str:
|
||||||
|
if file_refs is None:
|
||||||
|
return ""
|
||||||
|
return (
|
||||||
|
"AND EXISTS ("
|
||||||
|
"SELECT 1 FROM semantic_index_filter_file_refs scope_refs "
|
||||||
|
"WHERE scope_refs.file_ref = d.file_ref"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -753,6 +753,33 @@ class SQLiteFileSystemStore:
|
||||||
return results
|
return results
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def file_refs_for_scope(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
scope: Optional[dict[str, Any]] = None,
|
||||||
|
metadata_filter: Optional[dict[str, Any]] = None,
|
||||||
|
) -> list[str]:
|
||||||
|
where = ["f.deleted_at IS NULL"]
|
||||||
|
params: list[Any] = []
|
||||||
|
scope_sql, scope_params = self._scope_sql(scope)
|
||||||
|
if scope_sql:
|
||||||
|
where.append(scope_sql)
|
||||||
|
params.extend(scope_params)
|
||||||
|
metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
|
||||||
|
where.extend(metadata_sql)
|
||||||
|
params.extend(metadata_params)
|
||||||
|
with self.connect() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
f"""
|
||||||
|
SELECT DISTINCT f.file_ref
|
||||||
|
FROM files f
|
||||||
|
WHERE {" AND ".join(where)}
|
||||||
|
ORDER BY f.file_ref
|
||||||
|
""",
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
return [row["file_ref"] for row in rows]
|
||||||
|
|
||||||
def _search_once(
|
def _search_once(
|
||||||
self,
|
self,
|
||||||
match_query: str | None,
|
match_query: str | None,
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,257 @@ class ChannelBackend:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BrowseBackend:
|
||||||
|
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
|
||||||
|
self.document_ids = list(document_ids)
|
||||||
|
self.channels = channels
|
||||||
|
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
def available_channels(self):
|
||||||
|
return self.channels
|
||||||
|
|
||||||
|
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||||
|
self.calls.append((channel, query, limit, filters))
|
||||||
|
file_ref_filter = set()
|
||||||
|
if isinstance(filters, dict):
|
||||||
|
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
|
||||||
|
if isinstance(raw_file_refs, str):
|
||||||
|
file_ref_filter = {raw_file_refs}
|
||||||
|
else:
|
||||||
|
file_ref_filter = {str(item) for item in raw_file_refs}
|
||||||
|
document_ids = self.document_ids
|
||||||
|
if file_ref_filter and self.file_refs_by_document_id:
|
||||||
|
document_ids = [
|
||||||
|
document_id
|
||||||
|
for document_id in document_ids
|
||||||
|
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
|
||||||
|
]
|
||||||
|
return [
|
||||||
|
SimpleNamespace(
|
||||||
|
document_id=document_id,
|
||||||
|
snippet=f"{channel} candidate {rank}: {query}",
|
||||||
|
score=1.0 - rank * 0.01,
|
||||||
|
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
|
||||||
|
)
|
||||||
|
for rank, document_id in enumerate(document_ids[:limit], 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"):
|
||||||
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||||
|
|
||||||
|
class SummaryGenerator:
|
||||||
|
def generate(self, document, *, fields):
|
||||||
|
values = {
|
||||||
|
"summary": f"summary for {document.external_id}",
|
||||||
|
"doc_type": "memo",
|
||||||
|
"domain": "finance",
|
||||||
|
"topic": "risk",
|
||||||
|
}
|
||||||
|
return MetadataGenerationResult(
|
||||||
|
values={field: values[field] for field in fields if field in values}
|
||||||
|
)
|
||||||
|
|
||||||
|
filesystem.metadata_generator = SummaryGenerator()
|
||||||
|
return filesystem.register_file(
|
||||||
|
storage_uri=f"file:///tmp/{external_id}.txt",
|
||||||
|
source_path=f"documents/{external_id}.txt",
|
||||||
|
folder_path=folder_path,
|
||||||
|
external_id=external_id,
|
||||||
|
title=f"{external_id}.txt",
|
||||||
|
content=f"{external_id} discusses vector databases and retrieval.",
|
||||||
|
metadata={"department": department},
|
||||||
|
metadata_policy={
|
||||||
|
"fields": {
|
||||||
|
"summary": True,
|
||||||
|
"doc_type": False,
|
||||||
|
"domain": False,
|
||||||
|
"topic": False,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_is_agent_visible_semantic_command(tmp_path):
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
executor = PIFSCommandExecutor(filesystem)
|
||||||
|
|
||||||
|
assert "browse" in executor.allowed_commands()
|
||||||
|
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
from pageindex.filesystem.commands import PIFSCommandError
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||||
|
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
with pytest.raises(PIFSCommandError, match="browse requires a query"):
|
||||||
|
executor.execute("browse /documents")
|
||||||
|
with pytest.raises(PIFSCommandError, match="--query"):
|
||||||
|
executor.execute('browse /documents "vector database" --query "other"')
|
||||||
|
with pytest.raises(PIFSCommandError, match="--limit"):
|
||||||
|
executor.execute('browse /documents "vector database" --limit 10')
|
||||||
|
with pytest.raises(PIFSCommandError, match="--offset"):
|
||||||
|
executor.execute('browse /documents "vector database" --offset 10')
|
||||||
|
with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
|
||||||
|
executor.execute("browse /documents vector database")
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_validates_space_availability_and_page(tmp_path):
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
from pageindex.filesystem.commands import PIFSCommandError
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||||
|
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
|
||||||
|
executor.execute('browse /documents "vector database" --space hybrid')
|
||||||
|
with pytest.raises(PIFSCommandError, match="available spaces: summary"):
|
||||||
|
executor.execute('browse /documents "vector database" --space entity')
|
||||||
|
with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
|
||||||
|
executor.execute('browse /documents "vector database" --page 0')
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
from pageindex.filesystem.commands import PIFSCommandError
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||||
|
backend = BrowseBackend(["doc_direct"], channels=("entity",))
|
||||||
|
filesystem.semantic_retrieval_backend = backend
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
with pytest.raises(PIFSCommandError, match="available spaces: entity"):
|
||||||
|
executor.execute('browse /documents "vector database"')
|
||||||
|
assert backend.calls == []
|
||||||
|
|
||||||
|
result = json.loads(
|
||||||
|
executor.execute('browse /documents "vector database" --space entity')
|
||||||
|
)["data"]
|
||||||
|
assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
|
||||||
|
assert backend.calls[-1][0] == "entity"
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
||||||
|
_register_browse_file(filesystem, "doc_deep", "/documents/reports")
|
||||||
|
backend = BrowseBackend(["doc_deep", "doc_direct"])
|
||||||
|
filesystem.semantic_retrieval_backend = backend
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||||
|
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
||||||
|
assert direct["recursive"] is False
|
||||||
|
assert direct["space"] == "summary"
|
||||||
|
assert direct["page"] == 1
|
||||||
|
assert direct["page_size"] == 10
|
||||||
|
assert backend.calls[-1][0] == "summary"
|
||||||
|
|
||||||
|
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
||||||
|
assert [item["document_id"] for item in recursive["data"]] == [
|
||||||
|
"doc_deep",
|
||||||
|
"doc_direct",
|
||||||
|
]
|
||||||
|
assert [item["rank"] for item in recursive["data"]] == [1, 2]
|
||||||
|
assert recursive["recursive"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
document_ids = []
|
||||||
|
for index in range(12):
|
||||||
|
external_id = f"doc_{index:02d}"
|
||||||
|
document_ids.append(external_id)
|
||||||
|
department = "finance" if index == 10 else "ops"
|
||||||
|
_register_browse_file(filesystem, external_id, "/documents", department=department)
|
||||||
|
filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||||
|
assert len(first_page["data"]) == 10
|
||||||
|
assert first_page["has_more"] is True
|
||||||
|
assert first_page["data"][0]["rank"] == 1
|
||||||
|
|
||||||
|
second_page = json.loads(
|
||||||
|
executor.execute('browse /documents "vector database" --page 2')
|
||||||
|
)["data"]
|
||||||
|
assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
|
||||||
|
assert [item["rank"] for item in second_page["data"]] == [11, 12]
|
||||||
|
assert second_page["has_more"] is False
|
||||||
|
|
||||||
|
filtered = json.loads(
|
||||||
|
executor.execute(
|
||||||
|
'browse /documents "vector database" --where \'{"department":"finance"}\''
|
||||||
|
)
|
||||||
|
)["data"]
|
||||||
|
assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
|
||||||
|
assert filtered["data"][0]["summary"] == "summary for doc_10"
|
||||||
|
|
||||||
|
|
||||||
|
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
|
|
||||||
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||||
|
file_refs_by_document_id = {}
|
||||||
|
candidate_ids = []
|
||||||
|
for index in range(150):
|
||||||
|
external_id = f"off_scope_{index:02d}"
|
||||||
|
candidate_ids.append(external_id)
|
||||||
|
file_refs_by_document_id[external_id] = _register_browse_file(
|
||||||
|
filesystem,
|
||||||
|
external_id,
|
||||||
|
"/other",
|
||||||
|
)
|
||||||
|
file_refs_by_document_id["doc_deep"] = _register_browse_file(
|
||||||
|
filesystem,
|
||||||
|
"doc_deep",
|
||||||
|
"/documents/reports",
|
||||||
|
)
|
||||||
|
file_refs_by_document_id["doc_direct"] = _register_browse_file(
|
||||||
|
filesystem,
|
||||||
|
"doc_direct",
|
||||||
|
"/documents",
|
||||||
|
)
|
||||||
|
backend = BrowseBackend(
|
||||||
|
[*candidate_ids, "doc_deep", "doc_direct"],
|
||||||
|
file_refs_by_document_id=file_refs_by_document_id,
|
||||||
|
)
|
||||||
|
filesystem.semantic_retrieval_backend = backend
|
||||||
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||||
|
|
||||||
|
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||||
|
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
||||||
|
|
||||||
|
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
||||||
|
assert [item["document_id"] for item in recursive["data"]] == [
|
||||||
|
"doc_deep",
|
||||||
|
"doc_direct",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
||||||
assert [item.external_id for item in filtered] == ["doc_b"]
|
assert [item.external_id for item in filtered] == ["doc_b"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path):
|
||||||
|
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||||
|
index.reset(dimension=2, metadata={"field_mode": "summary"})
|
||||||
|
|
||||||
|
records = [
|
||||||
|
SemanticIndexRecord(
|
||||||
|
file_ref=f"file_off_{item:02d}",
|
||||||
|
external_id=f"doc_off_{item:02d}",
|
||||||
|
source_type="documents",
|
||||||
|
source_path=f"other/{item:02d}.pdf",
|
||||||
|
title=f"Off scope {item:02d}",
|
||||||
|
text="off scope",
|
||||||
|
vector=[1.0, 0.0],
|
||||||
|
)
|
||||||
|
for item in range(30)
|
||||||
|
]
|
||||||
|
records.append(
|
||||||
|
SemanticIndexRecord(
|
||||||
|
file_ref="file_in_scope",
|
||||||
|
external_id="doc_in_scope",
|
||||||
|
source_type="documents",
|
||||||
|
source_path="documents/in-scope.pdf",
|
||||||
|
title="In scope",
|
||||||
|
text="in scope",
|
||||||
|
vector=[0.0, 1.0],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
index.upsert_many(records)
|
||||||
|
|
||||||
|
results = index.search(
|
||||||
|
[1.0, 0.0],
|
||||||
|
limit=1,
|
||||||
|
filters={"file_ref": ["file_in_scope"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [item.file_ref for item in results] == ["file_in_scope"]
|
||||||
|
|
||||||
|
|
||||||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue