Merge Goal 2: semantic browse command

Merge the unified browse command implementation into feat/pageindex-filesystem.
This commit is contained in:
Bukely_ 2026-05-31 21:40:26 +08:00 committed by GitHub
commit d9ce184e50
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 649 additions and 5 deletions

View file

@ -23,6 +23,7 @@ class PIFSCommandExecutor:
"tree",
"find",
"grep",
"browse",
"cat",
"stat",
"head",
@ -53,6 +54,7 @@ class PIFSCommandExecutor:
MAX_FIND_LIMIT = 50
MAX_GREP_LIMIT = 20
MAX_SEMANTIC_LIMIT = 20
BROWSE_PAGE_SIZE = 10
MAX_TEXT_LINES = 100
MAX_PAGE_SPAN = 5
MAX_STRUCTURE_NODES = 25
@ -102,6 +104,8 @@ class PIFSCommandExecutor:
"Available command surfaces for this workspace:",
"- mode: read-only inspection",
"- ls/tree: folder browsing",
'- browse [-R] <folder> "<query>" [--space summary|entity|relation] '
"[--page N] [--where JSON]: semantic relevance file browsing",
"- find <folder>: folder path is positional; do not put paths in --where",
"- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
"- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
@ -264,6 +268,80 @@ class PIFSCommandExecutor:
listing = self.filesystem.browse(path, recursive=True, limit=limit)
return {"path": path, "depth": depth, "limit": limit, **listing}
def _cmd_browse(self, args: list[str]) -> Any:
recursive = False
where = None
space = "summary"
page = 1
positionals = []
i = 0
while i < len(args):
arg = args[i]
if arg in {"-R", "-r", "--recursive"}:
recursive = True
elif arg == "--where":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --where requires a JSON value")
where = args[i]
elif arg == "--space":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --space requires a value")
space = args[i]
elif arg == "--page":
i += 1
if i >= len(args):
raise PIFSCommandError("browse --page requires a value")
page = self._parse_non_negative_int(args[i], "browse --page")
elif arg in {"--limit", "--offset", "--query"}:
raise PIFSCommandError(
f"browse does not support {arg}; use fixed page size "
f"{self.BROWSE_PAGE_SIZE} and --page N"
)
elif arg.startswith("-"):
raise PIFSCommandError(f"Unsupported browse option: {arg}")
else:
positionals.append(arg)
i += 1
if len(positionals) < 2:
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
if len(positionals) > 2:
raise PIFSCommandError(
'browse accepts a folder and one quoted query, for example: '
'browse /documents "Federal Reserve"'
)
path, query = positionals
if not str(path).startswith("/"):
raise PIFSCommandError("browse target must be a PIFS folder path like /documents")
query = str(query or "").strip()
if not query:
raise PIFSCommandError('browse requires a query: browse <folder> "<query>"')
if page < 1:
raise PIFSCommandError("browse --page must be at least 1")
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
raise PIFSCommandError(
"Unsupported browse --space: "
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
)
if not self.filesystem.has_semantic_channel(space):
available = self.filesystem.semantic_retrieval_channels()
available_text = ", ".join(available) if available else "none"
raise PIFSCommandError(
f"browse --space {space} is not available; available spaces: {available_text}"
)
normalized = self._normalize_folder_path(path)
return self.filesystem.browse_semantic_files(
normalized,
query,
retrieval_query=self._semantic_retrieval_query(query),
recursive=recursive,
space=space,
page=page,
page_size=self.BROWSE_PAGE_SIZE,
metadata_filter=where,
)
def _cmd_find(self, args: list[str]) -> Any:
path = "/"
where = None

View file

@ -331,6 +331,130 @@ class PageIndexFileSystem:
max_depth=max_depth,
)
def browse_semantic_files(
self,
path: str,
query: str,
*,
retrieval_query: str | None = None,
recursive: bool = False,
space: str = "summary",
page: int = 1,
page_size: int = 10,
metadata_filter: Optional[dict[str, Any] | str] = None,
) -> dict[str, Any]:
path = normalize_path(path)
self.store.folder_info(path)
query_text = self._query_text(retrieval_query or query).strip()
if not query_text:
raise ValueError("browse requires a query")
if page < 1:
raise ValueError("browse --page must be at least 1")
if page_size < 1:
raise ValueError("browse page_size must be at least 1")
if space not in SEMANTIC_RETRIEVAL_CHANNELS:
raise ValueError(
"Unsupported browse --space: "
f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}"
)
available_spaces = self.semantic_retrieval_channels()
if space not in available_spaces:
available = ", ".join(available_spaces) if available_spaces else "none"
raise ValueError(
f"browse --space {space} is not available; available spaces: {available}"
)
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
if search_channel is None:
available = ", ".join(available_spaces) if available_spaces else "none"
raise ValueError(
f"browse --space {space} is not available; available spaces: {available}"
)
parsed_filter = self.metadata.parse_filter(metadata_filter)
scope = {"folder_path": path, "recursive": recursive}
scope_file_refs = self.store.file_refs_for_scope(
scope=scope,
metadata_filter=parsed_filter,
)
offset = (page - 1) * page_size
needed = offset + page_size + 1
semantic_filters = self._semantic_filters_for_scope(scope)
semantic_filters["file_ref"] = scope_file_refs
candidates = (
search_channel(
space,
query_text,
limit=needed,
filters=semantic_filters,
)
if scope_file_refs
else []
)
scope_file_ref_set = set(scope_file_refs)
rows: list[dict[str, Any]] = []
seen: set[str] = set()
for candidate in candidates:
try:
file_ref = self.store.resolve_file_ref(candidate.document_id)
except KeyError:
continue
if file_ref in seen:
continue
if file_ref not in scope_file_ref_set:
continue
if not self.store.file_matches(
file_ref,
scope=scope,
metadata_filter=parsed_filter,
):
continue
seen.add(file_ref)
entry = self.store.get_file(file_ref)
folder_paths = [
folder["path"]
for folder in self.store.folder_memberships(file_ref)
]
rank = len(rows) + 1
rows.append(
{
"rank": rank,
"similarity": self._semantic_candidate_similarity(candidate),
"score": self._semantic_candidate_score(candidate),
"path": self._stable_file_locator(file_ref, entry),
"file_ref": file_ref,
"document_id": entry.external_id,
"external_id": entry.external_id,
"title": entry.title,
"source_path": entry.source_path,
"folder_path": self._preferred_folder_path(
folder_paths,
path,
entry.folder_path,
),
"folder_paths": folder_paths,
"summary": str((entry.metadata or {}).get("summary") or ""),
"snippet": str(getattr(candidate, "snippet", "") or entry.descriptor),
"metadata": entry.metadata,
"metadata_status": entry.metadata_status,
"sources": list(getattr(candidate, "sources", []) or []),
}
)
if len(rows) >= needed:
break
page_rows = rows[offset : offset + page_size]
return {
"mode": "files",
"retrieval": f"{space}_vector",
"query": query,
"scope": path,
"recursive": recursive,
"space": space,
"available_spaces": list(available_spaces),
"page": page,
"page_size": page_size,
"has_more": len(rows) > offset + page_size,
"data": page_rows,
}
def folder_info(self, path: str = "/") -> dict[str, Any]:
return self.store.folder_info(path)
@ -1515,6 +1639,45 @@ class PageIndexFileSystem:
break
return results
@staticmethod
def _semantic_candidate_score(candidate: Any) -> float | None:
try:
return float(getattr(candidate, "score"))
except (AttributeError, TypeError, ValueError):
return None
@classmethod
def _semantic_candidate_similarity(cls, candidate: Any) -> float:
distances: list[float] = []
for source in getattr(candidate, "sources", []) or []:
if not isinstance(source, dict) or source.get("distance") is None:
continue
try:
distances.append(float(source["distance"]))
except (TypeError, ValueError):
continue
if distances:
distance = max(min(distances), 0.0)
return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4)
score = cls._semantic_candidate_score(candidate)
if score is None:
return 0.0
return round(max(0.0, min(1.0, score)), 4)
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
source_path = str(getattr(entry, "source_path", "") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if self.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(getattr(entry, "external_id", "") or "").strip()
if external_id:
return external_id
return file_ref
@staticmethod
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")

View file

@ -159,15 +159,29 @@ class SQLiteVecSemanticIndex:
raise SemanticIndexError(
f"query vector dimension mismatch: expected {dimension}, got {len(vector)}"
)
fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1)))
source_types = _source_type_filters(filters or {})
raw_filters = filters or {}
source_types = _source_type_filters(raw_filters)
file_refs = _file_ref_filters(raw_filters)
if file_refs == []:
return []
with self.connect() as conn:
if file_refs is not None:
_install_file_ref_filter_table(conn, file_refs)
rows = []
if source_types:
for source_type in source_types:
fetch_k = self._search_fetch_k(
conn,
limit,
fetch_multiplier,
exact_file_ref_filter=file_refs is not None,
source_type=source_type,
)
if fetch_k <= 0:
continue
rows.extend(
conn.execute(
"""
f"""
SELECT
d.file_ref,
d.external_id,
@ -180,6 +194,7 @@ class SQLiteVecSemanticIndex:
FROM semantic_index_vec v
JOIN semantic_index_docs d ON d.rowid = v.rowid
WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ?
{_file_ref_filter_sql(file_refs)}
ORDER BY v.distance
""",
(sqlite_vec.serialize_float32(vector), fetch_k, source_type),
@ -187,8 +202,16 @@ class SQLiteVecSemanticIndex:
)
rows.sort(key=lambda row: float(row["distance"]))
else:
fetch_k = self._search_fetch_k(
conn,
limit,
fetch_multiplier,
exact_file_ref_filter=file_refs is not None,
)
if fetch_k <= 0:
return []
rows = conn.execute(
"""
f"""
SELECT
d.file_ref,
d.external_id,
@ -201,6 +224,7 @@ class SQLiteVecSemanticIndex:
FROM semantic_index_vec v
JOIN semantic_index_docs d ON d.rowid = v.rowid
WHERE v.embedding MATCH ? AND k = ?
{_file_ref_filter_sql(file_refs)}
ORDER BY v.distance
""",
(sqlite_vec.serialize_float32(vector), fetch_k),
@ -226,6 +250,30 @@ class SQLiteVecSemanticIndex:
break
return results
@staticmethod
def _search_fetch_k(
conn: sqlite3.Connection,
limit: int,
fetch_multiplier: int,
*,
exact_file_ref_filter: bool,
source_type: str | None = None,
) -> int:
if exact_file_ref_filter:
where = []
params: list[Any] = []
if source_type is not None:
where.append("source_type = ?")
params.append(source_type)
where_sql = "WHERE " + " AND ".join(where) if where else ""
return int(
conn.execute(
f"SELECT COUNT(*) FROM semantic_index_docs {where_sql}",
params,
).fetchone()[0]
)
return min(4096, max(limit, limit * max(fetch_multiplier, 1)))
def info(self) -> dict[str, Any]:
with self.connect() as conn:
config = {
@ -344,7 +392,8 @@ def _matches_filters(
filters: dict[str, Any],
) -> bool:
for key, expected in filters.items():
actual = row[key] if key in row.keys() else metadata.get(key)
actual_key = "file_ref" if key == "file_refs" else key
actual = row[actual_key] if actual_key in row.keys() else metadata.get(actual_key)
if isinstance(expected, list):
if str(actual) not in {str(item) for item in expected}:
return False
@ -360,3 +409,41 @@ def _source_type_filters(filters: dict[str, Any]) -> list[str]:
if isinstance(value, list):
return [str(item) for item in value if str(item)]
return [str(value)] if str(value) else []
def _file_ref_filters(filters: dict[str, Any]) -> list[str] | None:
if "file_ref" in filters:
value = filters.get("file_ref")
elif "file_refs" in filters:
value = filters.get("file_refs")
else:
return None
if isinstance(value, list):
return [str(item) for item in value if str(item)]
return [str(value)] if str(value) else []
def _install_file_ref_filter_table(conn: sqlite3.Connection, file_refs: list[str]) -> None:
conn.execute(
"""
CREATE TEMP TABLE IF NOT EXISTS semantic_index_filter_file_refs (
file_ref TEXT PRIMARY KEY
)
"""
)
conn.execute("DELETE FROM semantic_index_filter_file_refs")
conn.executemany(
"INSERT OR IGNORE INTO semantic_index_filter_file_refs(file_ref) VALUES (?)",
[(file_ref,) for file_ref in file_refs],
)
def _file_ref_filter_sql(file_refs: list[str] | None) -> str:
if file_refs is None:
return ""
return (
"AND EXISTS ("
"SELECT 1 FROM semantic_index_filter_file_refs scope_refs "
"WHERE scope_refs.file_ref = d.file_ref"
")"
)

View file

@ -753,6 +753,33 @@ class SQLiteFileSystemStore:
return results
return results
def file_refs_for_scope(
self,
*,
scope: Optional[dict[str, Any]] = None,
metadata_filter: Optional[dict[str, Any]] = None,
) -> list[str]:
where = ["f.deleted_at IS NULL"]
params: list[Any] = []
scope_sql, scope_params = self._scope_sql(scope)
if scope_sql:
where.append(scope_sql)
params.extend(scope_params)
metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
where.extend(metadata_sql)
params.extend(metadata_params)
with self.connect() as conn:
rows = conn.execute(
f"""
SELECT DISTINCT f.file_ref
FROM files f
WHERE {" AND ".join(where)}
ORDER BY f.file_ref
""",
params,
).fetchall()
return [row["file_ref"] for row in rows]
def _search_once(
self,
match_query: str | None,

View file

@ -69,6 +69,257 @@ class ChannelBackend:
]
class BrowseBackend:
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
self.document_ids = list(document_ids)
self.channels = channels
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
self.calls = []
def available_channels(self):
return self.channels
def search_channel(self, channel, query, *, limit=10, filters=None):
self.calls.append((channel, query, limit, filters))
file_ref_filter = set()
if isinstance(filters, dict):
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
if isinstance(raw_file_refs, str):
file_ref_filter = {raw_file_refs}
else:
file_ref_filter = {str(item) for item in raw_file_refs}
document_ids = self.document_ids
if file_ref_filter and self.file_refs_by_document_id:
document_ids = [
document_id
for document_id in document_ids
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
]
return [
SimpleNamespace(
document_id=document_id,
snippet=f"{channel} candidate {rank}: {query}",
score=1.0 - rank * 0.01,
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
)
for rank, document_id in enumerate(document_ids[:limit], 1)
]
def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"):
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
values = {
"summary": f"summary for {document.external_id}",
"doc_type": "memo",
"domain": "finance",
"topic": "risk",
}
return MetadataGenerationResult(
values={field: values[field] for field in fields if field in values}
)
filesystem.metadata_generator = SummaryGenerator()
return filesystem.register_file(
storage_uri=f"file:///tmp/{external_id}.txt",
source_path=f"documents/{external_id}.txt",
folder_path=folder_path,
external_id=external_id,
title=f"{external_id}.txt",
content=f"{external_id} discusses vector databases and retrieval.",
metadata={"department": department},
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
def test_browse_is_agent_visible_semantic_command(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
executor = PIFSCommandExecutor(filesystem)
assert "browse" in executor.allowed_commands()
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="browse requires a query"):
executor.execute("browse /documents")
with pytest.raises(PIFSCommandError, match="--query"):
executor.execute('browse /documents "vector database" --query "other"')
with pytest.raises(PIFSCommandError, match="--limit"):
executor.execute('browse /documents "vector database" --limit 10')
with pytest.raises(PIFSCommandError, match="--offset"):
executor.execute('browse /documents "vector database" --offset 10')
with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
executor.execute("browse /documents vector database")
def test_browse_validates_space_availability_and_page(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
executor.execute('browse /documents "vector database" --space hybrid')
with pytest.raises(PIFSCommandError, match="available spaces: summary"):
executor.execute('browse /documents "vector database" --space entity')
with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
executor.execute('browse /documents "vector database" --page 0')
def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
backend = BrowseBackend(["doc_direct"], channels=("entity",))
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="available spaces: entity"):
executor.execute('browse /documents "vector database"')
assert backend.calls == []
result = json.loads(
executor.execute('browse /documents "vector database" --space entity')
)["data"]
assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
assert backend.calls[-1][0] == "entity"
def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
_register_browse_file(filesystem, "doc_direct", "/documents")
_register_browse_file(filesystem, "doc_deep", "/documents/reports")
backend = BrowseBackend(["doc_deep", "doc_direct"])
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
assert direct["recursive"] is False
assert direct["space"] == "summary"
assert direct["page"] == 1
assert direct["page_size"] == 10
assert backend.calls[-1][0] == "summary"
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
assert [item["document_id"] for item in recursive["data"]] == [
"doc_deep",
"doc_direct",
]
assert [item["rank"] for item in recursive["data"]] == [1, 2]
assert recursive["recursive"] is True
def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
document_ids = []
for index in range(12):
external_id = f"doc_{index:02d}"
document_ids.append(external_id)
department = "finance" if index == 10 else "ops"
_register_browse_file(filesystem, external_id, "/documents", department=department)
filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
executor = PIFSCommandExecutor(filesystem, json_output=True)
first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert len(first_page["data"]) == 10
assert first_page["has_more"] is True
assert first_page["data"][0]["rank"] == 1
second_page = json.loads(
executor.execute('browse /documents "vector database" --page 2')
)["data"]
assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
assert [item["rank"] for item in second_page["data"]] == [11, 12]
assert second_page["has_more"] is False
filtered = json.loads(
executor.execute(
'browse /documents "vector database" --where \'{"department":"finance"}\''
)
)["data"]
assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
assert filtered["data"][0]["summary"] == "summary for doc_10"
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
file_refs_by_document_id = {}
candidate_ids = []
for index in range(150):
external_id = f"off_scope_{index:02d}"
candidate_ids.append(external_id)
file_refs_by_document_id[external_id] = _register_browse_file(
filesystem,
external_id,
"/other",
)
file_refs_by_document_id["doc_deep"] = _register_browse_file(
filesystem,
"doc_deep",
"/documents/reports",
)
file_refs_by_document_id["doc_direct"] = _register_browse_file(
filesystem,
"doc_direct",
"/documents",
)
backend = BrowseBackend(
[*candidate_ids, "doc_deep", "doc_direct"],
file_refs_by_document_id=file_refs_by_document_id,
)
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
assert [item["document_id"] for item in recursive["data"]] == [
"doc_deep",
"doc_direct",
]
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult

View file

@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
assert [item.external_id for item in filtered] == ["doc_b"]
def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path):
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
index.reset(dimension=2, metadata={"field_mode": "summary"})
records = [
SemanticIndexRecord(
file_ref=f"file_off_{item:02d}",
external_id=f"doc_off_{item:02d}",
source_type="documents",
source_path=f"other/{item:02d}.pdf",
title=f"Off scope {item:02d}",
text="off scope",
vector=[1.0, 0.0],
)
for item in range(30)
]
records.append(
SemanticIndexRecord(
file_ref="file_in_scope",
external_id="doc_in_scope",
source_type="documents",
source_path="documents/in-scope.pdf",
title="In scope",
text="in scope",
vector=[0.0, 1.0],
)
)
index.upsert_many(records)
results = index.search(
[1.0, 0.0],
limit=1,
filters={"file_ref": ["file_in_scope"]},
)
assert [item.file_ref for item in results] == ["file_in_scope"]
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer