diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 18a85cc..7c2d91e 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -23,6 +23,7 @@ class PIFSCommandExecutor: "tree", "find", "grep", + "browse", "cat", "stat", "head", @@ -53,6 +54,7 @@ class PIFSCommandExecutor: MAX_FIND_LIMIT = 50 MAX_GREP_LIMIT = 20 MAX_SEMANTIC_LIMIT = 20 + BROWSE_PAGE_SIZE = 10 MAX_TEXT_LINES = 100 MAX_PAGE_SPAN = 5 MAX_STRUCTURE_NODES = 25 @@ -102,6 +104,8 @@ class PIFSCommandExecutor: "Available command surfaces for this workspace:", "- mode: read-only inspection", "- ls/tree: folder browsing", + '- browse [-R] "" [--space summary|entity|relation] ' + "[--page N] [--where JSON]: semantic relevance file browsing", "- find : folder path is positional; do not put paths in --where", "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", "- find -maxdepth N -type f|d: bounded folder traversal for find", @@ -264,6 +268,80 @@ class PIFSCommandExecutor: listing = self.filesystem.browse(path, recursive=True, limit=limit) return {"path": path, "depth": depth, "limit": limit, **listing} + def _cmd_browse(self, args: list[str]) -> Any: + recursive = False + where = None + space = "summary" + page = 1 + positionals = [] + i = 0 + while i < len(args): + arg = args[i] + if arg in {"-R", "-r", "--recursive"}: + recursive = True + elif arg == "--where": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --where requires a JSON value") + where = args[i] + elif arg == "--space": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --space requires a value") + space = args[i] + elif arg == "--page": + i += 1 + if i >= len(args): + raise PIFSCommandError("browse --page requires a value") + page = self._parse_non_negative_int(args[i], "browse --page") + elif arg in {"--limit", "--offset", "--query"}: + raise PIFSCommandError( + f"browse does not support {arg}; use fixed page size " + f"{self.BROWSE_PAGE_SIZE} and --page N" + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported browse option: {arg}") + else: + positionals.append(arg) + i += 1 + if len(positionals) < 2: + raise PIFSCommandError('browse requires a query: browse ""') + if len(positionals) > 2: + raise PIFSCommandError( + 'browse accepts a folder and one quoted query, for example: ' + 'browse /documents "Federal Reserve"' + ) + path, query = positionals + if not str(path).startswith("/"): + raise PIFSCommandError("browse target must be a PIFS folder path like /documents") + query = str(query or "").strip() + if not query: + raise PIFSCommandError('browse requires a query: browse ""') + if page < 1: + raise PIFSCommandError("browse --page must be at least 1") + if space not in SEMANTIC_RETRIEVAL_CHANNELS: + raise PIFSCommandError( + "Unsupported browse --space: " + f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}" + ) + if not self.filesystem.has_semantic_channel(space): + available = self.filesystem.semantic_retrieval_channels() + available_text = ", ".join(available) if available else "none" + raise PIFSCommandError( + f"browse --space {space} is not available; available spaces: {available_text}" + ) + normalized = self._normalize_folder_path(path) + return self.filesystem.browse_semantic_files( + normalized, + query, + retrieval_query=self._semantic_retrieval_query(query), + recursive=recursive, + space=space, + page=page, + page_size=self.BROWSE_PAGE_SIZE, + metadata_filter=where, + ) + def _cmd_find(self, args: list[str]) -> Any: path = "/" where = None diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 72833b7..78c1cec 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -331,6 +331,130 @@ class PageIndexFileSystem: max_depth=max_depth, ) + def browse_semantic_files( + self, + path: str, + query: str, + *, + retrieval_query: str | None = None, + recursive: bool = False, + space: str = "summary", + page: int = 1, + page_size: int = 10, + metadata_filter: Optional[dict[str, Any] | str] = None, + ) -> dict[str, Any]: + path = normalize_path(path) + self.store.folder_info(path) + query_text = self._query_text(retrieval_query or query).strip() + if not query_text: + raise ValueError("browse requires a query") + if page < 1: + raise ValueError("browse --page must be at least 1") + if page_size < 1: + raise ValueError("browse page_size must be at least 1") + if space not in SEMANTIC_RETRIEVAL_CHANNELS: + raise ValueError( + "Unsupported browse --space: " + f"{space}. Supported spaces: {', '.join(SEMANTIC_RETRIEVAL_CHANNELS)}" + ) + available_spaces = self.semantic_retrieval_channels() + if space not in available_spaces: + available = ", ".join(available_spaces) if available_spaces else "none" + raise ValueError( + f"browse --space {space} is not available; available spaces: {available}" + ) + search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None) + if search_channel is None: + available = ", ".join(available_spaces) if available_spaces else "none" + raise ValueError( + f"browse --space {space} is not available; available spaces: {available}" + ) + parsed_filter = self.metadata.parse_filter(metadata_filter) + scope = {"folder_path": path, "recursive": recursive} + scope_file_refs = self.store.file_refs_for_scope( + scope=scope, + metadata_filter=parsed_filter, + ) + offset = (page - 1) * page_size + needed = offset + page_size + 1 + semantic_filters = self._semantic_filters_for_scope(scope) + semantic_filters["file_ref"] = scope_file_refs + candidates = ( + search_channel( + space, + query_text, + limit=needed, + filters=semantic_filters, + ) + if scope_file_refs + else [] + ) + scope_file_ref_set = set(scope_file_refs) + rows: list[dict[str, Any]] = [] + seen: set[str] = set() + for candidate in candidates: + try: + file_ref = self.store.resolve_file_ref(candidate.document_id) + except KeyError: + continue + if file_ref in seen: + continue + if file_ref not in scope_file_ref_set: + continue + if not self.store.file_matches( + file_ref, + scope=scope, + metadata_filter=parsed_filter, + ): + continue + seen.add(file_ref) + entry = self.store.get_file(file_ref) + folder_paths = [ + folder["path"] + for folder in self.store.folder_memberships(file_ref) + ] + rank = len(rows) + 1 + rows.append( + { + "rank": rank, + "similarity": self._semantic_candidate_similarity(candidate), + "score": self._semantic_candidate_score(candidate), + "path": self._stable_file_locator(file_ref, entry), + "file_ref": file_ref, + "document_id": entry.external_id, + "external_id": entry.external_id, + "title": entry.title, + "source_path": entry.source_path, + "folder_path": self._preferred_folder_path( + folder_paths, + path, + entry.folder_path, + ), + "folder_paths": folder_paths, + "summary": str((entry.metadata or {}).get("summary") or ""), + "snippet": str(getattr(candidate, "snippet", "") or entry.descriptor), + "metadata": entry.metadata, + "metadata_status": entry.metadata_status, + "sources": list(getattr(candidate, "sources", []) or []), + } + ) + if len(rows) >= needed: + break + page_rows = rows[offset : offset + page_size] + return { + "mode": "files", + "retrieval": f"{space}_vector", + "query": query, + "scope": path, + "recursive": recursive, + "space": space, + "available_spaces": list(available_spaces), + "page": page, + "page_size": page_size, + "has_more": len(rows) > offset + page_size, + "data": page_rows, + } + def folder_info(self, path: str = "/") -> dict[str, Any]: return self.store.folder_info(path) @@ -1515,6 +1639,45 @@ class PageIndexFileSystem: break return results + @staticmethod + def _semantic_candidate_score(candidate: Any) -> float | None: + try: + return float(getattr(candidate, "score")) + except (AttributeError, TypeError, ValueError): + return None + + @classmethod + def _semantic_candidate_similarity(cls, candidate: Any) -> float: + distances: list[float] = [] + for source in getattr(candidate, "sources", []) or []: + if not isinstance(source, dict) or source.get("distance") is None: + continue + try: + distances.append(float(source["distance"])) + except (TypeError, ValueError): + continue + if distances: + distance = max(min(distances), 0.0) + return round(max(0.0, min(1.0, 1.0 / (1.0 + distance))), 4) + score = cls._semantic_candidate_score(candidate) + if score is None: + return 0.0 + return round(max(0.0, min(1.0, score)), 4) + + def _stable_file_locator(self, file_ref: str, entry: Any) -> str: + source_path = str(getattr(entry, "source_path", "") or "").strip() + if source_path: + target = "/" + source_path.strip("/") + try: + if self.store.resolve_file_ref(target) == file_ref: + return target + except KeyError: + pass + external_id = str(getattr(entry, "external_id", "") or "").strip() + if external_id: + return external_id + return file_ref + @staticmethod def _build_descriptor(title: str, metadata: dict[str, Any]) -> str: source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel") diff --git a/pageindex/filesystem/semantic_index.py b/pageindex/filesystem/semantic_index.py index 2453e1f..4a29551 100644 --- a/pageindex/filesystem/semantic_index.py +++ b/pageindex/filesystem/semantic_index.py @@ -159,15 +159,29 @@ class SQLiteVecSemanticIndex: raise SemanticIndexError( f"query vector dimension mismatch: expected {dimension}, got {len(vector)}" ) - fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1))) - source_types = _source_type_filters(filters or {}) + raw_filters = filters or {} + source_types = _source_type_filters(raw_filters) + file_refs = _file_ref_filters(raw_filters) + if file_refs == []: + return [] with self.connect() as conn: + if file_refs is not None: + _install_file_ref_filter_table(conn, file_refs) rows = [] if source_types: for source_type in source_types: + fetch_k = self._search_fetch_k( + conn, + limit, + fetch_multiplier, + exact_file_ref_filter=file_refs is not None, + source_type=source_type, + ) + if fetch_k <= 0: + continue rows.extend( conn.execute( - """ + f""" SELECT d.file_ref, d.external_id, @@ -180,6 +194,7 @@ class SQLiteVecSemanticIndex: FROM semantic_index_vec v JOIN semantic_index_docs d ON d.rowid = v.rowid WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ? + {_file_ref_filter_sql(file_refs)} ORDER BY v.distance """, (sqlite_vec.serialize_float32(vector), fetch_k, source_type), @@ -187,8 +202,16 @@ class SQLiteVecSemanticIndex: ) rows.sort(key=lambda row: float(row["distance"])) else: + fetch_k = self._search_fetch_k( + conn, + limit, + fetch_multiplier, + exact_file_ref_filter=file_refs is not None, + ) + if fetch_k <= 0: + return [] rows = conn.execute( - """ + f""" SELECT d.file_ref, d.external_id, @@ -201,6 +224,7 @@ class SQLiteVecSemanticIndex: FROM semantic_index_vec v JOIN semantic_index_docs d ON d.rowid = v.rowid WHERE v.embedding MATCH ? AND k = ? + {_file_ref_filter_sql(file_refs)} ORDER BY v.distance """, (sqlite_vec.serialize_float32(vector), fetch_k), @@ -226,6 +250,30 @@ class SQLiteVecSemanticIndex: break return results + @staticmethod + def _search_fetch_k( + conn: sqlite3.Connection, + limit: int, + fetch_multiplier: int, + *, + exact_file_ref_filter: bool, + source_type: str | None = None, + ) -> int: + if exact_file_ref_filter: + where = [] + params: list[Any] = [] + if source_type is not None: + where.append("source_type = ?") + params.append(source_type) + where_sql = "WHERE " + " AND ".join(where) if where else "" + return int( + conn.execute( + f"SELECT COUNT(*) FROM semantic_index_docs {where_sql}", + params, + ).fetchone()[0] + ) + return min(4096, max(limit, limit * max(fetch_multiplier, 1))) + def info(self) -> dict[str, Any]: with self.connect() as conn: config = { @@ -344,7 +392,8 @@ def _matches_filters( filters: dict[str, Any], ) -> bool: for key, expected in filters.items(): - actual = row[key] if key in row.keys() else metadata.get(key) + actual_key = "file_ref" if key == "file_refs" else key + actual = row[actual_key] if actual_key in row.keys() else metadata.get(actual_key) if isinstance(expected, list): if str(actual) not in {str(item) for item in expected}: return False @@ -360,3 +409,41 @@ def _source_type_filters(filters: dict[str, Any]) -> list[str]: if isinstance(value, list): return [str(item) for item in value if str(item)] return [str(value)] if str(value) else [] + + +def _file_ref_filters(filters: dict[str, Any]) -> list[str] | None: + if "file_ref" in filters: + value = filters.get("file_ref") + elif "file_refs" in filters: + value = filters.get("file_refs") + else: + return None + if isinstance(value, list): + return [str(item) for item in value if str(item)] + return [str(value)] if str(value) else [] + + +def _install_file_ref_filter_table(conn: sqlite3.Connection, file_refs: list[str]) -> None: + conn.execute( + """ + CREATE TEMP TABLE IF NOT EXISTS semantic_index_filter_file_refs ( + file_ref TEXT PRIMARY KEY + ) + """ + ) + conn.execute("DELETE FROM semantic_index_filter_file_refs") + conn.executemany( + "INSERT OR IGNORE INTO semantic_index_filter_file_refs(file_ref) VALUES (?)", + [(file_ref,) for file_ref in file_refs], + ) + + +def _file_ref_filter_sql(file_refs: list[str] | None) -> str: + if file_refs is None: + return "" + return ( + "AND EXISTS (" + "SELECT 1 FROM semantic_index_filter_file_refs scope_refs " + "WHERE scope_refs.file_ref = d.file_ref" + ")" + ) diff --git a/pageindex/filesystem/store.py b/pageindex/filesystem/store.py index 7517d70..30a7d32 100644 --- a/pageindex/filesystem/store.py +++ b/pageindex/filesystem/store.py @@ -753,6 +753,33 @@ class SQLiteFileSystemStore: return results return results + def file_refs_for_scope( + self, + *, + scope: Optional[dict[str, Any]] = None, + metadata_filter: Optional[dict[str, Any]] = None, + ) -> list[str]: + where = ["f.deleted_at IS NULL"] + params: list[Any] = [] + scope_sql, scope_params = self._scope_sql(scope) + if scope_sql: + where.append(scope_sql) + params.extend(scope_params) + metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter) + where.extend(metadata_sql) + params.extend(metadata_params) + with self.connect() as conn: + rows = conn.execute( + f""" + SELECT DISTINCT f.file_ref + FROM files f + WHERE {" AND ".join(where)} + ORDER BY f.file_ref + """, + params, + ).fetchall() + return [row["file_ref"] for row in rows] + def _search_once( self, match_query: str | None, diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 087473a..04ce084 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -69,6 +69,257 @@ class ChannelBackend: ] +class BrowseBackend: + def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None): + self.document_ids = list(document_ids) + self.channels = channels + self.file_refs_by_document_id = dict(file_refs_by_document_id or {}) + self.calls = [] + + def available_channels(self): + return self.channels + + def search_channel(self, channel, query, *, limit=10, filters=None): + self.calls.append((channel, query, limit, filters)) + file_ref_filter = set() + if isinstance(filters, dict): + raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or [] + if isinstance(raw_file_refs, str): + file_ref_filter = {raw_file_refs} + else: + file_ref_filter = {str(item) for item in raw_file_refs} + document_ids = self.document_ids + if file_ref_filter and self.file_refs_by_document_id: + document_ids = [ + document_id + for document_id in document_ids + if self.file_refs_by_document_id.get(document_id) in file_ref_filter + ] + return [ + SimpleNamespace( + document_id=document_id, + snippet=f"{channel} candidate {rank}: {query}", + score=1.0 - rank * 0.01, + sources=[{"channel": channel, "rank": rank, "distance": rank / 10}], + ) + for rank, document_id in enumerate(document_ids[:limit], 1) + ] + + +def _register_browse_file(filesystem, external_id, folder_path, *, department="ops"): + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + values = { + "summary": f"summary for {document.external_id}", + "doc_type": "memo", + "domain": "finance", + "topic": "risk", + } + return MetadataGenerationResult( + values={field: values[field] for field in fields if field in values} + ) + + filesystem.metadata_generator = SummaryGenerator() + return filesystem.register_file( + storage_uri=f"file:///tmp/{external_id}.txt", + source_path=f"documents/{external_id}.txt", + folder_path=folder_path, + external_id=external_id, + title=f"{external_id}.txt", + content=f"{external_id} discusses vector databases and retrieval.", + metadata={"department": department}, + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + + +def test_browse_is_agent_visible_semantic_command(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + executor = PIFSCommandExecutor(filesystem) + + assert "browse" in executor.allowed_commands() + assert 'browse [-R] ""' in executor.describe_available_command_surfaces() + + +def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"]) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="browse requires a query"): + executor.execute("browse /documents") + with pytest.raises(PIFSCommandError, match="--query"): + executor.execute('browse /documents "vector database" --query "other"') + with pytest.raises(PIFSCommandError, match="--limit"): + executor.execute('browse /documents "vector database" --limit 10') + with pytest.raises(PIFSCommandError, match="--offset"): + executor.execute('browse /documents "vector database" --offset 10') + with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"): + executor.execute("browse /documents vector database") + + +def test_browse_validates_space_availability_and_page(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",)) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"): + executor.execute('browse /documents "vector database" --space hybrid') + with pytest.raises(PIFSCommandError, match="available spaces: summary"): + executor.execute('browse /documents "vector database" --space entity') + with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"): + executor.execute('browse /documents "vector database" --page 0') + + +def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + backend = BrowseBackend(["doc_direct"], channels=("entity",)) + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="available spaces: entity"): + executor.execute('browse /documents "vector database"') + assert backend.calls == [] + + result = json.loads( + executor.execute('browse /documents "vector database" --space entity') + )["data"] + assert [item["document_id"] for item in result["data"]] == ["doc_direct"] + assert backend.calls[-1][0] == "entity" + + +def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_browse_file(filesystem, "doc_direct", "/documents") + _register_browse_file(filesystem, "doc_deep", "/documents/reports") + backend = BrowseBackend(["doc_deep", "doc_direct"]) + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + direct = json.loads(executor.execute('browse /documents "vector database"'))["data"] + assert [item["document_id"] for item in direct["data"]] == ["doc_direct"] + assert direct["recursive"] is False + assert direct["space"] == "summary" + assert direct["page"] == 1 + assert direct["page_size"] == 10 + assert backend.calls[-1][0] == "summary" + + recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"] + assert [item["document_id"] for item in recursive["data"]] == [ + "doc_deep", + "doc_direct", + ] + assert [item["rank"] for item in recursive["data"]] == [1, 2] + assert recursive["recursive"] is True + + +def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + document_ids = [] + for index in range(12): + external_id = f"doc_{index:02d}" + document_ids.append(external_id) + department = "finance" if index == 10 else "ops" + _register_browse_file(filesystem, external_id, "/documents", department=department) + filesystem.semantic_retrieval_backend = BrowseBackend(document_ids) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"] + assert len(first_page["data"]) == 10 + assert first_page["has_more"] is True + assert first_page["data"][0]["rank"] == 1 + + second_page = json.loads( + executor.execute('browse /documents "vector database" --page 2') + )["data"] + assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"] + assert [item["rank"] for item in second_page["data"]] == [11, 12] + assert second_page["has_more"] is False + + filtered = json.loads( + executor.execute( + 'browse /documents "vector database" --where \'{"department":"finance"}\'' + ) + )["data"] + assert [item["document_id"] for item in filtered["data"]] == ["doc_10"] + assert filtered["data"][0]["summary"] == "summary for doc_10" + + +def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path): + import json + + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + file_refs_by_document_id = {} + candidate_ids = [] + for index in range(150): + external_id = f"off_scope_{index:02d}" + candidate_ids.append(external_id) + file_refs_by_document_id[external_id] = _register_browse_file( + filesystem, + external_id, + "/other", + ) + file_refs_by_document_id["doc_deep"] = _register_browse_file( + filesystem, + "doc_deep", + "/documents/reports", + ) + file_refs_by_document_id["doc_direct"] = _register_browse_file( + filesystem, + "doc_direct", + "/documents", + ) + backend = BrowseBackend( + [*candidate_ids, "doc_deep", "doc_direct"], + file_refs_by_document_id=file_refs_by_document_id, + ) + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + direct = json.loads(executor.execute('browse /documents "vector database"'))["data"] + assert [item["document_id"] for item in direct["data"]] == ["doc_direct"] + + recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"] + assert [item["document_id"] for item in recursive["data"]] == [ + "doc_deep", + "doc_direct", + ] + + def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py index 324ead7..d4263e1 100644 --- a/tests/test_semantic_index.py +++ b/tests/test_semantic_index.py @@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path): assert [item.external_id for item in filtered] == ["doc_b"] +def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path): + index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") + index.reset(dimension=2, metadata={"field_mode": "summary"}) + + records = [ + SemanticIndexRecord( + file_ref=f"file_off_{item:02d}", + external_id=f"doc_off_{item:02d}", + source_type="documents", + source_path=f"other/{item:02d}.pdf", + title=f"Off scope {item:02d}", + text="off scope", + vector=[1.0, 0.0], + ) + for item in range(30) + ] + records.append( + SemanticIndexRecord( + file_ref="file_in_scope", + external_id="doc_in_scope", + source_type="documents", + source_path="documents/in-scope.pdf", + title="In scope", + text="in scope", + vector=[0.0, 1.0], + ) + ) + index.upsert_many(records) + + results = index.search( + [1.0, 0.0], + limit=1, + filters={"file_ref": ["file_in_scope"]}, + ) + + assert [item.file_ref for item in results] == ["file_in_scope"] + + def test_summary_projection_indexes_unified_metadata_summary(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer