fix(pifs): scope browse vector search before paging

This commit is contained in:
BukeLy 2026-05-31 17:30:01 +08:00
parent ba821a70b9
commit 3562d47fdb
5 changed files with 238 additions and 16 deletions

View file

@ -70,9 +70,10 @@ class ChannelBackend:
class BrowseBackend:
def __init__(self, document_ids, channels=("summary",)):
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
self.document_ids = list(document_ids)
self.channels = channels
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
self.calls = []
def available_channels(self):
@ -80,6 +81,20 @@ class BrowseBackend:
def search_channel(self, channel, query, *, limit=10, filters=None):
self.calls.append((channel, query, limit, filters))
file_ref_filter = set()
if isinstance(filters, dict):
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
if isinstance(raw_file_refs, str):
file_ref_filter = {raw_file_refs}
else:
file_ref_filter = {str(item) for item in raw_file_refs}
document_ids = self.document_ids
if file_ref_filter and self.file_refs_by_document_id:
document_ids = [
document_id
for document_id in document_ids
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
]
return [
SimpleNamespace(
document_id=document_id,
@ -87,7 +102,7 @@ class BrowseBackend:
score=1.0 - rank * 0.01,
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
)
for rank, document_id in enumerate(self.document_ids[:limit], 1)
for rank, document_id in enumerate(document_ids[:limit], 1)
]
@ -108,11 +123,11 @@ def _register_browse_file(filesystem, external_id, folder_path, *, department="o
filesystem.metadata_generator = SummaryGenerator()
return filesystem.register_file(
storage_uri=f"file:///tmp/{external_id}.pdf",
source_path=f"documents/{external_id}.pdf",
storage_uri=f"file:///tmp/{external_id}.txt",
source_path=f"documents/{external_id}.txt",
folder_path=folder_path,
external_id=external_id,
title=f"{external_id}.pdf",
title=f"{external_id}.txt",
content=f"{external_id} discusses vector databases and retrieval.",
metadata={"department": department},
metadata_policy={
@ -262,6 +277,49 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp
assert filtered["data"][0]["summary"] == "summary for doc_10"
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
file_refs_by_document_id = {}
candidate_ids = []
for index in range(150):
external_id = f"off_scope_{index:02d}"
candidate_ids.append(external_id)
file_refs_by_document_id[external_id] = _register_browse_file(
filesystem,
external_id,
"/other",
)
file_refs_by_document_id["doc_deep"] = _register_browse_file(
filesystem,
"doc_deep",
"/documents/reports",
)
file_refs_by_document_id["doc_direct"] = _register_browse_file(
filesystem,
"doc_direct",
"/documents",
)
backend = BrowseBackend(
[*candidate_ids, "doc_deep", "doc_direct"],
file_refs_by_document_id=file_refs_by_document_id,
)
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
assert [item["document_id"] for item in recursive["data"]] == [
"doc_deep",
"doc_direct",
]
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult

View file

@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
assert [item.external_id for item in filtered] == ["doc_b"]
def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path):
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
index.reset(dimension=2, metadata={"field_mode": "summary"})
records = [
SemanticIndexRecord(
file_ref=f"file_off_{item:02d}",
external_id=f"doc_off_{item:02d}",
source_type="documents",
source_path=f"other/{item:02d}.pdf",
title=f"Off scope {item:02d}",
text="off scope",
vector=[1.0, 0.0],
)
for item in range(30)
]
records.append(
SemanticIndexRecord(
file_ref="file_in_scope",
external_id="doc_in_scope",
source_type="documents",
source_path="documents/in-scope.pdf",
title="In scope",
text="in scope",
vector=[0.0, 1.0],
)
)
index.upsert_many(records)
results = index.search(
[1.0, 0.0],
limit=1,
filters={"file_ref": ["file_in_scope"]},
)
assert [item.file_ref for item in results] == ["file_in_scope"]
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer