mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-27 20:29:41 +02:00
fix(pifs): scope browse vector search before paging
This commit is contained in:
parent
ba821a70b9
commit
3562d47fdb
5 changed files with 238 additions and 16 deletions
|
|
@ -70,9 +70,10 @@ class ChannelBackend:
|
|||
|
||||
|
||||
class BrowseBackend:
|
||||
def __init__(self, document_ids, channels=("summary",)):
|
||||
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
|
||||
self.document_ids = list(document_ids)
|
||||
self.channels = channels
|
||||
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
|
||||
self.calls = []
|
||||
|
||||
def available_channels(self):
|
||||
|
|
@ -80,6 +81,20 @@ class BrowseBackend:
|
|||
|
||||
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||
self.calls.append((channel, query, limit, filters))
|
||||
file_ref_filter = set()
|
||||
if isinstance(filters, dict):
|
||||
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
|
||||
if isinstance(raw_file_refs, str):
|
||||
file_ref_filter = {raw_file_refs}
|
||||
else:
|
||||
file_ref_filter = {str(item) for item in raw_file_refs}
|
||||
document_ids = self.document_ids
|
||||
if file_ref_filter and self.file_refs_by_document_id:
|
||||
document_ids = [
|
||||
document_id
|
||||
for document_id in document_ids
|
||||
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
|
||||
]
|
||||
return [
|
||||
SimpleNamespace(
|
||||
document_id=document_id,
|
||||
|
|
@ -87,7 +102,7 @@ class BrowseBackend:
|
|||
score=1.0 - rank * 0.01,
|
||||
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
|
||||
)
|
||||
for rank, document_id in enumerate(self.document_ids[:limit], 1)
|
||||
for rank, document_id in enumerate(document_ids[:limit], 1)
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -108,11 +123,11 @@ def _register_browse_file(filesystem, external_id, folder_path, *, department="o
|
|||
|
||||
filesystem.metadata_generator = SummaryGenerator()
|
||||
return filesystem.register_file(
|
||||
storage_uri=f"file:///tmp/{external_id}.pdf",
|
||||
source_path=f"documents/{external_id}.pdf",
|
||||
storage_uri=f"file:///tmp/{external_id}.txt",
|
||||
source_path=f"documents/{external_id}.txt",
|
||||
folder_path=folder_path,
|
||||
external_id=external_id,
|
||||
title=f"{external_id}.pdf",
|
||||
title=f"{external_id}.txt",
|
||||
content=f"{external_id} discusses vector databases and retrieval.",
|
||||
metadata={"department": department},
|
||||
metadata_policy={
|
||||
|
|
@ -262,6 +277,49 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp
|
|||
assert filtered["data"][0]["summary"] == "summary for doc_10"
|
||||
|
||||
|
||||
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
file_refs_by_document_id = {}
|
||||
candidate_ids = []
|
||||
for index in range(150):
|
||||
external_id = f"off_scope_{index:02d}"
|
||||
candidate_ids.append(external_id)
|
||||
file_refs_by_document_id[external_id] = _register_browse_file(
|
||||
filesystem,
|
||||
external_id,
|
||||
"/other",
|
||||
)
|
||||
file_refs_by_document_id["doc_deep"] = _register_browse_file(
|
||||
filesystem,
|
||||
"doc_deep",
|
||||
"/documents/reports",
|
||||
)
|
||||
file_refs_by_document_id["doc_direct"] = _register_browse_file(
|
||||
filesystem,
|
||||
"doc_direct",
|
||||
"/documents",
|
||||
)
|
||||
backend = BrowseBackend(
|
||||
[*candidate_ids, "doc_deep", "doc_direct"],
|
||||
file_refs_by_document_id=file_refs_by_document_id,
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
||||
|
||||
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
||||
assert [item["document_id"] for item in recursive["data"]] == [
|
||||
"doc_deep",
|
||||
"doc_direct",
|
||||
]
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
|
|||
|
|
@ -55,6 +55,44 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|||
assert [item.external_id for item in filtered] == ["doc_b"]
|
||||
|
||||
|
||||
def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path):
|
||||
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||
index.reset(dimension=2, metadata={"field_mode": "summary"})
|
||||
|
||||
records = [
|
||||
SemanticIndexRecord(
|
||||
file_ref=f"file_off_{item:02d}",
|
||||
external_id=f"doc_off_{item:02d}",
|
||||
source_type="documents",
|
||||
source_path=f"other/{item:02d}.pdf",
|
||||
title=f"Off scope {item:02d}",
|
||||
text="off scope",
|
||||
vector=[1.0, 0.0],
|
||||
)
|
||||
for item in range(30)
|
||||
]
|
||||
records.append(
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_in_scope",
|
||||
external_id="doc_in_scope",
|
||||
source_type="documents",
|
||||
source_path="documents/in-scope.pdf",
|
||||
title="In scope",
|
||||
text="in scope",
|
||||
vector=[0.0, 1.0],
|
||||
)
|
||||
)
|
||||
index.upsert_many(records)
|
||||
|
||||
results = index.search(
|
||||
[1.0, 0.0],
|
||||
limit=1,
|
||||
filters={"file_ref": ["file_in_scope"]},
|
||||
)
|
||||
|
||||
assert [item.file_ref for item in results] == ["file_in_scope"]
|
||||
|
||||
|
||||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue