mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
Route default semantic search to the summary projection when summary is the only populated semantic channel.
182 lines
6.2 KiB
Python
182 lines
6.2 KiB
Python
import json
|
|
from types import SimpleNamespace
|
|
|
|
|
|
class SummaryBackend:
|
|
def __init__(self, document_id):
|
|
self.document_id = document_id
|
|
self.calls = []
|
|
|
|
def available_channels(self):
|
|
return ("summary",)
|
|
|
|
def search_channel(self, channel, query, *, limit=10, filters=None):
|
|
self.calls.append((channel, query, filters))
|
|
return [
|
|
SimpleNamespace(
|
|
document_id=self.document_id,
|
|
snippet=f"summary candidate: {query}",
|
|
)
|
|
]
|
|
|
|
|
|
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/report.pdf",
|
|
source_path="examples/documents/report.pdf",
|
|
folder_path="/documents",
|
|
external_id="dsid_report",
|
|
title="Annual report",
|
|
metadata={"source_type": "examples-documents"},
|
|
content="Federal Reserve supervision and regulation annual report.",
|
|
)
|
|
backend = SummaryBackend("dsid_report")
|
|
filesystem.semantic_retrieval_backend = backend
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
result = json.loads(
|
|
executor.execute('search-summary "Federal Reserve annual report" /documents')
|
|
)
|
|
|
|
assert backend.calls[0][2] == {}
|
|
assert result["data"]["data"][0]["external_id"] == "dsid_report"
|
|
|
|
|
|
def test_semantic_search_scope_filters_explicit_source_type_facets():
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/source_type=google-drive"}
|
|
) == {"source_type": "google_drive"}
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/semantic/source_type=google-drive"}
|
|
) == {"source_type": "google_drive"}
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/documents"}
|
|
) == {}
|
|
|
|
|
|
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
|
|
|
workspace = tmp_path / "workspace"
|
|
index_dir = workspace / "artifacts" / "projection_indexes"
|
|
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
|
summary_index.reset(
|
|
dimension=3,
|
|
metadata={
|
|
"channel": "summary",
|
|
"embedding_provider": "openai",
|
|
"embedding_model": "test-embedding",
|
|
"embedding_dimensions": 3,
|
|
},
|
|
)
|
|
summary_index.upsert_many(
|
|
[
|
|
SemanticIndexRecord(
|
|
file_ref="file_a",
|
|
external_id="doc_a",
|
|
source_type="documents",
|
|
source_path="documents/a.pdf",
|
|
title="A",
|
|
text="summary",
|
|
vector=[1.0, 0.0, 0.0],
|
|
)
|
|
]
|
|
)
|
|
filesystem = PageIndexFileSystem(workspace)
|
|
calls = []
|
|
|
|
def fake_configure(index_dir_arg, **kwargs):
|
|
calls.append((index_dir_arg, kwargs))
|
|
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
|
return filesystem.semantic_retrieval_backend
|
|
|
|
monkeypatch.setattr(
|
|
filesystem,
|
|
"configure_hybrid_projection_retrieval",
|
|
fake_configure,
|
|
)
|
|
|
|
assert filesystem.configure_existing_projection_retrieval() is True
|
|
assert calls == [
|
|
(
|
|
filesystem.summary_projection_index_dir,
|
|
{
|
|
"embedding_provider": "openai",
|
|
"embedding_model": "test-embedding",
|
|
"embedding_dimensions": 3,
|
|
"embedding_timeout": 60,
|
|
},
|
|
)
|
|
]
|
|
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
|
|
|
|
|
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
|
|
|
class FixedEmbedder:
|
|
def embed(self, texts):
|
|
return [[1.0, 0.0, 0.0] for _ in texts]
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": "vendor renewal risk matrix"}
|
|
)
|
|
|
|
source = tmp_path / "source.txt"
|
|
source.write_text("ordinary fixture body", encoding="utf-8")
|
|
index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
|
|
indexer = SummaryProjectionIndexer(
|
|
index_dir,
|
|
embedder=FixedEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="fake",
|
|
embedding_dimensions=3,
|
|
)
|
|
backend = HybridProjectionSearchBackend(
|
|
index_dir,
|
|
embedder=FixedEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="fake",
|
|
embedding_dimensions=3,
|
|
)
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
summary_projection_indexer=indexer,
|
|
semantic_retrieval_backend=backend,
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/source.txt",
|
|
folder_path="/documents",
|
|
external_id="doc_summary_only",
|
|
title="Operations note",
|
|
content=source.read_text(encoding="utf-8"),
|
|
metadata={"department": "ops"},
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert filesystem.search("purchase order exposure", semantic=False) == []
|
|
|
|
results = filesystem.search("purchase order exposure", semantic=True)
|
|
|
|
assert [result.external_id for result in results] == ["doc_summary_only"]
|
|
assert results[0].snippet == "summary_vector rank=1"
|