fix(filesystem): use summary projection for default semantic search

Route default semantic search to the summary projection when summary is the only populated semantic channel.
This commit is contained in:
Bukely_ 2026-05-26 20:27:29 +08:00 committed by BukeLy
parent 3519adfbd1
commit ad45f96dfa
2 changed files with 67 additions and 0 deletions

View file

@ -130,6 +130,8 @@ class HybridProjectionSearchBackend:
if self._channel_document_count(channel) > 0
)
if not channels:
if self._channel_document_count("summary") > 0:
return self.search_channel("summary", query, limit=limit, filters=filters)
return []
channel_hits = self._search_channels(
query=query,

View file

@ -115,3 +115,68 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
)
]
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
class FixedEmbedder:
def embed(self, texts):
return [[1.0, 0.0, 0.0] for _ in texts]
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": "vendor renewal risk matrix"}
)
source = tmp_path / "source.txt"
source.write_text("ordinary fixture body", encoding="utf-8")
index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
indexer = SummaryProjectionIndexer(
index_dir,
embedder=FixedEmbedder(),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=3,
)
backend = HybridProjectionSearchBackend(
index_dir,
embedder=FixedEmbedder(),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=3,
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
summary_projection_indexer=indexer,
semantic_retrieval_backend=backend,
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_summary_only",
title="Operations note",
content=source.read_text(encoding="utf-8"),
metadata={"department": "ops"},
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
assert filesystem.search("purchase order exposure", semantic=False) == []
results = filesystem.search("purchase order exposure", semantic=True)
assert [result.external_id for result in results] == ["doc_summary_only"]
assert results[0].snippet == "summary_vector rank=1"