mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
retrieval: add hybrid search behavior tests
This commit is contained in:
parent
e72b17fbed
commit
852ab3a576
1 changed files with 236 additions and 0 deletions
|
|
@ -0,0 +1,236 @@
|
||||||
|
"""Behavior tests for the hybrid chunk retriever against a real Postgres.
|
||||||
|
|
||||||
|
These exercise ``search_chunks`` through its public surface only: seed real
|
||||||
|
documents/chunks, run a search, and assert on the returned ``DocumentHit``s —
|
||||||
|
never on SQL shape or internal ranking math. ``query_embedding`` is supplied
|
||||||
|
directly (a public parameter) so the semantic leg is deterministic instead of
|
||||||
|
depending on a live embedding model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.retrieval.hybrid_search import (
|
||||||
|
search_chunks,
|
||||||
|
)
|
||||||
|
from app.agents.chat.multi_agent_chat.shared.retrieval.models import SearchScope
|
||||||
|
from app.config import config
|
||||||
|
from app.db import Chunk, Document, DocumentType, SearchSpace
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
_DIM = config.embedding_model_instance.dimension
|
||||||
|
|
||||||
|
|
||||||
|
def _axis(index: int) -> list[float]:
|
||||||
|
"""A unit vector pointing along one axis — orthogonal axes are dissimilar."""
|
||||||
|
vector = [0.0] * _DIM
|
||||||
|
vector[index] = 1.0
|
||||||
|
return vector
|
||||||
|
|
||||||
|
|
||||||
|
async def _add_document(
|
||||||
|
db_session,
|
||||||
|
*,
|
||||||
|
search_space_id: int,
|
||||||
|
title: str = "Doc",
|
||||||
|
document_type: DocumentType = DocumentType.FILE,
|
||||||
|
state: str = "ready",
|
||||||
|
chunks: list[tuple[str, int, list[float]]],
|
||||||
|
) -> Document:
|
||||||
|
"""Persist one document and its chunks; ``chunks`` is (content, position, embedding)."""
|
||||||
|
document = Document(
|
||||||
|
title=title,
|
||||||
|
document_type=document_type,
|
||||||
|
content="\n".join(content for content, _, _ in chunks),
|
||||||
|
content_hash=uuid.uuid4().hex,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
status={"state": state},
|
||||||
|
)
|
||||||
|
db_session.add(document)
|
||||||
|
await db_session.flush()
|
||||||
|
for content, position, embedding in chunks:
|
||||||
|
db_session.add(
|
||||||
|
Chunk(
|
||||||
|
content=content,
|
||||||
|
document_id=document.id,
|
||||||
|
position=position,
|
||||||
|
embedding=embedding,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
await db_session.flush()
|
||||||
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
async def test_keyword_relevant_document_is_retrieved(db_session, db_search_space):
|
||||||
|
document = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Asyncio Guide",
|
||||||
|
chunks=[("The asyncio library enables concurrency.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(99),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert document.id in {hit.document_id for hit in results}
|
||||||
|
|
||||||
|
|
||||||
|
async def test_semantically_closest_document_ranks_first(db_session, db_search_space):
|
||||||
|
aligned = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Background Work",
|
||||||
|
chunks=[("Parallel execution of background work.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Dessert",
|
||||||
|
chunks=[("Recipes for chocolate cake.", 0, _axis(1))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asynchronous coroutines",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert results[0].document_id == aligned.id
|
||||||
|
|
||||||
|
|
||||||
|
async def test_results_stay_within_the_search_space(db_session, db_search_space):
|
||||||
|
other_space = SearchSpace(name="Other Space", user_id=db_search_space.user_id)
|
||||||
|
db_session.add(other_space)
|
||||||
|
await db_session.flush()
|
||||||
|
|
||||||
|
mine = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
chunks=[("Shared keyword asyncio here.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
foreign = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=other_space.id,
|
||||||
|
chunks=[("Shared keyword asyncio here.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
found = {hit.document_id for hit in results}
|
||||||
|
assert mine.id in found and foreign.id not in found
|
||||||
|
|
||||||
|
|
||||||
|
async def test_document_ids_scope_pins_results(db_session, db_search_space):
|
||||||
|
pinned = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
chunks=[("asyncio appears in the pinned doc.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
chunks=[("asyncio appears in the other doc too.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(document_ids=(pinned.id,)),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert {hit.document_id for hit in results} == {pinned.id}
|
||||||
|
|
||||||
|
|
||||||
|
async def test_deleting_documents_are_excluded(db_session, db_search_space):
|
||||||
|
ready = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
chunks=[("asyncio in a ready document.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
deleting = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
state="deleting",
|
||||||
|
chunks=[("asyncio in a deleting document.", 0, _axis(0))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
found = {hit.document_id for hit in results}
|
||||||
|
assert ready.id in found and deleting.id not in found
|
||||||
|
|
||||||
|
|
||||||
|
async def test_matched_chunks_are_ordered_for_reading(db_session, db_search_space):
|
||||||
|
# Insert out of order, and give the later-position chunk the stronger
|
||||||
|
# semantic score, so reading order differs from both insertion and score.
|
||||||
|
document = await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
chunks=[
|
||||||
|
("asyncio paragraph two.", 1, _axis(0)),
|
||||||
|
("asyncio paragraph one.", 0, _axis(50)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=5,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
hit = next(hit for hit in results if hit.document_id == document.id)
|
||||||
|
assert [chunk.position for chunk in hit.chunks] == [0, 1]
|
||||||
|
|
||||||
|
|
||||||
|
async def test_top_k_caps_the_number_of_documents(db_session, db_search_space):
|
||||||
|
for index in range(3):
|
||||||
|
await _add_document(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title=f"Doc {index}",
|
||||||
|
chunks=[(f"asyncio mentioned in doc {index}.", 0, _axis(index))],
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await search_chunks(
|
||||||
|
db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
query="asyncio",
|
||||||
|
scope=SearchScope(),
|
||||||
|
top_k=2,
|
||||||
|
query_embedding=_axis(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(results) == 2
|
||||||
Loading…
Add table
Add a link
Reference in a new issue