mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 17:26:23 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/onedrive-connector
This commit is contained in:
commit
5a3eece397
70 changed files with 8288 additions and 5698 deletions
|
|
@ -14,7 +14,9 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
def _cal_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument:
|
||||
def _cal_doc(
|
||||
*, unique_id: str, search_space_id: int, connector_id: int, user_id: str
|
||||
) -> ConnectorDocument:
|
||||
return ConnectorDocument(
|
||||
title=f"Event {unique_id}",
|
||||
source_markdown=f"## Calendar Event\n\nDetails for {unique_id}",
|
||||
|
|
@ -34,7 +36,9 @@ def _cal_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_calendar_pipeline_creates_ready_document(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
@ -63,7 +67,9 @@ async def test_calendar_pipeline_creates_ready_document(
|
|||
assert DocumentStatus.is_state(row.status, DocumentStatus.READY)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_calendar_legacy_doc_migrated(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
@ -101,7 +107,9 @@ async def test_calendar_legacy_doc_migrated(
|
|||
service = IndexingPipelineService(session=db_session)
|
||||
await service.migrate_legacy_docs([connector_doc])
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == original_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == original_id)
|
||||
)
|
||||
row = result.scalars().first()
|
||||
|
||||
assert row.document_type == DocumentType.GOOGLE_CALENDAR_CONNECTOR
|
||||
|
|
|
|||
|
|
@ -14,7 +14,9 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
def _drive_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument:
|
||||
def _drive_doc(
|
||||
*, unique_id: str, search_space_id: int, connector_id: int, user_id: str
|
||||
) -> ConnectorDocument:
|
||||
return ConnectorDocument(
|
||||
title=f"File {unique_id}.pdf",
|
||||
source_markdown=f"## Document Content\n\nText from file {unique_id}",
|
||||
|
|
@ -33,7 +35,9 @@ def _drive_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_drive_pipeline_creates_ready_document(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
@ -62,7 +66,9 @@ async def test_drive_pipeline_creates_ready_document(
|
|||
assert DocumentStatus.is_state(row.status, DocumentStatus.READY)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_drive_legacy_doc_migrated(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
@ -100,7 +106,9 @@ async def test_drive_legacy_doc_migrated(
|
|||
service = IndexingPipelineService(session=db_session)
|
||||
await service.migrate_legacy_docs([connector_doc])
|
||||
|
||||
result = await db_session.execute(select(Document).filter(Document.id == original_id))
|
||||
result = await db_session.execute(
|
||||
select(Document).filter(Document.id == original_id)
|
||||
)
|
||||
row = result.scalars().first()
|
||||
|
||||
assert row.document_type == DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
|
@ -111,7 +119,9 @@ async def test_drive_legacy_doc_migrated(
|
|||
|
||||
|
||||
async def test_should_skip_file_skips_failed_document(
|
||||
db_session, db_search_space, db_user,
|
||||
db_session,
|
||||
db_search_space,
|
||||
db_user,
|
||||
):
|
||||
"""A FAILED document with unchanged md5 must be skipped — user can manually retry via Quick Index."""
|
||||
import importlib
|
||||
|
|
@ -162,7 +172,12 @@ async def test_should_skip_file_skips_failed_document(
|
|||
db_session.add(failed_doc)
|
||||
await db_session.flush()
|
||||
|
||||
incoming_file = {"id": file_id, "name": "Failed File.pdf", "mimeType": "application/pdf", "md5Checksum": md5}
|
||||
incoming_file = {
|
||||
"id": file_id,
|
||||
"name": "Failed File.pdf",
|
||||
"mimeType": "application/pdf",
|
||||
"md5Checksum": md5,
|
||||
}
|
||||
|
||||
should_skip, msg = await _should_skip_file(db_session, incoming_file, space_id)
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ from app.db import Document, DocumentStatus, DocumentType
|
|||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_identifier_hash,
|
||||
compute_unique_identifier_hash,
|
||||
)
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
|
|
@ -17,7 +16,9 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
def _gmail_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_id: str) -> ConnectorDocument:
|
||||
def _gmail_doc(
|
||||
*, unique_id: str, search_space_id: int, connector_id: int, user_id: str
|
||||
) -> ConnectorDocument:
|
||||
"""Build a Gmail-style ConnectorDocument like the real indexer does."""
|
||||
return ConnectorDocument(
|
||||
title=f"Subject for {unique_id}",
|
||||
|
|
@ -37,7 +38,9 @@ def _gmail_doc(*, unique_id: str, search_space_id: int, connector_id: int, user_
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_gmail_pipeline_creates_ready_document(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
@ -67,7 +70,9 @@ async def test_gmail_pipeline_creates_ready_document(
|
|||
assert row.source_markdown == doc.source_markdown
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_gmail_legacy_doc_migrated_then_reused(
|
||||
db_session, db_search_space, db_connector, db_user, mocker
|
||||
):
|
||||
|
|
|
|||
|
|
@ -9,7 +9,9 @@ from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineServ
|
|||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_index_batch_creates_ready_documents(
|
||||
db_session, db_search_space, make_connector_document, mocker
|
||||
):
|
||||
|
|
@ -47,7 +49,9 @@ async def test_index_batch_creates_ready_documents(
|
|||
assert row.embedding is not None
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts", "patched_chunk_text")
|
||||
@pytest.mark.usefixtures(
|
||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
||||
)
|
||||
async def test_index_batch_empty_returns_empty(db_session, mocker):
|
||||
"""index_batch with empty input returns an empty list."""
|
||||
service = IndexingPipelineService(session=db_session)
|
||||
|
|
|
|||
106
surfsense_backend/tests/integration/retriever/conftest.py
Normal file
106
surfsense_backend/tests/integration/retriever/conftest.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
"""Shared fixtures for retriever integration tests."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import pytest_asyncio
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.db import Chunk, Document, DocumentType, SearchSpace, User
|
||||
|
||||
EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
||||
DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM
|
||||
|
||||
|
||||
def _make_document(
|
||||
*,
|
||||
title: str,
|
||||
document_type: DocumentType,
|
||||
content: str,
|
||||
search_space_id: int,
|
||||
created_by_id: str,
|
||||
) -> Document:
|
||||
uid = uuid.uuid4().hex[:12]
|
||||
return Document(
|
||||
title=title,
|
||||
document_type=document_type,
|
||||
content=content,
|
||||
content_hash=f"content-{uid}",
|
||||
unique_identifier_hash=f"uid-{uid}",
|
||||
source_markdown=content,
|
||||
search_space_id=search_space_id,
|
||||
created_by_id=created_by_id,
|
||||
embedding=DUMMY_EMBEDDING,
|
||||
updated_at=datetime.now(UTC),
|
||||
status={"state": "ready"},
|
||||
)
|
||||
|
||||
|
||||
def _make_chunk(*, content: str, document_id: int) -> Chunk:
|
||||
return Chunk(
|
||||
content=content,
|
||||
document_id=document_id,
|
||||
embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def seed_large_doc(
|
||||
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
||||
):
|
||||
"""Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20).
|
||||
|
||||
Also inserts a small 3-chunk document for diversity testing.
|
||||
Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``,
|
||||
and ``large_chunk_ids`` (all 35 chunk IDs).
|
||||
"""
|
||||
user_id = str(db_user.id)
|
||||
space_id = db_search_space.id
|
||||
|
||||
large_doc = _make_document(
|
||||
title="Large PDF Document",
|
||||
document_type=DocumentType.FILE,
|
||||
content="large document about quarterly performance reviews and budgets",
|
||||
search_space_id=space_id,
|
||||
created_by_id=user_id,
|
||||
)
|
||||
small_doc = _make_document(
|
||||
title="Small Note",
|
||||
document_type=DocumentType.NOTE,
|
||||
content="quarterly performance review summary note",
|
||||
search_space_id=space_id,
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
db_session.add_all([large_doc, small_doc])
|
||||
await db_session.flush()
|
||||
|
||||
large_chunks = []
|
||||
for i in range(35):
|
||||
chunk = _make_chunk(
|
||||
content=f"chunk {i} about quarterly performance review section {i}",
|
||||
document_id=large_doc.id,
|
||||
)
|
||||
large_chunks.append(chunk)
|
||||
|
||||
small_chunks = [
|
||||
_make_chunk(
|
||||
content="quarterly performance review summary note content",
|
||||
document_id=small_doc.id,
|
||||
),
|
||||
]
|
||||
|
||||
db_session.add_all(large_chunks + small_chunks)
|
||||
await db_session.flush()
|
||||
|
||||
return {
|
||||
"large_doc": large_doc,
|
||||
"small_doc": small_doc,
|
||||
"large_chunk_ids": [c.id for c in large_chunks],
|
||||
"small_chunk_ids": [c.id for c in small_chunks],
|
||||
"search_space": db_search_space,
|
||||
"user": db_user,
|
||||
}
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
"""Integration tests for optimized ChucksHybridSearchRetriever.
|
||||
|
||||
Verifies the SQL ROW_NUMBER per-doc chunk limit, column pruning,
|
||||
and doc metadata caching from RRF results.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.retriever.chunks_hybrid_search import (
|
||||
_MAX_FETCH_CHUNKS_PER_DOC,
|
||||
ChucksHybridSearchRetriever,
|
||||
)
|
||||
|
||||
from .conftest import DUMMY_EMBEDDING
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
|
||||
"""A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
large_doc_id = seed_large_doc["large_doc"].id
|
||||
for result in results:
|
||||
if result["document"].get("id") == large_doc_id:
|
||||
assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||
assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
|
||||
break
|
||||
else:
|
||||
pytest.fail("Large doc not found in search results")
|
||||
|
||||
|
||||
async def test_doc_metadata_populated_from_rrf(db_session, seed_large_doc):
|
||||
"""Document metadata (title, type, etc.) should be present even without joinedload."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
assert len(results) >= 1
|
||||
for result in results:
|
||||
doc = result["document"]
|
||||
assert "id" in doc
|
||||
assert "title" in doc
|
||||
assert doc["title"]
|
||||
assert "document_type" in doc
|
||||
assert doc["document_type"] is not None
|
||||
|
||||
|
||||
async def test_matched_chunk_ids_tracked(db_session, seed_large_doc):
|
||||
"""matched_chunk_ids should contain the chunk IDs that appeared in the RRF results."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
matched = result.get("matched_chunk_ids", [])
|
||||
chunk_ids_in_result = {c["chunk_id"] for c in result["chunks"]}
|
||||
for mid in matched:
|
||||
assert mid in chunk_ids_in_result, (
|
||||
f"matched_chunk_id {mid} not found in chunks"
|
||||
)
|
||||
|
||||
|
||||
async def test_chunks_ordered_by_id(db_session, seed_large_doc):
|
||||
"""Chunks within each document should be ordered by chunk ID (original order)."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
chunk_ids = [c["chunk_id"] for c in result["chunks"]]
|
||||
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
|
||||
|
||||
|
||||
async def test_score_is_positive_float(db_session, seed_large_doc):
|
||||
"""Each result should have a positive float score from RRF."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = ChucksHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
assert len(results) >= 1
|
||||
for result in results:
|
||||
assert isinstance(result["score"], float)
|
||||
assert result["score"] > 0
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
"""Integration tests for optimized DocumentHybridSearchRetriever.
|
||||
|
||||
Verifies the SQL ROW_NUMBER per-doc chunk limit and column pruning.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.retriever.documents_hybrid_search import (
|
||||
_MAX_FETCH_CHUNKS_PER_DOC,
|
||||
DocumentHybridSearchRetriever,
|
||||
)
|
||||
|
||||
from .conftest import DUMMY_EMBEDDING
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
|
||||
"""A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = DocumentHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
large_doc_id = seed_large_doc["large_doc"].id
|
||||
for result in results:
|
||||
if result["document"].get("id") == large_doc_id:
|
||||
assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||
assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
|
||||
break
|
||||
else:
|
||||
pytest.fail("Large doc not found in search results")
|
||||
|
||||
|
||||
async def test_doc_metadata_populated(db_session, seed_large_doc):
|
||||
"""Document metadata should be present from the RRF results."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = DocumentHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
assert len(results) >= 1
|
||||
for result in results:
|
||||
doc = result["document"]
|
||||
assert "id" in doc
|
||||
assert "title" in doc
|
||||
assert doc["title"]
|
||||
assert "document_type" in doc
|
||||
assert doc["document_type"] is not None
|
||||
|
||||
|
||||
async def test_chunks_ordered_by_id(db_session, seed_large_doc):
|
||||
"""Chunks within each document should be ordered by chunk ID."""
|
||||
space_id = seed_large_doc["search_space"].id
|
||||
|
||||
retriever = DocumentHybridSearchRetriever(db_session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text="quarterly performance review",
|
||||
top_k=10,
|
||||
search_space_id=space_id,
|
||||
query_embedding=DUMMY_EMBEDDING,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
chunk_ids = [c["chunk_id"] for c in result["chunks"]]
|
||||
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
|
||||
Loading…
Add table
Add a link
Reference in a new issue