mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
feat: add integration tests for indexing pipeline components
- Introduced integration tests for Calendar, Drive, and Gmail indexers to ensure proper document creation and migration. - Added tests for batch indexing functionality to validate the processing of multiple documents. - Implemented tests for legacy document migration to verify updates to document types and hashes. - Enhanced test coverage for the IndexingPipelineService to ensure robust functionality across various document types.
This commit is contained in:
parent
f7b52470eb
commit
8c41fd91ba
7 changed files with 693 additions and 0 deletions
|
|
@ -0,0 +1,82 @@
|
|||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.indexing_pipeline.document_hashing import compute_unique_identifier_hash
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_session():
|
||||
return AsyncMock()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pipeline(mock_session):
|
||||
return IndexingPipelineService(mock_session)
|
||||
|
||||
|
||||
async def test_calls_prepare_then_index_per_document(
|
||||
pipeline, make_connector_document
|
||||
):
|
||||
"""index_batch calls prepare_for_indexing, then index() for each returned doc."""
|
||||
doc1 = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-1",
|
||||
search_space_id=1,
|
||||
)
|
||||
doc2 = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-2",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
orm1 = MagicMock(spec=Document)
|
||||
orm1.unique_identifier_hash = compute_unique_identifier_hash(doc1)
|
||||
orm2 = MagicMock(spec=Document)
|
||||
orm2.unique_identifier_hash = compute_unique_identifier_hash(doc2)
|
||||
|
||||
mock_llm = MagicMock()
|
||||
|
||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[orm1, orm2])
|
||||
pipeline.index = AsyncMock(side_effect=lambda doc, cdoc, llm: doc)
|
||||
|
||||
results = await pipeline.index_batch([doc1, doc2], mock_llm)
|
||||
|
||||
pipeline.prepare_for_indexing.assert_awaited_once_with([doc1, doc2])
|
||||
assert pipeline.index.await_count == 2
|
||||
assert results == [orm1, orm2]
|
||||
|
||||
|
||||
async def test_empty_input_returns_empty(pipeline):
|
||||
"""Empty connector_docs list returns empty result."""
|
||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[])
|
||||
|
||||
results = await pipeline.index_batch([], MagicMock())
|
||||
|
||||
assert results == []
|
||||
|
||||
|
||||
async def test_skips_document_without_matching_connector_doc(
|
||||
pipeline, make_connector_document
|
||||
):
|
||||
"""If prepare returns a doc whose hash has no matching ConnectorDocument, it's skipped."""
|
||||
doc1 = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-1",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
orphan_orm = MagicMock(spec=Document)
|
||||
orphan_orm.unique_identifier_hash = "nonexistent-hash"
|
||||
|
||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[orphan_orm])
|
||||
pipeline.index = AsyncMock()
|
||||
|
||||
results = await pipeline.index_batch([doc1], MagicMock())
|
||||
|
||||
pipeline.index.assert_not_awaited()
|
||||
assert results == []
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_session():
|
||||
session = AsyncMock()
|
||||
return session
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pipeline(mock_session):
|
||||
return IndexingPipelineService(mock_session)
|
||||
|
||||
|
||||
def _make_execute_side_effect(doc_by_hash: dict):
|
||||
"""Return a side_effect for session.execute that resolves documents by hash."""
|
||||
|
||||
async def _side_effect(stmt):
|
||||
result = MagicMock()
|
||||
for h, doc in doc_by_hash.items():
|
||||
if h in str(stmt.compile(compile_kwargs={"literal_binds": True})):
|
||||
result.scalars.return_value.first.return_value = doc
|
||||
return result
|
||||
result.scalars.return_value.first.return_value = None
|
||||
return result
|
||||
|
||||
return _side_effect
|
||||
|
||||
|
||||
async def test_updates_hash_and_type_for_legacy_document(
|
||||
pipeline, mock_session, make_connector_document
|
||||
):
|
||||
"""Legacy Composio document gets unique_identifier_hash and document_type updated."""
|
||||
doc = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-abc",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
legacy_hash = compute_identifier_hash("COMPOSIO_GMAIL_CONNECTOR", "msg-abc", 1)
|
||||
native_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-abc", 1)
|
||||
|
||||
existing = MagicMock(spec=Document)
|
||||
existing.unique_identifier_hash = legacy_hash
|
||||
existing.document_type = DocumentType.COMPOSIO_GMAIL_CONNECTOR
|
||||
|
||||
result_mock = MagicMock()
|
||||
result_mock.scalars.return_value.first.return_value = existing
|
||||
mock_session.execute = AsyncMock(return_value=result_mock)
|
||||
|
||||
await pipeline.migrate_legacy_docs([doc])
|
||||
|
||||
assert existing.unique_identifier_hash == native_hash
|
||||
assert existing.document_type == DocumentType.GOOGLE_GMAIL_CONNECTOR
|
||||
mock_session.commit.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_noop_when_no_legacy_document_exists(
|
||||
pipeline, mock_session, make_connector_document
|
||||
):
|
||||
"""No updates when no legacy Composio document is found in DB."""
|
||||
doc = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
unique_id="msg-xyz",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
result_mock = MagicMock()
|
||||
result_mock.scalars.return_value.first.return_value = None
|
||||
mock_session.execute = AsyncMock(return_value=result_mock)
|
||||
|
||||
await pipeline.migrate_legacy_docs([doc])
|
||||
|
||||
mock_session.commit.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_skips_non_google_doc_types(
|
||||
pipeline, mock_session, make_connector_document
|
||||
):
|
||||
"""Non-Google doc types have no legacy mapping and trigger no DB query."""
|
||||
doc = make_connector_document(
|
||||
document_type=DocumentType.SLACK_CONNECTOR,
|
||||
unique_id="slack-123",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
await pipeline.migrate_legacy_docs([doc])
|
||||
|
||||
mock_session.execute.assert_not_awaited()
|
||||
mock_session.commit.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_handles_all_three_google_types(
|
||||
pipeline, mock_session, make_connector_document
|
||||
):
|
||||
"""Each native Google type correctly maps to its Composio legacy type."""
|
||||
mappings = [
|
||||
(DocumentType.GOOGLE_GMAIL_CONNECTOR, "COMPOSIO_GMAIL_CONNECTOR"),
|
||||
(DocumentType.GOOGLE_CALENDAR_CONNECTOR, "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"),
|
||||
(DocumentType.GOOGLE_DRIVE_FILE, "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"),
|
||||
]
|
||||
for native_type, expected_legacy in mappings:
|
||||
doc = make_connector_document(
|
||||
document_type=native_type,
|
||||
unique_id="id-1",
|
||||
search_space_id=1,
|
||||
)
|
||||
|
||||
existing = MagicMock(spec=Document)
|
||||
existing.document_type = DocumentType(expected_legacy)
|
||||
|
||||
result_mock = MagicMock()
|
||||
result_mock.scalars.return_value.first.return_value = existing
|
||||
mock_session.execute = AsyncMock(return_value=result_mock)
|
||||
mock_session.commit = AsyncMock()
|
||||
|
||||
await pipeline.migrate_legacy_docs([doc])
|
||||
|
||||
assert existing.document_type == native_type
|
||||
Loading…
Add table
Add a link
Reference in a new issue