diff --git a/surfsense_backend/tests/e2e/fakes/composio_module.py b/surfsense_backend/tests/e2e/fakes/composio_module.py index 38c4d4c46..16a93d0f1 100644 --- a/surfsense_backend/tests/e2e/fakes/composio_module.py +++ b/surfsense_backend/tests/e2e/fakes/composio_module.py @@ -350,6 +350,25 @@ def _drive_list_files(args: dict[str, Any]) -> dict[str, Any]: folder id and serve the matching fixture list. """ q = args.get("q", "") + if "in owners" in q: + return { + "data": { + "files": [ + { + "id": "fake-file-owner-probe", + "name": "owner-probe", + "owners": [ + { + "me": True, + "emailAddress": "e2e-fake@surfsense.example", + } + ], + } + ], + "nextPageToken": None, + } + } + folder_id = "root" if "in parents" in q: # q looks like: '' in parents and trashed = false ... diff --git a/surfsense_backend/tests/integration/composio/conftest.py b/surfsense_backend/tests/integration/composio/conftest.py index 779e7bdb2..44d707ec3 100644 --- a/surfsense_backend/tests/integration/composio/conftest.py +++ b/surfsense_backend/tests/integration/composio/conftest.py @@ -1,13 +1,11 @@ """Composio route integration fixtures. -The sys.modules hijack happens at module import time, before importing -app.app, so production `from composio import Composio` bindings resolve to -the strict E2E fake in this pytest process too. +The `composio` sys.modules hijack lives in the parent integration conftest +so it runs before any sibling suite imports `app.routes`. """ from __future__ import annotations -import sys from collections.abc import AsyncGenerator import httpx @@ -16,19 +14,15 @@ import pytest_asyncio from httpx import ASGITransport from sqlalchemy.ext.asyncio import AsyncSession -from tests.e2e.fakes import composio_module as _fake_composio - -sys.modules["composio"] = _fake_composio - -from app.app import app, limiter # noqa: E402 -from app.config import config # noqa: E402 -from app.db import ( # noqa: E402 +from app.app import app, limiter +from app.config import config +from app.db import ( SearchSourceConnector, SearchSourceConnectorType, User, get_async_session, ) -from app.users import current_active_user # noqa: E402 +from app.users import current_active_user pytestmark = pytest.mark.integration diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index d9d7cacae..e03101e63 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -1,3 +1,5 @@ +import importlib +import sys import uuid from unittest.mock import AsyncMock, MagicMock @@ -7,17 +9,27 @@ from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.pool import NullPool -from app.config import config as app_config -from app.db import ( - Base, - DocumentType, - SearchSourceConnector, - SearchSourceConnectorType, - SearchSpace, - User, -) -from app.indexing_pipeline.connector_document import ConnectorDocument -from tests.conftest import TEST_DATABASE_URL +# Hijack `composio` before any `from app.*` import; the `from composio import +# Composio` in app.services.composio_service binds once at first import. +from tests.e2e.fakes import composio_module as _fake_composio + +sys.modules["composio"] = _fake_composio + +app_config = importlib.import_module("app.config").config +app_db = importlib.import_module("app.db") +Base = app_db.Base +DocumentType = app_db.DocumentType +SearchSourceConnector = app_db.SearchSourceConnector +SearchSourceConnectorType = app_db.SearchSourceConnectorType +SearchSpace = app_db.SearchSpace +User = app_db.User +ConnectorDocument = importlib.import_module( + "app.indexing_pipeline.connector_document" +).ConnectorDocument +create_default_roles_and_membership = importlib.import_module( + "app.routes.search_spaces_routes" +).create_default_roles_and_membership +TEST_DATABASE_URL = importlib.import_module("tests.conftest").TEST_DATABASE_URL _EMBEDDING_DIM = app_config.embedding_model_instance.dimension @@ -105,6 +117,9 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac ) db_session.add(space) await db_session.flush() + # Mirror POST /searchspaces so routes guarded by check_permission find a membership. + await create_default_roles_and_membership(db_session, space.id, db_user.id) + await db_session.flush() return space @@ -145,6 +160,10 @@ def patched_chunk_text(monkeypatch) -> MagicMock: "app.indexing_pipeline.indexing_pipeline_service.chunk_text", mock, ) + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + mock, + ) return mock diff --git a/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py b/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py index 1c8f7f990..143c9e252 100644 --- a/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py +++ b/surfsense_backend/tests/integration/document_upload/test_stripe_page_purchases.py @@ -204,6 +204,7 @@ class TestStripeCheckoutSessionCreation: assert ( fake_client.last_params["success_url"] == f"http://localhost:3000/dashboard/{search_space_id}/purchase-success" + "?session_id={CHECKOUT_SESSION_ID}" ) assert ( fake_client.last_params["cancel_url"] diff --git a/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py index 795f0d564..44ff5c48a 100644 --- a/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py +++ b/surfsense_backend/tests/integration/google_unification/test_calendar_indexer_credentials.py @@ -7,7 +7,7 @@ mocked at their system boundaries. from __future__ import annotations -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import ANY, AsyncMock, MagicMock, patch import pytest import pytest_asyncio @@ -25,6 +25,7 @@ pytestmark = pytest.mark.integration _COMPOSIO_ACCOUNT_ID = "composio-calendar-test-789" _INDEXER_MODULE = "app.tasks.connector_indexers.google_calendar_indexer" +_GET_ACCESS_TOKEN = "app.services.composio_service.ComposioService.get_access_token" @pytest_asyncio.fixture @@ -69,32 +70,29 @@ async def native_calendar(async_engine): await cleanup_space(async_engine, data["search_space_id"]) +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") @patch(f"{_INDEXER_MODULE}.GoogleCalendarConnector") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") -async def test_composio_calendar_uses_composio_credentials( - mock_build_creds, +@patch(f"{_INDEXER_MODULE}.ComposioService") +async def test_composio_calendar_uses_composio_service( + mock_composio_service_cls, mock_cal_cls, mock_tl_cls, + mock_get_access_token, async_engine, composio_calendar, ): - """Calendar indexer calls build_composio_credentials for a Composio connector.""" + """Calendar indexer uses Composio tools directly for a Composio connector.""" from app.tasks.connector_indexers.google_calendar_indexer import ( index_google_calendar_events, ) data = composio_calendar - mock_creds = MagicMock(name="composio-creds") - mock_build_creds.return_value = mock_creds + mock_composio_service = MagicMock() + mock_composio_service.get_calendar_events = AsyncMock(return_value=([], None)) + mock_composio_service_cls.return_value = mock_composio_service mock_tl_cls.return_value = mock_task_logger() - mock_cal_instance = MagicMock() - mock_cal_instance.get_all_primary_calendar_events = AsyncMock( - return_value=([], None) - ) - mock_cal_cls.return_value = mock_cal_instance - maker = make_session_factory(async_engine) async with maker() as session: await index_google_calendar_events( @@ -104,17 +102,25 @@ async def test_composio_calendar_uses_composio_credentials( user_id=data["user_id"], ) - mock_build_creds.assert_called_once_with(_COMPOSIO_ACCOUNT_ID) - mock_cal_cls.assert_called_once() - _, kwargs = mock_cal_cls.call_args - assert kwargs.get("credentials") is mock_creds + mock_composio_service_cls.assert_called_once() + mock_composio_service.get_calendar_events.assert_called_once_with( + connected_account_id=_COMPOSIO_ACCOUNT_ID, + entity_id=f"surfsense_{data['user_id']}", + time_min=ANY, + time_max=ANY, + max_results=250, + ) + mock_cal_cls.assert_not_called() + mock_get_access_token.assert_not_called() +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") +@patch(f"{_INDEXER_MODULE}.ComposioService") async def test_composio_calendar_without_account_id_returns_error( - mock_build_creds, + mock_composio_service_cls, mock_tl_cls, + mock_get_access_token, async_engine, composio_calendar_no_id, ): @@ -138,20 +144,23 @@ async def test_composio_calendar_without_account_id_returns_error( assert count == 0 assert error is not None assert "composio" in error.lower() - mock_build_creds.assert_not_called() + mock_composio_service_cls.assert_not_called() + mock_get_access_token.assert_not_called() +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") +@patch(f"{_INDEXER_MODULE}.ComposioService") @patch(f"{_INDEXER_MODULE}.GoogleCalendarConnector") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") -async def test_native_calendar_does_not_use_composio_credentials( - mock_build_creds, +async def test_native_calendar_uses_google_calendar_connector( mock_cal_cls, + mock_composio_service_cls, mock_tl_cls, + mock_get_access_token, async_engine, native_calendar, ): - """Calendar indexer does NOT call build_composio_credentials for a native connector.""" + """Native Calendar connector uses GoogleCalendarConnector with no Composio path.""" from app.tasks.connector_indexers.google_calendar_indexer import ( index_google_calendar_events, ) @@ -174,4 +183,6 @@ async def test_native_calendar_does_not_use_composio_credentials( user_id=data["user_id"], ) - mock_build_creds.assert_not_called() + mock_cal_cls.assert_called_once() + mock_composio_service_cls.assert_not_called() + mock_get_access_token.assert_not_called() diff --git a/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py index afb3e64c3..b869f5607 100644 --- a/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py +++ b/surfsense_backend/tests/integration/google_unification/test_gmail_indexer_credentials.py @@ -7,7 +7,7 @@ mocked at their system boundaries. from __future__ import annotations -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import ANY, AsyncMock, MagicMock, patch import pytest import pytest_asyncio @@ -25,6 +25,7 @@ pytestmark = pytest.mark.integration _COMPOSIO_ACCOUNT_ID = "composio-gmail-test-456" _INDEXER_MODULE = "app.tasks.connector_indexers.google_gmail_indexer" +_GET_ACCESS_TOKEN = "app.services.composio_service.ComposioService.get_access_token" @pytest_asyncio.fixture @@ -69,30 +70,32 @@ async def native_gmail(async_engine): await cleanup_space(async_engine, data["search_space_id"]) +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") @patch(f"{_INDEXER_MODULE}.GoogleGmailConnector") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") -async def test_composio_gmail_uses_composio_credentials( - mock_build_creds, +@patch(f"{_INDEXER_MODULE}.ComposioService") +async def test_composio_gmail_uses_composio_service( + mock_composio_service_cls, mock_gmail_cls, mock_tl_cls, + mock_get_access_token, async_engine, composio_gmail, ): - """Gmail indexer calls build_composio_credentials for a Composio connector.""" + """Gmail indexer uses Composio tools directly for a Composio connector.""" from app.tasks.connector_indexers.google_gmail_indexer import ( index_google_gmail_messages, ) data = composio_gmail - mock_creds = MagicMock(name="composio-creds") - mock_build_creds.return_value = mock_creds + mock_composio_service = MagicMock() + mock_composio_service.get_gmail_messages = AsyncMock( + return_value=([], None, None, None) + ) + mock_composio_service.get_gmail_message_detail = AsyncMock(return_value=({}, None)) + mock_composio_service_cls.return_value = mock_composio_service mock_tl_cls.return_value = mock_task_logger() - mock_gmail_instance = MagicMock() - mock_gmail_instance.get_recent_messages = AsyncMock(return_value=([], None)) - mock_gmail_cls.return_value = mock_gmail_instance - maker = make_session_factory(async_engine) async with maker() as session: await index_google_gmail_messages( @@ -102,17 +105,25 @@ async def test_composio_gmail_uses_composio_credentials( user_id=data["user_id"], ) - mock_build_creds.assert_called_once_with(_COMPOSIO_ACCOUNT_ID) - mock_gmail_cls.assert_called_once() - args, _ = mock_gmail_cls.call_args - assert args[0] is mock_creds + mock_composio_service_cls.assert_called_once() + mock_composio_service.get_gmail_messages.assert_called_once_with( + connected_account_id=_COMPOSIO_ACCOUNT_ID, + entity_id=f"surfsense_{data['user_id']}", + query=ANY, + max_results=ANY, + page_token=None, + ) + mock_gmail_cls.assert_not_called() + mock_get_access_token.assert_not_called() +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") +@patch(f"{_INDEXER_MODULE}.ComposioService") async def test_composio_gmail_without_account_id_returns_error( - mock_build_creds, + mock_composio_service_cls, mock_tl_cls, + mock_get_access_token, async_engine, composio_gmail_no_id, ): @@ -136,20 +147,23 @@ async def test_composio_gmail_without_account_id_returns_error( assert count == 0 assert error is not None assert "composio" in error.lower() - mock_build_creds.assert_not_called() + mock_composio_service_cls.assert_not_called() + mock_get_access_token.assert_not_called() +@patch(_GET_ACCESS_TOKEN) @patch(f"{_INDEXER_MODULE}.TaskLoggingService") +@patch(f"{_INDEXER_MODULE}.ComposioService") @patch(f"{_INDEXER_MODULE}.GoogleGmailConnector") -@patch(f"{_INDEXER_MODULE}.build_composio_credentials") -async def test_native_gmail_does_not_use_composio_credentials( - mock_build_creds, +async def test_native_gmail_uses_google_gmail_connector( mock_gmail_cls, + mock_composio_service_cls, mock_tl_cls, + mock_get_access_token, async_engine, native_gmail, ): - """Gmail indexer does NOT call build_composio_credentials for a native connector.""" + """Native Gmail connector uses GoogleGmailConnector with no Composio path.""" from app.tasks.connector_indexers.google_gmail_indexer import ( index_google_gmail_messages, ) @@ -170,4 +184,6 @@ async def test_native_gmail_does_not_use_composio_credentials( user_id=data["user_id"], ) - mock_build_creds.assert_not_called() + mock_gmail_cls.assert_called_once() + mock_composio_service_cls.assert_not_called() + mock_get_access_token.assert_not_called() diff --git a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py index 9fc802aa6..6bb1d2094 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/adapters/test_file_upload_adapter.py @@ -200,7 +200,7 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker): """Reindexing replaces old chunks with new content rather than appending.""" mocker.patch( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", side_effect=[["Original chunk."], ["Updated chunk."]], ) diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py index 59b6cec9c..07e388836 100644 --- a/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_index_batch_parallel.py @@ -37,7 +37,12 @@ def _make_orm_doc(connector_doc, doc_id): async def test_index_calls_embed_and_chunk_via_to_thread( pipeline, make_connector_document, monkeypatch ): - """index() runs embed_texts and the chunker via asyncio.to_thread, not blocking the loop.""" + """index() runs the chunker and embed_texts via asyncio.to_thread, not blocking the loop. + + Routing between ``chunk_text`` (code path) and ``chunk_text_hybrid`` (default + path, see issue #1334) is verified separately in + ``test_non_code_documents_use_hybrid_chunker``. + """ to_thread_calls = [] original_to_thread = asyncio.to_thread @@ -51,12 +56,6 @@ async def test_index_calls_embed_and_chunk_via_to_thread( "app.indexing_pipeline.indexing_pipeline_service.summarize_document", AsyncMock(return_value="Summary."), ) - mock_chunk = MagicMock(return_value=["chunk1"]) - mock_chunk.__name__ = "chunk_text" - monkeypatch.setattr( - "app.indexing_pipeline.indexing_pipeline_service.chunk_text", - mock_chunk, - ) mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) mock_chunk_hybrid.__name__ = "chunk_text_hybrid" monkeypatch.setattr( @@ -71,6 +70,11 @@ async def test_index_calls_embed_and_chunk_via_to_thread( "app.indexing_pipeline.indexing_pipeline_service.embed_texts", mock_embed, ) + # Bypass set_committed_value, which requires a real ORM instance (not MagicMock). + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document", + MagicMock(), + ) connector_doc = make_connector_document( document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, @@ -83,11 +87,62 @@ async def test_index_calls_embed_and_chunk_via_to_thread( await pipeline.index(document, connector_doc, llm=MagicMock()) - # Non-code documents now route through the table-aware hybrid chunker - # (see commit 2f3a33c9). Either chunker entry point satisfies the - # "chunking runs off the event loop" contract this test guards. + # Either chunker entry point satisfies the "chunking runs off the event + # loop" contract this test guards. Routing between the two is verified + # in test_non_code_documents_use_hybrid_chunker. assert {"chunk_text", "chunk_text_hybrid"} & set(to_thread_calls) assert "embed_texts" in to_thread_calls + assert document.status == DocumentStatus.ready() + + +async def test_non_code_documents_use_hybrid_chunker( + pipeline, make_connector_document, monkeypatch +): + """Non-code documents route through ``chunk_text_hybrid`` (issue #1334). + + The hybrid chunker preserves Markdown table integrity by avoiding splits + mid-row. Only documents flagged with ``should_use_code_chunker=True`` + should take the ``chunk_text`` path. + """ + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.summarize_document", + AsyncMock(return_value="Summary."), + ) + mock_chunk_hybrid = MagicMock(return_value=["chunk1"]) + mock_chunk_hybrid.__name__ = "chunk_text_hybrid" + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.chunk_text_hybrid", + mock_chunk_hybrid, + ) + mock_chunk_code = MagicMock(return_value=["chunk1"]) + mock_chunk_code.__name__ = "chunk_text" + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.chunk_text", + mock_chunk_code, + ) + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.embed_texts", + MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]), + ) + monkeypatch.setattr( + "app.indexing_pipeline.indexing_pipeline_service.attach_chunks_to_document", + MagicMock(), + ) + + connector_doc = make_connector_document( + document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, + unique_id="msg-1", + search_space_id=1, + should_use_code_chunker=False, + ) + document = MagicMock(spec=Document) + document.id = 1 + document.status = DocumentStatus.pending() + + await pipeline.index(document, connector_doc, llm=MagicMock()) + + mock_chunk_hybrid.assert_called_once() + mock_chunk_code.assert_not_called() def _mock_session_factory(orm_docs_by_id):