SurfSense/surfsense_backend/tests/integration/google_unification/conftest.py
CREDO23 ce15016533 citations: consolidate prompts, retire eager path, refresh ADR
Rewrite the main-agent citation contract to a single [n] channel and sync
the orphaned system_prompt_composer surface to match; drop stale
[citation:chunk_id] / <chunk_index> references from dynamic_context and
provider hints. Reuse the shared hybrid search in the deliverables report
(citations omitted for now) and delete the orphaned report KB helper.
Remove the dead eager KnowledgePriorityMiddleware wiring (knowledge_priority
+ stack) and its legacy browse test. Update ADR 0001 to reflect the cutover.
2026-06-25 15:27:09 +02:00

310 lines
9.5 KiB
Python

"""Shared fixtures for Google unification integration tests."""
from __future__ import annotations
import uuid
from datetime import UTC, datetime
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from app.config import config as app_config
from app.db import (
Chunk,
Document,
DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
SearchSpace,
User,
)
EMBEDDING_DIM = app_config.embedding_model_instance.dimension
DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM
def make_document(
*,
title: str,
document_type: DocumentType,
content: str,
search_space_id: int,
created_by_id: str,
) -> Document:
"""Build a Document instance with unique hashes and a dummy embedding."""
uid = uuid.uuid4().hex[:12]
return Document(
title=title,
document_type=document_type,
content=content,
content_hash=f"content-{uid}",
unique_identifier_hash=f"uid-{uid}",
source_markdown=content,
search_space_id=search_space_id,
created_by_id=created_by_id,
embedding=DUMMY_EMBEDDING,
updated_at=datetime.now(UTC),
status={"state": "ready"},
)
def make_chunk(*, content: str, document_id: int) -> Chunk:
return Chunk(
content=content,
document_id=document_id,
embedding=DUMMY_EMBEDDING,
)
# ---------------------------------------------------------------------------
# Savepoint-based fixture (used by retriever tests that receive db_session)
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def seed_google_docs(
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
"""Insert a native Drive doc, a legacy Composio Drive doc, and a FILE doc.
Returns a dict with keys ``native_doc``, ``legacy_doc``, ``file_doc``,
plus ``search_space`` and ``user``.
"""
user_id = str(db_user.id)
space_id = db_search_space.id
native_doc = make_document(
title="Native Drive Document",
document_type=DocumentType.GOOGLE_DRIVE_FILE,
content="quarterly report from native google drive connector",
search_space_id=space_id,
created_by_id=user_id,
)
legacy_doc = make_document(
title="Legacy Composio Drive Document",
document_type=DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
content="quarterly report from composio google drive connector",
search_space_id=space_id,
created_by_id=user_id,
)
file_doc = make_document(
title="Uploaded PDF",
document_type=DocumentType.FILE,
content="unrelated uploaded file about quarterly reports",
search_space_id=space_id,
created_by_id=user_id,
)
db_session.add_all([native_doc, legacy_doc, file_doc])
await db_session.flush()
native_chunk = make_chunk(
content="quarterly report from native google drive connector",
document_id=native_doc.id,
)
legacy_chunk = make_chunk(
content="quarterly report from composio google drive connector",
document_id=legacy_doc.id,
)
file_chunk = make_chunk(
content="unrelated uploaded file about quarterly reports",
document_id=file_doc.id,
)
db_session.add_all([native_chunk, legacy_chunk, file_chunk])
await db_session.flush()
return {
"native_doc": native_doc,
"legacy_doc": legacy_doc,
"file_doc": file_doc,
"search_space": db_search_space,
"user": db_user,
}
# ---------------------------------------------------------------------------
# Committed-data fixture (used by service / browse tests that create their
# own sessions internally and therefore cannot see savepoint-scoped data)
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def committed_google_data(async_engine):
"""Insert native, legacy, and FILE docs via a committed transaction.
Yields ``{"search_space_id": int, "user_id": str}``.
Cleans up by deleting the search space (cascades to documents / chunks).
"""
space_id = None
async with async_engine.begin() as conn:
session = AsyncSession(bind=conn, expire_on_commit=False)
user = User(
id=uuid.uuid4(),
email=f"google-test-{uuid.uuid4().hex[:6]}@surfsense.net",
hashed_password="hashed",
is_active=True,
is_superuser=False,
is_verified=True,
)
session.add(user)
await session.flush()
space = SearchSpace(name=f"Google Test {uuid.uuid4().hex[:6]}", user_id=user.id)
session.add(space)
await session.flush()
space_id = space.id
user_id = str(user.id)
native_doc = make_document(
title="Native Drive Doc",
document_type=DocumentType.GOOGLE_DRIVE_FILE,
content="quarterly budget from native google drive",
search_space_id=space_id,
created_by_id=user_id,
)
legacy_doc = make_document(
title="Legacy Composio Drive Doc",
document_type=DocumentType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR,
content="quarterly budget from composio google drive",
search_space_id=space_id,
created_by_id=user_id,
)
file_doc = make_document(
title="Plain File",
document_type=DocumentType.FILE,
content="quarterly budget uploaded as file",
search_space_id=space_id,
created_by_id=user_id,
)
session.add_all([native_doc, legacy_doc, file_doc])
await session.flush()
for doc in [native_doc, legacy_doc, file_doc]:
session.add(
Chunk(
content=doc.content,
document_id=doc.id,
embedding=DUMMY_EMBEDDING,
)
)
await session.flush()
yield {"search_space_id": space_id, "user_id": user_id}
async with async_engine.begin() as conn:
await conn.execute(
text("DELETE FROM searchspaces WHERE id = :sid"), {"sid": space_id}
)
# ---------------------------------------------------------------------------
# Monkeypatch fixtures for system boundaries
# ---------------------------------------------------------------------------
@pytest.fixture
def patched_session_factory(async_engine, monkeypatch):
"""Replace ``async_session_maker`` in connector_service with one bound to the test engine."""
test_maker = async_sessionmaker(async_engine, expire_on_commit=False)
monkeypatch.setattr(
"app.services.connector_service.async_session_maker", test_maker
)
return test_maker
@pytest.fixture
def patched_embed(monkeypatch):
"""Mock the embedding model (system boundary) to return a fixed vector."""
mock = MagicMock(return_value=DUMMY_EMBEDDING)
monkeypatch.setattr("app.config.config.embedding_model_instance.embed", mock)
return mock
# ---------------------------------------------------------------------------
# Indexer test helpers
# ---------------------------------------------------------------------------
def make_session_factory(async_engine):
"""Create a session factory bound to the test engine."""
return async_sessionmaker(async_engine, expire_on_commit=False)
def mock_task_logger():
"""Return a fully-mocked TaskLoggingService with async methods."""
from unittest.mock import AsyncMock, MagicMock
mock = AsyncMock()
mock.log_task_start = AsyncMock(return_value=MagicMock())
mock.log_task_progress = AsyncMock()
mock.log_task_failure = AsyncMock()
mock.log_task_success = AsyncMock()
return mock
async def seed_connector(
async_engine,
*,
connector_type: SearchSourceConnectorType,
config: dict,
name_prefix: str = "test",
):
"""Seed a connector with committed data. Returns dict and cleanup function.
Yields ``{"connector_id", "search_space_id", "user_id"}``.
"""
space_id = None
async with async_engine.begin() as conn:
session = AsyncSession(bind=conn, expire_on_commit=False)
user = User(
id=uuid.uuid4(),
email=f"{name_prefix}-{uuid.uuid4().hex[:6]}@surfsense.net",
hashed_password="hashed",
is_active=True,
is_superuser=False,
is_verified=True,
)
session.add(user)
await session.flush()
space = SearchSpace(
name=f"{name_prefix} {uuid.uuid4().hex[:6]}", user_id=user.id
)
session.add(space)
await session.flush()
space_id = space.id
connector = SearchSourceConnector(
name=f"{name_prefix} connector",
connector_type=connector_type,
is_indexable=True,
config=config,
search_space_id=space_id,
user_id=user.id,
)
session.add(connector)
await session.flush()
connector_id = connector.id
user_id = str(user.id)
return {
"connector_id": connector_id,
"search_space_id": space_id,
"user_id": user_id,
}
async def cleanup_space(async_engine, space_id: int):
"""Delete a search space (cascades to connectors/documents)."""
async with async_engine.begin() as conn:
await conn.execute(
text("DELETE FROM searchspaces WHERE id = :sid"), {"sid": space_id}
)