SurfSense/surfsense_backend/tests/integration/retriever/conftest.py

"""Shared fixtures for retriever integration tests."""

from __future__ import annotations

import uuid
from datetime import UTC, datetime, timedelta

import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession

from app.config import config as app_config
from app.db import Chunk, Document, DocumentType, SearchSpace, User

EMBEDDING_DIM = app_config.embedding_model_instance.dimension
DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM


def _make_document(
    *,
    title: str,
    document_type: DocumentType,
    content: str,
    search_space_id: int,
    created_by_id: str,
    updated_at: datetime | None = None,
) -> Document:
    uid = uuid.uuid4().hex[:12]
    return Document(
        title=title,
        document_type=document_type,
        content=content,
        content_hash=f"content-{uid}",
        unique_identifier_hash=f"uid-{uid}",
        source_markdown=content,
        search_space_id=search_space_id,
        created_by_id=created_by_id,
        embedding=DUMMY_EMBEDDING,
        updated_at=updated_at or datetime.now(UTC),
        status={"state": "ready"},
    )


def _make_chunk(*, content: str, document_id: int) -> Chunk:
    return Chunk(
        content=content,
        document_id=document_id,
        embedding=DUMMY_EMBEDDING,
    )


@pytest_asyncio.fixture
async def seed_large_doc(
    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
    """Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20).

    Also inserts a small 3-chunk document for diversity testing.
    Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``,
    and ``large_chunk_ids`` (all 35 chunk IDs).
    """
    user_id = str(db_user.id)
    space_id = db_search_space.id

    large_doc = _make_document(
        title="Large PDF Document",
        document_type=DocumentType.FILE,
        content="large document about quarterly performance reviews and budgets",
        search_space_id=space_id,
        created_by_id=user_id,
    )
    small_doc = _make_document(
        title="Small Note",
        document_type=DocumentType.NOTE,
        content="quarterly performance review summary note",
        search_space_id=space_id,
        created_by_id=user_id,
    )

    db_session.add_all([large_doc, small_doc])
    await db_session.flush()

    large_chunks = []
    for i in range(35):
        chunk = _make_chunk(
            content=f"chunk {i} about quarterly performance review section {i}",
            document_id=large_doc.id,
        )
        large_chunks.append(chunk)

    small_chunks = [
        _make_chunk(
            content="quarterly performance review summary note content",
            document_id=small_doc.id,
        ),
    ]

    db_session.add_all(large_chunks + small_chunks)
    await db_session.flush()

    return {
        "large_doc": large_doc,
        "small_doc": small_doc,
        "large_chunk_ids": [c.id for c in large_chunks],
        "small_chunk_ids": [c.id for c in small_chunks],
        "search_space": db_search_space,
        "user": db_user,
    }


@pytest_asyncio.fixture
async def seed_date_filtered_docs(
    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
    """Insert matching docs with different timestamps for date-filter tests."""
    user_id = str(db_user.id)
    space_id = db_search_space.id
    now = datetime.now(UTC)

    recent_doc = _make_document(
        title="Recent OCV Notes",
        document_type=DocumentType.FILE,
        content="ocv meeting decisions and action items",
        search_space_id=space_id,
        created_by_id=user_id,
        updated_at=now,
    )
    old_doc = _make_document(
        title="Old OCV Notes",
        document_type=DocumentType.FILE,
        content="ocv meeting decisions and action items",
        search_space_id=space_id,
        created_by_id=user_id,
        updated_at=now - timedelta(days=730),
    )

    db_session.add_all([recent_doc, old_doc])
    await db_session.flush()

    db_session.add_all(
        [
            _make_chunk(
                content="ocv meeting decisions and action items recent",
                document_id=recent_doc.id,
            ),
            _make_chunk(
                content="ocv meeting decisions and action items old",
                document_id=old_doc.id,
            ),
        ]
    )
    await db_session.flush()

    return {
        "recent_doc": recent_doc,
        "old_doc": old_doc,
        "search_space": db_search_space,
        "user": db_user,
    }