"""Shared fixtures for retriever integration tests.""" from __future__ import annotations import uuid from datetime import UTC, datetime, timedelta import pytest_asyncio from sqlalchemy.ext.asyncio import AsyncSession from app.config import config as app_config from app.db import Chunk, Document, DocumentType, SearchSpace, User EMBEDDING_DIM = app_config.embedding_model_instance.dimension DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM def _make_document( *, title: str, document_type: DocumentType, content: str, search_space_id: int, created_by_id: str, updated_at: datetime | None = None, ) -> Document: uid = uuid.uuid4().hex[:12] return Document( title=title, document_type=document_type, content=content, content_hash=f"content-{uid}", unique_identifier_hash=f"uid-{uid}", source_markdown=content, search_space_id=search_space_id, created_by_id=created_by_id, embedding=DUMMY_EMBEDDING, updated_at=updated_at or datetime.now(UTC), status={"state": "ready"}, ) def _make_chunk(*, content: str, document_id: int) -> Chunk: return Chunk( content=content, document_id=document_id, embedding=DUMMY_EMBEDDING, ) @pytest_asyncio.fixture async def seed_large_doc( db_session: AsyncSession, db_user: User, db_search_space: SearchSpace ): """Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20). Also inserts a small 3-chunk document for diversity testing. Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``, and ``large_chunk_ids`` (all 35 chunk IDs). """ user_id = str(db_user.id) space_id = db_search_space.id large_doc = _make_document( title="Large PDF Document", document_type=DocumentType.FILE, content="large document about quarterly performance reviews and budgets", search_space_id=space_id, created_by_id=user_id, ) small_doc = _make_document( title="Small Note", document_type=DocumentType.NOTE, content="quarterly performance review summary note", search_space_id=space_id, created_by_id=user_id, ) db_session.add_all([large_doc, small_doc]) await db_session.flush() large_chunks = [] for i in range(35): chunk = _make_chunk( content=f"chunk {i} about quarterly performance review section {i}", document_id=large_doc.id, ) large_chunks.append(chunk) small_chunks = [ _make_chunk( content="quarterly performance review summary note content", document_id=small_doc.id, ), ] db_session.add_all(large_chunks + small_chunks) await db_session.flush() return { "large_doc": large_doc, "small_doc": small_doc, "large_chunk_ids": [c.id for c in large_chunks], "small_chunk_ids": [c.id for c in small_chunks], "search_space": db_search_space, "user": db_user, } @pytest_asyncio.fixture async def seed_date_filtered_docs( db_session: AsyncSession, db_user: User, db_search_space: SearchSpace ): """Insert matching docs with different timestamps for date-filter tests.""" user_id = str(db_user.id) space_id = db_search_space.id now = datetime.now(UTC) recent_doc = _make_document( title="Recent OCV Notes", document_type=DocumentType.FILE, content="ocv meeting decisions and action items", search_space_id=space_id, created_by_id=user_id, updated_at=now, ) old_doc = _make_document( title="Old OCV Notes", document_type=DocumentType.FILE, content="ocv meeting decisions and action items", search_space_id=space_id, created_by_id=user_id, updated_at=now - timedelta(days=730), ) db_session.add_all([recent_doc, old_doc]) await db_session.flush() db_session.add_all( [ _make_chunk( content="ocv meeting decisions and action items recent", document_id=recent_doc.id, ), _make_chunk( content="ocv meeting decisions and action items old", document_id=old_doc.id, ), ] ) await db_session.flush() return { "recent_doc": recent_doc, "old_doc": old_doc, "search_space": db_search_space, "user": db_user, }