mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 17:56:25 +02:00
158 lines
4.4 KiB
Python
158 lines
4.4 KiB
Python
"""Shared fixtures for retriever integration tests."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from datetime import UTC, datetime, timedelta
|
|
|
|
import pytest_asyncio
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.config import config as app_config
|
|
from app.db import Chunk, Document, DocumentType, SearchSpace, User
|
|
|
|
EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
|
DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM
|
|
|
|
|
|
def _make_document(
|
|
*,
|
|
title: str,
|
|
document_type: DocumentType,
|
|
content: str,
|
|
search_space_id: int,
|
|
created_by_id: str,
|
|
updated_at: datetime | None = None,
|
|
) -> Document:
|
|
uid = uuid.uuid4().hex[:12]
|
|
return Document(
|
|
title=title,
|
|
document_type=document_type,
|
|
content=content,
|
|
content_hash=f"content-{uid}",
|
|
unique_identifier_hash=f"uid-{uid}",
|
|
source_markdown=content,
|
|
search_space_id=search_space_id,
|
|
created_by_id=created_by_id,
|
|
embedding=DUMMY_EMBEDDING,
|
|
updated_at=updated_at or datetime.now(UTC),
|
|
status={"state": "ready"},
|
|
)
|
|
|
|
|
|
def _make_chunk(*, content: str, document_id: int) -> Chunk:
|
|
return Chunk(
|
|
content=content,
|
|
document_id=document_id,
|
|
embedding=DUMMY_EMBEDDING,
|
|
)
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def seed_large_doc(
|
|
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
|
):
|
|
"""Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20).
|
|
|
|
Also inserts a small 3-chunk document for diversity testing.
|
|
Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``,
|
|
and ``large_chunk_ids`` (all 35 chunk IDs).
|
|
"""
|
|
user_id = str(db_user.id)
|
|
space_id = db_search_space.id
|
|
|
|
large_doc = _make_document(
|
|
title="Large PDF Document",
|
|
document_type=DocumentType.FILE,
|
|
content="large document about quarterly performance reviews and budgets",
|
|
search_space_id=space_id,
|
|
created_by_id=user_id,
|
|
)
|
|
small_doc = _make_document(
|
|
title="Small Note",
|
|
document_type=DocumentType.NOTE,
|
|
content="quarterly performance review summary note",
|
|
search_space_id=space_id,
|
|
created_by_id=user_id,
|
|
)
|
|
|
|
db_session.add_all([large_doc, small_doc])
|
|
await db_session.flush()
|
|
|
|
large_chunks = []
|
|
for i in range(35):
|
|
chunk = _make_chunk(
|
|
content=f"chunk {i} about quarterly performance review section {i}",
|
|
document_id=large_doc.id,
|
|
)
|
|
large_chunks.append(chunk)
|
|
|
|
small_chunks = [
|
|
_make_chunk(
|
|
content="quarterly performance review summary note content",
|
|
document_id=small_doc.id,
|
|
),
|
|
]
|
|
|
|
db_session.add_all(large_chunks + small_chunks)
|
|
await db_session.flush()
|
|
|
|
return {
|
|
"large_doc": large_doc,
|
|
"small_doc": small_doc,
|
|
"large_chunk_ids": [c.id for c in large_chunks],
|
|
"small_chunk_ids": [c.id for c in small_chunks],
|
|
"search_space": db_search_space,
|
|
"user": db_user,
|
|
}
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def seed_date_filtered_docs(
|
|
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
|
):
|
|
"""Insert matching docs with different timestamps for date-filter tests."""
|
|
user_id = str(db_user.id)
|
|
space_id = db_search_space.id
|
|
now = datetime.now(UTC)
|
|
|
|
recent_doc = _make_document(
|
|
title="Recent OCV Notes",
|
|
document_type=DocumentType.FILE,
|
|
content="ocv meeting decisions and action items",
|
|
search_space_id=space_id,
|
|
created_by_id=user_id,
|
|
updated_at=now,
|
|
)
|
|
old_doc = _make_document(
|
|
title="Old OCV Notes",
|
|
document_type=DocumentType.FILE,
|
|
content="ocv meeting decisions and action items",
|
|
search_space_id=space_id,
|
|
created_by_id=user_id,
|
|
updated_at=now - timedelta(days=730),
|
|
)
|
|
|
|
db_session.add_all([recent_doc, old_doc])
|
|
await db_session.flush()
|
|
|
|
db_session.add_all(
|
|
[
|
|
_make_chunk(
|
|
content="ocv meeting decisions and action items recent",
|
|
document_id=recent_doc.id,
|
|
),
|
|
_make_chunk(
|
|
content="ocv meeting decisions and action items old",
|
|
document_id=old_doc.id,
|
|
),
|
|
]
|
|
)
|
|
await db_session.flush()
|
|
|
|
return {
|
|
"recent_doc": recent_doc,
|
|
"old_doc": old_doc,
|
|
"search_space": db_search_space,
|
|
"user": db_user,
|
|
}
|