feat: made agent file sytem optimized

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-03-28 16:39:46 -07:00
parent ee0b59c0fa
commit 2cc2d339e6
67 changed files with 8011 additions and 5591 deletions

View file

@ -0,0 +1,106 @@
"""Shared fixtures for retriever integration tests."""
from __future__ import annotations
import uuid
from datetime import UTC, datetime
import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
from app.db import Chunk, Document, DocumentType, SearchSpace, User
EMBEDDING_DIM = app_config.embedding_model_instance.dimension
DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM
def _make_document(
*,
title: str,
document_type: DocumentType,
content: str,
search_space_id: int,
created_by_id: str,
) -> Document:
uid = uuid.uuid4().hex[:12]
return Document(
title=title,
document_type=document_type,
content=content,
content_hash=f"content-{uid}",
unique_identifier_hash=f"uid-{uid}",
source_markdown=content,
search_space_id=search_space_id,
created_by_id=created_by_id,
embedding=DUMMY_EMBEDDING,
updated_at=datetime.now(UTC),
status={"state": "ready"},
)
def _make_chunk(*, content: str, document_id: int) -> Chunk:
return Chunk(
content=content,
document_id=document_id,
embedding=DUMMY_EMBEDDING,
)
@pytest_asyncio.fixture
async def seed_large_doc(
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
):
"""Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20).
Also inserts a small 3-chunk document for diversity testing.
Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``,
and ``large_chunk_ids`` (all 35 chunk IDs).
"""
user_id = str(db_user.id)
space_id = db_search_space.id
large_doc = _make_document(
title="Large PDF Document",
document_type=DocumentType.FILE,
content="large document about quarterly performance reviews and budgets",
search_space_id=space_id,
created_by_id=user_id,
)
small_doc = _make_document(
title="Small Note",
document_type=DocumentType.NOTE,
content="quarterly performance review summary note",
search_space_id=space_id,
created_by_id=user_id,
)
db_session.add_all([large_doc, small_doc])
await db_session.flush()
large_chunks = []
for i in range(35):
chunk = _make_chunk(
content=f"chunk {i} about quarterly performance review section {i}",
document_id=large_doc.id,
)
large_chunks.append(chunk)
small_chunks = [
_make_chunk(
content="quarterly performance review summary note content",
document_id=small_doc.id,
),
]
db_session.add_all(large_chunks + small_chunks)
await db_session.flush()
return {
"large_doc": large_doc,
"small_doc": small_doc,
"large_chunk_ids": [c.id for c in large_chunks],
"small_chunk_ids": [c.id for c in small_chunks],
"search_space": db_search_space,
"user": db_user,
}

View file

@ -0,0 +1,116 @@
"""Integration tests for optimized ChucksHybridSearchRetriever.
Verifies the SQL ROW_NUMBER per-doc chunk limit, column pruning,
and doc metadata caching from RRF results.
"""
import pytest
from app.retriever.chunks_hybrid_search import (
_MAX_FETCH_CHUNKS_PER_DOC,
ChucksHybridSearchRetriever,
)
from .conftest import DUMMY_EMBEDDING
pytestmark = pytest.mark.integration
async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
"""A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
space_id = seed_large_doc["search_space"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
large_doc_id = seed_large_doc["large_doc"].id
for result in results:
if result["document"].get("id") == large_doc_id:
assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
break
else:
pytest.fail("Large doc not found in search results")
async def test_doc_metadata_populated_from_rrf(db_session, seed_large_doc):
"""Document metadata (title, type, etc.) should be present even without joinedload."""
space_id = seed_large_doc["search_space"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
assert len(results) >= 1
for result in results:
doc = result["document"]
assert "id" in doc
assert "title" in doc
assert doc["title"]
assert "document_type" in doc
assert doc["document_type"] is not None
async def test_matched_chunk_ids_tracked(db_session, seed_large_doc):
"""matched_chunk_ids should contain the chunk IDs that appeared in the RRF results."""
space_id = seed_large_doc["search_space"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
matched = result.get("matched_chunk_ids", [])
chunk_ids_in_result = {c["chunk_id"] for c in result["chunks"]}
for mid in matched:
assert mid in chunk_ids_in_result, (
f"matched_chunk_id {mid} not found in chunks"
)
async def test_chunks_ordered_by_id(db_session, seed_large_doc):
"""Chunks within each document should be ordered by chunk ID (original order)."""
space_id = seed_large_doc["search_space"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
chunk_ids = [c["chunk_id"] for c in result["chunks"]]
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
async def test_score_is_positive_float(db_session, seed_large_doc):
"""Each result should have a positive float score from RRF."""
space_id = seed_large_doc["search_space"].id
retriever = ChucksHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
assert len(results) >= 1
for result in results:
assert isinstance(result["score"], float)
assert result["score"] > 0

View file

@ -0,0 +1,76 @@
"""Integration tests for optimized DocumentHybridSearchRetriever.
Verifies the SQL ROW_NUMBER per-doc chunk limit and column pruning.
"""
import pytest
from app.retriever.documents_hybrid_search import (
_MAX_FETCH_CHUNKS_PER_DOC,
DocumentHybridSearchRetriever,
)
from .conftest import DUMMY_EMBEDDING
pytestmark = pytest.mark.integration
async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
"""A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
space_id = seed_large_doc["search_space"].id
retriever = DocumentHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
large_doc_id = seed_large_doc["large_doc"].id
for result in results:
if result["document"].get("id") == large_doc_id:
assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
break
else:
pytest.fail("Large doc not found in search results")
async def test_doc_metadata_populated(db_session, seed_large_doc):
"""Document metadata should be present from the RRF results."""
space_id = seed_large_doc["search_space"].id
retriever = DocumentHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
assert len(results) >= 1
for result in results:
doc = result["document"]
assert "id" in doc
assert "title" in doc
assert doc["title"]
assert "document_type" in doc
assert doc["document_type"] is not None
async def test_chunks_ordered_by_id(db_session, seed_large_doc):
"""Chunks within each document should be ordered by chunk ID."""
space_id = seed_large_doc["search_space"].id
retriever = DocumentHybridSearchRetriever(db_session)
results = await retriever.hybrid_search(
query_text="quarterly performance review",
top_k=10,
search_space_id=space_id,
query_embedding=DUMMY_EMBEDDING,
)
for result in results:
chunk_ids = [c["chunk_id"] for c in result["chunks"]]
assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"