feat: made agent file sytem optimized

2026-07-10 22:32:16 +02:00 · 2026-03-28 16:39:46 -07:00 · 2026-03-28 16:39:46 -07:00 · 2cc2d339e6
commit 2cc2d339e6
parent ee0b59c0fa
67 changed files with 8011 additions and 5591 deletions
--- a/surfsense_backend/tests/integration/retriever/init.py
+++ b/surfsense_backend/tests/integration/retriever/init.py
--- a/surfsense_backend/tests/integration/retriever/conftest.py
+++ b/surfsense_backend/tests/integration/retriever/conftest.py
@ -0,0 +1,106 @@
+"""Shared fixtures for retriever integration tests."""
+
+from __future__ import annotations
+
+import uuid
+from datetime import UTC, datetime
+
+import pytest_asyncio
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config as app_config
+from app.db import Chunk, Document, DocumentType, SearchSpace, User
+
+EMBEDDING_DIM = app_config.embedding_model_instance.dimension
+DUMMY_EMBEDDING = [0.1] * EMBEDDING_DIM
+
+
+def _make_document(
+    *,
+    title: str,
+    document_type: DocumentType,
+    content: str,
+    search_space_id: int,
+    created_by_id: str,
+) -> Document:
+    uid = uuid.uuid4().hex[:12]
+    return Document(
+        title=title,
+        document_type=document_type,
+        content=content,
+        content_hash=f"content-{uid}",
+        unique_identifier_hash=f"uid-{uid}",
+        source_markdown=content,
+        search_space_id=search_space_id,
+        created_by_id=created_by_id,
+        embedding=DUMMY_EMBEDDING,
+        updated_at=datetime.now(UTC),
+        status={"state": "ready"},
+    )
+
+
+def _make_chunk(*, content: str, document_id: int) -> Chunk:
+    return Chunk(
+        content=content,
+        document_id=document_id,
+        embedding=DUMMY_EMBEDDING,
+    )
+
+
+@pytest_asyncio.fixture
+async def seed_large_doc(
+    db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+):
+    """Insert a document with 35 chunks (more than _MAX_FETCH_CHUNKS_PER_DOC=20).
+
+    Also inserts a small 3-chunk document for diversity testing.
+    Returns a dict with ``large_doc``, ``small_doc``, ``search_space``, ``user``,
+    and ``large_chunk_ids`` (all 35 chunk IDs).
+    """
+    user_id = str(db_user.id)
+    space_id = db_search_space.id
+
+    large_doc = _make_document(
+        title="Large PDF Document",
+        document_type=DocumentType.FILE,
+        content="large document about quarterly performance reviews and budgets",
+        search_space_id=space_id,
+        created_by_id=user_id,
+    )
+    small_doc = _make_document(
+        title="Small Note",
+        document_type=DocumentType.NOTE,
+        content="quarterly performance review summary note",
+        search_space_id=space_id,
+        created_by_id=user_id,
+    )
+
+    db_session.add_all([large_doc, small_doc])
+    await db_session.flush()
+
+    large_chunks = []
+    for i in range(35):
+        chunk = _make_chunk(
+            content=f"chunk {i} about quarterly performance review section {i}",
+            document_id=large_doc.id,
+        )
+        large_chunks.append(chunk)
+
+    small_chunks = [
+        _make_chunk(
+            content="quarterly performance review summary note content",
+            document_id=small_doc.id,
+        ),
+    ]
+
+    db_session.add_all(large_chunks + small_chunks)
+    await db_session.flush()
+
+    return {
+        "large_doc": large_doc,
+        "small_doc": small_doc,
+        "large_chunk_ids": [c.id for c in large_chunks],
+        "small_chunk_ids": [c.id for c in small_chunks],
+        "search_space": db_search_space,
+        "user": db_user,
+    }
--- a/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
+++ b/surfsense_backend/tests/integration/retriever/test_optimized_chunk_retriever.py
@ -0,0 +1,116 @@
+"""Integration tests for optimized ChucksHybridSearchRetriever.
+
+Verifies the SQL ROW_NUMBER per-doc chunk limit, column pruning,
+and doc metadata caching from RRF results.
+"""
+
+import pytest
+
+from app.retriever.chunks_hybrid_search import (
+    _MAX_FETCH_CHUNKS_PER_DOC,
+    ChucksHybridSearchRetriever,
+)
+
+from .conftest import DUMMY_EMBEDDING
+
+pytestmark = pytest.mark.integration
+
+
+async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
+    """A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    large_doc_id = seed_large_doc["large_doc"].id
+    for result in results:
+        if result["document"].get("id") == large_doc_id:
+            assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
+            assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
+            break
+    else:
+        pytest.fail("Large doc not found in search results")
+
+
+async def test_doc_metadata_populated_from_rrf(db_session, seed_large_doc):
+    """Document metadata (title, type, etc.) should be present even without joinedload."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    assert len(results) >= 1
+    for result in results:
+        doc = result["document"]
+        assert "id" in doc
+        assert "title" in doc
+        assert doc["title"]
+        assert "document_type" in doc
+        assert doc["document_type"] is not None
+
+
+async def test_matched_chunk_ids_tracked(db_session, seed_large_doc):
+    """matched_chunk_ids should contain the chunk IDs that appeared in the RRF results."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    for result in results:
+        matched = result.get("matched_chunk_ids", [])
+        chunk_ids_in_result = {c["chunk_id"] for c in result["chunks"]}
+        for mid in matched:
+            assert mid in chunk_ids_in_result, (
+                f"matched_chunk_id {mid} not found in chunks"
+            )
+
+
+async def test_chunks_ordered_by_id(db_session, seed_large_doc):
+    """Chunks within each document should be ordered by chunk ID (original order)."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    for result in results:
+        chunk_ids = [c["chunk_id"] for c in result["chunks"]]
+        assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"
+
+
+async def test_score_is_positive_float(db_session, seed_large_doc):
+    """Each result should have a positive float score from RRF."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = ChucksHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    assert len(results) >= 1
+    for result in results:
+        assert isinstance(result["score"], float)
+        assert result["score"] > 0
--- a/surfsense_backend/tests/integration/retriever/test_optimized_doc_retriever.py
+++ b/surfsense_backend/tests/integration/retriever/test_optimized_doc_retriever.py
@ -0,0 +1,76 @@
+"""Integration tests for optimized DocumentHybridSearchRetriever.
+
+Verifies the SQL ROW_NUMBER per-doc chunk limit and column pruning.
+"""
+
+import pytest
+
+from app.retriever.documents_hybrid_search import (
+    _MAX_FETCH_CHUNKS_PER_DOC,
+    DocumentHybridSearchRetriever,
+)
+
+from .conftest import DUMMY_EMBEDDING
+
+pytestmark = pytest.mark.integration
+
+
+async def test_per_doc_chunk_limit_respected(db_session, seed_large_doc):
+    """A document with 35 chunks should have at most _MAX_FETCH_CHUNKS_PER_DOC chunks returned."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = DocumentHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    large_doc_id = seed_large_doc["large_doc"].id
+    for result in results:
+        if result["document"].get("id") == large_doc_id:
+            assert len(result["chunks"]) <= _MAX_FETCH_CHUNKS_PER_DOC
+            assert len(result["chunks"]) == _MAX_FETCH_CHUNKS_PER_DOC
+            break
+    else:
+        pytest.fail("Large doc not found in search results")
+
+
+async def test_doc_metadata_populated(db_session, seed_large_doc):
+    """Document metadata should be present from the RRF results."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = DocumentHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    assert len(results) >= 1
+    for result in results:
+        doc = result["document"]
+        assert "id" in doc
+        assert "title" in doc
+        assert doc["title"]
+        assert "document_type" in doc
+        assert doc["document_type"] is not None
+
+
+async def test_chunks_ordered_by_id(db_session, seed_large_doc):
+    """Chunks within each document should be ordered by chunk ID."""
+    space_id = seed_large_doc["search_space"].id
+
+    retriever = DocumentHybridSearchRetriever(db_session)
+    results = await retriever.hybrid_search(
+        query_text="quarterly performance review",
+        top_k=10,
+        search_space_id=space_id,
+        query_embedding=DUMMY_EMBEDDING,
+    )
+
+    for result in results:
+        chunk_ids = [c["chunk_id"] for c in result["chunks"]]
+        assert chunk_ids == sorted(chunk_ids), "Chunks not ordered by ID"