feat: made agent file sytem optimized

2026-04-27 01:36:30 +02:00 · 2026-03-28 16:39:46 -07:00 · 2026-03-28 16:39:46 -07:00 · 2cc2d339e6
commit 2cc2d339e6
parent ee0b59c0fa
67 changed files with 8011 additions and 5591 deletions
--- a/surfsense_backend/app/retriever/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriever/documents_hybrid_search.py
@ -4,7 +4,7 @@ from datetime import datetime

 from app.utils.perf import get_perf_logger

-_MAX_FETCH_CHUNKS_PER_DOC = 30
+_MAX_FETCH_CHUNKS_PER_DOC = 20


 class DocumentHybridSearchRetriever:
@ -289,57 +289,77 @@ class DocumentHybridSearchRetriever:
        if not documents_with_scores:
            return []

-        # Collect document IDs for chunk fetching
+        # Collect document IDs and pre-cache metadata from the small RRF
+        # result set so the bulk chunk fetch can skip joinedload entirely.
        doc_ids: list[int] = [doc.id for doc, _score in documents_with_scores]
-
-        # Fetch chunks for these documents, capped per document to avoid
-        # loading hundreds of chunks for a single large file.
-        chunks_query = (
-            select(Chunk)
-            .options(joinedload(Chunk.document))
-            .where(Chunk.document_id.in_(doc_ids))
-            .order_by(Chunk.document_id, Chunk.id)
-        )
-        chunks_result = await self.db_session.execute(chunks_query)
-        raw_chunks = chunks_result.scalars().all()
-
-        doc_chunk_counts: dict[int, int] = {}
-        chunks: list = []
-        for chunk in raw_chunks:
-            did = chunk.document_id
-            count = doc_chunk_counts.get(did, 0)
-            if count < _MAX_FETCH_CHUNKS_PER_DOC:
-                chunks.append(chunk)
-                doc_chunk_counts[did] = count + 1
-
-        # Assemble doc-grouped results
-        doc_map: dict[int, dict] = {
-            doc.id: {
-                "document_id": doc.id,
-                "content": "",
-                "score": float(score),
-                "chunks": [],
-                "document": {
-                    "id": doc.id,
-                    "title": doc.title,
-                    "document_type": doc.document_type.value
-                    if getattr(doc, "document_type", None)
-                    else None,
-                    "metadata": doc.document_metadata or {},
-                },
-                "source": doc.document_type.value
+        doc_meta_cache: dict[int, dict] = {}
+        doc_score_cache: dict[int, float] = {}
+        doc_source_cache: dict[int, str | None] = {}
+        for doc, score in documents_with_scores:
+            doc_meta_cache[doc.id] = {
+                "id": doc.id,
+                "title": doc.title,
+                "document_type": doc.document_type.value
                if getattr(doc, "document_type", None)
                else None,
+                "metadata": doc.document_metadata or {},
            }
-            for doc, score in documents_with_scores
+            doc_score_cache[doc.id] = float(score)
+            doc_source_cache[doc.id] = (
+                doc.document_type.value if getattr(doc, "document_type", None) else None
+            )
+
+        # SQL-level per-document chunk limit using ROW_NUMBER().
+        # Avoids loading hundreds of chunks per large document only to
+        # discard them in Python.
+        numbered = (
+            select(
+                Chunk.id.label("chunk_id"),
+                func.row_number()
+                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .label("rn"),
+            )
+            .where(Chunk.document_id.in_(doc_ids))
+            .subquery("numbered")
+        )
+
+        # Select only the columns we need (skip Chunk.embedding ~12KB/row).
+        chunks_query = (
+            select(Chunk.id, Chunk.content, Chunk.document_id)
+            .join(numbered, Chunk.id == numbered.c.chunk_id)
+            .where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
+            .order_by(Chunk.document_id, Chunk.id)
+        )
+
+        t_fetch = time.perf_counter()
+        chunks_result = await self.db_session.execute(chunks_query)
+        fetched_chunks = chunks_result.all()
+        perf.debug(
+            "[doc_search] chunk fetch in %.3fs rows=%d",
+            time.perf_counter() - t_fetch,
+            len(fetched_chunks),
+        )
+
+        # Assemble doc-grouped results using pre-cached metadata.
+        doc_map: dict[int, dict] = {
+            doc_id: {
+                "document_id": doc_id,
+                "content": "",
+                "score": doc_score_cache.get(doc_id, 0.0),
+                "chunks": [],
+                "matched_chunk_ids": [],
+                "document": doc_meta_cache.get(doc_id, {}),
+                "source": doc_source_cache.get(doc_id),
+            }
+            for doc_id in doc_ids
        }

-        for chunk in chunks:
-            doc_id = chunk.document_id
+        for row in fetched_chunks:
+            doc_id = row.document_id
            if doc_id not in doc_map:
                continue
            doc_map[doc_id]["chunks"].append(
-                {"chunk_id": chunk.id, "content": chunk.content}
+                {"chunk_id": row.id, "content": row.content}
            )

        # Fill concatenated content (useful for reranking)