feat: made agent file sytem optimized

2026-05-08 07:12:39 +02:00 · 2026-03-28 16:39:46 -07:00 · 2026-03-28 16:39:46 -07:00 · 2cc2d339e6
commit 2cc2d339e6
parent ee0b59c0fa
67 changed files with 8011 additions and 5591 deletions
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@ -5,7 +5,7 @@ from datetime import datetime

 from app.utils.perf import get_perf_logger

-_MAX_FETCH_CHUNKS_PER_DOC = 30
+_MAX_FETCH_CHUNKS_PER_DOC = 20


 class ChucksHybridSearchRetriever:
@ -185,7 +185,7 @@ class ChucksHybridSearchRetriever:
              - chunks: list[{chunk_id, content}] for citation-aware prompting
              - document: {id, title, document_type, metadata}
        """
-        from sqlalchemy import func, select, text
+        from sqlalchemy import func, or_, select, text
        from sqlalchemy.orm import joinedload

        from app.config import config
@ -360,64 +360,81 @@ class ChucksHybridSearchRetriever:
        if not doc_ids:
            return []

-        # Fetch chunks for selected documents.  We cap per document to avoid
-        # loading hundreds of chunks for a single large file while still
-        # ensuring the chunks that matched the RRF query are always included.
-        chunk_query = (
-            select(Chunk)
-            .options(joinedload(Chunk.document))
-            .join(Document, Chunk.document_id == Document.id)
-            .where(Document.id.in_(doc_ids))
-            .where(*base_conditions)
-            .order_by(Chunk.document_id, Chunk.id)
-        )
-        chunks_result = await self.db_session.execute(chunk_query)
-        raw_chunks = chunks_result.scalars().all()
-
+        # Collect document metadata from the small RRF result set (already
+        # loaded via joinedload) so the bulk chunk fetch can skip the expensive
+        # Document JOIN entirely.
        matched_chunk_ids: set[int] = {
            item["chunk_id"] for item in serialized_chunk_results
        }
+        doc_meta_cache: dict[int, dict] = {}
+        for item in serialized_chunk_results:
+            did = item["document"]["id"]
+            if did not in doc_meta_cache:
+                doc_meta_cache[did] = item["document"]

-        doc_chunk_counts: dict[int, int] = {}
-        all_chunks: list = []
-        for chunk in raw_chunks:
-            did = chunk.document_id
-            count = doc_chunk_counts.get(did, 0)
-            if chunk.id in matched_chunk_ids or count < _MAX_FETCH_CHUNKS_PER_DOC:
-                all_chunks.append(chunk)
-                doc_chunk_counts[did] = count + 1
+        # SQL-level per-document chunk limit using ROW_NUMBER().
+        # Avoids loading hundreds of chunks per large document only to
+        # discard them in Python.
+        numbered = (
+            select(
+                Chunk.id.label("chunk_id"),
+                func.row_number()
+                .over(partition_by=Chunk.document_id, order_by=Chunk.id)
+                .label("rn"),
+            )
+            .where(Chunk.document_id.in_(doc_ids))
+            .subquery("numbered")
+        )

-        # Assemble final doc-grouped results in the same order as doc_ids
+        matched_list = list(matched_chunk_ids)
+        if matched_list:
+            chunk_filter = or_(
+                numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC,
+                Chunk.id.in_(matched_list),
+            )
+        else:
+            chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
+
+        # Select only the columns we need (skip Chunk.embedding ~12KB/row).
+        chunk_query = (
+            select(Chunk.id, Chunk.content, Chunk.document_id)
+            .join(numbered, Chunk.id == numbered.c.chunk_id)
+            .where(chunk_filter)
+            .order_by(Chunk.document_id, Chunk.id)
+        )
+
+        t_fetch = time.perf_counter()
+        chunks_result = await self.db_session.execute(chunk_query)
+        fetched_chunks = chunks_result.all()
+        perf.debug(
+            "[chunk_search] chunk fetch in %.3fs rows=%d",
+            time.perf_counter() - t_fetch,
+            len(fetched_chunks),
+        )
+
+        # Assemble final doc-grouped results in the same order as doc_ids,
+        # using pre-cached doc metadata instead of joinedload.
        doc_map: dict[int, dict] = {
            doc_id: {
                "document_id": doc_id,
                "content": "",
                "score": float(doc_scores.get(doc_id, 0.0)),
                "chunks": [],
-                "document": {},
-                "source": None,
+                "matched_chunk_ids": [],
+                "document": doc_meta_cache.get(doc_id, {}),
+                "source": (doc_meta_cache.get(doc_id) or {}).get("document_type"),
            }
            for doc_id in doc_ids
        }

-        for chunk in all_chunks:
-            doc = chunk.document
-            doc_id = doc.id
+        for row in fetched_chunks:
+            doc_id = row.document_id
            if doc_id not in doc_map:
                continue
            doc_entry = doc_map[doc_id]
-            doc_entry["document"] = {
-                "id": doc.id,
-                "title": doc.title,
-                "document_type": doc.document_type.value
-                if getattr(doc, "document_type", None)
-                else None,
-                "metadata": doc.document_metadata or {},
-            }
-            doc_entry["source"] = (
-                doc.document_type.value if getattr(doc, "document_type", None) else None
-            )
-            doc_entry["chunks"].append({"chunk_id": chunk.id, "content": chunk.content})
+            doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
+            if row.id in matched_chunk_ids:
+                doc_entry["matched_chunk_ids"].append(row.id)

        # Fill concatenated content (useful for reranking)
        final_docs: list[dict] = []