feat: made agent file sytem optimized

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-03-28 16:39:46 -07:00
parent ee0b59c0fa
commit 2cc2d339e6
67 changed files with 8011 additions and 5591 deletions

View file

@ -5,7 +5,7 @@ from datetime import datetime
from app.utils.perf import get_perf_logger
_MAX_FETCH_CHUNKS_PER_DOC = 30
_MAX_FETCH_CHUNKS_PER_DOC = 20
class ChucksHybridSearchRetriever:
@ -185,7 +185,7 @@ class ChucksHybridSearchRetriever:
- chunks: list[{chunk_id, content}] for citation-aware prompting
- document: {id, title, document_type, metadata}
"""
from sqlalchemy import func, select, text
from sqlalchemy import func, or_, select, text
from sqlalchemy.orm import joinedload
from app.config import config
@ -360,64 +360,81 @@ class ChucksHybridSearchRetriever:
if not doc_ids:
return []
# Fetch chunks for selected documents. We cap per document to avoid
# loading hundreds of chunks for a single large file while still
# ensuring the chunks that matched the RRF query are always included.
chunk_query = (
select(Chunk)
.options(joinedload(Chunk.document))
.join(Document, Chunk.document_id == Document.id)
.where(Document.id.in_(doc_ids))
.where(*base_conditions)
.order_by(Chunk.document_id, Chunk.id)
)
chunks_result = await self.db_session.execute(chunk_query)
raw_chunks = chunks_result.scalars().all()
# Collect document metadata from the small RRF result set (already
# loaded via joinedload) so the bulk chunk fetch can skip the expensive
# Document JOIN entirely.
matched_chunk_ids: set[int] = {
item["chunk_id"] for item in serialized_chunk_results
}
doc_meta_cache: dict[int, dict] = {}
for item in serialized_chunk_results:
did = item["document"]["id"]
if did not in doc_meta_cache:
doc_meta_cache[did] = item["document"]
doc_chunk_counts: dict[int, int] = {}
all_chunks: list = []
for chunk in raw_chunks:
did = chunk.document_id
count = doc_chunk_counts.get(did, 0)
if chunk.id in matched_chunk_ids or count < _MAX_FETCH_CHUNKS_PER_DOC:
all_chunks.append(chunk)
doc_chunk_counts[did] = count + 1
# SQL-level per-document chunk limit using ROW_NUMBER().
# Avoids loading hundreds of chunks per large document only to
# discard them in Python.
numbered = (
select(
Chunk.id.label("chunk_id"),
func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
.label("rn"),
)
.where(Chunk.document_id.in_(doc_ids))
.subquery("numbered")
)
# Assemble final doc-grouped results in the same order as doc_ids
matched_list = list(matched_chunk_ids)
if matched_list:
chunk_filter = or_(
numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC,
Chunk.id.in_(matched_list),
)
else:
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
chunk_query = (
select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter)
.order_by(Chunk.document_id, Chunk.id)
)
t_fetch = time.perf_counter()
chunks_result = await self.db_session.execute(chunk_query)
fetched_chunks = chunks_result.all()
perf.debug(
"[chunk_search] chunk fetch in %.3fs rows=%d",
time.perf_counter() - t_fetch,
len(fetched_chunks),
)
# Assemble final doc-grouped results in the same order as doc_ids,
# using pre-cached doc metadata instead of joinedload.
doc_map: dict[int, dict] = {
doc_id: {
"document_id": doc_id,
"content": "",
"score": float(doc_scores.get(doc_id, 0.0)),
"chunks": [],
"document": {},
"source": None,
"matched_chunk_ids": [],
"document": doc_meta_cache.get(doc_id, {}),
"source": (doc_meta_cache.get(doc_id) or {}).get("document_type"),
}
for doc_id in doc_ids
}
for chunk in all_chunks:
doc = chunk.document
doc_id = doc.id
for row in fetched_chunks:
doc_id = row.document_id
if doc_id not in doc_map:
continue
doc_entry = doc_map[doc_id]
doc_entry["document"] = {
"id": doc.id,
"title": doc.title,
"document_type": doc.document_type.value
if getattr(doc, "document_type", None)
else None,
"metadata": doc.document_metadata or {},
}
doc_entry["source"] = (
doc.document_type.value if getattr(doc, "document_type", None) else None
)
doc_entry["chunks"].append({"chunk_id": chunk.id, "content": chunk.content})
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
if row.id in matched_chunk_ids:
doc_entry["matched_chunk_ids"].append(row.id)
# Fill concatenated content (useful for reranking)
final_docs: list[dict] = []