feat: made agent file sytem optimized

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-03-28 16:39:46 -07:00
parent ee0b59c0fa
commit 2cc2d339e6
67 changed files with 8011 additions and 5591 deletions

View file

@ -4,7 +4,7 @@ from datetime import datetime
from app.utils.perf import get_perf_logger
_MAX_FETCH_CHUNKS_PER_DOC = 30
_MAX_FETCH_CHUNKS_PER_DOC = 20
class DocumentHybridSearchRetriever:
@ -289,57 +289,77 @@ class DocumentHybridSearchRetriever:
if not documents_with_scores:
return []
# Collect document IDs for chunk fetching
# Collect document IDs and pre-cache metadata from the small RRF
# result set so the bulk chunk fetch can skip joinedload entirely.
doc_ids: list[int] = [doc.id for doc, _score in documents_with_scores]
# Fetch chunks for these documents, capped per document to avoid
# loading hundreds of chunks for a single large file.
chunks_query = (
select(Chunk)
.options(joinedload(Chunk.document))
.where(Chunk.document_id.in_(doc_ids))
.order_by(Chunk.document_id, Chunk.id)
)
chunks_result = await self.db_session.execute(chunks_query)
raw_chunks = chunks_result.scalars().all()
doc_chunk_counts: dict[int, int] = {}
chunks: list = []
for chunk in raw_chunks:
did = chunk.document_id
count = doc_chunk_counts.get(did, 0)
if count < _MAX_FETCH_CHUNKS_PER_DOC:
chunks.append(chunk)
doc_chunk_counts[did] = count + 1
# Assemble doc-grouped results
doc_map: dict[int, dict] = {
doc.id: {
"document_id": doc.id,
"content": "",
"score": float(score),
"chunks": [],
"document": {
"id": doc.id,
"title": doc.title,
"document_type": doc.document_type.value
if getattr(doc, "document_type", None)
else None,
"metadata": doc.document_metadata or {},
},
"source": doc.document_type.value
doc_meta_cache: dict[int, dict] = {}
doc_score_cache: dict[int, float] = {}
doc_source_cache: dict[int, str | None] = {}
for doc, score in documents_with_scores:
doc_meta_cache[doc.id] = {
"id": doc.id,
"title": doc.title,
"document_type": doc.document_type.value
if getattr(doc, "document_type", None)
else None,
"metadata": doc.document_metadata or {},
}
for doc, score in documents_with_scores
doc_score_cache[doc.id] = float(score)
doc_source_cache[doc.id] = (
doc.document_type.value if getattr(doc, "document_type", None) else None
)
# SQL-level per-document chunk limit using ROW_NUMBER().
# Avoids loading hundreds of chunks per large document only to
# discard them in Python.
numbered = (
select(
Chunk.id.label("chunk_id"),
func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
.label("rn"),
)
.where(Chunk.document_id.in_(doc_ids))
.subquery("numbered")
)
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
chunks_query = (
select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id)
.where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
.order_by(Chunk.document_id, Chunk.id)
)
t_fetch = time.perf_counter()
chunks_result = await self.db_session.execute(chunks_query)
fetched_chunks = chunks_result.all()
perf.debug(
"[doc_search] chunk fetch in %.3fs rows=%d",
time.perf_counter() - t_fetch,
len(fetched_chunks),
)
# Assemble doc-grouped results using pre-cached metadata.
doc_map: dict[int, dict] = {
doc_id: {
"document_id": doc_id,
"content": "",
"score": doc_score_cache.get(doc_id, 0.0),
"chunks": [],
"matched_chunk_ids": [],
"document": doc_meta_cache.get(doc_id, {}),
"source": doc_source_cache.get(doc_id),
}
for doc_id in doc_ids
}
for chunk in chunks:
doc_id = chunk.document_id
for row in fetched_chunks:
doc_id = row.document_id
if doc_id not in doc_map:
continue
doc_map[doc_id]["chunks"].append(
{"chunk_id": chunk.id, "content": chunk.content}
{"chunk_id": row.id, "content": row.content}
)
# Fill concatenated content (useful for reranking)