feat(backend): Remove LLM summaries from document indexing

This commit is contained in:
Anish Sarkar 2026-06-04 00:50:19 +05:30
parent 290a9539ef
commit 81fa219b30
17 changed files with 40 additions and 518 deletions

View file

@ -9,7 +9,6 @@ from litellm import get_model_info, token_counter
from app.config import config
from app.db import Chunk, DocumentType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
logger = logging.getLogger(__name__)
@ -176,57 +175,6 @@ def optimize_content_for_context_window(
return optimized_content
async def generate_document_summary(
content: str,
user_llm,
document_metadata: dict | None = None,
) -> tuple[str, list[float]]:
"""
Generate summary and embedding for document content with metadata.
Args:
content: Document content
user_llm: User's LLM instance
document_metadata: Optional metadata dictionary to include in summary
Returns:
Tuple of (enhanced_summary_content, summary_embedding)
"""
# Get model name from user_llm for token counting
model_name = getattr(user_llm, "model", "gpt-3.5-turbo") # Fallback to default
# Optimize content to fit within context window
optimized_content = optimize_content_for_context_window(
content, document_metadata, model_name
)
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
content_with_metadata = f"<DOCUMENT><DOCUMENT_METADATA>\n\n{document_metadata}\n\n</DOCUMENT_METADATA>\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
summary_content = summary_result.content
# Combine summary with metadata if provided
if document_metadata:
metadata_parts = []
metadata_parts.append("# DOCUMENT METADATA")
for key, value in document_metadata.items():
if value: # Only include non-empty values
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
else:
enhanced_summary_content = summary_content
summary_embedding = await asyncio.to_thread(embed_text, enhanced_summary_content)
return enhanced_summary_content, summary_embedding
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.