feat(backend): Remove LLM summaries from document indexing

This commit is contained in:
Anish Sarkar 2026-06-04 00:50:19 +05:30
parent 290a9539ef
commit 81fa219b30
17 changed files with 40 additions and 518 deletions

View file

@ -191,149 +191,6 @@ class DoclingService:
logger.error(f"Full traceback: {traceback.format_exc()}")
raise RuntimeError(f"Docling processing failed: {e}") from e
async def process_large_document_summary(
self, content: str, llm, document_title: str = "Document"
) -> str:
"""
Process large documents using chunked LLM summarization.
Args:
content: The full document content
llm: The language model to use for summarization
document_title: Title of the document for context
Returns:
Final summary of the document
"""
# Large document threshold (100K characters ≈ 25K tokens)
large_document_threshold = 100_000
if len(content) <= large_document_threshold:
# For smaller documents, use direct processing
logger.info(
f"📄 Document size: {len(content)} chars - using direct processing"
)
from app.prompts import SUMMARY_PROMPT_TEMPLATE
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
result = await summary_chain.ainvoke({"document": content})
return result.content
logger.info(
f"📚 Large document detected: {len(content)} chars - using chunked processing"
)
# Import chunker from config
# Create LLM-optimized chunks (8K tokens max for safety)
from chonkie import OverlapRefinery, RecursiveChunker
from langchain_core.prompts import PromptTemplate
llm_chunker = RecursiveChunker(
chunk_size=8000 # Conservative for most LLMs
)
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
overlap_refinery = OverlapRefinery(
context_size=0.1, # 10% overlap for context preservation
method="suffix", # Add next chunk context to current chunk
)
# First chunk the content, then apply overlap refinery
initial_chunks = llm_chunker.chunk(content)
chunks = overlap_refinery.refine(initial_chunks)
total_chunks = len(chunks)
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
# Template for chunk processing
chunk_template = PromptTemplate(
input_variables=["chunk", "chunk_number", "total_chunks"],
template="""<INSTRUCTIONS>
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
Create a comprehensive summary of this document chunk. Focus on:
- Key concepts, facts, and information
- Important details and context
- Main topics and themes
Provide a clear, structured summary that captures the essential content.
Chunk {chunk_number}/{total_chunks}:
<document_chunk>
{chunk}
</document_chunk>
</INSTRUCTIONS>""",
)
# Process each chunk individually
chunk_summaries = []
for i, chunk in enumerate(chunks, 1):
try:
logger.info(
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
)
chunk_chain = chunk_template | llm
chunk_result = await chunk_chain.ainvoke(
{
"chunk": chunk.text,
"chunk_number": i,
"total_chunks": total_chunks,
}
)
chunk_summary = chunk_result.content
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
except Exception as e:
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
# Combine summaries into final document summary
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
try:
combine_template = PromptTemplate(
input_variables=["summaries", "document_title"],
template="""<INSTRUCTIONS>
You are combining multiple section summaries into a final comprehensive document summary.
Create a unified, coherent summary from the following section summaries of "{document_title}".
Ensure:
- Logical flow and organization
- No redundancy or repetition
- Comprehensive coverage of all key points
- Professional, objective tone
<section_summaries>
{summaries}
</section_summaries>
</INSTRUCTIONS>""",
)
combined_summaries = "\n\n".join(chunk_summaries)
combine_chain = combine_template | llm
final_result = await combine_chain.ainvoke(
{"summaries": combined_summaries, "document_title": document_title}
)
final_summary = final_result.content
logger.info(
f"✅ Large document processing complete: {len(final_summary)} chars summary"
)
return final_summary
except Exception as e:
logger.error(f"❌ Failed to combine summaries: {e}")
# Fallback: return concatenated chunk summaries
fallback_summary = "\n\n".join(chunk_summaries)
logger.warning("⚠️ Using fallback combined summary")
return fallback_summary
def create_docling_service() -> DoclingService:
"""Create a Docling service instance."""

View file

@ -233,18 +233,6 @@ async def _resolve_attachment_vision_llm(
return await get_vision_llm(session, search_space_id)
async def _resolve_summary_llm(
session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
):
"""Fetch summary LLM only when indexing summary is enabled."""
if not should_summarize:
return None
from app.services.llm_service import get_user_long_context_llm
return await get_user_long_context_llm(session, user_id, search_space_id)
def _require_extracted_attachment_content(
*, content: str, etl_meta: dict[str, Any], path: str
) -> str:
@ -349,13 +337,6 @@ async def upsert_note(
path=payload.path,
)
llm = await _resolve_summary_llm(
session,
user_id=str(user_id),
search_space_id=search_space_id,
should_summarize=connector.enable_summary,
)
document_string = _build_document_string(
payload, vault_name, content_override=content_for_index
)
@ -374,8 +355,6 @@ async def upsert_note(
search_space_id=search_space_id,
connector_id=connector.id,
created_by_id=str(user_id),
should_summarize=connector.enable_summary,
fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
metadata=metadata,
)
@ -388,7 +367,7 @@ async def upsert_note(
document = prepared[0]
return await pipeline.index(document, connector_doc, llm)
return await pipeline.index(document, connector_doc)
async def rename_note(

View file

@ -18,7 +18,6 @@ class TaskDispatcher(Protocol):
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None: ...
@ -35,7 +34,6 @@ class CeleryTaskDispatcher:
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None:
@ -49,7 +47,6 @@ class CeleryTaskDispatcher:
filename=filename,
search_space_id=search_space_id,
user_id=user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)