mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
feat(backend): Remove LLM summaries from document indexing
This commit is contained in:
parent
290a9539ef
commit
81fa219b30
17 changed files with 40 additions and 518 deletions
|
|
@ -191,149 +191,6 @@ class DoclingService:
|
|||
logger.error(f"Full traceback: {traceback.format_exc()}")
|
||||
raise RuntimeError(f"Docling processing failed: {e}") from e
|
||||
|
||||
async def process_large_document_summary(
|
||||
self, content: str, llm, document_title: str = "Document"
|
||||
) -> str:
|
||||
"""
|
||||
Process large documents using chunked LLM summarization.
|
||||
|
||||
Args:
|
||||
content: The full document content
|
||||
llm: The language model to use for summarization
|
||||
document_title: Title of the document for context
|
||||
|
||||
Returns:
|
||||
Final summary of the document
|
||||
"""
|
||||
# Large document threshold (100K characters ≈ 25K tokens)
|
||||
large_document_threshold = 100_000
|
||||
|
||||
if len(content) <= large_document_threshold:
|
||||
# For smaller documents, use direct processing
|
||||
logger.info(
|
||||
f"📄 Document size: {len(content)} chars - using direct processing"
|
||||
)
|
||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||
|
||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
||||
result = await summary_chain.ainvoke({"document": content})
|
||||
return result.content
|
||||
|
||||
logger.info(
|
||||
f"📚 Large document detected: {len(content)} chars - using chunked processing"
|
||||
)
|
||||
|
||||
# Import chunker from config
|
||||
# Create LLM-optimized chunks (8K tokens max for safety)
|
||||
from chonkie import OverlapRefinery, RecursiveChunker
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
|
||||
llm_chunker = RecursiveChunker(
|
||||
chunk_size=8000 # Conservative for most LLMs
|
||||
)
|
||||
|
||||
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
|
||||
overlap_refinery = OverlapRefinery(
|
||||
context_size=0.1, # 10% overlap for context preservation
|
||||
method="suffix", # Add next chunk context to current chunk
|
||||
)
|
||||
|
||||
# First chunk the content, then apply overlap refinery
|
||||
initial_chunks = llm_chunker.chunk(content)
|
||||
chunks = overlap_refinery.refine(initial_chunks)
|
||||
total_chunks = len(chunks)
|
||||
|
||||
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
|
||||
|
||||
# Template for chunk processing
|
||||
chunk_template = PromptTemplate(
|
||||
input_variables=["chunk", "chunk_number", "total_chunks"],
|
||||
template="""<INSTRUCTIONS>
|
||||
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
|
||||
|
||||
Create a comprehensive summary of this document chunk. Focus on:
|
||||
- Key concepts, facts, and information
|
||||
- Important details and context
|
||||
- Main topics and themes
|
||||
|
||||
Provide a clear, structured summary that captures the essential content.
|
||||
|
||||
Chunk {chunk_number}/{total_chunks}:
|
||||
<document_chunk>
|
||||
{chunk}
|
||||
</document_chunk>
|
||||
</INSTRUCTIONS>""",
|
||||
)
|
||||
|
||||
# Process each chunk individually
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
try:
|
||||
logger.info(
|
||||
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
|
||||
)
|
||||
|
||||
chunk_chain = chunk_template | llm
|
||||
chunk_result = await chunk_chain.ainvoke(
|
||||
{
|
||||
"chunk": chunk.text,
|
||||
"chunk_number": i,
|
||||
"total_chunks": total_chunks,
|
||||
}
|
||||
)
|
||||
|
||||
chunk_summary = chunk_result.content
|
||||
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
|
||||
|
||||
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
|
||||
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
|
||||
|
||||
# Combine summaries into final document summary
|
||||
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
|
||||
|
||||
try:
|
||||
combine_template = PromptTemplate(
|
||||
input_variables=["summaries", "document_title"],
|
||||
template="""<INSTRUCTIONS>
|
||||
You are combining multiple section summaries into a final comprehensive document summary.
|
||||
|
||||
Create a unified, coherent summary from the following section summaries of "{document_title}".
|
||||
Ensure:
|
||||
- Logical flow and organization
|
||||
- No redundancy or repetition
|
||||
- Comprehensive coverage of all key points
|
||||
- Professional, objective tone
|
||||
|
||||
<section_summaries>
|
||||
{summaries}
|
||||
</section_summaries>
|
||||
</INSTRUCTIONS>""",
|
||||
)
|
||||
|
||||
combined_summaries = "\n\n".join(chunk_summaries)
|
||||
combine_chain = combine_template | llm
|
||||
|
||||
final_result = await combine_chain.ainvoke(
|
||||
{"summaries": combined_summaries, "document_title": document_title}
|
||||
)
|
||||
|
||||
final_summary = final_result.content
|
||||
logger.info(
|
||||
f"✅ Large document processing complete: {len(final_summary)} chars summary"
|
||||
)
|
||||
|
||||
return final_summary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to combine summaries: {e}")
|
||||
# Fallback: return concatenated chunk summaries
|
||||
fallback_summary = "\n\n".join(chunk_summaries)
|
||||
logger.warning("⚠️ Using fallback combined summary")
|
||||
return fallback_summary
|
||||
|
||||
|
||||
def create_docling_service() -> DoclingService:
|
||||
"""Create a Docling service instance."""
|
||||
|
|
|
|||
|
|
@ -233,18 +233,6 @@ async def _resolve_attachment_vision_llm(
|
|||
return await get_vision_llm(session, search_space_id)
|
||||
|
||||
|
||||
async def _resolve_summary_llm(
|
||||
session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
|
||||
):
|
||||
"""Fetch summary LLM only when indexing summary is enabled."""
|
||||
if not should_summarize:
|
||||
return None
|
||||
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
|
||||
return await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
|
||||
def _require_extracted_attachment_content(
|
||||
*, content: str, etl_meta: dict[str, Any], path: str
|
||||
) -> str:
|
||||
|
|
@ -349,13 +337,6 @@ async def upsert_note(
|
|||
path=payload.path,
|
||||
)
|
||||
|
||||
llm = await _resolve_summary_llm(
|
||||
session,
|
||||
user_id=str(user_id),
|
||||
search_space_id=search_space_id,
|
||||
should_summarize=connector.enable_summary,
|
||||
)
|
||||
|
||||
document_string = _build_document_string(
|
||||
payload, vault_name, content_override=content_for_index
|
||||
)
|
||||
|
|
@ -374,8 +355,6 @@ async def upsert_note(
|
|||
search_space_id=search_space_id,
|
||||
connector_id=connector.id,
|
||||
created_by_id=str(user_id),
|
||||
should_summarize=connector.enable_summary,
|
||||
fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
|
@ -388,7 +367,7 @@ async def upsert_note(
|
|||
|
||||
document = prepared[0]
|
||||
|
||||
return await pipeline.index(document, connector_doc, llm)
|
||||
return await pipeline.index(document, connector_doc)
|
||||
|
||||
|
||||
async def rename_note(
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@ class TaskDispatcher(Protocol):
|
|||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
should_summarize: bool = False,
|
||||
use_vision_llm: bool = False,
|
||||
processing_mode: str = "basic",
|
||||
) -> None: ...
|
||||
|
|
@ -35,7 +34,6 @@ class CeleryTaskDispatcher:
|
|||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
should_summarize: bool = False,
|
||||
use_vision_llm: bool = False,
|
||||
processing_mode: str = "basic",
|
||||
) -> None:
|
||||
|
|
@ -49,7 +47,6 @@ class CeleryTaskDispatcher:
|
|||
filename=filename,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
should_summarize=should_summarize,
|
||||
use_vision_llm=use_vision_llm,
|
||||
processing_mode=processing_mode,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue