mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
feat(backend): Remove LLM summaries from document indexing
This commit is contained in:
parent
290a9539ef
commit
81fa219b30
17 changed files with 40 additions and 518 deletions
|
|
@ -9,7 +9,6 @@ from litellm import get_model_info, token_counter
|
|||
|
||||
from app.config import config
|
||||
from app.db import Chunk, DocumentType
|
||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -176,57 +175,6 @@ def optimize_content_for_context_window(
|
|||
return optimized_content
|
||||
|
||||
|
||||
async def generate_document_summary(
|
||||
content: str,
|
||||
user_llm,
|
||||
document_metadata: dict | None = None,
|
||||
) -> tuple[str, list[float]]:
|
||||
"""
|
||||
Generate summary and embedding for document content with metadata.
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
user_llm: User's LLM instance
|
||||
document_metadata: Optional metadata dictionary to include in summary
|
||||
|
||||
Returns:
|
||||
Tuple of (enhanced_summary_content, summary_embedding)
|
||||
"""
|
||||
# Get model name from user_llm for token counting
|
||||
model_name = getattr(user_llm, "model", "gpt-3.5-turbo") # Fallback to default
|
||||
|
||||
# Optimize content to fit within context window
|
||||
optimized_content = optimize_content_for_context_window(
|
||||
content, document_metadata, model_name
|
||||
)
|
||||
|
||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
|
||||
content_with_metadata = f"<DOCUMENT><DOCUMENT_METADATA>\n\n{document_metadata}\n\n</DOCUMENT_METADATA>\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
|
||||
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
|
||||
summary_content = summary_result.content
|
||||
|
||||
# Combine summary with metadata if provided
|
||||
if document_metadata:
|
||||
metadata_parts = []
|
||||
metadata_parts.append("# DOCUMENT METADATA")
|
||||
|
||||
for key, value in document_metadata.items():
|
||||
if value: # Only include non-empty values
|
||||
formatted_key = key.replace("_", " ").title()
|
||||
metadata_parts.append(f"**{formatted_key}:** {value}")
|
||||
|
||||
metadata_section = "\n".join(metadata_parts)
|
||||
enhanced_summary_content = (
|
||||
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
|
||||
)
|
||||
else:
|
||||
enhanced_summary_content = summary_content
|
||||
|
||||
summary_embedding = await asyncio.to_thread(embed_text, enhanced_summary_content)
|
||||
|
||||
return enhanced_summary_content, summary_embedding
|
||||
|
||||
|
||||
async def create_document_chunks(content: str) -> list[Chunk]:
|
||||
"""
|
||||
Create chunks from document content.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue