feat(backend): Remove LLM summaries from document indexing

2026-07-26 23:51:14 +02:00 · 2026-06-04 00:50:19 +05:30 · 2026-06-04 00:50:19 +05:30 · 81fa219b30
commit 81fa219b30
parent 290a9539ef
17 changed files with 40 additions and 518 deletions
--- a/surfsense_backend/app/tasks/document_processors/_save.py
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@ -1,20 +1,15 @@
-"""
-Unified document save/update logic for file processors.
-"""
+"""Unified document save/update logic for file processors."""

-import asyncio
 import logging

 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.db import Document, DocumentStatus, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.utils.document_converters import (
    create_document_chunks,
    embed_text,
    generate_content_hash,
-    generate_document_summary,
 )

 from ._helpers import (
@ -24,59 +19,6 @@ from ._helpers import (
 )
 from .base import get_current_timestamp, safe_set_chunks

-# ---------------------------------------------------------------------------
-# Summary generation
-# ---------------------------------------------------------------------------
-
-
-async def _generate_summary(
-    markdown_content: str,
-    file_name: str,
-    etl_service: str,
-    user_llm,
-    enable_summary: bool,
-) -> tuple[str, list[float]]:
-    """
-    Generate a document summary and embedding.
-
-    Docling uses its own large-document summary strategy; other ETL services
-    use the standard ``generate_document_summary`` helper.
-    """
-    if not enable_summary:
-        summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
-        return summary, await asyncio.to_thread(embed_text, summary)
-
-    if etl_service == "DOCLING":
-        from app.services.docling_service import create_docling_service
-
-        docling_service = create_docling_service()
-        summary_text = await docling_service.process_large_document_summary(
-            content=markdown_content, llm=user_llm, document_title=file_name
-        )
-
-        meta = {
-            "file_name": file_name,
-            "etl_service": etl_service,
-            "document_type": "File Document",
-        }
-        parts = ["# DOCUMENT METADATA"]
-        for key, value in meta.items():
-            if value:
-                formatted_key = key.replace("_", " ").title()
-                parts.append(f"**{formatted_key}:** {value}")
-
-        enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
-        return enhanced, await asyncio.to_thread(embed_text, enhanced)
-
-    # Standard summary (Unstructured / LlamaCloud / others)
-    meta = {
-        "file_name": file_name,
-        "etl_service": etl_service,
-        "document_type": "File Document",
-    }
-    return await generate_document_summary(markdown_content, user_llm, meta)
-
-
 # ---------------------------------------------------------------------------
 # Unified save function
 # ---------------------------------------------------------------------------
@ -90,7 +32,6 @@ async def save_file_document(
    user_id: str,
    etl_service: str,
    connector: dict | None = None,
-    enable_summary: bool = True,
 ) -> Document | None:
    """
    Process and store a file document with deduplication and migration support.
@ -106,7 +47,6 @@ async def save_file_document(
        user_id: ID of the user
        etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
        connector: Optional connector info for Google Drive files
-        enable_summary: Whether to generate an AI summary

    Returns:
        Document object if successful, None if duplicate detected
@ -133,24 +73,16 @@ async def save_file_document(
            if should_skip:
                return doc

-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} "
-                f"in search space {search_space_id}"
-            )
-
-        summary_content, summary_embedding = await _generate_summary(
-            markdown_content, file_name, etl_service, user_llm, enable_summary
-        )
+        document_content = f"File: {file_name}\n\n{markdown_content[:4000]}"
+        document_embedding = embed_text(document_content)
        chunks = await create_document_chunks(markdown_content)
        doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}

        if existing_document:
            existing_document.title = file_name
-            existing_document.content = summary_content
+            existing_document.content = document_content
            existing_document.content_hash = content_hash
-            existing_document.embedding = summary_embedding
+            existing_document.embedding = document_embedding
            existing_document.document_metadata = doc_metadata
            await safe_set_chunks(session, existing_document, chunks)
            existing_document.source_markdown = markdown_content
@ -171,8 +103,8 @@ async def save_file_document(
            title=file_name,
            document_type=doc_type,
            document_metadata=doc_metadata,
-            content=summary_content,
-            embedding=summary_embedding,
+            content=document_content,
+            embedding=document_embedding,
            chunks=chunks,
            content_hash=content_hash,
            unique_identifier_hash=primary_hash,
--- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py
@ -25,11 +25,10 @@ from app.db import (
    SearchSourceConnectorType,
    SearchSpace,
 )
-from app.services.llm_service import get_document_summary_llm
 from app.utils.document_converters import (
    create_document_chunks,
+    embed_text,
    generate_content_hash,
-    generate_document_summary,
    generate_unique_identifier_hash,
 )

@ -176,34 +175,8 @@ async def add_circleback_meeting_document(
        # PHASE 3: Process the document content
        # =======================================================================

-        # Get LLM for generating summary
-        llm = await get_document_summary_llm(session, search_space_id)
-        if not llm:
-            logger.warning(
-                f"No LLM configured for search space {search_space_id}. Using content as summary."
-            )
-            # Use first 1000 chars as summary if no LLM available
-            summary_content = (
-                markdown_content[:1000] + "..."
-                if len(markdown_content) > 1000
-                else markdown_content
-            )
-            summary_embedding = None
-        else:
-            # Generate summary with metadata
-            summary_metadata = {
-                "meeting_name": meeting_name,
-                "meeting_id": meeting_id,
-                "document_type": "Circleback Meeting",
-                **{
-                    k: v
-                    for k, v in metadata.items()
-                    if isinstance(v, str | int | float | bool)
-                },
-            }
-            summary_content, summary_embedding = await generate_document_summary(
-                markdown_content, llm, summary_metadata
-            )
+        summary_content = markdown_content
+        summary_embedding = embed_text(summary_content)

        # Process chunks
        chunks = await create_document_chunks(markdown_content)
@ -224,8 +197,7 @@ async def add_circleback_meeting_document(
        document.title = meeting_name
        document.content = summary_content
        document.content_hash = content_hash
-        if summary_embedding is not None:
-            document.embedding = summary_embedding
+        document.embedding = summary_embedding
        document.document_metadata = document_metadata
        await safe_set_chunks(session, document, chunks)
        document.source_markdown = markdown_content
--- a/surfsense_backend/app/tasks/document_processors/extension_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/extension_processor.py
@ -9,12 +9,11 @@ from sqlalchemy.ext.asyncio import AsyncSession

 from app.db import Document, DocumentType
 from app.schemas import ExtensionDocumentContent
-from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
+    embed_text,
    generate_content_hash,
-    generate_document_summary,
    generate_unique_identifier_hash,
 )

@ -123,26 +122,8 @@ async def add_extension_received_document(
                    f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
                )

-        # Get user's long context LLM (needed for both create and update)
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search space {search_space_id}"
-            )
-
-        # Generate summary with metadata
-        document_metadata = {
-            "session_id": content.metadata.BrowsingSessionId,
-            "url": content.metadata.VisitedWebPageURL,
-            "title": content.metadata.VisitedWebPageTitle,
-            "referrer": content.metadata.VisitedWebPageReffererURL,
-            "timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
-            "duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
-            "document_type": "Browser Extension Capture",
-        }
-        summary_content, summary_embedding = await generate_document_summary(
-            combined_document_string, user_llm, document_metadata
-        )
+        summary_content = combined_document_string
+        summary_embedding = embed_text(summary_content)

        # Process chunks
        chunks = await create_document_chunks(content.pageContent)
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -10,7 +10,7 @@ from __future__ import annotations
 import contextlib
 import logging
 import os
-from dataclasses import dataclass, field
+from dataclasses import dataclass

 from fastapi import HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
@ -48,12 +48,6 @@ class _ProcessingContext:
    notification: Notification | None = None
    use_vision_llm: bool = False
    processing_mode: str = "basic"
-    enable_summary: bool = field(init=False)
-
-    def __post_init__(self) -> None:
-        self.enable_summary = (
-            self.connector.get("enable_summary", True) if self.connector else True
-        )


 # ---------------------------------------------------------------------------
@ -261,7 +255,6 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
        ctx.user_id,
        etl_result.etl_service,
        ctx.connector,
-        enable_summary=ctx.enable_summary,
    )

    if result:
@ -466,7 +459,6 @@ async def process_file_in_background_with_document(
    log_entry: Log,
    connector: dict | None = None,
    notification: Notification | None = None,
-    should_summarize: bool = False,
    use_vision_llm: bool = False,
    processing_mode: str = "basic",
 ) -> Document | None:
@ -482,7 +474,6 @@ async def process_file_in_background_with_document(
    from app.indexing_pipeline.adapters.file_upload_adapter import (
        UploadDocumentAdapter,
    )
-    from app.services.llm_service import get_user_long_context_llm
    from app.utils.document_converters import generate_content_hash

    from .base import check_duplicate_document
@ -522,8 +513,6 @@ async def process_file_in_background_with_document(
                stage="chunking",
            )

-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-
        adapter = UploadDocumentAdapter(session)
        await adapter.index(
            markdown_content=markdown_content,
@ -531,8 +520,6 @@ async def process_file_in_background_with_document(
            etl_service=etl_service,
            search_space_id=search_space_id,
            user_id=user_id,
-            llm=user_llm,
-            should_summarize=should_summarize,
        )

        if billable_pages > 0:
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@ -8,12 +8,11 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.db import Document, DocumentStatus, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
+    embed_text,
    generate_content_hash,
-    generate_document_summary,
 )

 from ._helpers import (
@ -183,21 +182,8 @@ async def add_received_markdown_file_document(
                return doc
            # Content changed - continue to update

-        # Get user's long context LLM (needed for both create and update)
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search space {search_space_id}"
-            )
-
-        # Generate summary with metadata
-        document_metadata = {
-            "file_name": file_name,
-            "document_type": "Markdown File Document",
-        }
-        summary_content, summary_embedding = await generate_document_summary(
-            file_in_markdown, user_llm, document_metadata
-        )
+        summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
+        summary_embedding = embed_text(summary_content)

        # Process chunks
        chunks = await create_document_chunks(file_in_markdown)
--- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py
@ -17,12 +17,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from youtube_transcript_api import YouTubeTranscriptApi

 from app.db import Document, DocumentStatus, DocumentType
-from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
+    embed_text,
    generate_content_hash,
-    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from app.utils.proxy_config import get_requests_proxies
@ -355,40 +354,8 @@ async def add_youtube_video_document(
            await session.commit()
            return document

-        # Get LLM for summary generation
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
-            {"stage": "llm_setup"},
-        )
-
-        # Get user's long context LLM
-        user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
-        if not user_llm:
-            raise RuntimeError(
-                f"No long context LLM configured for user {user_id} in search space {search_space_id}"
-            )
-
-        # Generate summary
-        await task_logger.log_task_progress(
-            log_entry,
-            f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
-            {"stage": "summary_generation"},
-        )
-
-        # Generate summary with metadata
-        document_metadata_for_summary = {
-            "url": url,
-            "video_id": video_id,
-            "title": video_data.get("title", "YouTube Video"),
-            "author": video_data.get("author_name", "Unknown"),
-            "thumbnail": video_data.get("thumbnail_url", ""),
-            "document_type": "YouTube Video Document",
-            "has_transcript": "No captions available" not in transcript_text,
-        }
-        summary_content, summary_embedding = await generate_document_summary(
-            combined_document_string, user_llm, document_metadata_for_summary
-        )
+        summary_content = combined_document_string
+        summary_embedding = embed_text(summary_content)

        # Process chunks
        await task_logger.log_task_progress(