From e9892c8fe915362a62621f5bb12c85e20c247228 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 26 Feb 2026 18:24:57 -0800 Subject: [PATCH] feat: added configable summary calculation and various improvements - Replaced direct embedding calls with a utility function across various components to streamline embedding logic. - Added enable_summary flag to several models and routes to control summary generation behavior. --- .../102_add_enable_summary_to_connectors.py | 46 ++++++++++++ .../new_chat/tools/search_surfsense_docs.py | 4 +- .../agents/new_chat/tools/shared_memory.py | 6 +- .../app/agents/new_chat/tools/user_memory.py | 6 +- .../connectors/composio_gmail_connector.py | 12 ++-- .../composio_google_calendar_connector.py | 12 ++-- .../composio_google_drive_connector.py | 21 +++--- surfsense_backend/app/db.py | 32 +++++---- .../adapters/file_upload_adapter.py | 3 +- .../indexing_pipeline/document_embedder.py | 7 +- .../app/routes/documents_routes.py | 2 + .../app/routes/reports_routes.py | 4 +- surfsense_backend/app/schemas/podcasts.py | 4 +- .../app/schemas/search_source_connector.py | 2 + .../app/services/linear/kb_sync_service.py | 16 ++--- .../app/services/llm_router_service.py | 70 +++++++++++-------- .../app/services/notion/kb_sync_service.py | 8 +-- .../app/tasks/celery_tasks/document_tasks.py | 11 ++- .../connector_indexers/airtable_indexer.py | 11 ++- .../connector_indexers/bookstack_indexer.py | 17 ++--- .../connector_indexers/clickup_indexer.py | 8 +-- .../connector_indexers/confluence_indexer.py | 18 ++--- .../connector_indexers/discord_indexer.py | 5 +- .../connector_indexers/github_indexer.py | 13 ++-- .../google_calendar_indexer.py | 21 ++---- .../google_drive_indexer.py | 6 +- .../google_gmail_indexer.py | 11 ++- .../tasks/connector_indexers/jira_indexer.py | 14 ++-- .../connector_indexers/linear_indexer.py | 17 ++--- .../tasks/connector_indexers/luma_indexer.py | 29 ++------ .../connector_indexers/notion_indexer.py | 11 ++- .../connector_indexers/obsidian_indexer.py | 5 +- .../tasks/connector_indexers/slack_indexer.py | 6 +- .../tasks/connector_indexers/teams_indexer.py | 6 +- .../connector_indexers/webcrawler_indexer.py | 24 ++----- .../document_processors/file_processors.py | 9 ++- .../app/tasks/surfsense_docs_indexer.py | 7 +- .../app/utils/document_converters.py | 52 +++++++++++++- surfsense_backend/main.py | 5 +- .../new-chat/[[...chat_id]]/page.tsx | 4 -- .../assistant-ui/connector-popup.tsx | 6 ++ .../components/summary-config.tsx | 25 +++++++ .../views/connector-edit-view.tsx | 10 ++- .../views/indexing-configuration-view.tsx | 10 ++- .../hooks/use-connector-dialog.ts | 17 ++++- .../components/sources/DocumentUploadTab.tsx | 8 ++- .../contracts/types/connector.types.ts | 3 + .../contracts/types/document.types.ts | 1 + .../lib/apis/documents-api.service.ts | 1 + surfsense_web/lib/electric/client.ts | 32 +++++---- 50 files changed, 380 insertions(+), 298 deletions(-) create mode 100644 surfsense_backend/alembic/versions/102_add_enable_summary_to_connectors.py create mode 100644 surfsense_web/components/assistant-ui/connector-popup/components/summary-config.tsx diff --git a/surfsense_backend/alembic/versions/102_add_enable_summary_to_connectors.py b/surfsense_backend/alembic/versions/102_add_enable_summary_to_connectors.py new file mode 100644 index 000000000..ae5fc0e02 --- /dev/null +++ b/surfsense_backend/alembic/versions/102_add_enable_summary_to_connectors.py @@ -0,0 +1,46 @@ +"""102_add_enable_summary_to_connectors + +Revision ID: 102 +Revises: 101 +Create Date: 2026-02-26 + +Adds enable_summary boolean column to search_source_connectors. +Defaults to False for all existing and new connectors so LLM-based +summary generation is opt-in. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "102" +down_revision: str | None = "101" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + conn = op.get_bind() + existing_columns = [ + col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors") + ] + + if "enable_summary" not in existing_columns: + op.add_column( + "search_source_connectors", + sa.Column( + "enable_summary", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + +def downgrade() -> None: + op.drop_column("search_source_connectors", "enable_summary") diff --git a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py index b9b370c23..ec86c3ffa 100644 --- a/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py +++ b/surfsense_backend/app/agents/new_chat/tools/search_surfsense_docs.py @@ -14,8 +14,8 @@ from langchain_core.tools import tool from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument +from app.utils.document_converters import embed_text def format_surfsense_docs_results(results: list[tuple]) -> str: @@ -100,7 +100,7 @@ async def search_surfsense_docs_async( Formatted string with relevant documentation content """ # Get embedding for the query - query_embedding = config.embedding_model_instance.embed(query) + query_embedding = embed_text(query) # Vector similarity search on chunks, joining with documents stmt = ( diff --git a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py index 8cd4148ba..ba69f1ce8 100644 --- a/surfsense_backend/app/agents/new_chat/tools/shared_memory.py +++ b/surfsense_backend/app/agents/new_chat/tools/shared_memory.py @@ -8,8 +8,8 @@ from langchain_core.tools import tool from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import MemoryCategory, SharedMemory, User +from app.utils.document_converters import embed_text logger = logging.getLogger(__name__) @@ -64,7 +64,7 @@ async def save_shared_memory( count = await get_shared_memory_count(db_session, search_space_id) if count >= MAX_MEMORIES_PER_SEARCH_SPACE: await delete_oldest_shared_memory(db_session, search_space_id) - embedding = config.embedding_model_instance.embed(content) + embedding = embed_text(content) row = SharedMemory( search_space_id=search_space_id, created_by_id=_to_uuid(created_by_id), @@ -108,7 +108,7 @@ async def recall_shared_memory( if category and category in valid_categories: stmt = stmt.where(SharedMemory.category == MemoryCategory(category)) if query: - query_embedding = config.embedding_model_instance.embed(query) + query_embedding = embed_text(query) stmt = stmt.order_by( SharedMemory.embedding.op("<=>")(query_embedding) ).limit(top_k) diff --git a/surfsense_backend/app/agents/new_chat/tools/user_memory.py b/surfsense_backend/app/agents/new_chat/tools/user_memory.py index 23a0b8666..8aa516454 100644 --- a/surfsense_backend/app/agents/new_chat/tools/user_memory.py +++ b/surfsense_backend/app/agents/new_chat/tools/user_memory.py @@ -17,8 +17,8 @@ from langchain_core.tools import tool from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import MemoryCategory, UserMemory +from app.utils.document_converters import embed_text logger = logging.getLogger(__name__) @@ -178,7 +178,7 @@ def create_save_memory_tool( await delete_oldest_memory(db_session, user_id, search_space_id) # Generate embedding for the memory - embedding = config.embedding_model_instance.embed(content) + embedding = embed_text(content) # Create new memory using ORM # The pgvector Vector column type handles embedding conversion automatically @@ -268,7 +268,7 @@ def create_recall_memory_tool( if query: # Semantic search using embeddings - query_embedding = config.embedding_model_instance.embed(query) + query_embedding = embed_text(query) # Build query with vector similarity stmt = ( diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py index 4764a0a41..e83ba5cfb 100644 --- a/surfsense_backend/app/connectors/composio_gmail_connector.py +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -14,7 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload -from app.config import config from app.connectors.composio_connector import ComposioConnector from app.db import Document, DocumentStatus, DocumentType from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE @@ -27,6 +26,7 @@ from app.tasks.connector_indexers.base import ( ) from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -383,6 +383,7 @@ async def _process_gmail_messages_phase2( connector_id: int, search_space_id: int, user_id: str, + enable_summary: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int]: """ @@ -415,7 +416,7 @@ async def _process_gmail_messages_phase2( session, user_id, search_space_id ) - if user_llm: + if user_llm and enable_summary: document_metadata_for_summary = { "message_id": item["message_id"], "thread_id": item["thread_id"], @@ -427,10 +428,8 @@ async def _process_gmail_messages_phase2( item["markdown_content"], user_llm, document_metadata_for_summary ) else: - summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["markdown_content"]) @@ -646,6 +645,7 @@ async def index_composio_gmail( connector_id=connector_id, search_space_id=search_space_id, user_id=user_id, + enable_summary=getattr(connector, "enable_summary", False), on_heartbeat_callback=on_heartbeat_callback, ) diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py index 6593721a1..63bade873 100644 --- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -14,7 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload -from app.config import config from app.connectors.composio_connector import ComposioConnector from app.db import Document, DocumentStatus, DocumentType from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE @@ -27,6 +26,7 @@ from app.tasks.connector_indexers.base import ( ) from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -440,7 +440,7 @@ async def index_composio_google_calendar( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "event_id": item["event_id"], "summary": item["summary"], @@ -456,12 +456,10 @@ async def index_composio_google_calendar( document_metadata_for_summary, ) else: - summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}" - if item["location"]: - summary_content += f"\nLocation: {item['location']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content + summary_content = ( + f"Calendar: {item['summary']}\n\n{item['markdown_content']}" ) + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["markdown_content"]) diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py index 4ccd195e6..c10edb7e9 100644 --- a/surfsense_backend/app/connectors/composio_google_drive_connector.py +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -31,6 +31,7 @@ from app.tasks.connector_indexers.base import ( ) from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -714,6 +715,7 @@ async def index_composio_google_drive( max_items=max_items, task_logger=task_logger, log_entry=log_entry, + enable_summary=getattr(connector, "enable_summary", False), on_heartbeat_callback=on_heartbeat_callback, ) else: @@ -747,6 +749,7 @@ async def index_composio_google_drive( max_items=max_items, task_logger=task_logger, log_entry=log_entry, + enable_summary=getattr(connector, "enable_summary", False), on_heartbeat_callback=on_heartbeat_callback, ) @@ -829,6 +832,7 @@ async def _index_composio_drive_delta_sync( max_items: int, task_logger: TaskLoggingService, log_entry, + enable_summary: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int, list[str]]: """Index Google Drive files using delta sync with real-time document status updates. @@ -1079,7 +1083,7 @@ async def _index_composio_drive_delta_sync( session, user_id, search_space_id ) - if user_llm: + if user_llm and enable_summary: document_metadata_for_summary = { "file_id": item["file_id"], "file_name": item["file_name"], @@ -1090,10 +1094,8 @@ async def _index_composio_drive_delta_sync( markdown_content, user_llm, document_metadata_for_summary ) else: - summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}\n\n{markdown_content}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(markdown_content) @@ -1155,6 +1157,7 @@ async def _index_composio_drive_full_scan( max_items: int, task_logger: TaskLoggingService, log_entry, + enable_summary: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int, list[str]]: """Index Google Drive files using full scan with real-time document status updates.""" @@ -1488,7 +1491,7 @@ async def _index_composio_drive_full_scan( session, user_id, search_space_id ) - if user_llm: + if user_llm and enable_summary: document_metadata_for_summary = { "file_id": item["file_id"], "file_name": item["file_name"], @@ -1499,10 +1502,8 @@ async def _index_composio_drive_full_scan( markdown_content, user_llm, document_metadata_for_summary ) else: - summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}\n\n{markdown_content}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(markdown_content) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 1c9181ed2..771689a13 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1,6 +1,6 @@ from collections.abc import AsyncGenerator from datetime import UTC, datetime -from enum import Enum +from enum import StrEnum from fastapi import Depends from fastapi_users.db import SQLAlchemyBaseUserTableUUID, SQLAlchemyUserDatabase @@ -31,7 +31,7 @@ if config.AUTH_TYPE == "GOOGLE": DATABASE_URL = config.DATABASE_URL -class DocumentType(str, Enum): +class DocumentType(StrEnum): EXTENSION = "EXTENSION" CRAWLED_URL = "CRAWLED_URL" FILE = "FILE" @@ -60,7 +60,7 @@ class DocumentType(str, Enum): COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" -class SearchSourceConnectorType(str, Enum): +class SearchSourceConnectorType(StrEnum): SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT TAVILY_API = "TAVILY_API" SEARXNG_API = "SEARXNG_API" @@ -93,7 +93,7 @@ class SearchSourceConnectorType(str, Enum): COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" -class PodcastStatus(str, Enum): +class PodcastStatus(StrEnum): PENDING = "pending" GENERATING = "generating" READY = "ready" @@ -177,7 +177,7 @@ class DocumentStatus: return None -class LiteLLMProvider(str, Enum): +class LiteLLMProvider(StrEnum): """ Enum for LLM providers supported by LiteLLM. """ @@ -215,7 +215,7 @@ class LiteLLMProvider(str, Enum): CUSTOM = "CUSTOM" -class ImageGenProvider(str, Enum): +class ImageGenProvider(StrEnum): """ Enum for image generation providers supported by LiteLLM. This is a subset of LLM providers — only those that support image generation. @@ -233,7 +233,7 @@ class ImageGenProvider(str, Enum): NSCALE = "NSCALE" -class LogLevel(str, Enum): +class LogLevel(StrEnum): DEBUG = "DEBUG" INFO = "INFO" WARNING = "WARNING" @@ -241,13 +241,13 @@ class LogLevel(str, Enum): CRITICAL = "CRITICAL" -class LogStatus(str, Enum): +class LogStatus(StrEnum): IN_PROGRESS = "IN_PROGRESS" SUCCESS = "SUCCESS" FAILED = "FAILED" -class IncentiveTaskType(str, Enum): +class IncentiveTaskType(StrEnum): """ Enum for incentive task types that users can complete to earn free pages. Each task can only be completed once per user. @@ -298,7 +298,7 @@ INCENTIVE_TASKS_CONFIG = { } -class Permission(str, Enum): +class Permission(StrEnum): """ Granular permissions for search space resources. Use '*' (FULL_ACCESS) to grant all permissions. @@ -471,7 +471,7 @@ class BaseModel(Base): id = Column(Integer, primary_key=True, index=True) -class NewChatMessageRole(str, Enum): +class NewChatMessageRole(StrEnum): """Role enum for new chat messages.""" USER = "user" @@ -479,7 +479,7 @@ class NewChatMessageRole(str, Enum): SYSTEM = "system" -class ChatVisibility(str, Enum): +class ChatVisibility(StrEnum): """ Visibility/sharing level for chat threads. @@ -788,7 +788,7 @@ class ChatSessionState(BaseModel): ai_responding_to_user = relationship("User") -class MemoryCategory(str, Enum): +class MemoryCategory(StrEnum): """Categories for user memories.""" # Using lowercase keys to match PostgreSQL enum values @@ -1317,6 +1317,12 @@ class SearchSourceConnector(BaseModel, TimestampMixin): last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True) config = Column(JSON, nullable=False) + # Summary generation (LLM-based) - disabled by default to save resources. + # When enabled, improves hybrid search quality at the cost of LLM calls. + enable_summary = Column( + Boolean, nullable=False, default=False, server_default="false" + ) + # Periodic indexing fields periodic_indexing_enabled = Column(Boolean, nullable=False, default=False) indexing_frequency_minutes = Column(Integer, nullable=True) diff --git a/surfsense_backend/app/indexing_pipeline/adapters/file_upload_adapter.py b/surfsense_backend/app/indexing_pipeline/adapters/file_upload_adapter.py index 65503c60e..ab1095ee3 100644 --- a/surfsense_backend/app/indexing_pipeline/adapters/file_upload_adapter.py +++ b/surfsense_backend/app/indexing_pipeline/adapters/file_upload_adapter.py @@ -13,6 +13,7 @@ async def index_uploaded_file( user_id: str, session: AsyncSession, llm, + should_summarize: bool = False, ) -> None: connector_doc = ConnectorDocument( title=filename, @@ -22,7 +23,7 @@ async def index_uploaded_file( search_space_id=search_space_id, created_by_id=user_id, connector_id=None, - should_summarize=True, + should_summarize=should_summarize, should_use_code_chunker=False, fallback_summary=markdown_content[:4000], metadata={ diff --git a/surfsense_backend/app/indexing_pipeline/document_embedder.py b/surfsense_backend/app/indexing_pipeline/document_embedder.py index ea24a5a56..adec24434 100644 --- a/surfsense_backend/app/indexing_pipeline/document_embedder.py +++ b/surfsense_backend/app/indexing_pipeline/document_embedder.py @@ -1,6 +1,3 @@ -from app.config import config +from app.utils.document_converters import embed_text - -def embed_text(text: str) -> list[float]: - """Embed a single text string using the configured embedding model.""" - return config.embedding_model_instance.embed(text) +__all__ = ["embed_text"] diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 8217b6dac..0aff84f6d 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -118,6 +118,7 @@ async def create_documents( async def create_documents_file_upload( files: list[UploadFile], search_space_id: int = Form(...), + should_summarize: bool = Form(False), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -303,6 +304,7 @@ async def create_documents_file_upload( filename=filename, search_space_id=search_space_id, user_id=str(user.id), + should_summarize=should_summarize, ) return { diff --git a/surfsense_backend/app/routes/reports_routes.py b/surfsense_backend/app/routes/reports_routes.py index e32d7adcd..652d267fd 100644 --- a/surfsense_backend/app/routes/reports_routes.py +++ b/surfsense_backend/app/routes/reports_routes.py @@ -17,7 +17,7 @@ import logging import os import re import tempfile -from enum import Enum +from enum import StrEnum import pypandoc import typst @@ -46,7 +46,7 @@ router = APIRouter() MAX_REPORT_LIST_LIMIT = 500 -class ExportFormat(str, Enum): +class ExportFormat(StrEnum): PDF = "pdf" DOCX = "docx" diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py index 60f9d7dc0..d41f1ca36 100644 --- a/surfsense_backend/app/schemas/podcasts.py +++ b/surfsense_backend/app/schemas/podcasts.py @@ -1,13 +1,13 @@ """Podcast schemas for API responses.""" from datetime import datetime -from enum import Enum +from enum import StrEnum from typing import Any from pydantic import BaseModel -class PodcastStatusEnum(str, Enum): +class PodcastStatusEnum(StrEnum): PENDING = "pending" GENERATING = "generating" READY = "ready" diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index b8ff3e649..1b0ed0b13 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -16,6 +16,7 @@ class SearchSourceConnectorBase(BaseModel): is_indexable: bool last_indexed_at: datetime | None = None config: dict[str, Any] + enable_summary: bool = False periodic_indexing_enabled: bool = False indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None @@ -65,6 +66,7 @@ class SearchSourceConnectorUpdate(BaseModel): is_indexable: bool | None = None last_indexed_at: datetime | None = None config: dict[str, Any] | None = None + enable_summary: bool | None = None periodic_indexing_enabled: bool | None = None indexing_frequency_minutes: int | None = None next_scheduled_at: datetime | None = None diff --git a/surfsense_backend/app/services/linear/kb_sync_service.py b/surfsense_backend/app/services/linear/kb_sync_service.py index bbae8c6e8..8d1bc47c7 100644 --- a/surfsense_backend/app/services/linear/kb_sync_service.py +++ b/surfsense_backend/app/services/linear/kb_sync_service.py @@ -4,12 +4,12 @@ from datetime import datetime from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.linear_connector import LinearConnector from app.db import Chunk, Document from app.services.llm_service import get_user_long_context_llm from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, ) @@ -80,7 +80,7 @@ class LinearKBSyncService: state = formatted_issue.get("state", "Unknown") priority = issue_raw.get("priorityLabel", "Unknown") comment_count = len(formatted_issue.get("comments", [])) - description = formatted_issue.get("description", "") + formatted_issue.get("description", "") user_llm = await get_user_long_context_llm( self.db_session, user_id, search_space_id, disable_streaming=True @@ -100,18 +100,10 @@ class LinearKBSyncService: issue_content, user_llm, document_metadata_for_summary ) else: - if description and len(description) > 1000: - description = description[:997] + "..." summary_content = ( - f"Linear Issue {issue_identifier}: {issue_title}\n\n" - f"Status: {state}\n\n" - ) - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content + f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}" ) + summary_embedding = embed_text(summary_content) await self.db_session.execute( delete(Chunk).where(Chunk.document_id == document.id) diff --git a/surfsense_backend/app/services/llm_router_service.py b/surfsense_backend/app/services/llm_router_service.py index 58a75cf54..23fcad69d 100644 --- a/surfsense_backend/app/services/llm_router_service.py +++ b/surfsense_backend/app/services/llm_router_service.py @@ -15,10 +15,12 @@ import logging from typing import Any from langchain_core.callbacks import CallbackManagerForLLMRun +from langchain_core.exceptions import ContextOverflowError from langchain_core.language_models import BaseChatModel from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult from litellm import Router +from litellm.exceptions import ContextWindowExceededError logger = logging.getLogger(__name__) @@ -359,13 +361,15 @@ class ChatLiteLLMRouter(BaseChatModel): if self._tool_choice is not None: call_kwargs["tool_choice"] = self._tool_choice - # Call router completion - response = self._router.completion( - model=self.model, - messages=formatted_messages, - stop=stop, - **call_kwargs, - ) + try: + response = self._router.completion( + model=self.model, + messages=formatted_messages, + stop=stop, + **call_kwargs, + ) + except ContextWindowExceededError as e: + raise ContextOverflowError(str(e)) from e # Convert response to ChatResult with potential tool calls message = self._convert_response_to_message(response.choices[0].message) @@ -396,13 +400,15 @@ class ChatLiteLLMRouter(BaseChatModel): if self._tool_choice is not None: call_kwargs["tool_choice"] = self._tool_choice - # Call router async completion - response = await self._router.acompletion( - model=self.model, - messages=formatted_messages, - stop=stop, - **call_kwargs, - ) + try: + response = await self._router.acompletion( + model=self.model, + messages=formatted_messages, + stop=stop, + **call_kwargs, + ) + except ContextWindowExceededError as e: + raise ContextOverflowError(str(e)) from e # Convert response to ChatResult with potential tool calls message = self._convert_response_to_message(response.choices[0].message) @@ -432,14 +438,16 @@ class ChatLiteLLMRouter(BaseChatModel): if self._tool_choice is not None: call_kwargs["tool_choice"] = self._tool_choice - # Call router completion with streaming - response = self._router.completion( - model=self.model, - messages=formatted_messages, - stop=stop, - stream=True, - **call_kwargs, - ) + try: + response = self._router.completion( + model=self.model, + messages=formatted_messages, + stop=stop, + stream=True, + **call_kwargs, + ) + except ContextWindowExceededError as e: + raise ContextOverflowError(str(e)) from e # Yield chunks for chunk in response: @@ -471,14 +479,16 @@ class ChatLiteLLMRouter(BaseChatModel): if self._tool_choice is not None: call_kwargs["tool_choice"] = self._tool_choice - # Call router async completion with streaming - response = await self._router.acompletion( - model=self.model, - messages=formatted_messages, - stop=stop, - stream=True, - **call_kwargs, - ) + try: + response = await self._router.acompletion( + model=self.model, + messages=formatted_messages, + stop=stop, + stream=True, + **call_kwargs, + ) + except ContextWindowExceededError as e: + raise ContextOverflowError(str(e)) from e # Yield chunks asynchronously async for chunk in response: diff --git a/surfsense_backend/app/services/notion/kb_sync_service.py b/surfsense_backend/app/services/notion/kb_sync_service.py index 40b9f0ef1..ce31e0d35 100644 --- a/surfsense_backend/app/services/notion/kb_sync_service.py +++ b/surfsense_backend/app/services/notion/kb_sync_service.py @@ -4,11 +4,11 @@ from datetime import datetime from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.db import Chunk, Document from app.services.llm_service import get_user_long_context_llm from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, ) @@ -127,10 +127,8 @@ class NotionKBSyncService: logger.debug(f"Generated summary length: {len(summary_content)} chars") else: logger.warning("No LLM configured - using fallback summary") - summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content[:500]}..." - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}" + summary_embedding = embed_text(summary_content) logger.debug(f"Deleting old chunks for document {document_id}") await self.db_session.execute( diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 3e60b5819..60cd21f97 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -626,6 +626,7 @@ def process_file_upload_with_document_task( filename: str, search_space_id: int, user_id: str, + should_summarize: bool = False, ): """ Celery task to process uploaded file with existing pending document. @@ -640,6 +641,7 @@ def process_file_upload_with_document_task( filename: Original filename search_space_id: ID of the search space user_id: ID of the user + should_summarize: Whether to generate an LLM summary """ import traceback @@ -674,7 +676,12 @@ def process_file_upload_with_document_task( try: loop.run_until_complete( _process_file_with_document( - document_id, temp_path, filename, search_space_id, user_id + document_id, + temp_path, + filename, + search_space_id, + user_id, + should_summarize=should_summarize, ) ) logger.info( @@ -710,6 +717,7 @@ async def _process_file_with_document( filename: str, search_space_id: int, user_id: str, + should_summarize: bool = False, ): """ Process file and update existing pending document status. @@ -811,6 +819,7 @@ async def _process_file_with_document( task_logger=task_logger, log_entry=log_entry, notification=notification, + should_summarize=should_summarize, ) # Update notification on success diff --git a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py index 46cd069c9..438a93815 100644 --- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py @@ -12,13 +12,13 @@ from collections.abc import Awaitable, Callable from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.airtable_history import AirtableHistoryConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -399,7 +399,7 @@ async def index_airtable_records( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "record_id": item["record_id"], "created_time": item["record"].get("CREATED_TIME()", ""), @@ -415,11 +415,8 @@ async def index_airtable_records( document_metadata_for_summary, ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Airtable Record: {item['record_id']}\n\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["markdown_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index d60884539..bf3aaa35f 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -13,13 +13,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.bookstack_connector import BookStackConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -403,7 +403,7 @@ async def index_bookstack_pages( "connector_id": connector_id, } - if user_llm: + if user_llm and connector.enable_summary: summary_metadata = { "page_name": item["page_name"], "page_id": item["page_id"], @@ -418,17 +418,8 @@ async def index_bookstack_pages( item["full_content"], user_llm, summary_metadata ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n" - if item["page_content"]: - # Take first 1000 characters of content for summary - content_preview = item["page_content"][:1000] - if len(item["page_content"]) > 1000: - content_preview += "..." - summary_content += f"Content Preview: {content_preview}\n\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}" + summary_embedding = embed_text(summary_content) # Process chunks - using the full page content chunks = await create_document_chunks(item["full_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py index 47c5d8b3b..fd0233e87 100644 --- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py @@ -14,13 +14,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.clickup_history import ClickUpHistoryConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -398,7 +398,7 @@ async def index_clickup_tasks( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "task_id": item["task_id"], "task_name": item["task_name"], @@ -418,9 +418,7 @@ async def index_clickup_tasks( ) else: summary_content = item["task_content"] - summary_embedding = config.embedding_model_instance.embed( - item["task_content"] - ) + summary_embedding = embed_text(item["task_content"]) chunks = await create_document_chunks(item["task_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index a3a059d4e..c28e82b8f 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -14,13 +14,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.confluence_history import ConfluenceHistoryConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -378,7 +378,7 @@ async def index_confluence_pages( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata = { "page_title": item["page_title"], "page_id": item["page_id"], @@ -394,18 +394,8 @@ async def index_confluence_pages( item["full_content"], user_llm, document_metadata ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n" - if item["page_content"]: - # Take first 1000 characters of content for summary - content_preview = item["page_content"][:1000] - if len(item["page_content"]) > 1000: - content_preview += "..." - summary_content += f"Content Preview: {content_preview}\n\n" - summary_content += f"Comments: {item['comment_count']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n{item['full_content']}" + summary_embedding = embed_text(summary_content) # Process chunks - using the full page content with comments chunks = await create_document_chunks(item["full_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py index 8769d03c5..0421352ff 100644 --- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -23,6 +23,7 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_unique_identifier_hash, ) @@ -669,9 +670,7 @@ async def index_discord_messages( # Heavy processing (embeddings, chunks) chunks = await create_document_chunks(item["combined_document_string"]) - doc_embedding = config.embedding_model_instance.embed( - item["combined_document_string"] - ) + doc_embedding = embed_text(item["combined_document_string"]) # Update document to READY with actual content document.title = f"{item['guild_name']}#{item['channel_name']}" diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index e1a1ddd4d..fc6634024 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -16,13 +16,13 @@ from datetime import UTC, datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.github_connector import GitHubConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -367,7 +367,7 @@ async def index_github_repos( "estimated_tokens": digest.estimated_tokens, } - if user_llm: + if user_llm and connector.enable_summary: # Prepare content for summarization summary_content = digest.full_digest if len(summary_content) > MAX_DIGEST_CHARS: @@ -381,15 +381,12 @@ async def index_github_repos( summary_content, user_llm, document_metadata_for_summary ) else: - # Fallback to simple summary if no LLM configured summary_text = ( f"# GitHub Repository: {repo_full_name}\n\n" f"## Summary\n{digest.summary}\n\n" - f"## File Structure\n{digest.tree[:3000]}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_text + f"## File Structure\n{digest.tree}" ) + summary_embedding = embed_text(summary_text) # Chunk the full digest content for granular search try: @@ -551,7 +548,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: chunks.append( Chunk( content=chunk_text, - embedding=config.embedding_model_instance.embed(chunk_text), + embedding=embed_text(chunk_text), ) ) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 1476f3f40..1407d98dd 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -20,6 +20,7 @@ from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -489,7 +490,7 @@ async def index_google_calendar_events( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "event_id": item["event_id"], "event_summary": item["event_summary"], @@ -507,22 +508,8 @@ async def index_google_calendar_events( item["event_markdown"], user_llm, document_metadata_for_summary ) else: - summary_content = ( - f"Google Calendar Event: {item['event_summary']}\n\n" - ) - summary_content += f"Calendar: {item['calendar_id']}\n" - summary_content += f"Start: {item['start_time']}\n" - summary_content += f"End: {item['end_time']}\n" - if item["location"]: - summary_content += f"Location: {item['location']}\n" - if item["description"]: - desc_preview = item["description"][:1000] - if len(item["description"]) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["event_markdown"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index f7624cffe..20c54d3fc 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -352,7 +352,7 @@ async def index_google_drive_single_file( await session.commit() # Process the file - indexed, skipped, failed = await _process_single_file( + indexed, _skipped, failed = await _process_single_file( drive_client=drive_client, session=session, file=file, @@ -608,7 +608,7 @@ async def _index_with_delta_sync( {"stage": "delta_sync", "start_token": start_page_token}, ) - changes, final_token, error = await fetch_all_changes( + changes, _final_token, error = await fetch_all_changes( drive_client, start_page_token, folder_id ) @@ -1011,7 +1011,7 @@ async def _process_single_file( pending_document.status = DocumentStatus.processing() await session.commit() - _, error, metadata = await download_and_process_file( + _, error, _metadata = await download_and_process_file( client=drive_client, file=file, search_space_id=search_space_id, diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index c7caee4da..1a8b2b176 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -25,6 +25,7 @@ from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -413,7 +414,7 @@ async def index_google_gmail_messages( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "message_id": item["message_id"], "thread_id": item["thread_id"], @@ -432,12 +433,8 @@ async def index_google_gmail_messages( document_metadata_for_summary, ) else: - summary_content = f"Google Gmail Message: {item['subject']}\n\n" - summary_content += f"Sender: {item['sender']}\n" - summary_content += f"Date: {item['date_str']}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["markdown_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index 65f56ce46..dec37428a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -14,13 +14,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.jira_history import JiraHistoryConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -356,7 +356,7 @@ async def index_jira_issues( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata = { "issue_key": item["issue_identifier"], "issue_title": item["issue_title"], @@ -373,14 +373,8 @@ async def index_jira_issues( item["issue_content"], user_llm, document_metadata ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n" - if item["formatted_issue"].get("description"): - summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n" - summary_content += f"Comments: {item['comment_count']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\n{item['issue_content']}" + summary_embedding = embed_text(summary_content) # Process chunks - using the full issue content with comments chunks = await create_document_chunks(item["issue_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index 87bafb3c0..1a2254c5b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -13,13 +13,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.linear_connector import LinearConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -395,7 +395,7 @@ async def index_linear_issues( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "issue_id": item["issue_identifier"], "issue_title": item["issue_title"], @@ -412,17 +412,8 @@ async def index_linear_issues( item["issue_content"], user_llm, document_metadata_for_summary ) else: - # Fallback to simple summary if no LLM configured - description = item["description"] - if description and len(description) > 1000: - description = description[:997] + "..." - summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {item['comment_count']}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n{item['issue_content']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["issue_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 04af80e53..2d86b09c1 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -13,13 +13,13 @@ from datetime import datetime, timedelta from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.luma_connector import LumaConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -441,7 +441,7 @@ async def index_luma_events( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "event_id": item["event_id"], "event_name": item["event_name"], @@ -462,29 +462,10 @@ async def index_luma_events( item["event_markdown"], user_llm, document_metadata_for_summary ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Luma Event: {item['event_name']}\n\n" - if item["event_url"]: - summary_content += f"URL: {item['event_url']}\n" - summary_content += f"Start: {item['start_at']}\n" - summary_content += f"End: {item['end_at']}\n" - if item["timezone"]: - summary_content += f"Timezone: {item['timezone']}\n" - if item["location"]: - summary_content += f"Location: {item['location']}\n" - if item["city"]: - summary_content += f"City: {item['city']}\n" - if item["host_names"]: - summary_content += f"Hosts: {item['host_names']}\n" - if item["description"]: - desc_preview = item["description"][:1000] - if len(item["description"]) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content + summary_content = ( + f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}" ) + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["event_markdown"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index d2b1c9137..b0c49dea5 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -13,13 +13,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.notion_history import NotionHistoryConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -447,7 +447,7 @@ async def index_notion_pages( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "page_title": item["page_title"], "page_id": item["page_id"], @@ -463,11 +463,8 @@ async def index_notion_pages( document_metadata_for_summary, ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content'][:500]}..." - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content']}" + summary_embedding = embed_text(summary_content) chunks = await create_document_chunks(item["markdown_content"]) diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py index 6dea1a730..c0eef84d5 100644 --- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py @@ -26,6 +26,7 @@ from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -546,7 +547,7 @@ async def index_obsidian_vault( # Generate summary summary_content = "" - if long_context_llm: + if long_context_llm and connector.enable_summary: summary_content, _ = await generate_document_summary( document_string, long_context_llm, @@ -554,7 +555,7 @@ async def index_obsidian_vault( ) # Generate embedding - embedding = config.embedding_model_instance.embed(document_string) + embedding = embed_text(document_string) # Add URL and summary to metadata document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}" diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py index 01771d2ac..f83b171bc 100644 --- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -17,12 +17,12 @@ from slack_sdk.errors import SlackApiError from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.slack_history import SlackHistory from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_unique_identifier_hash, ) @@ -542,9 +542,7 @@ async def index_slack_messages( # Heavy processing (embeddings, chunks) chunks = await create_document_chunks(item["combined_document_string"]) - doc_embedding = config.embedding_model_instance.embed( - item["combined_document_string"] - ) + doc_embedding = embed_text(item["combined_document_string"]) # Update document to READY with actual content document.title = f"{item['team_name']}#{item['channel_name']}" diff --git a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py index cf6828268..ad34e8696 100644 --- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py @@ -16,12 +16,12 @@ from datetime import UTC, datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.teams_history import TeamsHistory from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_unique_identifier_hash, ) @@ -581,9 +581,7 @@ async def index_teams_messages( # Heavy processing (embeddings, chunks) chunks = await create_document_chunks(item["combined_document_string"]) - doc_embedding = config.embedding_model_instance.embed( - item["combined_document_string"] - ) + doc_embedding = embed_text(item["combined_document_string"]) # Update document to READY with actual content document.title = f"{item['team_name']} - {item['channel_name']}" diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index 8b6005b54..94361cc27 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -13,13 +13,13 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config from app.connectors.webcrawler_connector import WebCrawlerConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -377,7 +377,7 @@ async def index_crawled_urls( session, user_id, search_space_id ) - if user_llm: + if user_llm and connector.enable_summary: document_metadata_for_summary = { "url": url, "title": title, @@ -393,24 +393,8 @@ async def index_crawled_urls( structured_document, user_llm, document_metadata_for_summary ) else: - # Fallback to simple summary if no LLM configured - summary_content = f"Crawled URL: {title}\n\n" - summary_content += f"URL: {url}\n" - if description: - summary_content += f"Description: {description}\n" - if language: - summary_content += f"Language: {language}\n" - summary_content += f"Crawler: {crawler_type}\n\n" - - # Add content preview - content_preview = content[:1000] - if len(content) > 1000: - content_preview += "..." - summary_content += f"Content Preview:\n{content_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) + summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}" + summary_embedding = embed_text(summary_content) # Process chunks chunks = await create_document_chunks(content) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 181c2fb31..b77777e06 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -25,6 +25,7 @@ from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( convert_document_to_markdown, create_document_chunks, + embed_text, generate_content_hash, generate_document_summary, generate_unique_identifier_hash, @@ -760,11 +761,7 @@ async def add_received_file_document_using_docling( f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}" ) - from app.config import config - - summary_embedding = config.embedding_model_instance.embed( - enhanced_summary_content - ) + summary_embedding = embed_text(enhanced_summary_content) # Process chunks chunks = await create_document_chunks(file_in_markdown) @@ -1599,6 +1596,7 @@ async def process_file_in_background_with_document( log_entry: Log, connector: dict | None = None, notification: Notification | None = None, + should_summarize: bool = False, ) -> Document | None: """ Process file and update existing pending document (2-phase pattern). @@ -1881,6 +1879,7 @@ async def process_file_in_background_with_document( user_id=user_id, session=session, llm=user_llm, + should_summarize=should_summarize, ) await task_logger.log_task_success( diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py index ef287bc65..ca4a83de3 100644 --- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -15,6 +15,7 @@ from sqlalchemy.orm import selectinload from app.config import config from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker +from app.utils.document_converters import embed_text logger = logging.getLogger(__name__) @@ -89,7 +90,7 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]: return [ SurfsenseDocsChunk( content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), + embedding=embed_text(chunk.text), ) for chunk in config.chunker_instance.chunk(content) ] @@ -154,7 +155,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in existing_doc.title = title existing_doc.content = content existing_doc.content_hash = content_hash - existing_doc.embedding = config.embedding_model_instance.embed(content) + existing_doc.embedding = embed_text(content) existing_doc.chunks = chunks existing_doc.updated_at = datetime.now(UTC) @@ -170,7 +171,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in title=title, content=content, content_hash=content_hash, - embedding=config.embedding_model_instance.embed(content), + embedding=embed_text(content), chunks=chunks, updated_at=datetime.now(UTC), ) diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index d7e1db71d..8049b0de5 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -1,11 +1,59 @@ import hashlib +import logging +import warnings +import numpy as np from litellm import get_model_info, token_counter from app.config import config from app.db import Chunk, DocumentType from app.prompts import SUMMARY_PROMPT_TEMPLATE +logger = logging.getLogger(__name__) + + +def _get_embedding_max_tokens() -> int: + """Get the max token limit for the configured embedding model. + + Checks model properties in order: max_seq_length, _max_tokens. + Falls back to 8192 (OpenAI embedding default). + """ + model = config.embedding_model_instance + for attr in ("max_seq_length", "_max_tokens"): + val = getattr(model, attr, None) + if isinstance(val, int) and val > 0: + return val + return 8192 + + +def truncate_for_embedding(text: str) -> str: + """Truncate text to fit within the embedding model's context window. + + Uses the embedding model's own tokenizer for accurate token counting, + so the result is model-agnostic regardless of the underlying provider. + """ + max_tokens = _get_embedding_max_tokens() + if len(text) // 3 <= max_tokens: + return text + + tokenizer = config.embedding_model_instance.get_tokenizer() + tokens = tokenizer.encode(text) + if len(tokens) <= max_tokens: + return text + + warnings.warn( + f"Truncating text from {len(tokens)} to {max_tokens} tokens for embedding.", + stacklevel=2, + ) + return tokenizer.decode(tokens[:max_tokens]) + + +def embed_text(text: str) -> np.ndarray: + """Truncate text to fit and embed it. Drop-in replacement for + ``config.embedding_model_instance.embed(text)`` that never exceeds the + model's context window.""" + return config.embedding_model_instance.embed(truncate_for_embedding(text)) + def get_model_context_window(model_name: str) -> int: """Get the total context window size for a model (input + output tokens).""" @@ -146,7 +194,7 @@ async def generate_document_summary( else: enhanced_summary_content = summary_content - summary_embedding = config.embedding_model_instance.embed(enhanced_summary_content) + summary_embedding = embed_text(enhanced_summary_content) return enhanced_summary_content, summary_embedding @@ -164,7 +212,7 @@ async def create_document_chunks(content: str) -> list[Chunk]: return [ Chunk( content=chunk.text, - embedding=config.embedding_model_instance.embed(chunk.text), + embedding=embed_text(chunk.text), ) for chunk in config.chunker_instance.chunk(content) ] diff --git a/surfsense_backend/main.py b/surfsense_backend/main.py index 9c1b5c342..4a7a9b7b1 100644 --- a/surfsense_backend/main.py +++ b/surfsense_backend/main.py @@ -29,4 +29,7 @@ if __name__ == "__main__": config = uvicorn.Config(**config_kwargs) server = uvicorn.Server(config) - server.run() + if sys.platform == "win32": + asyncio.run(server.serve(), loop_factory=asyncio.SelectorEventLoop) + else: + server.run() diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index d1d98bbf6..1df9ef06c 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -26,7 +26,6 @@ import { import { clearPlanOwnerRegistry, // extractWriteTodosFromContent, - hydratePlanStateAtom, } from "@/atoms/chat/plan-state.atom"; import { closeReportPanelAtom } from "@/atoms/chat/report-panel.atom"; import { membersAtom } from "@/atoms/members/members-query.atoms"; @@ -73,7 +72,6 @@ import { appendText, buildContentForPersistence, buildContentForUI, - type ContentPart, type ContentPartsState, readSSEStream, type ThinkingStepData, @@ -188,7 +186,6 @@ export default function NewChatPage() { const setMentionedDocumentIds = useSetAtom(mentionedDocumentIdsAtom); const setMentionedDocuments = useSetAtom(mentionedDocumentsAtom); const setMessageDocumentsMap = useSetAtom(messageDocumentsMapAtom); - const hydratePlanState = useSetAtom(hydratePlanStateAtom); const setCurrentThreadState = useSetAtom(currentThreadAtom); const setTargetCommentId = useSetAtom(setTargetCommentIdAtom); const clearTargetCommentId = useSetAtom(clearTargetCommentIdAtom); @@ -350,7 +347,6 @@ export default function NewChatPage() { setMessageDocumentsMap, setMentionedDocumentIds, setMentionedDocuments, - hydratePlanState, closeReportPanel, ]); diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index bac5c44a3..98964013d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -97,6 +97,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger isDisconnecting, periodicEnabled, frequencyMinutes, + enableSummary, allConnectors, viewingAccountsType, viewingMCPList, @@ -105,6 +106,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger setEndDate, setPeriodicEnabled, setFrequencyMinutes, + setEnableSummary, handleOpenChange, handleTabChange, handleScroll, @@ -282,6 +284,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger endDate={endDate} periodicEnabled={periodicEnabled} frequencyMinutes={frequencyMinutes} + enableSummary={enableSummary} isSaving={isSaving} isDisconnecting={isDisconnecting} isIndexing={indexingConnectorIds.has(editingConnector.id)} @@ -290,6 +293,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger onEndDateChange={setEndDate} onPeriodicEnabledChange={setPeriodicEnabled} onFrequencyChange={setFrequencyMinutes} + onEnableSummaryChange={setEnableSummary} onSave={() => { startIndexing(editingConnector.id); handleSaveConnector(() => refreshConnectors()); @@ -328,11 +332,13 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger endDate={endDate} periodicEnabled={periodicEnabled} frequencyMinutes={frequencyMinutes} + enableSummary={enableSummary} isStartingIndexing={isStartingIndexing} onStartDateChange={setStartDate} onEndDateChange={setEndDate} onPeriodicEnabledChange={setPeriodicEnabled} onFrequencyChange={setFrequencyMinutes} + onEnableSummaryChange={setEnableSummary} onConfigChange={setIndexingConnectorConfig} onStartIndexing={() => { if (indexingConfig.connectorId) { diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/summary-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/summary-config.tsx new file mode 100644 index 000000000..b9ff69f5f --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/components/summary-config.tsx @@ -0,0 +1,25 @@ +"use client"; + +import type { FC } from "react"; +import { Switch } from "@/components/ui/switch"; + +interface SummaryConfigProps { + enabled: boolean; + onEnabledChange: (enabled: boolean) => void; +} + +export const SummaryConfig: FC = ({ enabled, onEnabledChange }) => { + return ( +
+
+
+

Enable AI Summary

+

+ Improves search quality but adds latency during indexing +

+
+ +
+
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index cfdebee60..536044df4 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -9,6 +9,7 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; +import { SummaryConfig } from "../../components/summary-config"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -18,6 +19,7 @@ interface ConnectorEditViewProps { endDate: Date | undefined; periodicEnabled: boolean; frequencyMinutes: string; + enableSummary: boolean; isSaving: boolean; isDisconnecting: boolean; isIndexing?: boolean; @@ -26,6 +28,7 @@ interface ConnectorEditViewProps { onEndDateChange: (date: Date | undefined) => void; onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; + onEnableSummaryChange: (enabled: boolean) => void; onSave: () => void; onDisconnect: () => void; onBack: () => void; @@ -40,6 +43,7 @@ export const ConnectorEditView: FC = ({ endDate, periodicEnabled, frequencyMinutes, + enableSummary, isSaving, isDisconnecting, isIndexing = false, @@ -48,6 +52,7 @@ export const ConnectorEditView: FC = ({ onEndDateChange, onPeriodicEnabledChange, onFrequencyChange, + onEnableSummaryChange, onSave, onDisconnect, onBack, @@ -209,9 +214,12 @@ export const ConnectorEditView: FC = ({ /> )} - {/* Date range selector and periodic sync - only shown for indexable connectors */} + {/* Summary and sync settings - only shown for indexable connectors */} {connector.is_indexable && ( <> + {/* AI Summary toggle */} + + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 11760bb30..69d5c8e28 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -10,6 +10,7 @@ import { getConnectorTypeDisplay } from "@/lib/connectors/utils"; import { cn } from "@/lib/utils"; import { DateRangeSelector } from "../../components/date-range-selector"; import { PeriodicSyncConfig } from "../../components/periodic-sync-config"; +import { SummaryConfig } from "../../components/summary-config"; import type { IndexingConfigState } from "../../constants/connector-constants"; import { getConnectorDisplayName } from "../../tabs/all-connectors-tab"; import { getConnectorConfigComponent } from "../index"; @@ -21,11 +22,13 @@ interface IndexingConfigurationViewProps { endDate: Date | undefined; periodicEnabled: boolean; frequencyMinutes: string; + enableSummary: boolean; isStartingIndexing: boolean; onStartDateChange: (date: Date | undefined) => void; onEndDateChange: (date: Date | undefined) => void; onPeriodicEnabledChange: (enabled: boolean) => void; onFrequencyChange: (frequency: string) => void; + onEnableSummaryChange: (enabled: boolean) => void; onConfigChange?: (config: Record) => void; onStartIndexing: () => void; onSkip: () => void; @@ -38,11 +41,13 @@ export const IndexingConfigurationView: FC = ({ endDate, periodicEnabled, frequencyMinutes, + enableSummary, isStartingIndexing, onStartDateChange, onEndDateChange, onPeriodicEnabledChange, onFrequencyChange, + onEnableSummaryChange, onConfigChange, onStartIndexing, onSkip, @@ -149,9 +154,12 @@ export const IndexingConfigurationView: FC = ({ )} - {/* Date range selector and periodic sync - only shown for indexable connectors */} + {/* Summary and sync settings - only shown for indexable connectors */} {connector?.is_indexable && ( <> + {/* AI Summary toggle */} + + {/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 916d0dca3..4585057ff 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -67,6 +67,7 @@ export const useConnectorDialog = () => { const [isStartingIndexing, setIsStartingIndexing] = useState(false); const [periodicEnabled, setPeriodicEnabled] = useState(false); const [frequencyMinutes, setFrequencyMinutes] = useState("1440"); + const [enableSummary, setEnableSummary] = useState(false); // Edit mode state const [editingConnector, setEditingConnector] = useState(null); @@ -240,6 +241,7 @@ export const useConnectorDialog = () => { !connector.is_indexable ? false : connector.periodic_indexing_enabled ); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); + setEnableSummary(connector.enable_summary ?? false); // Reset dates - user can set new ones for re-indexing setStartDate(undefined); setEndDate(undefined); @@ -257,6 +259,7 @@ export const useConnectorDialog = () => { setEndDate(undefined); setPeriodicEnabled(false); setFrequencyMinutes("1440"); + setEnableSummary(false); setIsScrolled(false); setSearchQuery(""); } @@ -269,6 +272,7 @@ export const useConnectorDialog = () => { setEndDate(undefined); setPeriodicEnabled(false); setFrequencyMinutes("1440"); + setEnableSummary(false); setIsScrolled(false); setSearchQuery(""); } @@ -722,6 +726,7 @@ export const useConnectorDialog = () => { setConnectorConfig(connector.config || {}); setPeriodicEnabled(false); setFrequencyMinutes("1440"); + setEnableSummary(connector.enable_summary ?? false); setStartDate(undefined); setEndDate(undefined); @@ -909,12 +914,13 @@ export const useConnectorDialog = () => { const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined; const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; - // Update connector with periodic sync settings and config changes - if (periodicEnabled || indexingConnectorConfig) { + // Update connector with summary, periodic sync settings, and config changes + if (enableSummary || periodicEnabled || indexingConnectorConfig) { const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; await updateConnector({ id: indexingConfig.connectorId, data: { + enable_summary: enableSummary, ...(periodicEnabled && { periodic_indexing_enabled: true, indexing_frequency_minutes: frequency, @@ -1042,6 +1048,7 @@ export const useConnectorDialog = () => { updateConnector, periodicEnabled, frequencyMinutes, + enableSummary, router, indexingConnectorConfig, ] @@ -1108,6 +1115,7 @@ export const useConnectorDialog = () => { // Load existing periodic sync settings (disabled for non-indexable connectors) setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); + setEnableSummary(connector.enable_summary ?? false); // Reset dates - user can set new ones for re-indexing setStartDate(undefined); setEndDate(undefined); @@ -1189,6 +1197,7 @@ export const useConnectorDialog = () => { id: editingConnector.id, data: { name: connectorName || editingConnector.name, + enable_summary: enableSummary, periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled, indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency, config: connectorConfig || editingConnector.config, @@ -1326,6 +1335,7 @@ export const useConnectorDialog = () => { updateConnector, periodicEnabled, frequencyMinutes, + enableSummary, getFrequencyLabel, router, connectorConfig, @@ -1518,6 +1528,7 @@ export const useConnectorDialog = () => { setEndDate(undefined); setPeriodicEnabled(false); setFrequencyMinutes("1440"); + setEnableSummary(false); } } }, @@ -1557,6 +1568,7 @@ export const useConnectorDialog = () => { isDisconnecting, periodicEnabled, frequencyMinutes, + enableSummary, searchSpaceId, allConnectors, viewingAccountsType, @@ -1568,6 +1580,7 @@ export const useConnectorDialog = () => { setEndDate, setPeriodicEnabled, setFrequencyMinutes, + setEnableSummary, setConnectorName, // Handlers diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 94ba187d6..caea98890 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -8,6 +8,7 @@ import { useCallback, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; +import { SummaryConfig } from "@/components/assistant-ui/connector-popup/components/summary-config"; import { Accordion, AccordionContent, @@ -124,6 +125,7 @@ export function DocumentUploadTab({ const [files, setFiles] = useState([]); const [uploadProgress, setUploadProgress] = useState(0); const [accordionValue, setAccordionValue] = useState(""); + const [shouldSummarize, setShouldSummarize] = useState(false); const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); @@ -216,7 +218,7 @@ export function DocumentUploadTab({ }, 200); uploadDocuments( - { files, search_space_id: Number(searchSpaceId) }, + { files, search_space_id: Number(searchSpaceId), should_summarize: shouldSummarize }, { onSuccess: () => { clearInterval(progressInterval); @@ -413,6 +415,10 @@ export function DocumentUploadTab({ )} +
+ +
+ >(); // v5: fixed duplicate key errors, stable cutoff dates, onMustRefetch handler, // real-time documents table with title/created_by_id/status columns, // consolidated single documents sync, pending state for document queue visibility -const SYNC_VERSION = 5; +// v6: added enable_summary column to search_source_connectors +const SYNC_VERSION = 6; // Database name prefix for identifying SurfSense databases const DB_PREFIX = "surfsense-"; @@ -214,20 +215,21 @@ export async function initElectric(userId: string): Promise { // Create the search_source_connectors table schema in PGlite // This matches the backend schema await db.exec(` - CREATE TABLE IF NOT EXISTS search_source_connectors ( - id INTEGER PRIMARY KEY, - search_space_id INTEGER NOT NULL, - user_id TEXT NOT NULL, - connector_type TEXT NOT NULL, - name TEXT NOT NULL, - is_indexable BOOLEAN NOT NULL DEFAULT FALSE, - last_indexed_at TIMESTAMPTZ, - config JSONB DEFAULT '{}', - periodic_indexing_enabled BOOLEAN NOT NULL DEFAULT FALSE, - indexing_frequency_minutes INTEGER, - next_scheduled_at TIMESTAMPTZ, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() - ); + CREATE TABLE IF NOT EXISTS search_source_connectors ( + id INTEGER PRIMARY KEY, + search_space_id INTEGER NOT NULL, + user_id TEXT NOT NULL, + connector_type TEXT NOT NULL, + name TEXT NOT NULL, + is_indexable BOOLEAN NOT NULL DEFAULT FALSE, + last_indexed_at TIMESTAMPTZ, + config JSONB DEFAULT '{}', + periodic_indexing_enabled BOOLEAN NOT NULL DEFAULT FALSE, + indexing_frequency_minutes INTEGER, + next_scheduled_at TIMESTAMPTZ, + enable_summary BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ); CREATE INDEX IF NOT EXISTS idx_connectors_search_space_id ON search_source_connectors(search_space_id); CREATE INDEX IF NOT EXISTS idx_connectors_type ON search_source_connectors(connector_type);