From 290a9539eff1dc7a982e4ce981671df2a68a8653 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 4 Jun 2026 00:48:53 +0530 Subject: [PATCH] feat(db): Remove document summary LLM schema --- .../154_remove_document_summary_llm.py | 134 ++++++++++++++++++ surfsense_backend/app/db.py | 9 -- .../app/routes/search_spaces_routes.py | 10 -- .../app/schemas/new_llm_config.py | 9 -- .../app/schemas/search_source_connector.py | 2 - surfsense_backend/app/services/llm_service.py | 33 +---- .../contracts/types/connector.types.ts | 3 - .../contracts/types/new-llm-config.types.ts | 3 - surfsense_web/zero/schema/documents.ts | 1 - 9 files changed, 137 insertions(+), 67 deletions(-) create mode 100644 surfsense_backend/alembic/versions/154_remove_document_summary_llm.py diff --git a/surfsense_backend/alembic/versions/154_remove_document_summary_llm.py b/surfsense_backend/alembic/versions/154_remove_document_summary_llm.py new file mode 100644 index 000000000..6d0eb45cf --- /dev/null +++ b/surfsense_backend/alembic/versions/154_remove_document_summary_llm.py @@ -0,0 +1,134 @@ +"""remove document summary llm settings + +Revision ID: 154 +Revises: 153 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "154" +down_revision: str | None = "153" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +PUBLICATION_NAME = "zero_publication" + +DOCUMENT_COLS = [ + "id", + "title", + "document_type", + "search_space_id", + "folder_id", + "created_by_id", + "status", + "created_at", + "updated_at", +] + +USER_COLS = [ + "id", + "pages_limit", + "pages_used", + "premium_credit_micros_limit", + "premium_credit_micros_used", +] + +AUTOMATION_RUN_COLS = [ + "id", + "automation_id", + "trigger_id", + "status", + "step_results", + "started_at", + "finished_at", + "created_at", +] + + +def _column_exists(conn, table: str, column: str) -> bool: + return ( + conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = :table AND column_name = :column" + ), + {"table": table, "column": column}, + ).fetchone() + is not None + ) + + +def _has_zero_version(conn, table: str) -> bool: + return _column_exists(conn, table, "_0_version") + + +def _set_table_ddl(conn) -> str: + doc_cols = DOCUMENT_COLS + (['"_0_version"'] if _has_zero_version(conn, "documents") else []) + user_cols = USER_COLS + (['"_0_version"'] if _has_zero_version(conn, "user") else []) + tables = [ + "notifications", + f"documents ({', '.join(doc_cols)})", + "folders", + "search_source_connectors", + "new_chat_messages", + "chat_comments", + "chat_session_state", + f'"user" ({", ".join(user_cols)})', + f"automation_runs ({', '.join(AUTOMATION_RUN_COLS)})", + ] + return f"ALTER PUBLICATION {PUBLICATION_NAME} SET TABLE " + ", ".join(tables) + + +def _resync_zero_publication(tag: str) -> None: + conn = op.get_bind() + exists = conn.execute( + sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"), + {"name": PUBLICATION_NAME}, + ).fetchone() + if not exists: + return + + tx = conn.begin_nested() if conn.in_transaction() else conn.begin() + with tx: + conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'pre-{tag}'")) + conn.execute(sa.text(_set_table_ddl(conn))) + conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'post-{tag}'")) + + +def upgrade() -> None: + conn = op.get_bind() + + if _column_exists(conn, "searchspaces", "document_summary_llm_id"): + op.drop_column("searchspaces", "document_summary_llm_id") + + if _column_exists(conn, "search_source_connectors", "enable_summary"): + op.drop_column("search_source_connectors", "enable_summary") + + _resync_zero_publication("154-summary-removal") + + +def downgrade() -> None: + conn = op.get_bind() + + if not _column_exists(conn, "searchspaces", "document_summary_llm_id"): + op.add_column( + "searchspaces", + sa.Column("document_summary_llm_id", sa.Integer(), nullable=True, server_default="0"), + ) + + if not _column_exists(conn, "search_source_connectors", "enable_summary"): + op.add_column( + "search_source_connectors", + sa.Column( + "enable_summary", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + _resync_zero_publication("154-summary-removal-downgrade") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 5be10427f..1e011e049 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -1781,9 +1781,6 @@ class SearchSpace(BaseModel, TimestampMixin): agent_llm_id = Column( Integer, nullable=True, default=0 ) # For agent/chat operations, defaults to Auto mode - document_summary_llm_id = Column( - Integer, nullable=True, default=0 - ) # For document summarization, defaults to Auto mode image_generation_config_id = Column( Integer, nullable=True, default=0 ) # For image generation, defaults to Auto mode @@ -1951,12 +1948,6 @@ class SearchSourceConnector(BaseModel, TimestampMixin): last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True) config = Column(JSON, nullable=False) - # Summary generation (LLM-based) - disabled by default to save resources. - # When enabled, improves hybrid search quality at the cost of LLM calls. - enable_summary = Column( - Boolean, nullable=False, default=False, server_default="false" - ) - # Vision LLM for image files - disabled by default to save cost/time. # When enabled, images are described via a vision language model instead # of falling back to the document parser. diff --git a/surfsense_backend/app/routes/search_spaces_routes.py b/surfsense_backend/app/routes/search_spaces_routes.py index db230b0f5..898077b7a 100644 --- a/surfsense_backend/app/routes/search_spaces_routes.py +++ b/surfsense_backend/app/routes/search_spaces_routes.py @@ -617,9 +617,6 @@ async def get_llm_preferences( # Get full config objects for each role agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id) - document_summary_llm = await _get_llm_config_by_id( - session, search_space.document_summary_llm_id - ) image_generation_config = await _get_image_gen_config_by_id( session, search_space.image_generation_config_id ) @@ -629,11 +626,9 @@ async def get_llm_preferences( return LLMPreferencesRead( agent_llm_id=search_space.agent_llm_id, - document_summary_llm_id=search_space.document_summary_llm_id, image_generation_config_id=search_space.image_generation_config_id, vision_llm_config_id=search_space.vision_llm_config_id, agent_llm=agent_llm, - document_summary_llm=document_summary_llm, image_generation_config=image_generation_config, vision_llm_config=vision_llm_config, ) @@ -707,9 +702,6 @@ async def update_llm_preferences( # Get full config objects for response agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id) - document_summary_llm = await _get_llm_config_by_id( - session, search_space.document_summary_llm_id - ) image_generation_config = await _get_image_gen_config_by_id( session, search_space.image_generation_config_id ) @@ -719,11 +711,9 @@ async def update_llm_preferences( return LLMPreferencesRead( agent_llm_id=search_space.agent_llm_id, - document_summary_llm_id=search_space.document_summary_llm_id, image_generation_config_id=search_space.image_generation_config_id, vision_llm_config_id=search_space.vision_llm_config_id, agent_llm=agent_llm, - document_summary_llm=document_summary_llm, image_generation_config=image_generation_config, vision_llm_config=vision_llm_config, ) diff --git a/surfsense_backend/app/schemas/new_llm_config.py b/surfsense_backend/app/schemas/new_llm_config.py index e64478d38..716aa0457 100644 --- a/surfsense_backend/app/schemas/new_llm_config.py +++ b/surfsense_backend/app/schemas/new_llm_config.py @@ -221,9 +221,6 @@ class LLMPreferencesRead(BaseModel): agent_llm_id: int | None = Field( None, description="ID of the LLM config to use for agent/chat tasks" ) - document_summary_llm_id: int | None = Field( - None, description="ID of the LLM config to use for document summarization" - ) image_generation_config_id: int | None = Field( None, description="ID of the image generation config to use" ) @@ -234,9 +231,6 @@ class LLMPreferencesRead(BaseModel): agent_llm: dict[str, Any] | None = Field( None, description="Full config for agent LLM" ) - document_summary_llm: dict[str, Any] | None = Field( - None, description="Full config for document summary LLM" - ) image_generation_config: dict[str, Any] | None = Field( None, description="Full config for image generation" ) @@ -253,9 +247,6 @@ class LLMPreferencesUpdate(BaseModel): agent_llm_id: int | None = Field( None, description="ID of the LLM config to use for agent/chat tasks" ) - document_summary_llm_id: int | None = Field( - None, description="ID of the LLM config to use for document summarization" - ) image_generation_config_id: int | None = Field( None, description="ID of the image generation config to use" ) diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index aac7b92d5..982931859 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -16,7 +16,6 @@ class SearchSourceConnectorBase(BaseModel): is_indexable: bool last_indexed_at: datetime | None = None config: dict[str, Any] - enable_summary: bool = False enable_vision_llm: bool = False periodic_indexing_enabled: bool = False indexing_frequency_minutes: int | None = None @@ -67,7 +66,6 @@ class SearchSourceConnectorUpdate(BaseModel): is_indexable: bool | None = None last_indexed_at: datetime | None = None config: dict[str, Any] | None = None - enable_summary: bool | None = None enable_vision_llm: bool | None = None periodic_indexing_enabled: bool | None = None indexing_frequency_minutes: int | None = None diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py index aadb60cde..099e7c573 100644 --- a/surfsense_backend/app/services/llm_service.py +++ b/surfsense_backend/app/services/llm_service.py @@ -68,7 +68,6 @@ def _is_interactive_auth_provider( class LLMRole: AGENT = "agent" # For agent/chat operations - DOCUMENT_SUMMARY = "document_summary" # For document summarization def get_global_llm_config(llm_config_id: int) -> dict | None: @@ -266,7 +265,7 @@ async def get_search_space_llm_instance( Args: session: Database session search_space_id: Search Space ID - role: LLM role ('agent' or 'document_summary') + role: LLM role ('agent') Returns: ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found @@ -283,11 +282,8 @@ async def get_search_space_llm_instance( return None # Get the appropriate LLM config ID based on role - llm_config_id = None if role == LLMRole.AGENT: llm_config_id = search_space.agent_llm_id - elif role == LLMRole.DOCUMENT_SUMMARY: - llm_config_id = search_space.document_summary_llm_id else: logger.error(f"Invalid LLM role: {role}") return None @@ -470,20 +466,13 @@ async def get_search_space_llm_instance( async def get_agent_llm( - session: AsyncSession, search_space_id: int -) -> ChatLiteLLM | ChatLiteLLMRouter | None: - """Get the search space's agent LLM instance for chat operations.""" - return await get_search_space_llm_instance(session, search_space_id, LLMRole.AGENT) - - -async def get_document_summary_llm( session: AsyncSession, search_space_id: int, disable_streaming: bool = False ) -> ChatLiteLLM | ChatLiteLLMRouter | None: - """Get the search space's document summary LLM instance.""" + """Get the search space's agent LLM instance for chat operations.""" return await get_search_space_llm_instance( session, search_space_id, - LLMRole.DOCUMENT_SUMMARY, + LLMRole.AGENT, disable_streaming=disable_streaming, ) @@ -645,22 +634,6 @@ async def get_vision_llm( return None -# Backward-compatible alias (LLM preferences are now per-search-space, not per-user) -async def get_user_long_context_llm( - session: AsyncSession, - user_id: str, - search_space_id: int, - disable_streaming: bool = False, -) -> ChatLiteLLM | ChatLiteLLMRouter | None: - """ - Deprecated: Use get_document_summary_llm instead. - The user_id parameter is ignored as LLM preferences are now per-search-space. - """ - return await get_document_summary_llm( - session, search_space_id, disable_streaming=disable_streaming - ) - - def get_planner_llm() -> ChatLiteLLM | None: """Return a planner LLM instance from the first global config marked ``is_planner: true``, or ``None`` if no planner config is defined. diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts index 61d5ffc94..7c3dbb043 100644 --- a/surfsense_web/contracts/types/connector.types.ts +++ b/surfsense_web/contracts/types/connector.types.ts @@ -43,7 +43,6 @@ export const searchSourceConnector = z.object({ is_active: z.boolean().default(true), last_indexed_at: z.string().nullable(), config: z.record(z.string(), z.any()), - enable_summary: z.boolean().default(false), enable_vision_llm: z.boolean().default(false), periodic_indexing_enabled: z.boolean(), indexing_frequency_minutes: z.number().nullable(), @@ -98,7 +97,6 @@ export const createConnectorRequest = z.object({ is_active: true, last_indexed_at: true, config: true, - enable_summary: true, enable_vision_llm: true, periodic_indexing_enabled: true, indexing_frequency_minutes: true, @@ -124,7 +122,6 @@ export const updateConnectorRequest = z.object({ is_active: true, last_indexed_at: true, config: true, - enable_summary: true, enable_vision_llm: true, periodic_indexing_enabled: true, indexing_frequency_minutes: true, diff --git a/surfsense_web/contracts/types/new-llm-config.types.ts b/surfsense_web/contracts/types/new-llm-config.types.ts index b52b98ae4..2fa7a37be 100644 --- a/surfsense_web/contracts/types/new-llm-config.types.ts +++ b/surfsense_web/contracts/types/new-llm-config.types.ts @@ -384,11 +384,9 @@ export const getGlobalVisionLLMConfigsResponse = z.array(globalVisionLLMConfig); export const llmPreferences = z.object({ agent_llm_id: z.union([z.number(), z.null()]).optional(), - document_summary_llm_id: z.union([z.number(), z.null()]).optional(), image_generation_config_id: z.union([z.number(), z.null()]).optional(), vision_llm_config_id: z.union([z.number(), z.null()]).optional(), agent_llm: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(), - document_summary_llm: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(), image_generation_config: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(), vision_llm_config: z.union([z.record(z.string(), z.unknown()), z.null()]).optional(), }); @@ -409,7 +407,6 @@ export const updateLLMPreferencesRequest = z.object({ search_space_id: z.number(), data: llmPreferences.pick({ agent_llm_id: true, - document_summary_llm_id: true, image_generation_config_id: true, vision_llm_config_id: true, }), diff --git a/surfsense_web/zero/schema/documents.ts b/surfsense_web/zero/schema/documents.ts index d1ada4bc4..988056297 100644 --- a/surfsense_web/zero/schema/documents.ts +++ b/surfsense_web/zero/schema/documents.ts @@ -21,7 +21,6 @@ export const searchSourceConnectorTable = table("search_source_connectors") isIndexable: boolean().from("is_indexable"), lastIndexedAt: number().optional().from("last_indexed_at"), config: json(), - enableSummary: boolean().from("enable_summary"), periodicIndexingEnabled: boolean().from("periodic_indexing_enabled"), indexingFrequencyMinutes: number().optional().from("indexing_frequency_minutes"), nextScheduledAt: number().optional().from("next_scheduled_at"),