mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
feat: added configable summary calculation and various improvements
- Replaced direct embedding calls with a utility function across various components to streamline embedding logic. - Added enable_summary flag to several models and routes to control summary generation behavior.
This commit is contained in:
parent
dc33a4a68f
commit
e9892c8fe9
50 changed files with 380 additions and 298 deletions
|
|
@ -0,0 +1,46 @@
|
|||
"""102_add_enable_summary_to_connectors
|
||||
|
||||
Revision ID: 102
|
||||
Revises: 101
|
||||
Create Date: 2026-02-26
|
||||
|
||||
Adds enable_summary boolean column to search_source_connectors.
|
||||
Defaults to False for all existing and new connectors so LLM-based
|
||||
summary generation is opt-in.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "102"
|
||||
down_revision: str | None = "101"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
existing_columns = [
|
||||
col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
|
||||
]
|
||||
|
||||
if "enable_summary" not in existing_columns:
|
||||
op.add_column(
|
||||
"search_source_connectors",
|
||||
sa.Column(
|
||||
"enable_summary",
|
||||
sa.Boolean(),
|
||||
nullable=False,
|
||||
server_default=sa.text("false"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("search_source_connectors", "enable_summary")
|
||||
|
|
@ -14,8 +14,8 @@ from langchain_core.tools import tool
|
|||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
|
|
@ -100,7 +100,7 @@ async def search_surfsense_docs_async(
|
|||
Formatted string with relevant documentation content
|
||||
"""
|
||||
# Get embedding for the query
|
||||
query_embedding = config.embedding_model_instance.embed(query)
|
||||
query_embedding = embed_text(query)
|
||||
|
||||
# Vector similarity search on chunks, joining with documents
|
||||
stmt = (
|
||||
|
|
|
|||
|
|
@ -8,8 +8,8 @@ from langchain_core.tools import tool
|
|||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import MemoryCategory, SharedMemory, User
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -64,7 +64,7 @@ async def save_shared_memory(
|
|||
count = await get_shared_memory_count(db_session, search_space_id)
|
||||
if count >= MAX_MEMORIES_PER_SEARCH_SPACE:
|
||||
await delete_oldest_shared_memory(db_session, search_space_id)
|
||||
embedding = config.embedding_model_instance.embed(content)
|
||||
embedding = embed_text(content)
|
||||
row = SharedMemory(
|
||||
search_space_id=search_space_id,
|
||||
created_by_id=_to_uuid(created_by_id),
|
||||
|
|
@ -108,7 +108,7 @@ async def recall_shared_memory(
|
|||
if category and category in valid_categories:
|
||||
stmt = stmt.where(SharedMemory.category == MemoryCategory(category))
|
||||
if query:
|
||||
query_embedding = config.embedding_model_instance.embed(query)
|
||||
query_embedding = embed_text(query)
|
||||
stmt = stmt.order_by(
|
||||
SharedMemory.embedding.op("<=>")(query_embedding)
|
||||
).limit(top_k)
|
||||
|
|
|
|||
|
|
@ -17,8 +17,8 @@ from langchain_core.tools import tool
|
|||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import MemoryCategory, UserMemory
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -178,7 +178,7 @@ def create_save_memory_tool(
|
|||
await delete_oldest_memory(db_session, user_id, search_space_id)
|
||||
|
||||
# Generate embedding for the memory
|
||||
embedding = config.embedding_model_instance.embed(content)
|
||||
embedding = embed_text(content)
|
||||
|
||||
# Create new memory using ORM
|
||||
# The pgvector Vector column type handles embedding conversion automatically
|
||||
|
|
@ -268,7 +268,7 @@ def create_recall_memory_tool(
|
|||
|
||||
if query:
|
||||
# Semantic search using embeddings
|
||||
query_embedding = config.embedding_model_instance.embed(query)
|
||||
query_embedding = embed_text(query)
|
||||
|
||||
# Build query with vector similarity
|
||||
stmt = (
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
|
|
@ -27,6 +26,7 @@ from app.tasks.connector_indexers.base import (
|
|||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -383,6 +383,7 @@ async def _process_gmail_messages_phase2(
|
|||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
enable_summary: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
|
|
@ -415,7 +416,7 @@ async def _process_gmail_messages_phase2(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
|
|
@ -427,10 +428,8 @@ async def _process_gmail_messages_phase2(
|
|||
item["markdown_content"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
|
|
@ -646,6 +645,7 @@ async def index_composio_gmail(
|
|||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
enable_summary=getattr(connector, "enable_summary", False),
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
|
|
@ -27,6 +26,7 @@ from app.tasks.connector_indexers.base import (
|
|||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -440,7 +440,7 @@ async def index_composio_google_calendar(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"summary": item["summary"],
|
||||
|
|
@ -456,12 +456,10 @@ async def index_composio_google_calendar(
|
|||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}"
|
||||
if item["location"]:
|
||||
summary_content += f"\nLocation: {item['location']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
summary_content = (
|
||||
f"Calendar: {item['summary']}\n\n{item['markdown_content']}"
|
||||
)
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from app.tasks.connector_indexers.base import (
|
|||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -714,6 +715,7 @@ async def index_composio_google_drive(
|
|||
max_items=max_items,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
enable_summary=getattr(connector, "enable_summary", False),
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
else:
|
||||
|
|
@ -747,6 +749,7 @@ async def index_composio_google_drive(
|
|||
max_items=max_items,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
enable_summary=getattr(connector, "enable_summary", False),
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
|
||||
|
|
@ -829,6 +832,7 @@ async def _index_composio_drive_delta_sync(
|
|||
max_items: int,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
enable_summary: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int, list[str]]:
|
||||
"""Index Google Drive files using delta sync with real-time document status updates.
|
||||
|
|
@ -1079,7 +1083,7 @@ async def _index_composio_drive_delta_sync(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
|
|
@ -1090,10 +1094,8 @@ async def _index_composio_drive_delta_sync(
|
|||
markdown_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}\n\n{markdown_content}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
|
|
@ -1155,6 +1157,7 @@ async def _index_composio_drive_full_scan(
|
|||
max_items: int,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
enable_summary: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int, list[str]]:
|
||||
"""Index Google Drive files using full scan with real-time document status updates."""
|
||||
|
|
@ -1488,7 +1491,7 @@ async def _index_composio_drive_full_scan(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
|
|
@ -1499,10 +1502,8 @@ async def _index_composio_drive_full_scan(
|
|||
markdown_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}\n\n{markdown_content}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
from enum import Enum
|
||||
from enum import StrEnum
|
||||
|
||||
from fastapi import Depends
|
||||
from fastapi_users.db import SQLAlchemyBaseUserTableUUID, SQLAlchemyUserDatabase
|
||||
|
|
@ -31,7 +31,7 @@ if config.AUTH_TYPE == "GOOGLE":
|
|||
DATABASE_URL = config.DATABASE_URL
|
||||
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
class DocumentType(StrEnum):
|
||||
EXTENSION = "EXTENSION"
|
||||
CRAWLED_URL = "CRAWLED_URL"
|
||||
FILE = "FILE"
|
||||
|
|
@ -60,7 +60,7 @@ class DocumentType(str, Enum):
|
|||
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
|
||||
|
||||
|
||||
class SearchSourceConnectorType(str, Enum):
|
||||
class SearchSourceConnectorType(StrEnum):
|
||||
SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT
|
||||
TAVILY_API = "TAVILY_API"
|
||||
SEARXNG_API = "SEARXNG_API"
|
||||
|
|
@ -93,7 +93,7 @@ class SearchSourceConnectorType(str, Enum):
|
|||
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
|
||||
|
||||
|
||||
class PodcastStatus(str, Enum):
|
||||
class PodcastStatus(StrEnum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
|
|
@ -177,7 +177,7 @@ class DocumentStatus:
|
|||
return None
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
class LiteLLMProvider(StrEnum):
|
||||
"""
|
||||
Enum for LLM providers supported by LiteLLM.
|
||||
"""
|
||||
|
|
@ -215,7 +215,7 @@ class LiteLLMProvider(str, Enum):
|
|||
CUSTOM = "CUSTOM"
|
||||
|
||||
|
||||
class ImageGenProvider(str, Enum):
|
||||
class ImageGenProvider(StrEnum):
|
||||
"""
|
||||
Enum for image generation providers supported by LiteLLM.
|
||||
This is a subset of LLM providers — only those that support image generation.
|
||||
|
|
@ -233,7 +233,7 @@ class ImageGenProvider(str, Enum):
|
|||
NSCALE = "NSCALE"
|
||||
|
||||
|
||||
class LogLevel(str, Enum):
|
||||
class LogLevel(StrEnum):
|
||||
DEBUG = "DEBUG"
|
||||
INFO = "INFO"
|
||||
WARNING = "WARNING"
|
||||
|
|
@ -241,13 +241,13 @@ class LogLevel(str, Enum):
|
|||
CRITICAL = "CRITICAL"
|
||||
|
||||
|
||||
class LogStatus(str, Enum):
|
||||
class LogStatus(StrEnum):
|
||||
IN_PROGRESS = "IN_PROGRESS"
|
||||
SUCCESS = "SUCCESS"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class IncentiveTaskType(str, Enum):
|
||||
class IncentiveTaskType(StrEnum):
|
||||
"""
|
||||
Enum for incentive task types that users can complete to earn free pages.
|
||||
Each task can only be completed once per user.
|
||||
|
|
@ -298,7 +298,7 @@ INCENTIVE_TASKS_CONFIG = {
|
|||
}
|
||||
|
||||
|
||||
class Permission(str, Enum):
|
||||
class Permission(StrEnum):
|
||||
"""
|
||||
Granular permissions for search space resources.
|
||||
Use '*' (FULL_ACCESS) to grant all permissions.
|
||||
|
|
@ -471,7 +471,7 @@ class BaseModel(Base):
|
|||
id = Column(Integer, primary_key=True, index=True)
|
||||
|
||||
|
||||
class NewChatMessageRole(str, Enum):
|
||||
class NewChatMessageRole(StrEnum):
|
||||
"""Role enum for new chat messages."""
|
||||
|
||||
USER = "user"
|
||||
|
|
@ -479,7 +479,7 @@ class NewChatMessageRole(str, Enum):
|
|||
SYSTEM = "system"
|
||||
|
||||
|
||||
class ChatVisibility(str, Enum):
|
||||
class ChatVisibility(StrEnum):
|
||||
"""
|
||||
Visibility/sharing level for chat threads.
|
||||
|
||||
|
|
@ -788,7 +788,7 @@ class ChatSessionState(BaseModel):
|
|||
ai_responding_to_user = relationship("User")
|
||||
|
||||
|
||||
class MemoryCategory(str, Enum):
|
||||
class MemoryCategory(StrEnum):
|
||||
"""Categories for user memories."""
|
||||
|
||||
# Using lowercase keys to match PostgreSQL enum values
|
||||
|
|
@ -1317,6 +1317,12 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
|
|||
last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||
config = Column(JSON, nullable=False)
|
||||
|
||||
# Summary generation (LLM-based) - disabled by default to save resources.
|
||||
# When enabled, improves hybrid search quality at the cost of LLM calls.
|
||||
enable_summary = Column(
|
||||
Boolean, nullable=False, default=False, server_default="false"
|
||||
)
|
||||
|
||||
# Periodic indexing fields
|
||||
periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
|
||||
indexing_frequency_minutes = Column(Integer, nullable=True)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ async def index_uploaded_file(
|
|||
user_id: str,
|
||||
session: AsyncSession,
|
||||
llm,
|
||||
should_summarize: bool = False,
|
||||
) -> None:
|
||||
connector_doc = ConnectorDocument(
|
||||
title=filename,
|
||||
|
|
@ -22,7 +23,7 @@ async def index_uploaded_file(
|
|||
search_space_id=search_space_id,
|
||||
created_by_id=user_id,
|
||||
connector_id=None,
|
||||
should_summarize=True,
|
||||
should_summarize=should_summarize,
|
||||
should_use_code_chunker=False,
|
||||
fallback_summary=markdown_content[:4000],
|
||||
metadata={
|
||||
|
|
|
|||
|
|
@ -1,6 +1,3 @@
|
|||
from app.config import config
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
|
||||
def embed_text(text: str) -> list[float]:
|
||||
"""Embed a single text string using the configured embedding model."""
|
||||
return config.embedding_model_instance.embed(text)
|
||||
__all__ = ["embed_text"]
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ async def create_documents(
|
|||
async def create_documents_file_upload(
|
||||
files: list[UploadFile],
|
||||
search_space_id: int = Form(...),
|
||||
should_summarize: bool = Form(False),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
|
|
@ -303,6 +304,7 @@ async def create_documents_file_upload(
|
|||
filename=filename,
|
||||
search_space_id=search_space_id,
|
||||
user_id=str(user.id),
|
||||
should_summarize=should_summarize,
|
||||
)
|
||||
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ import logging
|
|||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from enum import Enum
|
||||
from enum import StrEnum
|
||||
|
||||
import pypandoc
|
||||
import typst
|
||||
|
|
@ -46,7 +46,7 @@ router = APIRouter()
|
|||
MAX_REPORT_LIST_LIMIT = 500
|
||||
|
||||
|
||||
class ExportFormat(str, Enum):
|
||||
class ExportFormat(StrEnum):
|
||||
PDF = "pdf"
|
||||
DOCX = "docx"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,13 +1,13 @@
|
|||
"""Podcast schemas for API responses."""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class PodcastStatusEnum(str, Enum):
|
||||
class PodcastStatusEnum(StrEnum):
|
||||
PENDING = "pending"
|
||||
GENERATING = "generating"
|
||||
READY = "ready"
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ class SearchSourceConnectorBase(BaseModel):
|
|||
is_indexable: bool
|
||||
last_indexed_at: datetime | None = None
|
||||
config: dict[str, Any]
|
||||
enable_summary: bool = False
|
||||
periodic_indexing_enabled: bool = False
|
||||
indexing_frequency_minutes: int | None = None
|
||||
next_scheduled_at: datetime | None = None
|
||||
|
|
@ -65,6 +66,7 @@ class SearchSourceConnectorUpdate(BaseModel):
|
|||
is_indexable: bool | None = None
|
||||
last_indexed_at: datetime | None = None
|
||||
config: dict[str, Any] | None = None
|
||||
enable_summary: bool | None = None
|
||||
periodic_indexing_enabled: bool | None = None
|
||||
indexing_frequency_minutes: int | None = None
|
||||
next_scheduled_at: datetime | None = None
|
||||
|
|
|
|||
|
|
@ -4,12 +4,12 @@ from datetime import datetime
|
|||
from sqlalchemy import delete
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.linear_connector import LinearConnector
|
||||
from app.db import Chunk, Document
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
)
|
||||
|
|
@ -80,7 +80,7 @@ class LinearKBSyncService:
|
|||
state = formatted_issue.get("state", "Unknown")
|
||||
priority = issue_raw.get("priorityLabel", "Unknown")
|
||||
comment_count = len(formatted_issue.get("comments", []))
|
||||
description = formatted_issue.get("description", "")
|
||||
formatted_issue.get("description", "")
|
||||
|
||||
user_llm = await get_user_long_context_llm(
|
||||
self.db_session, user_id, search_space_id, disable_streaming=True
|
||||
|
|
@ -100,18 +100,10 @@ class LinearKBSyncService:
|
|||
issue_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
if description and len(description) > 1000:
|
||||
description = description[:997] + "..."
|
||||
summary_content = (
|
||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n"
|
||||
f"Status: {state}\n\n"
|
||||
)
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n\n"
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
|
||||
)
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
await self.db_session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
|
|
|
|||
|
|
@ -15,10 +15,12 @@ import logging
|
|||
from typing import Any
|
||||
|
||||
from langchain_core.callbacks import CallbackManagerForLLMRun
|
||||
from langchain_core.exceptions import ContextOverflowError
|
||||
from langchain_core.language_models import BaseChatModel
|
||||
from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from litellm import Router
|
||||
from litellm.exceptions import ContextWindowExceededError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -359,13 +361,15 @@ class ChatLiteLLMRouter(BaseChatModel):
|
|||
if self._tool_choice is not None:
|
||||
call_kwargs["tool_choice"] = self._tool_choice
|
||||
|
||||
# Call router completion
|
||||
response = self._router.completion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
**call_kwargs,
|
||||
)
|
||||
try:
|
||||
response = self._router.completion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
**call_kwargs,
|
||||
)
|
||||
except ContextWindowExceededError as e:
|
||||
raise ContextOverflowError(str(e)) from e
|
||||
|
||||
# Convert response to ChatResult with potential tool calls
|
||||
message = self._convert_response_to_message(response.choices[0].message)
|
||||
|
|
@ -396,13 +400,15 @@ class ChatLiteLLMRouter(BaseChatModel):
|
|||
if self._tool_choice is not None:
|
||||
call_kwargs["tool_choice"] = self._tool_choice
|
||||
|
||||
# Call router async completion
|
||||
response = await self._router.acompletion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
**call_kwargs,
|
||||
)
|
||||
try:
|
||||
response = await self._router.acompletion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
**call_kwargs,
|
||||
)
|
||||
except ContextWindowExceededError as e:
|
||||
raise ContextOverflowError(str(e)) from e
|
||||
|
||||
# Convert response to ChatResult with potential tool calls
|
||||
message = self._convert_response_to_message(response.choices[0].message)
|
||||
|
|
@ -432,14 +438,16 @@ class ChatLiteLLMRouter(BaseChatModel):
|
|||
if self._tool_choice is not None:
|
||||
call_kwargs["tool_choice"] = self._tool_choice
|
||||
|
||||
# Call router completion with streaming
|
||||
response = self._router.completion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
stream=True,
|
||||
**call_kwargs,
|
||||
)
|
||||
try:
|
||||
response = self._router.completion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
stream=True,
|
||||
**call_kwargs,
|
||||
)
|
||||
except ContextWindowExceededError as e:
|
||||
raise ContextOverflowError(str(e)) from e
|
||||
|
||||
# Yield chunks
|
||||
for chunk in response:
|
||||
|
|
@ -471,14 +479,16 @@ class ChatLiteLLMRouter(BaseChatModel):
|
|||
if self._tool_choice is not None:
|
||||
call_kwargs["tool_choice"] = self._tool_choice
|
||||
|
||||
# Call router async completion with streaming
|
||||
response = await self._router.acompletion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
stream=True,
|
||||
**call_kwargs,
|
||||
)
|
||||
try:
|
||||
response = await self._router.acompletion(
|
||||
model=self.model,
|
||||
messages=formatted_messages,
|
||||
stop=stop,
|
||||
stream=True,
|
||||
**call_kwargs,
|
||||
)
|
||||
except ContextWindowExceededError as e:
|
||||
raise ContextOverflowError(str(e)) from e
|
||||
|
||||
# Yield chunks asynchronously
|
||||
async for chunk in response:
|
||||
|
|
|
|||
|
|
@ -4,11 +4,11 @@ from datetime import datetime
|
|||
from sqlalchemy import delete
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import Chunk, Document
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
)
|
||||
|
|
@ -127,10 +127,8 @@ class NotionKBSyncService:
|
|||
logger.debug(f"Generated summary length: {len(summary_content)} chars")
|
||||
else:
|
||||
logger.warning("No LLM configured - using fallback summary")
|
||||
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content[:500]}..."
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
logger.debug(f"Deleting old chunks for document {document_id}")
|
||||
await self.db_session.execute(
|
||||
|
|
|
|||
|
|
@ -626,6 +626,7 @@ def process_file_upload_with_document_task(
|
|||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
should_summarize: bool = False,
|
||||
):
|
||||
"""
|
||||
Celery task to process uploaded file with existing pending document.
|
||||
|
|
@ -640,6 +641,7 @@ def process_file_upload_with_document_task(
|
|||
filename: Original filename
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
should_summarize: Whether to generate an LLM summary
|
||||
"""
|
||||
import traceback
|
||||
|
||||
|
|
@ -674,7 +676,12 @@ def process_file_upload_with_document_task(
|
|||
try:
|
||||
loop.run_until_complete(
|
||||
_process_file_with_document(
|
||||
document_id, temp_path, filename, search_space_id, user_id
|
||||
document_id,
|
||||
temp_path,
|
||||
filename,
|
||||
search_space_id,
|
||||
user_id,
|
||||
should_summarize=should_summarize,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
|
|
@ -710,6 +717,7 @@ async def _process_file_with_document(
|
|||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
should_summarize: bool = False,
|
||||
):
|
||||
"""
|
||||
Process file and update existing pending document status.
|
||||
|
|
@ -811,6 +819,7 @@ async def _process_file_with_document(
|
|||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
notification=notification,
|
||||
should_summarize=should_summarize,
|
||||
)
|
||||
|
||||
# Update notification on success
|
||||
|
|
|
|||
|
|
@ -12,13 +12,13 @@ from collections.abc import Awaitable, Callable
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.airtable_history import AirtableHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -399,7 +399,7 @@ async def index_airtable_records(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"record_id": item["record_id"],
|
||||
"created_time": item["record"].get("CREATED_TIME()", ""),
|
||||
|
|
@ -415,11 +415,8 @@ async def index_airtable_records(
|
|||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Airtable Record: {item['record_id']}\n\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.bookstack_connector import BookStackConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -403,7 +403,7 @@ async def index_bookstack_pages(
|
|||
"connector_id": connector_id,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
summary_metadata = {
|
||||
"page_name": item["page_name"],
|
||||
"page_id": item["page_id"],
|
||||
|
|
@ -418,17 +418,8 @@ async def index_bookstack_pages(
|
|||
item["full_content"], user_llm, summary_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n"
|
||||
if item["page_content"]:
|
||||
# Take first 1000 characters of content for summary
|
||||
content_preview = item["page_content"][:1000]
|
||||
if len(item["page_content"]) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview: {content_preview}\n\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks - using the full page content
|
||||
chunks = await create_document_chunks(item["full_content"])
|
||||
|
|
|
|||
|
|
@ -14,13 +14,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.clickup_history import ClickUpHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -398,7 +398,7 @@ async def index_clickup_tasks(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"task_id": item["task_id"],
|
||||
"task_name": item["task_name"],
|
||||
|
|
@ -418,9 +418,7 @@ async def index_clickup_tasks(
|
|||
)
|
||||
else:
|
||||
summary_content = item["task_content"]
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
item["task_content"]
|
||||
)
|
||||
summary_embedding = embed_text(item["task_content"])
|
||||
|
||||
chunks = await create_document_chunks(item["task_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -14,13 +14,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.confluence_history import ConfluenceHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -378,7 +378,7 @@ async def index_confluence_pages(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
|
|
@ -394,18 +394,8 @@ async def index_confluence_pages(
|
|||
item["full_content"], user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n"
|
||||
if item["page_content"]:
|
||||
# Take first 1000 characters of content for summary
|
||||
content_preview = item["page_content"][:1000]
|
||||
if len(item["page_content"]) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview: {content_preview}\n\n"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n{item['full_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks - using the full page content with comments
|
||||
chunks = await create_document_chunks(item["full_content"])
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
|
|||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
|
@ -669,9 +670,7 @@ async def index_discord_messages(
|
|||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
doc_embedding = embed_text(item["combined_document_string"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['guild_name']}#{item['channel_name']}"
|
||||
|
|
|
|||
|
|
@ -16,13 +16,13 @@ from datetime import UTC, datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.github_connector import GitHubConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -367,7 +367,7 @@ async def index_github_repos(
|
|||
"estimated_tokens": digest.estimated_tokens,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
# Prepare content for summarization
|
||||
summary_content = digest.full_digest
|
||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||
|
|
@ -381,15 +381,12 @@ async def index_github_repos(
|
|||
summary_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_text = (
|
||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||
f"## Summary\n{digest.summary}\n\n"
|
||||
f"## File Structure\n{digest.tree[:3000]}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_text
|
||||
f"## File Structure\n{digest.tree}"
|
||||
)
|
||||
summary_embedding = embed_text(summary_text)
|
||||
|
||||
# Chunk the full digest content for granular search
|
||||
try:
|
||||
|
|
@ -551,7 +548,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
|||
chunks.append(
|
||||
Chunk(
|
||||
content=chunk_text,
|
||||
embedding=config.embedding_model_instance.embed(chunk_text),
|
||||
embedding=embed_text(chunk_text),
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ from app.services.llm_service import get_user_long_context_llm
|
|||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -489,7 +490,7 @@ async def index_google_calendar_events(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"event_summary": item["event_summary"],
|
||||
|
|
@ -507,22 +508,8 @@ async def index_google_calendar_events(
|
|||
item["event_markdown"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Google Calendar Event: {item['event_summary']}\n\n"
|
||||
)
|
||||
summary_content += f"Calendar: {item['calendar_id']}\n"
|
||||
summary_content += f"Start: {item['start_time']}\n"
|
||||
summary_content += f"End: {item['end_time']}\n"
|
||||
if item["location"]:
|
||||
summary_content += f"Location: {item['location']}\n"
|
||||
if item["description"]:
|
||||
desc_preview = item["description"][:1000]
|
||||
if len(item["description"]) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["event_markdown"])
|
||||
|
||||
|
|
|
|||
|
|
@ -352,7 +352,7 @@ async def index_google_drive_single_file(
|
|||
await session.commit()
|
||||
|
||||
# Process the file
|
||||
indexed, skipped, failed = await _process_single_file(
|
||||
indexed, _skipped, failed = await _process_single_file(
|
||||
drive_client=drive_client,
|
||||
session=session,
|
||||
file=file,
|
||||
|
|
@ -608,7 +608,7 @@ async def _index_with_delta_sync(
|
|||
{"stage": "delta_sync", "start_token": start_page_token},
|
||||
)
|
||||
|
||||
changes, final_token, error = await fetch_all_changes(
|
||||
changes, _final_token, error = await fetch_all_changes(
|
||||
drive_client, start_page_token, folder_id
|
||||
)
|
||||
|
||||
|
|
@ -1011,7 +1011,7 @@ async def _process_single_file(
|
|||
pending_document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
_, error, metadata = await download_and_process_file(
|
||||
_, error, _metadata = await download_and_process_file(
|
||||
client=drive_client,
|
||||
file=file,
|
||||
search_space_id=search_space_id,
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from app.services.llm_service import get_user_long_context_llm
|
|||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -413,7 +414,7 @@ async def index_google_gmail_messages(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
|
|
@ -432,12 +433,8 @@ async def index_google_gmail_messages(
|
|||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Gmail Message: {item['subject']}\n\n"
|
||||
summary_content += f"Sender: {item['sender']}\n"
|
||||
summary_content += f"Date: {item['date_str']}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -14,13 +14,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.jira_history import JiraHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -356,7 +356,7 @@ async def index_jira_issues(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata = {
|
||||
"issue_key": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
|
|
@ -373,14 +373,8 @@ async def index_jira_issues(
|
|||
item["issue_content"], user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n"
|
||||
if item["formatted_issue"].get("description"):
|
||||
summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\n{item['issue_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks - using the full issue content with comments
|
||||
chunks = await create_document_chunks(item["issue_content"])
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.linear_connector import LinearConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -395,7 +395,7 @@ async def index_linear_issues(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"issue_id": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
|
|
@ -412,17 +412,8 @@ async def index_linear_issues(
|
|||
item["issue_content"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
description = item["description"]
|
||||
if description and len(description) > 1000:
|
||||
description = description[:997] + "..."
|
||||
summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n\n"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n{item['issue_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["issue_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ from datetime import datetime, timedelta
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.luma_connector import LumaConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -441,7 +441,7 @@ async def index_luma_events(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"event_name": item["event_name"],
|
||||
|
|
@ -462,29 +462,10 @@ async def index_luma_events(
|
|||
item["event_markdown"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Luma Event: {item['event_name']}\n\n"
|
||||
if item["event_url"]:
|
||||
summary_content += f"URL: {item['event_url']}\n"
|
||||
summary_content += f"Start: {item['start_at']}\n"
|
||||
summary_content += f"End: {item['end_at']}\n"
|
||||
if item["timezone"]:
|
||||
summary_content += f"Timezone: {item['timezone']}\n"
|
||||
if item["location"]:
|
||||
summary_content += f"Location: {item['location']}\n"
|
||||
if item["city"]:
|
||||
summary_content += f"City: {item['city']}\n"
|
||||
if item["host_names"]:
|
||||
summary_content += f"Hosts: {item['host_names']}\n"
|
||||
if item["description"]:
|
||||
desc_preview = item["description"][:1000]
|
||||
if len(item["description"]) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
summary_content = (
|
||||
f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
|
||||
)
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["event_markdown"])
|
||||
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.notion_history import NotionHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -447,7 +447,7 @@ async def index_notion_pages(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
|
|
@ -463,11 +463,8 @@ async def index_notion_pages(
|
|||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content'][:500]}..."
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from app.services.llm_service import get_user_long_context_llm
|
|||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -546,7 +547,7 @@ async def index_obsidian_vault(
|
|||
|
||||
# Generate summary
|
||||
summary_content = ""
|
||||
if long_context_llm:
|
||||
if long_context_llm and connector.enable_summary:
|
||||
summary_content, _ = await generate_document_summary(
|
||||
document_string,
|
||||
long_context_llm,
|
||||
|
|
@ -554,7 +555,7 @@ async def index_obsidian_vault(
|
|||
)
|
||||
|
||||
# Generate embedding
|
||||
embedding = config.embedding_model_instance.embed(document_string)
|
||||
embedding = embed_text(document_string)
|
||||
|
||||
# Add URL and summary to metadata
|
||||
document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}"
|
||||
|
|
|
|||
|
|
@ -17,12 +17,12 @@ from slack_sdk.errors import SlackApiError
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.slack_history import SlackHistory
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
|
@ -542,9 +542,7 @@ async def index_slack_messages(
|
|||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
doc_embedding = embed_text(item["combined_document_string"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['team_name']}#{item['channel_name']}"
|
||||
|
|
|
|||
|
|
@ -16,12 +16,12 @@ from datetime import UTC, datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.teams_history import TeamsHistory
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
|
@ -581,9 +581,7 @@ async def index_teams_messages(
|
|||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
doc_embedding = embed_text(item["combined_document_string"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['team_name']} - {item['channel_name']}"
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.webcrawler_connector import WebCrawlerConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -377,7 +377,7 @@ async def index_crawled_urls(
|
|||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"url": url,
|
||||
"title": title,
|
||||
|
|
@ -393,24 +393,8 @@ async def index_crawled_urls(
|
|||
structured_document, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Crawled URL: {title}\n\n"
|
||||
summary_content += f"URL: {url}\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n"
|
||||
if language:
|
||||
summary_content += f"Language: {language}\n"
|
||||
summary_content += f"Crawler: {crawler_type}\n\n"
|
||||
|
||||
# Add content preview
|
||||
content_preview = content[:1000]
|
||||
if len(content) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview:\n{content_preview}\n"
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(content)
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from app.services.task_logging_service import TaskLoggingService
|
|||
from app.utils.document_converters import (
|
||||
convert_document_to_markdown,
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
|
|
@ -760,11 +761,7 @@ async def add_received_file_document_using_docling(
|
|||
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
|
||||
)
|
||||
|
||||
from app.config import config
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
enhanced_summary_content
|
||||
)
|
||||
summary_embedding = embed_text(enhanced_summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(file_in_markdown)
|
||||
|
|
@ -1599,6 +1596,7 @@ async def process_file_in_background_with_document(
|
|||
log_entry: Log,
|
||||
connector: dict | None = None,
|
||||
notification: Notification | None = None,
|
||||
should_summarize: bool = False,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process file and update existing pending document (2-phase pattern).
|
||||
|
|
@ -1881,6 +1879,7 @@ async def process_file_in_background_with_document(
|
|||
user_id=user_id,
|
||||
session=session,
|
||||
llm=user_llm,
|
||||
should_summarize=should_summarize,
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from sqlalchemy.orm import selectinload
|
|||
|
||||
from app.config import config
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -89,7 +90,7 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
|
|||
return [
|
||||
SurfsenseDocsChunk(
|
||||
content=chunk.text,
|
||||
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||
embedding=embed_text(chunk.text),
|
||||
)
|
||||
for chunk in config.chunker_instance.chunk(content)
|
||||
]
|
||||
|
|
@ -154,7 +155,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in
|
|||
existing_doc.title = title
|
||||
existing_doc.content = content
|
||||
existing_doc.content_hash = content_hash
|
||||
existing_doc.embedding = config.embedding_model_instance.embed(content)
|
||||
existing_doc.embedding = embed_text(content)
|
||||
existing_doc.chunks = chunks
|
||||
existing_doc.updated_at = datetime.now(UTC)
|
||||
|
||||
|
|
@ -170,7 +171,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in
|
|||
title=title,
|
||||
content=content,
|
||||
content_hash=content_hash,
|
||||
embedding=config.embedding_model_instance.embed(content),
|
||||
embedding=embed_text(content),
|
||||
chunks=chunks,
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,59 @@
|
|||
import hashlib
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from litellm import get_model_info, token_counter
|
||||
|
||||
from app.config import config
|
||||
from app.db import Chunk, DocumentType
|
||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_embedding_max_tokens() -> int:
|
||||
"""Get the max token limit for the configured embedding model.
|
||||
|
||||
Checks model properties in order: max_seq_length, _max_tokens.
|
||||
Falls back to 8192 (OpenAI embedding default).
|
||||
"""
|
||||
model = config.embedding_model_instance
|
||||
for attr in ("max_seq_length", "_max_tokens"):
|
||||
val = getattr(model, attr, None)
|
||||
if isinstance(val, int) and val > 0:
|
||||
return val
|
||||
return 8192
|
||||
|
||||
|
||||
def truncate_for_embedding(text: str) -> str:
|
||||
"""Truncate text to fit within the embedding model's context window.
|
||||
|
||||
Uses the embedding model's own tokenizer for accurate token counting,
|
||||
so the result is model-agnostic regardless of the underlying provider.
|
||||
"""
|
||||
max_tokens = _get_embedding_max_tokens()
|
||||
if len(text) // 3 <= max_tokens:
|
||||
return text
|
||||
|
||||
tokenizer = config.embedding_model_instance.get_tokenizer()
|
||||
tokens = tokenizer.encode(text)
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
|
||||
warnings.warn(
|
||||
f"Truncating text from {len(tokens)} to {max_tokens} tokens for embedding.",
|
||||
stacklevel=2,
|
||||
)
|
||||
return tokenizer.decode(tokens[:max_tokens])
|
||||
|
||||
|
||||
def embed_text(text: str) -> np.ndarray:
|
||||
"""Truncate text to fit and embed it. Drop-in replacement for
|
||||
``config.embedding_model_instance.embed(text)`` that never exceeds the
|
||||
model's context window."""
|
||||
return config.embedding_model_instance.embed(truncate_for_embedding(text))
|
||||
|
||||
|
||||
def get_model_context_window(model_name: str) -> int:
|
||||
"""Get the total context window size for a model (input + output tokens)."""
|
||||
|
|
@ -146,7 +194,7 @@ async def generate_document_summary(
|
|||
else:
|
||||
enhanced_summary_content = summary_content
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(enhanced_summary_content)
|
||||
summary_embedding = embed_text(enhanced_summary_content)
|
||||
|
||||
return enhanced_summary_content, summary_embedding
|
||||
|
||||
|
|
@ -164,7 +212,7 @@ async def create_document_chunks(content: str) -> list[Chunk]:
|
|||
return [
|
||||
Chunk(
|
||||
content=chunk.text,
|
||||
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||
embedding=embed_text(chunk.text),
|
||||
)
|
||||
for chunk in config.chunker_instance.chunk(content)
|
||||
]
|
||||
|
|
|
|||
|
|
@ -29,4 +29,7 @@ if __name__ == "__main__":
|
|||
config = uvicorn.Config(**config_kwargs)
|
||||
server = uvicorn.Server(config)
|
||||
|
||||
server.run()
|
||||
if sys.platform == "win32":
|
||||
asyncio.run(server.serve(), loop_factory=asyncio.SelectorEventLoop)
|
||||
else:
|
||||
server.run()
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@ import {
|
|||
import {
|
||||
clearPlanOwnerRegistry,
|
||||
// extractWriteTodosFromContent,
|
||||
hydratePlanStateAtom,
|
||||
} from "@/atoms/chat/plan-state.atom";
|
||||
import { closeReportPanelAtom } from "@/atoms/chat/report-panel.atom";
|
||||
import { membersAtom } from "@/atoms/members/members-query.atoms";
|
||||
|
|
@ -73,7 +72,6 @@ import {
|
|||
appendText,
|
||||
buildContentForPersistence,
|
||||
buildContentForUI,
|
||||
type ContentPart,
|
||||
type ContentPartsState,
|
||||
readSSEStream,
|
||||
type ThinkingStepData,
|
||||
|
|
@ -188,7 +186,6 @@ export default function NewChatPage() {
|
|||
const setMentionedDocumentIds = useSetAtom(mentionedDocumentIdsAtom);
|
||||
const setMentionedDocuments = useSetAtom(mentionedDocumentsAtom);
|
||||
const setMessageDocumentsMap = useSetAtom(messageDocumentsMapAtom);
|
||||
const hydratePlanState = useSetAtom(hydratePlanStateAtom);
|
||||
const setCurrentThreadState = useSetAtom(currentThreadAtom);
|
||||
const setTargetCommentId = useSetAtom(setTargetCommentIdAtom);
|
||||
const clearTargetCommentId = useSetAtom(clearTargetCommentIdAtom);
|
||||
|
|
@ -350,7 +347,6 @@ export default function NewChatPage() {
|
|||
setMessageDocumentsMap,
|
||||
setMentionedDocumentIds,
|
||||
setMentionedDocuments,
|
||||
hydratePlanState,
|
||||
closeReportPanel,
|
||||
]);
|
||||
|
||||
|
|
|
|||
|
|
@ -97,6 +97,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
isDisconnecting,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
allConnectors,
|
||||
viewingAccountsType,
|
||||
viewingMCPList,
|
||||
|
|
@ -105,6 +106,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
setEndDate,
|
||||
setPeriodicEnabled,
|
||||
setFrequencyMinutes,
|
||||
setEnableSummary,
|
||||
handleOpenChange,
|
||||
handleTabChange,
|
||||
handleScroll,
|
||||
|
|
@ -282,6 +284,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
endDate={endDate}
|
||||
periodicEnabled={periodicEnabled}
|
||||
frequencyMinutes={frequencyMinutes}
|
||||
enableSummary={enableSummary}
|
||||
isSaving={isSaving}
|
||||
isDisconnecting={isDisconnecting}
|
||||
isIndexing={indexingConnectorIds.has(editingConnector.id)}
|
||||
|
|
@ -290,6 +293,7 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
onEndDateChange={setEndDate}
|
||||
onPeriodicEnabledChange={setPeriodicEnabled}
|
||||
onFrequencyChange={setFrequencyMinutes}
|
||||
onEnableSummaryChange={setEnableSummary}
|
||||
onSave={() => {
|
||||
startIndexing(editingConnector.id);
|
||||
handleSaveConnector(() => refreshConnectors());
|
||||
|
|
@ -328,11 +332,13 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger
|
|||
endDate={endDate}
|
||||
periodicEnabled={periodicEnabled}
|
||||
frequencyMinutes={frequencyMinutes}
|
||||
enableSummary={enableSummary}
|
||||
isStartingIndexing={isStartingIndexing}
|
||||
onStartDateChange={setStartDate}
|
||||
onEndDateChange={setEndDate}
|
||||
onPeriodicEnabledChange={setPeriodicEnabled}
|
||||
onFrequencyChange={setFrequencyMinutes}
|
||||
onEnableSummaryChange={setEnableSummary}
|
||||
onConfigChange={setIndexingConnectorConfig}
|
||||
onStartIndexing={() => {
|
||||
if (indexingConfig.connectorId) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,25 @@
|
|||
"use client";
|
||||
|
||||
import type { FC } from "react";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
|
||||
interface SummaryConfigProps {
|
||||
enabled: boolean;
|
||||
onEnabledChange: (enabled: boolean) => void;
|
||||
}
|
||||
|
||||
export const SummaryConfig: FC<SummaryConfigProps> = ({ enabled, onEnabledChange }) => {
|
||||
return (
|
||||
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="space-y-1">
|
||||
<h3 className="font-medium text-sm sm:text-base">Enable AI Summary</h3>
|
||||
<p className="text-xs sm:text-sm text-muted-foreground">
|
||||
Improves search quality but adds latency during indexing
|
||||
</p>
|
||||
</div>
|
||||
<Switch checked={enabled} onCheckedChange={onEnabledChange} />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
|
@ -9,6 +9,7 @@ import type { SearchSourceConnector } from "@/contracts/types/connector.types";
|
|||
import { cn } from "@/lib/utils";
|
||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
||||
import { SummaryConfig } from "../../components/summary-config";
|
||||
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
||||
import { getConnectorConfigComponent } from "../index";
|
||||
|
||||
|
|
@ -18,6 +19,7 @@ interface ConnectorEditViewProps {
|
|||
endDate: Date | undefined;
|
||||
periodicEnabled: boolean;
|
||||
frequencyMinutes: string;
|
||||
enableSummary: boolean;
|
||||
isSaving: boolean;
|
||||
isDisconnecting: boolean;
|
||||
isIndexing?: boolean;
|
||||
|
|
@ -26,6 +28,7 @@ interface ConnectorEditViewProps {
|
|||
onEndDateChange: (date: Date | undefined) => void;
|
||||
onPeriodicEnabledChange: (enabled: boolean) => void;
|
||||
onFrequencyChange: (frequency: string) => void;
|
||||
onEnableSummaryChange: (enabled: boolean) => void;
|
||||
onSave: () => void;
|
||||
onDisconnect: () => void;
|
||||
onBack: () => void;
|
||||
|
|
@ -40,6 +43,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
endDate,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
isSaving,
|
||||
isDisconnecting,
|
||||
isIndexing = false,
|
||||
|
|
@ -48,6 +52,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
onEndDateChange,
|
||||
onPeriodicEnabledChange,
|
||||
onFrequencyChange,
|
||||
onEnableSummaryChange,
|
||||
onSave,
|
||||
onDisconnect,
|
||||
onBack,
|
||||
|
|
@ -209,9 +214,12 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
/>
|
||||
)}
|
||||
|
||||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{/* Summary and sync settings - only shown for indexable connectors */}
|
||||
{connector.is_indexable && (
|
||||
<>
|
||||
{/* AI Summary toggle */}
|
||||
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
||||
|
||||
{/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import { getConnectorTypeDisplay } from "@/lib/connectors/utils";
|
|||
import { cn } from "@/lib/utils";
|
||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
||||
import { SummaryConfig } from "../../components/summary-config";
|
||||
import type { IndexingConfigState } from "../../constants/connector-constants";
|
||||
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
||||
import { getConnectorConfigComponent } from "../index";
|
||||
|
|
@ -21,11 +22,13 @@ interface IndexingConfigurationViewProps {
|
|||
endDate: Date | undefined;
|
||||
periodicEnabled: boolean;
|
||||
frequencyMinutes: string;
|
||||
enableSummary: boolean;
|
||||
isStartingIndexing: boolean;
|
||||
onStartDateChange: (date: Date | undefined) => void;
|
||||
onEndDateChange: (date: Date | undefined) => void;
|
||||
onPeriodicEnabledChange: (enabled: boolean) => void;
|
||||
onFrequencyChange: (frequency: string) => void;
|
||||
onEnableSummaryChange: (enabled: boolean) => void;
|
||||
onConfigChange?: (config: Record<string, unknown>) => void;
|
||||
onStartIndexing: () => void;
|
||||
onSkip: () => void;
|
||||
|
|
@ -38,11 +41,13 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
endDate,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
isStartingIndexing,
|
||||
onStartDateChange,
|
||||
onEndDateChange,
|
||||
onPeriodicEnabledChange,
|
||||
onFrequencyChange,
|
||||
onEnableSummaryChange,
|
||||
onConfigChange,
|
||||
onStartIndexing,
|
||||
onSkip,
|
||||
|
|
@ -149,9 +154,12 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
<ConnectorConfigComponent connector={connector} onConfigChange={onConfigChange} />
|
||||
)}
|
||||
|
||||
{/* Date range selector and periodic sync - only shown for indexable connectors */}
|
||||
{/* Summary and sync settings - only shown for indexable connectors */}
|
||||
{connector?.is_indexable && (
|
||||
<>
|
||||
{/* AI Summary toggle */}
|
||||
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
||||
|
||||
{/* Date range selector - not shown for Google Drive (regular and Composio), Webcrawler, or GitHub (indexes full repo snapshots) */}
|
||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||
config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ export const useConnectorDialog = () => {
|
|||
const [isStartingIndexing, setIsStartingIndexing] = useState(false);
|
||||
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
||||
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
||||
const [enableSummary, setEnableSummary] = useState(false);
|
||||
|
||||
// Edit mode state
|
||||
const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
|
||||
|
|
@ -240,6 +241,7 @@ export const useConnectorDialog = () => {
|
|||
!connector.is_indexable ? false : connector.periodic_indexing_enabled
|
||||
);
|
||||
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
||||
setEnableSummary(connector.enable_summary ?? false);
|
||||
// Reset dates - user can set new ones for re-indexing
|
||||
setStartDate(undefined);
|
||||
setEndDate(undefined);
|
||||
|
|
@ -257,6 +259,7 @@ export const useConnectorDialog = () => {
|
|||
setEndDate(undefined);
|
||||
setPeriodicEnabled(false);
|
||||
setFrequencyMinutes("1440");
|
||||
setEnableSummary(false);
|
||||
setIsScrolled(false);
|
||||
setSearchQuery("");
|
||||
}
|
||||
|
|
@ -269,6 +272,7 @@ export const useConnectorDialog = () => {
|
|||
setEndDate(undefined);
|
||||
setPeriodicEnabled(false);
|
||||
setFrequencyMinutes("1440");
|
||||
setEnableSummary(false);
|
||||
setIsScrolled(false);
|
||||
setSearchQuery("");
|
||||
}
|
||||
|
|
@ -722,6 +726,7 @@ export const useConnectorDialog = () => {
|
|||
setConnectorConfig(connector.config || {});
|
||||
setPeriodicEnabled(false);
|
||||
setFrequencyMinutes("1440");
|
||||
setEnableSummary(connector.enable_summary ?? false);
|
||||
setStartDate(undefined);
|
||||
setEndDate(undefined);
|
||||
|
||||
|
|
@ -909,12 +914,13 @@ export const useConnectorDialog = () => {
|
|||
const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined;
|
||||
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
||||
|
||||
// Update connector with periodic sync settings and config changes
|
||||
if (periodicEnabled || indexingConnectorConfig) {
|
||||
// Update connector with summary, periodic sync settings, and config changes
|
||||
if (enableSummary || periodicEnabled || indexingConnectorConfig) {
|
||||
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
|
||||
await updateConnector({
|
||||
id: indexingConfig.connectorId,
|
||||
data: {
|
||||
enable_summary: enableSummary,
|
||||
...(periodicEnabled && {
|
||||
periodic_indexing_enabled: true,
|
||||
indexing_frequency_minutes: frequency,
|
||||
|
|
@ -1042,6 +1048,7 @@ export const useConnectorDialog = () => {
|
|||
updateConnector,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
router,
|
||||
indexingConnectorConfig,
|
||||
]
|
||||
|
|
@ -1108,6 +1115,7 @@ export const useConnectorDialog = () => {
|
|||
// Load existing periodic sync settings (disabled for non-indexable connectors)
|
||||
setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
|
||||
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
||||
setEnableSummary(connector.enable_summary ?? false);
|
||||
// Reset dates - user can set new ones for re-indexing
|
||||
setStartDate(undefined);
|
||||
setEndDate(undefined);
|
||||
|
|
@ -1189,6 +1197,7 @@ export const useConnectorDialog = () => {
|
|||
id: editingConnector.id,
|
||||
data: {
|
||||
name: connectorName || editingConnector.name,
|
||||
enable_summary: enableSummary,
|
||||
periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
|
||||
indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
|
||||
config: connectorConfig || editingConnector.config,
|
||||
|
|
@ -1326,6 +1335,7 @@ export const useConnectorDialog = () => {
|
|||
updateConnector,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
getFrequencyLabel,
|
||||
router,
|
||||
connectorConfig,
|
||||
|
|
@ -1518,6 +1528,7 @@ export const useConnectorDialog = () => {
|
|||
setEndDate(undefined);
|
||||
setPeriodicEnabled(false);
|
||||
setFrequencyMinutes("1440");
|
||||
setEnableSummary(false);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
@ -1557,6 +1568,7 @@ export const useConnectorDialog = () => {
|
|||
isDisconnecting,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
enableSummary,
|
||||
searchSpaceId,
|
||||
allConnectors,
|
||||
viewingAccountsType,
|
||||
|
|
@ -1568,6 +1580,7 @@ export const useConnectorDialog = () => {
|
|||
setEndDate,
|
||||
setPeriodicEnabled,
|
||||
setFrequencyMinutes,
|
||||
setEnableSummary,
|
||||
setConnectorName,
|
||||
|
||||
// Handlers
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import { useCallback, useMemo, useRef, useState } from "react";
|
|||
import { useDropzone } from "react-dropzone";
|
||||
import { toast } from "sonner";
|
||||
import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
|
||||
import { SummaryConfig } from "@/components/assistant-ui/connector-popup/components/summary-config";
|
||||
import {
|
||||
Accordion,
|
||||
AccordionContent,
|
||||
|
|
@ -124,6 +125,7 @@ export function DocumentUploadTab({
|
|||
const [files, setFiles] = useState<File[]>([]);
|
||||
const [uploadProgress, setUploadProgress] = useState(0);
|
||||
const [accordionValue, setAccordionValue] = useState<string>("");
|
||||
const [shouldSummarize, setShouldSummarize] = useState(false);
|
||||
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
|
||||
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
|
|
@ -216,7 +218,7 @@ export function DocumentUploadTab({
|
|||
}, 200);
|
||||
|
||||
uploadDocuments(
|
||||
{ files, search_space_id: Number(searchSpaceId) },
|
||||
{ files, search_space_id: Number(searchSpaceId), should_summarize: shouldSummarize },
|
||||
{
|
||||
onSuccess: () => {
|
||||
clearInterval(progressInterval);
|
||||
|
|
@ -413,6 +415,10 @@ export function DocumentUploadTab({
|
|||
</motion.div>
|
||||
)}
|
||||
|
||||
<div className="mt-3 sm:mt-6">
|
||||
<SummaryConfig enabled={shouldSummarize} onEnabledChange={setShouldSummarize} />
|
||||
</div>
|
||||
|
||||
<motion.div
|
||||
className="mt-3 sm:mt-6"
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ export const searchSourceConnector = z.object({
|
|||
is_active: z.boolean().default(true),
|
||||
last_indexed_at: z.string().nullable(),
|
||||
config: z.record(z.string(), z.any()),
|
||||
enable_summary: z.boolean().default(false),
|
||||
periodic_indexing_enabled: z.boolean(),
|
||||
indexing_frequency_minutes: z.number().nullable(),
|
||||
next_scheduled_at: z.string().nullable(),
|
||||
|
|
@ -94,6 +95,7 @@ export const createConnectorRequest = z.object({
|
|||
is_active: true,
|
||||
last_indexed_at: true,
|
||||
config: true,
|
||||
enable_summary: true,
|
||||
periodic_indexing_enabled: true,
|
||||
indexing_frequency_minutes: true,
|
||||
next_scheduled_at: true,
|
||||
|
|
@ -118,6 +120,7 @@ export const updateConnectorRequest = z.object({
|
|||
is_active: true,
|
||||
last_indexed_at: true,
|
||||
config: true,
|
||||
enable_summary: true,
|
||||
periodic_indexing_enabled: true,
|
||||
indexing_frequency_minutes: true,
|
||||
next_scheduled_at: true,
|
||||
|
|
|
|||
|
|
@ -135,6 +135,7 @@ export const createDocumentResponse = z.object({
|
|||
export const uploadDocumentRequest = z.object({
|
||||
files: z.array(z.instanceof(File)),
|
||||
search_space_id: z.number(),
|
||||
should_summarize: z.boolean().default(false),
|
||||
});
|
||||
|
||||
export const uploadDocumentResponse = z.object({
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ class DocumentsApiService {
|
|||
formData.append("files", file);
|
||||
});
|
||||
formData.append("search_space_id", String(parsedRequest.data.search_space_id));
|
||||
formData.append("should_summarize", String(parsedRequest.data.should_summarize));
|
||||
|
||||
return baseApiService.postFormData(`/api/v1/documents/fileupload`, uploadDocumentResponse, {
|
||||
body: formData,
|
||||
|
|
|
|||
|
|
@ -70,7 +70,8 @@ const pendingSyncs = new Map<string, Promise<SyncHandle>>();
|
|||
// v5: fixed duplicate key errors, stable cutoff dates, onMustRefetch handler,
|
||||
// real-time documents table with title/created_by_id/status columns,
|
||||
// consolidated single documents sync, pending state for document queue visibility
|
||||
const SYNC_VERSION = 5;
|
||||
// v6: added enable_summary column to search_source_connectors
|
||||
const SYNC_VERSION = 6;
|
||||
|
||||
// Database name prefix for identifying SurfSense databases
|
||||
const DB_PREFIX = "surfsense-";
|
||||
|
|
@ -214,20 +215,21 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Create the search_source_connectors table schema in PGlite
|
||||
// This matches the backend schema
|
||||
await db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS search_source_connectors (
|
||||
id INTEGER PRIMARY KEY,
|
||||
search_space_id INTEGER NOT NULL,
|
||||
user_id TEXT NOT NULL,
|
||||
connector_type TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
is_indexable BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
last_indexed_at TIMESTAMPTZ,
|
||||
config JSONB DEFAULT '{}',
|
||||
periodic_indexing_enabled BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
indexing_frequency_minutes INTEGER,
|
||||
next_scheduled_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS search_source_connectors (
|
||||
id INTEGER PRIMARY KEY,
|
||||
search_space_id INTEGER NOT NULL,
|
||||
user_id TEXT NOT NULL,
|
||||
connector_type TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
is_indexable BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
last_indexed_at TIMESTAMPTZ,
|
||||
config JSONB DEFAULT '{}',
|
||||
periodic_indexing_enabled BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
indexing_frequency_minutes INTEGER,
|
||||
next_scheduled_at TIMESTAMPTZ,
|
||||
enable_summary BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_connectors_search_space_id ON search_source_connectors(search_space_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_connectors_type ON search_source_connectors(connector_type);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue