diff --git a/README.md b/README.md index 7a10a71a7..acd900588 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ # SurfSense -While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Google Drive, Slack, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch and more to come. +Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. OSS alternative to NotebookLM, Perplexity, and Glean. + +SurfSense is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Google Drive, Slack, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch and more to come.
MODSetter%2FSurfSense | Trendshift @@ -38,7 +40,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 ## Key Features ### 💡 **Idea**: -- Have your own highly customizable private NotebookLM and Perplexity integrated with external sources. +- Open source alternative to NotebookLM, Perplexity, and Glean. Connect any LLM to your internal knowledge sources and collaborate with your team in real time. ### 📁 **Multiple File Format Uploading Support** - Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base . ### 🔍 **Powerful Search** diff --git a/README.zh-CN.md b/README.zh-CN.md index e4322ca27..4e4b0174b 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -16,7 +16,9 @@ # SurfSense -虽然像 NotebookLM 和 Perplexity 这样的工具在对任何主题/查询进行研究时令人印象深刻且非常有效,但 SurfSense 通过与您的个人知识库集成,将这一能力提升到了新的高度。它是一个高度可定制的 AI 研究助手,可以连接外部数据源,如搜索引擎(SearxNG、Tavily、LinkUp、Google Drive、Slack、Linear、Jira、ClickUp、Confluence、BookStack、Gmail、Notion、YouTube、GitHub、Discord、Airtable、Google Calendar、Luma、Circleback、Elasticsearch 等,未来还会支持更多。 +将任何 LLM 连接到您的内部知识源,并与团队成员实时聊天。NotebookLM、Perplexity 和 Glean 的开源替代方案。 + +SurfSense 是一个高度可定制的 AI 研究助手,可以连接外部数据源,如搜索引擎(SearxNG、Tavily、LinkUp)、Google Drive、Slack、Linear、Jira、ClickUp、Confluence、BookStack、Gmail、Notion、YouTube、GitHub、Discord、Airtable、Google Calendar、Luma、Circleback、Elasticsearch 等,未来还会支持更多。
MODSetter%2FSurfSense | Trendshift @@ -38,7 +40,7 @@ https://github.com/user-attachments/assets/a0a16566-6967-4374-ac51-9b3e07fbecd7 ## 核心功能 ### 💡 **理念**: -- 拥有您自己的高度可定制的私有 NotebookLM 和 Perplexity,并与外部数据源集成。 +- NotebookLM、Perplexity 和 Glean 的开源替代方案。将任何 LLM 连接到您的内部知识源,并与团队实时协作。 ### 📁 **支持多种文件格式上传** - 将您个人文件中的内容(文档、图像、视频,支持 **50+ 种文件扩展名**)保存到您自己的个人知识库。 diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py index b3de1f4b5..5c92d2601 100644 --- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -11,17 +11,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.discord_connector import DiscordConnector from app.db import Document, DocumentType, SearchSourceConnectorType -from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, generate_content_hash, - generate_document_summary, generate_unique_identifier_hash, ) from .base import ( - build_document_metadata_string, + build_document_metadata_markdown, check_document_by_unique_identifier, get_connector_by_id, get_current_timestamp, @@ -336,207 +334,155 @@ async def index_discord_messages( documents_skipped += 1 continue - # Convert messages to markdown format - channel_content = ( - f"# Discord Channel: {guild_name} / {channel_name}\n\n" - ) + # Process each message as an individual document (like Slack) for msg in formatted_messages: - user_name = msg.get("author_name", "Unknown User") - timestamp = msg.get("created_at", "Unknown Time") - text = msg.get("content", "") - channel_content += ( - f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n" + msg_id = msg.get("id", "") + msg_user_name = msg.get("author_name", "Unknown User") + msg_timestamp = msg.get("created_at", "Unknown Time") + msg_text = msg.get("content", "") + + # Format document metadata (similar to Slack) + metadata_sections = [ + ( + "METADATA", + [ + f"GUILD_NAME: {guild_name}", + f"GUILD_ID: {guild_id}", + f"CHANNEL_NAME: {channel_name}", + f"CHANNEL_ID: {channel_id}", + f"MESSAGE_TIMESTAMP: {msg_timestamp}", + f"MESSAGE_USER_NAME: {msg_user_name}", + ], + ), + ( + "CONTENT", + [ + "FORMAT: markdown", + "TEXT_START", + msg_text, + "TEXT_END", + ], + ), + ] + + # Build the document string + combined_document_string = build_document_metadata_markdown( + metadata_sections ) - # Metadata sections - metadata_sections = [ - ( - "METADATA", - [ - f"GUILD_NAME: {guild_name}", - f"GUILD_ID: {guild_id}", - f"CHANNEL_NAME: {channel_name}", - f"CHANNEL_ID: {channel_id}", - f"MESSAGE_COUNT: {len(formatted_messages)}", - ], - ), - ( - "CONTENT", - [ - "FORMAT: markdown", - "TEXT_START", - channel_content, - "TEXT_END", - ], - ), - ] + # Generate unique identifier hash for this Discord message + unique_identifier = f"{channel_id}_{msg_id}" + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.DISCORD_CONNECTOR, + unique_identifier, + search_space_id, + ) - combined_document_string = build_document_metadata_string( - metadata_sections - ) + # Generate content hash + content_hash = generate_content_hash( + combined_document_string, search_space_id + ) - # Generate unique identifier hash for this Discord channel - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id - ) + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) - # Generate content hash - content_hash = generate_content_hash( - combined_document_string, search_space_id - ) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info( - f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping." - ) - documents_skipped += 1 - continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document." - ) - - # Get user's long context LLM - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if not user_llm: - logger.error( - f"No long context LLM configured for user {user_id}" - ) - skipped_channels.append( - f"{guild_name}#{channel_name} (no LLM configured)" + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + logger.info( + f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping." ) documents_skipped += 1 continue + else: + # Content has changed - update the existing document + logger.info( + f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document." + ) - # Generate summary with metadata - document_metadata = { - "guild_name": guild_name, - "channel_name": channel_name, - "message_count": len(formatted_messages), - "document_type": "Discord Channel Messages", - "connector_type": "Discord", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - combined_document_string, - user_llm, - document_metadata, - ) + # Update chunks and embedding + chunks = await create_document_chunks( + combined_document_string + ) + doc_embedding = config.embedding_model_instance.embed( + combined_document_string + ) - # Chunks from channel content - chunks = await create_document_chunks(channel_content) + # Update existing document + existing_document.content = combined_document_string + existing_document.content_hash = content_hash + existing_document.embedding = doc_embedding + existing_document.document_metadata = { + "guild_name": guild_name, + "guild_id": guild_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": msg_id, + "message_timestamp": msg_timestamp, + "message_user_name": msg_user_name, + "indexed_at": datetime.now(UTC).strftime( + "%Y-%m-%d %H:%M:%S" + ), + } - # Update existing document - existing_document.title = ( - f"Discord - {guild_name}#{channel_name}" - ) - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Delete old chunks and add new ones + existing_document.chunks = chunks + existing_document.updated_at = get_current_timestamp() + + documents_indexed += 1 + logger.info( + f"Successfully updated Discord message {msg_id}" + ) + continue + + # Document doesn't exist - create new one + # Process chunks + chunks = await create_document_chunks(combined_document_string) + doc_embedding = config.embedding_model_instance.embed( + combined_document_string + ) + + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Discord - {guild_name}#{channel_name}", + document_type=DocumentType.DISCORD_CONNECTOR, + document_metadata={ "guild_name": guild_name, "guild_id": guild_id, "channel_name": channel_name, "channel_id": channel_id, - "message_count": len(formatted_messages), - "start_date": start_date_iso, - "end_date": end_date_iso, + "message_id": msg_id, + "message_timestamp": msg_timestamp, + "message_user_name": msg_user_name, "indexed_at": datetime.now(UTC).strftime( "%Y-%m-%d %H:%M:%S" ), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() + }, + content=combined_document_string, + embedding=doc_embedding, + chunks=chunks, + content_hash=content_hash, + unique_identifier_hash=unique_identifier_hash, + updated_at=get_current_timestamp(), + ) - documents_indexed += 1 + session.add(document) + documents_indexed += 1 + + # Batch commit every 10 documents + if documents_indexed % 10 == 0: logger.info( - f"Successfully updated Discord channel {guild_name}#{channel_name}" + f"Committing batch: {documents_indexed} Discord messages processed so far" ) - continue + await session.commit() - # Document doesn't exist - create new one - # Get user's long context LLM - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if not user_llm: - logger.error( - f"No long context LLM configured for user {user_id}" - ) - skipped_channels.append( - f"{guild_name}#{channel_name} (no LLM configured)" - ) - documents_skipped += 1 - continue - - # Generate summary with metadata - document_metadata = { - "guild_name": guild_name, - "channel_name": channel_name, - "message_count": len(formatted_messages), - "document_type": "Discord Channel Messages", - "connector_type": "Discord", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - combined_document_string, user_llm, document_metadata - ) - - # Chunks from channel content - chunks = await create_document_chunks(channel_content) - - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Discord - {guild_name}#{channel_name}", - document_type=DocumentType.DISCORD_CONNECTOR, - document_metadata={ - "guild_name": guild_name, - "guild_id": guild_id, - "channel_name": channel_name, - "channel_id": channel_id, - "message_count": len(formatted_messages), - "start_date": start_date_iso, - "end_date": end_date_iso, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - ) - - session.add(document) - documents_indexed += 1 logger.info( - f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" + f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" ) - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Discord channels processed so far" - ) - await session.commit() - except Exception as e: logger.error( f"Error processing guild {guild_name}: {e!s}", exc_info=True @@ -553,7 +499,7 @@ async def index_discord_messages( # Final commit for any remaining documents not yet committed in batches logger.info( - f"Final commit: Total {documents_indexed} Discord channels processed" + f"Final commit: Total {documents_indexed} Discord messages processed" ) await session.commit() @@ -561,18 +507,18 @@ async def index_discord_messages( result_message = None if skipped_channels: result_message = ( - f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: " + f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: " + ", ".join(skipped_channels) ) else: - result_message = f"Processed {documents_indexed} channels." + result_message = f"Processed {documents_indexed} messages." # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Discord indexing for connector {connector_id}", { - "channels_processed": documents_indexed, + "messages_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_channels_count": len(skipped_channels), @@ -582,7 +528,7 @@ async def index_discord_messages( ) logger.info( - f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped" ) return documents_indexed, result_message diff --git a/surfsense_web/app/dashboard/layout.tsx b/surfsense_web/app/dashboard/layout.tsx index 8763a622f..71cd6275f 100644 --- a/surfsense_web/app/dashboard/layout.tsx +++ b/surfsense_web/app/dashboard/layout.tsx @@ -2,7 +2,6 @@ import { Loader2 } from "lucide-react"; import { useEffect, useState } from "react"; -import { AnnouncementBanner } from "@/components/announcement-banner"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; import { getBearerToken, redirectToLogin } from "@/lib/auth-utils"; @@ -43,7 +42,6 @@ export default function DashboardLayout({ children }: DashboardLayoutProps) { return (
-
{children}
); diff --git a/surfsense_web/app/dashboard/searchspaces/page.tsx b/surfsense_web/app/dashboard/searchspaces/page.tsx index d7d24ae03..b40eb5d82 100644 --- a/surfsense_web/app/dashboard/searchspaces/page.tsx +++ b/surfsense_web/app/dashboard/searchspaces/page.tsx @@ -28,7 +28,7 @@ export default function SearchSpacesPage() { return ( { - setIsDismissed(true); - }; - - if (isDismissed) return null; - - return ( -
-
-
- -

- SurfSense is a work in progress.{" "} - - Report issues on GitHub - - -

- -
-
-
- ); -} diff --git a/surfsense_web/components/search-space-form.tsx b/surfsense_web/components/search-space-form.tsx index a43370303..bcea48e3d 100644 --- a/surfsense_web/components/search-space-form.tsx +++ b/surfsense_web/components/search-space-form.tsx @@ -118,7 +118,7 @@ export function SearchSpaceForm({ >
-

+

{isEditing ? "Edit Search Space" : "Create Search Space"}

@@ -157,7 +157,7 @@ export function SearchSpaceForm({ mass: 0.2, }} /> -
+