mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-03 04:42:39 +02:00
refactor: update Discord message indexing logic
- Enhanced the indexing process for Discord messages to treat each message as an individual document, improving metadata handling and content management. - Replaced the announcement banner component and related state management with a more streamlined approach, removing unnecessary files and simplifying the dashboard layout. - Updated logging messages for clarity and accuracy regarding processed messages.
This commit is contained in:
parent
afe63943f2
commit
aac0432023
4 changed files with 129 additions and 237 deletions
|
|
@ -11,17 +11,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from app.config import config
|
from app.config import config
|
||||||
from app.connectors.discord_connector import DiscordConnector
|
from app.connectors.discord_connector import DiscordConnector
|
||||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
build_document_metadata_string,
|
build_document_metadata_markdown,
|
||||||
check_document_by_unique_identifier,
|
check_document_by_unique_identifier,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
get_current_timestamp,
|
get_current_timestamp,
|
||||||
|
|
@ -336,207 +334,155 @@ async def index_discord_messages(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Convert messages to markdown format
|
# Process each message as an individual document (like Slack)
|
||||||
channel_content = (
|
|
||||||
f"# Discord Channel: {guild_name} / {channel_name}\n\n"
|
|
||||||
)
|
|
||||||
for msg in formatted_messages:
|
for msg in formatted_messages:
|
||||||
user_name = msg.get("author_name", "Unknown User")
|
msg_id = msg.get("id", "")
|
||||||
timestamp = msg.get("created_at", "Unknown Time")
|
msg_user_name = msg.get("author_name", "Unknown User")
|
||||||
text = msg.get("content", "")
|
msg_timestamp = msg.get("created_at", "Unknown Time")
|
||||||
channel_content += (
|
msg_text = msg.get("content", "")
|
||||||
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
|
|
||||||
|
# Format document metadata (similar to Slack)
|
||||||
|
metadata_sections = [
|
||||||
|
(
|
||||||
|
"METADATA",
|
||||||
|
[
|
||||||
|
f"GUILD_NAME: {guild_name}",
|
||||||
|
f"GUILD_ID: {guild_id}",
|
||||||
|
f"CHANNEL_NAME: {channel_name}",
|
||||||
|
f"CHANNEL_ID: {channel_id}",
|
||||||
|
f"MESSAGE_TIMESTAMP: {msg_timestamp}",
|
||||||
|
f"MESSAGE_USER_NAME: {msg_user_name}",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"CONTENT",
|
||||||
|
[
|
||||||
|
"FORMAT: markdown",
|
||||||
|
"TEXT_START",
|
||||||
|
msg_text,
|
||||||
|
"TEXT_END",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Build the document string
|
||||||
|
combined_document_string = build_document_metadata_markdown(
|
||||||
|
metadata_sections
|
||||||
)
|
)
|
||||||
|
|
||||||
# Metadata sections
|
# Generate unique identifier hash for this Discord message
|
||||||
metadata_sections = [
|
unique_identifier = f"{channel_id}_{msg_id}"
|
||||||
(
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
"METADATA",
|
DocumentType.DISCORD_CONNECTOR,
|
||||||
[
|
unique_identifier,
|
||||||
f"GUILD_NAME: {guild_name}",
|
search_space_id,
|
||||||
f"GUILD_ID: {guild_id}",
|
)
|
||||||
f"CHANNEL_NAME: {channel_name}",
|
|
||||||
f"CHANNEL_ID: {channel_id}",
|
|
||||||
f"MESSAGE_COUNT: {len(formatted_messages)}",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"CONTENT",
|
|
||||||
[
|
|
||||||
"FORMAT: markdown",
|
|
||||||
"TEXT_START",
|
|
||||||
channel_content,
|
|
||||||
"TEXT_END",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
combined_document_string = build_document_metadata_string(
|
# Generate content hash
|
||||||
metadata_sections
|
content_hash = generate_content_hash(
|
||||||
)
|
combined_document_string, search_space_id
|
||||||
|
)
|
||||||
|
|
||||||
# Generate unique identifier hash for this Discord channel
|
# Check if document with this unique identifier already exists
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
existing_document = await check_document_by_unique_identifier(
|
||||||
DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
if existing_document:
|
||||||
content_hash = generate_content_hash(
|
# Document exists - check if content has changed
|
||||||
combined_document_string, search_space_id
|
if existing_document.content_hash == content_hash:
|
||||||
)
|
logger.info(
|
||||||
|
f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping."
|
||||||
# Check if document with this unique identifier already exists
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, unique_identifier_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_document:
|
|
||||||
# Document exists - check if content has changed
|
|
||||||
if existing_document.content_hash == content_hash:
|
|
||||||
logger.info(
|
|
||||||
f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping."
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# Content has changed - update the existing document
|
|
||||||
logger.info(
|
|
||||||
f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
if not user_llm:
|
|
||||||
logger.error(
|
|
||||||
f"No long context LLM configured for user {user_id}"
|
|
||||||
)
|
|
||||||
skipped_channels.append(
|
|
||||||
f"{guild_name}#{channel_name} (no LLM configured)"
|
|
||||||
)
|
)
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
# Content has changed - update the existing document
|
||||||
|
logger.info(
|
||||||
|
f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document."
|
||||||
|
)
|
||||||
|
|
||||||
# Generate summary with metadata
|
# Update chunks and embedding
|
||||||
document_metadata = {
|
chunks = await create_document_chunks(
|
||||||
"guild_name": guild_name,
|
combined_document_string
|
||||||
"channel_name": channel_name,
|
)
|
||||||
"message_count": len(formatted_messages),
|
doc_embedding = config.embedding_model_instance.embed(
|
||||||
"document_type": "Discord Channel Messages",
|
combined_document_string
|
||||||
"connector_type": "Discord",
|
)
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
combined_document_string,
|
|
||||||
user_llm,
|
|
||||||
document_metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunks from channel content
|
# Update existing document
|
||||||
chunks = await create_document_chunks(channel_content)
|
existing_document.content = combined_document_string
|
||||||
|
existing_document.content_hash = content_hash
|
||||||
|
existing_document.embedding = doc_embedding
|
||||||
|
existing_document.document_metadata = {
|
||||||
|
"guild_name": guild_name,
|
||||||
|
"guild_id": guild_id,
|
||||||
|
"channel_name": channel_name,
|
||||||
|
"channel_id": channel_id,
|
||||||
|
"message_id": msg_id,
|
||||||
|
"message_timestamp": msg_timestamp,
|
||||||
|
"message_user_name": msg_user_name,
|
||||||
|
"indexed_at": datetime.now(UTC).strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
# Update existing document
|
# Delete old chunks and add new ones
|
||||||
existing_document.title = (
|
existing_document.chunks = chunks
|
||||||
f"Discord - {guild_name}#{channel_name}"
|
existing_document.updated_at = get_current_timestamp()
|
||||||
)
|
|
||||||
existing_document.content = summary_content
|
documents_indexed += 1
|
||||||
existing_document.content_hash = content_hash
|
logger.info(
|
||||||
existing_document.embedding = summary_embedding
|
f"Successfully updated Discord message {msg_id}"
|
||||||
existing_document.document_metadata = {
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist - create new one
|
||||||
|
# Process chunks
|
||||||
|
chunks = await create_document_chunks(combined_document_string)
|
||||||
|
doc_embedding = config.embedding_model_instance.embed(
|
||||||
|
combined_document_string
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create and store new document
|
||||||
|
document = Document(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
title=f"Discord - {guild_name}#{channel_name}",
|
||||||
|
document_type=DocumentType.DISCORD_CONNECTOR,
|
||||||
|
document_metadata={
|
||||||
"guild_name": guild_name,
|
"guild_name": guild_name,
|
||||||
"guild_id": guild_id,
|
"guild_id": guild_id,
|
||||||
"channel_name": channel_name,
|
"channel_name": channel_name,
|
||||||
"channel_id": channel_id,
|
"channel_id": channel_id,
|
||||||
"message_count": len(formatted_messages),
|
"message_id": msg_id,
|
||||||
"start_date": start_date_iso,
|
"message_timestamp": msg_timestamp,
|
||||||
"end_date": end_date_iso,
|
"message_user_name": msg_user_name,
|
||||||
"indexed_at": datetime.now(UTC).strftime(
|
"indexed_at": datetime.now(UTC).strftime(
|
||||||
"%Y-%m-%d %H:%M:%S"
|
"%Y-%m-%d %H:%M:%S"
|
||||||
),
|
),
|
||||||
}
|
},
|
||||||
existing_document.chunks = chunks
|
content=combined_document_string,
|
||||||
existing_document.updated_at = get_current_timestamp()
|
embedding=doc_embedding,
|
||||||
|
chunks=chunks,
|
||||||
|
content_hash=content_hash,
|
||||||
|
unique_identifier_hash=unique_identifier_hash,
|
||||||
|
updated_at=get_current_timestamp(),
|
||||||
|
)
|
||||||
|
|
||||||
documents_indexed += 1
|
session.add(document)
|
||||||
|
documents_indexed += 1
|
||||||
|
|
||||||
|
# Batch commit every 10 documents
|
||||||
|
if documents_indexed % 10 == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully updated Discord channel {guild_name}#{channel_name}"
|
f"Committing batch: {documents_indexed} Discord messages processed so far"
|
||||||
)
|
)
|
||||||
continue
|
await session.commit()
|
||||||
|
|
||||||
# Document doesn't exist - create new one
|
|
||||||
# Get user's long context LLM
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
if not user_llm:
|
|
||||||
logger.error(
|
|
||||||
f"No long context LLM configured for user {user_id}"
|
|
||||||
)
|
|
||||||
skipped_channels.append(
|
|
||||||
f"{guild_name}#{channel_name} (no LLM configured)"
|
|
||||||
)
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
|
||||||
document_metadata = {
|
|
||||||
"guild_name": guild_name,
|
|
||||||
"channel_name": channel_name,
|
|
||||||
"message_count": len(formatted_messages),
|
|
||||||
"document_type": "Discord Channel Messages",
|
|
||||||
"connector_type": "Discord",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
combined_document_string, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
|
|
||||||
# Chunks from channel content
|
|
||||||
chunks = await create_document_chunks(channel_content)
|
|
||||||
|
|
||||||
# Create and store new document
|
|
||||||
document = Document(
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
title=f"Discord - {guild_name}#{channel_name}",
|
|
||||||
document_type=DocumentType.DISCORD_CONNECTOR,
|
|
||||||
document_metadata={
|
|
||||||
"guild_name": guild_name,
|
|
||||||
"guild_id": guild_id,
|
|
||||||
"channel_name": channel_name,
|
|
||||||
"channel_id": channel_id,
|
|
||||||
"message_count": len(formatted_messages),
|
|
||||||
"start_date": start_date_iso,
|
|
||||||
"end_date": end_date_iso,
|
|
||||||
"indexed_at": datetime.now(UTC).strftime(
|
|
||||||
"%Y-%m-%d %H:%M:%S"
|
|
||||||
),
|
|
||||||
},
|
|
||||||
content=summary_content,
|
|
||||||
content_hash=content_hash,
|
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
|
||||||
embedding=summary_embedding,
|
|
||||||
chunks=chunks,
|
|
||||||
updated_at=get_current_timestamp(),
|
|
||||||
)
|
|
||||||
|
|
||||||
session.add(document)
|
|
||||||
documents_indexed += 1
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
|
f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Batch commit every 10 documents
|
|
||||||
if documents_indexed % 10 == 0:
|
|
||||||
logger.info(
|
|
||||||
f"Committing batch: {documents_indexed} Discord channels processed so far"
|
|
||||||
)
|
|
||||||
await session.commit()
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Error processing guild {guild_name}: {e!s}", exc_info=True
|
f"Error processing guild {guild_name}: {e!s}", exc_info=True
|
||||||
|
|
@ -553,7 +499,7 @@ async def index_discord_messages(
|
||||||
|
|
||||||
# Final commit for any remaining documents not yet committed in batches
|
# Final commit for any remaining documents not yet committed in batches
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Final commit: Total {documents_indexed} Discord channels processed"
|
f"Final commit: Total {documents_indexed} Discord messages processed"
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
|
|
@ -561,18 +507,18 @@ async def index_discord_messages(
|
||||||
result_message = None
|
result_message = None
|
||||||
if skipped_channels:
|
if skipped_channels:
|
||||||
result_message = (
|
result_message = (
|
||||||
f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: "
|
f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: "
|
||||||
+ ", ".join(skipped_channels)
|
+ ", ".join(skipped_channels)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
result_message = f"Processed {documents_indexed} channels."
|
result_message = f"Processed {documents_indexed} messages."
|
||||||
|
|
||||||
# Log success
|
# Log success
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Successfully completed Discord indexing for connector {connector_id}",
|
f"Successfully completed Discord indexing for connector {connector_id}",
|
||||||
{
|
{
|
||||||
"channels_processed": documents_indexed,
|
"messages_processed": documents_indexed,
|
||||||
"documents_indexed": documents_indexed,
|
"documents_indexed": documents_indexed,
|
||||||
"documents_skipped": documents_skipped,
|
"documents_skipped": documents_skipped,
|
||||||
"skipped_channels_count": len(skipped_channels),
|
"skipped_channels_count": len(skipped_channels),
|
||||||
|
|
@ -582,7 +528,7 @@ async def index_discord_messages(
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
|
f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped"
|
||||||
)
|
)
|
||||||
return documents_indexed, result_message
|
return documents_indexed, result_message
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
import { Loader2 } from "lucide-react";
|
import { Loader2 } from "lucide-react";
|
||||||
import { useEffect, useState } from "react";
|
import { useEffect, useState } from "react";
|
||||||
import { AnnouncementBanner } from "@/components/announcement-banner";
|
|
||||||
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
|
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
|
||||||
import { getBearerToken, redirectToLogin } from "@/lib/auth-utils";
|
import { getBearerToken, redirectToLogin } from "@/lib/auth-utils";
|
||||||
|
|
||||||
|
|
@ -43,7 +42,6 @@ export default function DashboardLayout({ children }: DashboardLayoutProps) {
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="h-full flex flex-col ">
|
<div className="h-full flex flex-col ">
|
||||||
<AnnouncementBanner />
|
|
||||||
<div className="flex-1 min-h-0">{children}</div>
|
<div className="flex-1 min-h-0">{children}</div>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
import { atomWithStorage } from "jotai/utils";
|
|
||||||
|
|
||||||
// Atom to track whether the announcement banner has been dismissed
|
|
||||||
// Persists to localStorage automatically
|
|
||||||
export const announcementDismissedAtom = atomWithStorage("surfsense_announcement_dismissed", false);
|
|
||||||
|
|
@ -1,47 +0,0 @@
|
||||||
"use client";
|
|
||||||
|
|
||||||
import { useAtom } from "jotai";
|
|
||||||
import { ExternalLink, Info, X } from "lucide-react";
|
|
||||||
import { announcementDismissedAtom } from "@/atoms/announcement.atom";
|
|
||||||
import { Button } from "@/components/ui/button";
|
|
||||||
|
|
||||||
export function AnnouncementBanner() {
|
|
||||||
const [isDismissed, setIsDismissed] = useAtom(announcementDismissedAtom);
|
|
||||||
|
|
||||||
const handleDismiss = () => {
|
|
||||||
setIsDismissed(true);
|
|
||||||
};
|
|
||||||
|
|
||||||
if (isDismissed) return null;
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div className="relative h-[3rem] flex items-center justify-center border bg-gradient-to-r from-blue-600 to-blue-500 dark:from-blue-700 dark:to-blue-600 border-b border-blue-700 dark:border-blue-800">
|
|
||||||
<div className="container mx-auto px-4">
|
|
||||||
<div className="flex items-center justify-center gap-3 py-2.5">
|
|
||||||
<Info className="h-4 w-4 text-blue-50 flex-shrink-0" />
|
|
||||||
<p className="text-sm text-blue-50 text-center font-medium">
|
|
||||||
SurfSense is a work in progress.{" "}
|
|
||||||
<a
|
|
||||||
href="https://github.com/MODSetter/SurfSense/issues"
|
|
||||||
target="_blank"
|
|
||||||
rel="noopener noreferrer"
|
|
||||||
className="inline-flex items-center gap-1 underline decoration-blue-200 underline-offset-2 hover:decoration-white transition-colors"
|
|
||||||
>
|
|
||||||
Report issues on GitHub
|
|
||||||
<ExternalLink className="h-3 w-3" />
|
|
||||||
</a>
|
|
||||||
</p>
|
|
||||||
<Button
|
|
||||||
variant="ghost"
|
|
||||||
size="sm"
|
|
||||||
className="h-7 w-7 p-0 shrink-0 text-blue-100 hover:text-white hover:bg-blue-700/50 dark:hover:bg-blue-800/50 absolute right-4"
|
|
||||||
onClick={handleDismiss}
|
|
||||||
>
|
|
||||||
<X className="h-3.5 w-3.5" />
|
|
||||||
<span className="sr-only">Dismiss</span>
|
|
||||||
</Button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue