refactor: update Discord message indexing logic

- Enhanced the indexing process for Discord messages to treat each message as an individual document, improving metadata handling and content management.
- Replaced the announcement banner component and related state management with a more streamlined approach, removing unnecessary files and simplifying the dashboard layout.
- Updated logging messages for clarity and accuracy regarding processed messages.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-01-05 22:18:25 -08:00
parent afe63943f2
commit aac0432023
4 changed files with 129 additions and 237 deletions

View file

@ -11,17 +11,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.discord_connector import DiscordConnector from app.connectors.discord_connector import DiscordConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import ( from app.utils.document_converters import (
create_document_chunks, create_document_chunks,
generate_content_hash, generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_markdown,
check_document_by_unique_identifier, check_document_by_unique_identifier,
get_connector_by_id, get_connector_by_id,
get_current_timestamp, get_current_timestamp,
@ -336,207 +334,155 @@ async def index_discord_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Convert messages to markdown format # Process each message as an individual document (like Slack)
channel_content = (
f"# Discord Channel: {guild_name} / {channel_name}\n\n"
)
for msg in formatted_messages: for msg in formatted_messages:
user_name = msg.get("author_name", "Unknown User") msg_id = msg.get("id", "")
timestamp = msg.get("created_at", "Unknown Time") msg_user_name = msg.get("author_name", "Unknown User")
text = msg.get("content", "") msg_timestamp = msg.get("created_at", "Unknown Time")
channel_content += ( msg_text = msg.get("content", "")
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
# Format document metadata (similar to Slack)
metadata_sections = [
(
"METADATA",
[
f"GUILD_NAME: {guild_name}",
f"GUILD_ID: {guild_id}",
f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}",
f"MESSAGE_TIMESTAMP: {msg_timestamp}",
f"MESSAGE_USER_NAME: {msg_user_name}",
],
),
(
"CONTENT",
[
"FORMAT: markdown",
"TEXT_START",
msg_text,
"TEXT_END",
],
),
]
# Build the document string
combined_document_string = build_document_metadata_markdown(
metadata_sections
) )
# Metadata sections # Generate unique identifier hash for this Discord message
metadata_sections = [ unique_identifier = f"{channel_id}_{msg_id}"
( unique_identifier_hash = generate_unique_identifier_hash(
"METADATA", DocumentType.DISCORD_CONNECTOR,
[ unique_identifier,
f"GUILD_NAME: {guild_name}", search_space_id,
f"GUILD_ID: {guild_id}", )
f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}",
f"MESSAGE_COUNT: {len(formatted_messages)}",
],
),
(
"CONTENT",
[
"FORMAT: markdown",
"TEXT_START",
channel_content,
"TEXT_END",
],
),
]
combined_document_string = build_document_metadata_string( # Generate content hash
metadata_sections content_hash = generate_content_hash(
) combined_document_string, search_space_id
)
# Generate unique identifier hash for this Discord channel # Check if document with this unique identifier already exists
unique_identifier_hash = generate_unique_identifier_hash( existing_document = await check_document_by_unique_identifier(
DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id session, unique_identifier_hash
) )
# Generate content hash if existing_document:
content_hash = generate_content_hash( # Document exists - check if content has changed
combined_document_string, search_space_id if existing_document.content_hash == content_hash:
) logger.info(
f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping."
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document."
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if not user_llm:
logger.error(
f"No long context LLM configured for user {user_id}"
)
skipped_channels.append(
f"{guild_name}#{channel_name} (no LLM configured)"
) )
documents_skipped += 1 documents_skipped += 1
continue continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document."
)
# Generate summary with metadata # Update chunks and embedding
document_metadata = { chunks = await create_document_chunks(
"guild_name": guild_name, combined_document_string
"channel_name": channel_name, )
"message_count": len(formatted_messages), doc_embedding = config.embedding_model_instance.embed(
"document_type": "Discord Channel Messages", combined_document_string
"connector_type": "Discord", )
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
combined_document_string,
user_llm,
document_metadata,
)
# Chunks from channel content # Update existing document
chunks = await create_document_chunks(channel_content) existing_document.content = combined_document_string
existing_document.content_hash = content_hash
existing_document.embedding = doc_embedding
existing_document.document_metadata = {
"guild_name": guild_name,
"guild_id": guild_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_id": msg_id,
"message_timestamp": msg_timestamp,
"message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
}
# Update existing document # Delete old chunks and add new ones
existing_document.title = ( existing_document.chunks = chunks
f"Discord - {guild_name}#{channel_name}" existing_document.updated_at = get_current_timestamp()
)
existing_document.content = summary_content documents_indexed += 1
existing_document.content_hash = content_hash logger.info(
existing_document.embedding = summary_embedding f"Successfully updated Discord message {msg_id}"
existing_document.document_metadata = { )
continue
# Document doesn't exist - create new one
# Process chunks
chunks = await create_document_chunks(combined_document_string)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Discord - {guild_name}#{channel_name}",
document_type=DocumentType.DISCORD_CONNECTOR,
document_metadata={
"guild_name": guild_name, "guild_name": guild_name,
"guild_id": guild_id, "guild_id": guild_id,
"channel_name": channel_name, "channel_name": channel_name,
"channel_id": channel_id, "channel_id": channel_id,
"message_count": len(formatted_messages), "message_id": msg_id,
"start_date": start_date_iso, "message_timestamp": msg_timestamp,
"end_date": end_date_iso, "message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime( "indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S" "%Y-%m-%d %H:%M:%S"
), ),
} },
existing_document.chunks = chunks content=combined_document_string,
existing_document.updated_at = get_current_timestamp() embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
updated_at=get_current_timestamp(),
)
documents_indexed += 1 session.add(document)
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info( logger.info(
f"Successfully updated Discord channel {guild_name}#{channel_name}" f"Committing batch: {documents_indexed} Discord messages processed so far"
) )
continue await session.commit()
# Document doesn't exist - create new one
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if not user_llm:
logger.error(
f"No long context LLM configured for user {user_id}"
)
skipped_channels.append(
f"{guild_name}#{channel_name} (no LLM configured)"
)
documents_skipped += 1
continue
# Generate summary with metadata
document_metadata = {
"guild_name": guild_name,
"channel_name": channel_name,
"message_count": len(formatted_messages),
"document_type": "Discord Channel Messages",
"connector_type": "Discord",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
# Chunks from channel content
chunks = await create_document_chunks(channel_content)
# Create and store new document
document = Document(
search_space_id=search_space_id,
title=f"Discord - {guild_name}#{channel_name}",
document_type=DocumentType.DISCORD_CONNECTOR,
document_metadata={
"guild_name": guild_name,
"guild_id": guild_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_count": len(formatted_messages),
"start_date": start_date_iso,
"end_date": end_date_iso,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
)
session.add(document)
documents_indexed += 1
logger.info( logger.info(
f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
) )
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Discord channels processed so far"
)
await session.commit()
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Error processing guild {guild_name}: {e!s}", exc_info=True f"Error processing guild {guild_name}: {e!s}", exc_info=True
@ -553,7 +499,7 @@ async def index_discord_messages(
# Final commit for any remaining documents not yet committed in batches # Final commit for any remaining documents not yet committed in batches
logger.info( logger.info(
f"Final commit: Total {documents_indexed} Discord channels processed" f"Final commit: Total {documents_indexed} Discord messages processed"
) )
await session.commit() await session.commit()
@ -561,18 +507,18 @@ async def index_discord_messages(
result_message = None result_message = None
if skipped_channels: if skipped_channels:
result_message = ( result_message = (
f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: " f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: "
+ ", ".join(skipped_channels) + ", ".join(skipped_channels)
) )
else: else:
result_message = f"Processed {documents_indexed} channels." result_message = f"Processed {documents_indexed} messages."
# Log success # Log success
await task_logger.log_task_success( await task_logger.log_task_success(
log_entry, log_entry,
f"Successfully completed Discord indexing for connector {connector_id}", f"Successfully completed Discord indexing for connector {connector_id}",
{ {
"channels_processed": documents_indexed, "messages_processed": documents_indexed,
"documents_indexed": documents_indexed, "documents_indexed": documents_indexed,
"documents_skipped": documents_skipped, "documents_skipped": documents_skipped,
"skipped_channels_count": len(skipped_channels), "skipped_channels_count": len(skipped_channels),
@ -582,7 +528,7 @@ async def index_discord_messages(
) )
logger.info( logger.info(
f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped"
) )
return documents_indexed, result_message return documents_indexed, result_message

View file

@ -2,7 +2,6 @@
import { Loader2 } from "lucide-react"; import { Loader2 } from "lucide-react";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import { AnnouncementBanner } from "@/components/announcement-banner";
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import { getBearerToken, redirectToLogin } from "@/lib/auth-utils"; import { getBearerToken, redirectToLogin } from "@/lib/auth-utils";
@ -43,7 +42,6 @@ export default function DashboardLayout({ children }: DashboardLayoutProps) {
return ( return (
<div className="h-full flex flex-col "> <div className="h-full flex flex-col ">
<AnnouncementBanner />
<div className="flex-1 min-h-0">{children}</div> <div className="flex-1 min-h-0">{children}</div>
</div> </div>
); );

View file

@ -1,5 +0,0 @@
import { atomWithStorage } from "jotai/utils";
// Atom to track whether the announcement banner has been dismissed
// Persists to localStorage automatically
export const announcementDismissedAtom = atomWithStorage("surfsense_announcement_dismissed", false);

View file

@ -1,47 +0,0 @@
"use client";
import { useAtom } from "jotai";
import { ExternalLink, Info, X } from "lucide-react";
import { announcementDismissedAtom } from "@/atoms/announcement.atom";
import { Button } from "@/components/ui/button";
export function AnnouncementBanner() {
const [isDismissed, setIsDismissed] = useAtom(announcementDismissedAtom);
const handleDismiss = () => {
setIsDismissed(true);
};
if (isDismissed) return null;
return (
<div className="relative h-[3rem] flex items-center justify-center border bg-gradient-to-r from-blue-600 to-blue-500 dark:from-blue-700 dark:to-blue-600 border-b border-blue-700 dark:border-blue-800">
<div className="container mx-auto px-4">
<div className="flex items-center justify-center gap-3 py-2.5">
<Info className="h-4 w-4 text-blue-50 flex-shrink-0" />
<p className="text-sm text-blue-50 text-center font-medium">
SurfSense is a work in progress.{" "}
<a
href="https://github.com/MODSetter/SurfSense/issues"
target="_blank"
rel="noopener noreferrer"
className="inline-flex items-center gap-1 underline decoration-blue-200 underline-offset-2 hover:decoration-white transition-colors"
>
Report issues on GitHub
<ExternalLink className="h-3 w-3" />
</a>
</p>
<Button
variant="ghost"
size="sm"
className="h-7 w-7 p-0 shrink-0 text-blue-100 hover:text-white hover:bg-blue-700/50 dark:hover:bg-blue-800/50 absolute right-4"
onClick={handleDismiss}
>
<X className="h-3.5 w-3.5" />
<span className="sr-only">Dismiss</span>
</Button>
</div>
</div>
</div>
);
}