refactor: update Discord message indexing logic

- Enhanced the indexing process for Discord messages to treat each message as an individual document, improving metadata handling and content management.
- Replaced the announcement banner component and related state management with a more streamlined approach, removing unnecessary files and simplifying the dashboard layout.
- Updated logging messages for clarity and accuracy regarding processed messages.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-01-05 22:18:25 -08:00
parent afe63943f2
commit aac0432023
4 changed files with 129 additions and 237 deletions

View file

@ -11,17 +11,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config from app.config import config
from app.connectors.discord_connector import DiscordConnector from app.connectors.discord_connector import DiscordConnector
from app.db import Document, DocumentType, SearchSourceConnectorType from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import ( from app.utils.document_converters import (
create_document_chunks, create_document_chunks,
generate_content_hash, generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
from .base import ( from .base import (
build_document_metadata_string, build_document_metadata_markdown,
check_document_by_unique_identifier, check_document_by_unique_identifier,
get_connector_by_id, get_connector_by_id,
get_current_timestamp, get_current_timestamp,
@ -336,19 +334,14 @@ async def index_discord_messages(
documents_skipped += 1 documents_skipped += 1
continue continue
# Convert messages to markdown format # Process each message as an individual document (like Slack)
channel_content = (
f"# Discord Channel: {guild_name} / {channel_name}\n\n"
)
for msg in formatted_messages: for msg in formatted_messages:
user_name = msg.get("author_name", "Unknown User") msg_id = msg.get("id", "")
timestamp = msg.get("created_at", "Unknown Time") msg_user_name = msg.get("author_name", "Unknown User")
text = msg.get("content", "") msg_timestamp = msg.get("created_at", "Unknown Time")
channel_content += ( msg_text = msg.get("content", "")
f"## {user_name} ({timestamp})\n\n{text}\n\n---\n\n"
)
# Metadata sections # Format document metadata (similar to Slack)
metadata_sections = [ metadata_sections = [
( (
"METADATA", "METADATA",
@ -357,7 +350,8 @@ async def index_discord_messages(
f"GUILD_ID: {guild_id}", f"GUILD_ID: {guild_id}",
f"CHANNEL_NAME: {channel_name}", f"CHANNEL_NAME: {channel_name}",
f"CHANNEL_ID: {channel_id}", f"CHANNEL_ID: {channel_id}",
f"MESSAGE_COUNT: {len(formatted_messages)}", f"MESSAGE_TIMESTAMP: {msg_timestamp}",
f"MESSAGE_USER_NAME: {msg_user_name}",
], ],
), ),
( (
@ -365,19 +359,23 @@ async def index_discord_messages(
[ [
"FORMAT: markdown", "FORMAT: markdown",
"TEXT_START", "TEXT_START",
channel_content, msg_text,
"TEXT_END", "TEXT_END",
], ],
), ),
] ]
combined_document_string = build_document_metadata_string( # Build the document string
combined_document_string = build_document_metadata_markdown(
metadata_sections metadata_sections
) )
# Generate unique identifier hash for this Discord channel # Generate unique identifier hash for this Discord message
unique_identifier = f"{channel_id}_{msg_id}"
unique_identifier_hash = generate_unique_identifier_hash( unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.DISCORD_CONNECTOR, channel_id, search_space_id DocumentType.DISCORD_CONNECTOR,
unique_identifier,
search_space_id,
) )
# Generate content hash # Generate content hash
@ -394,110 +392,57 @@ async def index_discord_messages(
# Document exists - check if content has changed # Document exists - check if content has changed
if existing_document.content_hash == content_hash: if existing_document.content_hash == content_hash:
logger.info( logger.info(
f"Document for Discord channel {guild_name}#{channel_name} unchanged. Skipping." f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping."
) )
documents_skipped += 1 documents_skipped += 1
continue continue
else: else:
# Content has changed - update the existing document # Content has changed - update the existing document
logger.info( logger.info(
f"Content changed for Discord channel {guild_name}#{channel_name}. Updating document." f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document."
) )
# Get user's long context LLM # Update chunks and embedding
user_llm = await get_user_long_context_llm( chunks = await create_document_chunks(
session, user_id, search_space_id combined_document_string
) )
if not user_llm: doc_embedding = config.embedding_model_instance.embed(
logger.error( combined_document_string
f"No long context LLM configured for user {user_id}"
) )
skipped_channels.append(
f"{guild_name}#{channel_name} (no LLM configured)"
)
documents_skipped += 1
continue
# Generate summary with metadata
document_metadata = {
"guild_name": guild_name,
"channel_name": channel_name,
"message_count": len(formatted_messages),
"document_type": "Discord Channel Messages",
"connector_type": "Discord",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
combined_document_string,
user_llm,
document_metadata,
)
# Chunks from channel content
chunks = await create_document_chunks(channel_content)
# Update existing document # Update existing document
existing_document.title = ( existing_document.content = combined_document_string
f"Discord - {guild_name}#{channel_name}"
)
existing_document.content = summary_content
existing_document.content_hash = content_hash existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding existing_document.embedding = doc_embedding
existing_document.document_metadata = { existing_document.document_metadata = {
"guild_name": guild_name, "guild_name": guild_name,
"guild_id": guild_id, "guild_id": guild_id,
"channel_name": channel_name, "channel_name": channel_name,
"channel_id": channel_id, "channel_id": channel_id,
"message_count": len(formatted_messages), "message_id": msg_id,
"start_date": start_date_iso, "message_timestamp": msg_timestamp,
"end_date": end_date_iso, "message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime( "indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S" "%Y-%m-%d %H:%M:%S"
), ),
} }
# Delete old chunks and add new ones
existing_document.chunks = chunks existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp() existing_document.updated_at = get_current_timestamp()
documents_indexed += 1 documents_indexed += 1
logger.info( logger.info(
f"Successfully updated Discord channel {guild_name}#{channel_name}" f"Successfully updated Discord message {msg_id}"
) )
continue continue
# Document doesn't exist - create new one # Document doesn't exist - create new one
# Get user's long context LLM # Process chunks
user_llm = await get_user_long_context_llm( chunks = await create_document_chunks(combined_document_string)
session, user_id, search_space_id doc_embedding = config.embedding_model_instance.embed(
combined_document_string
) )
if not user_llm:
logger.error(
f"No long context LLM configured for user {user_id}"
)
skipped_channels.append(
f"{guild_name}#{channel_name} (no LLM configured)"
)
documents_skipped += 1
continue
# Generate summary with metadata
document_metadata = {
"guild_name": guild_name,
"channel_name": channel_name,
"message_count": len(formatted_messages),
"document_type": "Discord Channel Messages",
"connector_type": "Discord",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
# Chunks from channel content
chunks = await create_document_chunks(channel_content)
# Create and store new document # Create and store new document
document = Document( document = Document(
@ -509,34 +454,35 @@ async def index_discord_messages(
"guild_id": guild_id, "guild_id": guild_id,
"channel_name": channel_name, "channel_name": channel_name,
"channel_id": channel_id, "channel_id": channel_id,
"message_count": len(formatted_messages), "message_id": msg_id,
"start_date": start_date_iso, "message_timestamp": msg_timestamp,
"end_date": end_date_iso, "message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime( "indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S" "%Y-%m-%d %H:%M:%S"
), ),
}, },
content=summary_content, content=combined_document_string,
embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash, unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
) )
session.add(document) session.add(document)
documents_indexed += 1 documents_indexed += 1
logger.info(
f"Successfully indexed new channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
)
# Batch commit every 10 documents # Batch commit every 10 documents
if documents_indexed % 10 == 0: if documents_indexed % 10 == 0:
logger.info( logger.info(
f"Committing batch: {documents_indexed} Discord channels processed so far" f"Committing batch: {documents_indexed} Discord messages processed so far"
) )
await session.commit() await session.commit()
logger.info(
f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
)
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Error processing guild {guild_name}: {e!s}", exc_info=True f"Error processing guild {guild_name}: {e!s}", exc_info=True
@ -553,7 +499,7 @@ async def index_discord_messages(
# Final commit for any remaining documents not yet committed in batches # Final commit for any remaining documents not yet committed in batches
logger.info( logger.info(
f"Final commit: Total {documents_indexed} Discord channels processed" f"Final commit: Total {documents_indexed} Discord messages processed"
) )
await session.commit() await session.commit()
@ -561,18 +507,18 @@ async def index_discord_messages(
result_message = None result_message = None
if skipped_channels: if skipped_channels:
result_message = ( result_message = (
f"Processed {documents_indexed} channels. Skipped {len(skipped_channels)} channels: " f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: "
+ ", ".join(skipped_channels) + ", ".join(skipped_channels)
) )
else: else:
result_message = f"Processed {documents_indexed} channels." result_message = f"Processed {documents_indexed} messages."
# Log success # Log success
await task_logger.log_task_success( await task_logger.log_task_success(
log_entry, log_entry,
f"Successfully completed Discord indexing for connector {connector_id}", f"Successfully completed Discord indexing for connector {connector_id}",
{ {
"channels_processed": documents_indexed, "messages_processed": documents_indexed,
"documents_indexed": documents_indexed, "documents_indexed": documents_indexed,
"documents_skipped": documents_skipped, "documents_skipped": documents_skipped,
"skipped_channels_count": len(skipped_channels), "skipped_channels_count": len(skipped_channels),
@ -582,7 +528,7 @@ async def index_discord_messages(
) )
logger.info( logger.info(
f"Discord indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped"
) )
return documents_indexed, result_message return documents_indexed, result_message

View file

@ -2,7 +2,6 @@
import { Loader2 } from "lucide-react"; import { Loader2 } from "lucide-react";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import { AnnouncementBanner } from "@/components/announcement-banner";
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import { getBearerToken, redirectToLogin } from "@/lib/auth-utils"; import { getBearerToken, redirectToLogin } from "@/lib/auth-utils";
@ -43,7 +42,6 @@ export default function DashboardLayout({ children }: DashboardLayoutProps) {
return ( return (
<div className="h-full flex flex-col "> <div className="h-full flex flex-col ">
<AnnouncementBanner />
<div className="flex-1 min-h-0">{children}</div> <div className="flex-1 min-h-0">{children}</div>
</div> </div>
); );

View file

@ -1,5 +0,0 @@
import { atomWithStorage } from "jotai/utils";
// Atom to track whether the announcement banner has been dismissed
// Persists to localStorage automatically
export const announcementDismissedAtom = atomWithStorage("surfsense_announcement_dismissed", false);

View file

@ -1,47 +0,0 @@
"use client";
import { useAtom } from "jotai";
import { ExternalLink, Info, X } from "lucide-react";
import { announcementDismissedAtom } from "@/atoms/announcement.atom";
import { Button } from "@/components/ui/button";
export function AnnouncementBanner() {
const [isDismissed, setIsDismissed] = useAtom(announcementDismissedAtom);
const handleDismiss = () => {
setIsDismissed(true);
};
if (isDismissed) return null;
return (
<div className="relative h-[3rem] flex items-center justify-center border bg-gradient-to-r from-blue-600 to-blue-500 dark:from-blue-700 dark:to-blue-600 border-b border-blue-700 dark:border-blue-800">
<div className="container mx-auto px-4">
<div className="flex items-center justify-center gap-3 py-2.5">
<Info className="h-4 w-4 text-blue-50 flex-shrink-0" />
<p className="text-sm text-blue-50 text-center font-medium">
SurfSense is a work in progress.{" "}
<a
href="https://github.com/MODSetter/SurfSense/issues"
target="_blank"
rel="noopener noreferrer"
className="inline-flex items-center gap-1 underline decoration-blue-200 underline-offset-2 hover:decoration-white transition-colors"
>
Report issues on GitHub
<ExternalLink className="h-3 w-3" />
</a>
</p>
<Button
variant="ghost"
size="sm"
className="h-7 w-7 p-0 shrink-0 text-blue-100 hover:text-white hover:bg-blue-700/50 dark:hover:bg-blue-800/50 absolute right-4"
onClick={handleDismiss}
>
<X className="h-3.5 w-3.5" />
<span className="sr-only">Dismiss</span>
</Button>
</div>
</div>
</div>
);
}