Merge pull request #790 from MODSetter/dev

feat: new manage docs ui/ux
This commit is contained in:
Rohan Verma 2026-02-05 23:50:42 -08:00 committed by GitHub
commit d97068882a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
65 changed files with 8215 additions and 4388 deletions

View file

@ -17,13 +17,6 @@ from collections.abc import Sequence
from alembic import context, op
# Get Electric SQL user credentials from env.py configuration
_config = context.config
ELECTRIC_DB_USER = _config.get_main_option("electric_db_user", "electric")
ELECTRIC_DB_PASSWORD = _config.get_main_option(
"electric_db_password", "electric_password"
)
# revision identifiers, used by Alembic.
revision: str = "66"
down_revision: str | None = "65"
@ -31,8 +24,21 @@ branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def _get_electric_credentials() -> tuple[str, str]:
"""Get Electric SQL credentials from Alembic config.
Must be called inside upgrade()/downgrade(), not at module level,
because context.config is only available during migration execution.
"""
_config = context.config
user = _config.get_main_option("electric_db_user", "electric")
password = _config.get_main_option("electric_db_password", "electric_password")
return user, password
def upgrade() -> None:
"""Upgrade schema - add notifications table and Electric SQL replication."""
electric_db_user, electric_db_password = _get_electric_credentials()
# Create notifications table
op.execute(
"""
@ -74,8 +80,8 @@ def upgrade() -> None:
f"""
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{ELECTRIC_DB_USER}') THEN
CREATE USER {ELECTRIC_DB_USER} WITH REPLICATION PASSWORD '{ELECTRIC_DB_PASSWORD}';
IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{electric_db_user}') THEN
CREATE USER {electric_db_user} WITH REPLICATION PASSWORD '{electric_db_password}';
END IF;
END
$$;
@ -89,19 +95,19 @@ def upgrade() -> None:
DECLARE
db_name TEXT := current_database();
BEGIN
EXECUTE format('GRANT CONNECT ON DATABASE %I TO {ELECTRIC_DB_USER}', db_name);
EXECUTE format('GRANT CONNECT ON DATABASE %I TO {electric_db_user}', db_name);
END
$$;
"""
)
op.execute(f"GRANT USAGE ON SCHEMA public TO {ELECTRIC_DB_USER};")
op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {ELECTRIC_DB_USER};")
op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {ELECTRIC_DB_USER};")
op.execute(f"GRANT USAGE ON SCHEMA public TO {electric_db_user};")
op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {electric_db_user};")
op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {electric_db_user};")
op.execute(
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {ELECTRIC_DB_USER};"
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {electric_db_user};"
)
op.execute(
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {ELECTRIC_DB_USER};"
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {electric_db_user};"
)
# Create the publication if not exists

View file

@ -10,8 +10,6 @@ SECRET_KEY rotation.
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
@ -23,17 +21,45 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# Add access_token column (nullable so existing rows are unaffected)
op.add_column(
"image_generations",
sa.Column("access_token", sa.String(64), nullable=True),
)
op.create_index(
"ix_image_generations_access_token",
"image_generations",
["access_token"],
# Guard: skip entirely if image_generations table doesn't exist
op.execute(
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_name = 'image_generations'
) THEN
-- Add column if not exists
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'image_generations' AND column_name = 'access_token'
) THEN
ALTER TABLE image_generations
ADD COLUMN access_token VARCHAR(64);
END IF;
-- Create index if not exists
CREATE INDEX IF NOT EXISTS ix_image_generations_access_token
ON image_generations (access_token);
END IF;
END$$;
"""
)
def downgrade() -> None:
op.drop_index("ix_image_generations_access_token", table_name="image_generations")
op.drop_column("image_generations", "access_token")
op.execute("DROP INDEX IF EXISTS ix_image_generations_access_token")
op.execute(
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'image_generations' AND column_name = 'access_token'
) THEN
ALTER TABLE image_generations DROP COLUMN access_token;
END IF;
END$$;
"""
)

View file

@ -0,0 +1,77 @@
"""Add status column to documents table for per-document processing status
Revision ID: 95
Revises: 94
Create Date: 2026-02-05
Changes:
1. Add status column (JSONB) to documents table
2. Default value is {"state": "ready"} for backward compatibility
3. Existing documents are set to ready status
4. Index created for efficient status filtering
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "95"
down_revision: str | None = "94"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Add status column to documents with default ready state."""
# 1. Add status column with default value for new rows
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'documents' AND column_name = 'status'
) THEN
ALTER TABLE documents
ADD COLUMN status JSONB NOT NULL DEFAULT '{"state": "ready"}'::jsonb;
END IF;
END$$;
"""
)
# 2. Create index on status for efficient filtering by state
op.execute(
"""
CREATE INDEX IF NOT EXISTS ix_documents_status
ON documents ((status->>'state'));
"""
)
def downgrade() -> None:
"""Remove status column from documents."""
# Drop index
op.execute(
"""
DROP INDEX IF EXISTS ix_documents_status;
"""
)
# Drop column
op.execute(
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'documents' AND column_name = 'status'
) THEN
ALTER TABLE documents
DROP COLUMN status;
END IF;
END$$;
"""
)

View file

@ -16,11 +16,15 @@ from sqlalchemy.orm import selectinload
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import Document, DocumentType
from app.db import Document, DocumentStatus, DocumentType
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import calculate_date_range
from app.tasks.connector_indexers.base import (
calculate_date_range,
check_duplicate_document_by_hash,
safe_set_chunks,
)
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
@ -206,26 +210,24 @@ class ComposioGmailConnector(ComposioConnector):
# ============ Indexer Functions ============
async def _process_gmail_message_batch(
async def _analyze_gmail_messages_phase1(
session: AsyncSession,
messages: list[dict[str, Any]],
composio_connector: ComposioGmailConnector,
connector_id: int,
search_space_id: int,
user_id: str,
total_documents_indexed: int = 0,
) -> tuple[int, int]:
) -> tuple[list[dict[str, Any]], int, int]:
"""
Process a batch of Gmail messages and index them.
Args:
total_documents_indexed: Running total of documents indexed so far (for batch commits).
Phase 1: Analyze all messages, create pending documents.
Makes ALL documents visible in the UI immediately with pending status.
Returns:
Tuple of (documents_indexed, documents_skipped)
Tuple of (messages_to_process, documents_skipped, duplicate_content_count)
"""
documents_indexed = 0
messages_to_process = []
documents_skipped = 0
duplicate_content_count = 0
for message in messages:
try:
@ -235,11 +237,7 @@ async def _process_gmail_message_batch(
documents_skipped += 1
continue
# Composio's GMAIL_FETCH_EMAILS already returns full message content
# No need for a separate detail API call
# Extract message info from Composio response
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
payload = message.get("payload", {})
headers = payload.get("headers", [])
@ -262,7 +260,7 @@ async def _process_gmail_message_batch(
message
)
# Check for empty content (defensive parsing per Composio best practices)
# Check for empty content
if not markdown_content.strip():
logger.warning(f"Skipping Gmail message with no content: {subject}")
documents_skipped += 1
@ -280,102 +278,58 @@ async def _process_gmail_message_batch(
session, unique_identifier_hash
)
# Get label IDs from Composio response
# Get label IDs and thread_id from Composio response
label_ids = message.get("labelIds", [])
# Extract thread_id if available (for consistency with non-Composio implementation)
thread_id = message.get("threadId", "") or message.get("thread_id", "")
if existing_document:
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
"date_str": date_str,
"label_ids": label_ids,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Gmail: {subject}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"labels": label_ids,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
# Batch commit every 10 documents
current_total = total_documents_indexed + documents_indexed
if current_total % 10 == 0:
logger.info(
f"Committing batch: {current_total} Gmail messages processed so far"
)
await session.commit()
)
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"document_type": "Gmail Message (Composio)",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from standard connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
chunks = await create_document_chunks(markdown_content)
if duplicate_by_content:
logger.info(
f"Message {subject} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Gmail: {subject}",
title=subject,
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
document_metadata={
"message_id": message_id,
@ -388,39 +342,140 @@ async def _process_gmail_message_batch(
"toolkit_id": "gmail",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
messages_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date_str": date_str,
"label_ids": label_ids,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
documents_skipped += 1
continue
return messages_to_process, documents_skipped, duplicate_content_count
async def _process_gmail_messages_phase2(
session: AsyncSession,
messages_to_process: list[dict[str, Any]],
connector_id: int,
search_space_id: int,
user_id: str,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""
Phase 2: Process each document one by one.
Each document transitions: pending processing ready/failed
Returns:
Tuple of (documents_indexed, documents_failed)
"""
documents_indexed = 0
documents_failed = 0
last_heartbeat_time = time.time()
for item in messages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"document_type": "Gmail Message (Composio)",
}
summary_content, summary_embedding = await generate_document_summary(
item["markdown_content"], user_llm, document_metadata_for_summary
)
else:
summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["subject"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"date": item["date_str"],
"labels": item["label_ids"],
"connector_id": connector_id,
"source": "composio",
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents
current_total = total_documents_indexed + documents_indexed
if current_total % 10 == 0:
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {current_total} Gmail messages processed so far"
f"Committing batch: {documents_indexed} Gmail messages processed so far"
)
await session.commit()
except Exception as e:
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
documents_skipped += 1
# Rollback on error to avoid partial state (per Composio best practices)
# Mark document as failed with reason (visible in UI)
try:
await session.rollback()
except Exception as rollback_error:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Error during rollback: {rollback_error!s}", exc_info=True
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
return documents_indexed, documents_skipped
return documents_indexed, documents_failed
async def index_composio_gmail(
@ -437,7 +492,7 @@ async def index_composio_gmail(
max_items: int = 1000,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str]:
"""Index Gmail messages via Composio with pagination and incremental processing."""
"""Index Gmail messages via Composio with real-time document status updates."""
try:
composio_connector = ComposioGmailConnector(session, connector_id)
@ -448,14 +503,10 @@ async def index_composio_gmail(
end_date = None
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
# This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior)
if start_date is not None and end_date is not None:
# User provided both dates - use them directly
start_date_str = start_date
end_date_str = end_date
else:
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
# This ensures indexing works even when user doesn't specify dates
start_date_str, end_date_str = calculate_date_range(
connector, start_date, end_date, default_days_back=365
)
@ -473,48 +524,32 @@ async def index_composio_gmail(
f"(start_date={start_date_str}, end_date={end_date_str})"
)
# Use smaller batch size to avoid 413 payload too large errors
await task_logger.log_task_progress(
log_entry,
f"Fetching Gmail messages via Composio for connector {connector_id}",
{"stage": "fetching_messages"},
)
# =======================================================================
# FETCH ALL MESSAGES FIRST
# =======================================================================
batch_size = 50
page_token = None
total_documents_indexed = 0
total_documents_skipped = 0
total_messages_fetched = 0
result_size_estimate = None # Will be set from first API response
all_messages = []
result_size_estimate = None
last_heartbeat_time = time.time()
while total_messages_fetched < max_items:
# Send heartbeat periodically to indicate task is still alive
while len(all_messages) < max_items:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(total_documents_indexed)
await on_heartbeat_callback(len(all_messages))
last_heartbeat_time = current_time
# Calculate how many messages to fetch in this batch
remaining = max_items - total_messages_fetched
remaining = max_items - len(all_messages)
current_batch_size = min(batch_size, remaining)
# Use result_size_estimate if available, otherwise fall back to max_items
estimated_total = (
result_size_estimate if result_size_estimate is not None else max_items
)
# Cap estimated_total at max_items to avoid showing misleading progress
estimated_total = min(estimated_total, max_items)
await task_logger.log_task_progress(
log_entry,
f"Fetching Gmail messages batch via Composio for connector {connector_id} "
f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)",
{
"stage": "fetching_messages",
"batch_size": current_batch_size,
"total_fetched": total_messages_fetched,
"total_indexed": total_documents_indexed,
"estimated_total": estimated_total,
},
)
# Fetch batch of messages
(
messages,
next_token,
@ -533,97 +568,136 @@ async def index_composio_gmail(
return 0, f"Failed to fetch Gmail messages: {error}"
if not messages:
# No more messages available
break
# Update result_size_estimate from first response (Gmail provides this estimate)
if result_size_estimate is None and result_size_estimate_batch is not None:
result_size_estimate = result_size_estimate_batch
logger.info(
f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'"
)
total_messages_fetched += len(messages)
# Recalculate estimated_total after potentially updating result_size_estimate
estimated_total = (
result_size_estimate if result_size_estimate is not None else max_items
)
estimated_total = min(estimated_total, max_items)
all_messages.extend(messages)
logger.info(
f"Fetched batch of {len(messages)} Gmail messages "
f"(total: {total_messages_fetched}/{estimated_total})"
f"Fetched {len(messages)} messages (total: {len(all_messages)})"
)
# Process batch incrementally
batch_indexed, batch_skipped = await _process_gmail_message_batch(
session=session,
messages=messages,
composio_connector=composio_connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
total_documents_indexed=total_documents_indexed,
)
total_documents_indexed += batch_indexed
total_documents_skipped += batch_skipped
logger.info(
f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped "
f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)"
)
# Batch commits happen in _process_gmail_message_batch every 10 documents
# This ensures progress is saved incrementally, preventing data loss on crashes
# Check if we should continue
if not next_token:
# No more pages available
if not next_token or len(messages) < current_batch_size:
break
if len(messages) < current_batch_size:
# Last page had fewer items than requested, we're done
break
# Continue with next page
page_token = next_token
if total_messages_fetched == 0:
if not all_messages:
success_msg = "No Gmail messages found in the specified date range"
await task_logger.log_task_success(
log_entry, success_msg, {"messages_count": 0}
)
# CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
return 0, None # Return None (not error) when no items found
return (
0,
None,
) # Return None (not error) when no items found - this is success with 0 items
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
logger.info(f"Found {len(all_messages)} Gmail messages to index via Composio")
# =======================================================================
# PHASE 1: Analyze all messages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
await task_logger.log_task_progress(
log_entry,
f"Phase 1: Creating pending documents for {len(all_messages)} messages",
{"stage": "phase1_pending"},
)
(
messages_to_process,
documents_skipped,
duplicate_content_count,
) = await _analyze_gmail_messages_phase1(
session=session,
messages=all_messages,
composio_connector=composio_connector,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
)
# Commit all pending documents - they all appear in UI now
new_documents_count = len([m for m in messages_to_process if m["is_new"]])
if new_documents_count > 0:
logger.info(f"Phase 1: Committing {new_documents_count} pending documents")
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
await task_logger.log_task_progress(
log_entry,
f"Phase 2: Processing {len(messages_to_process)} documents",
{"stage": "phase2_processing"},
)
documents_indexed, documents_failed = await _process_gmail_messages_phase2(
session=session,
messages_to_process=messages_to_process,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
on_heartbeat_callback=on_heartbeat_callback,
)
# CRITICAL: Always update timestamp so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit to ensure all documents are persisted (safety net)
# This matches the pattern used in non-Composio Gmail indexer
logger.info(
f"Final commit: Total {total_documents_indexed} Gmail messages processed"
)
await session.commit()
logger.info(
"Successfully committed all Composio Gmail document changes to database"
)
# Final commit to ensure all documents are persisted
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
try:
await session.commit()
logger.info(
"Successfully committed all Composio Gmail document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
await task_logger.log_task_success(
log_entry,
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
{
"documents_indexed": total_documents_indexed,
"documents_skipped": total_documents_skipped,
"messages_fetched": total_messages_fetched,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
return total_documents_indexed, None
logger.info(
f"Composio Gmail indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
return documents_indexed, warning_message
except Exception as e:
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)

View file

@ -16,13 +16,14 @@ from sqlalchemy.orm import selectinload
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import Document, DocumentType
from app.db import Document, DocumentStatus, DocumentType
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
calculate_date_range,
check_duplicate_document_by_hash,
safe_set_chunks,
)
from app.utils.document_converters import (
create_document_chunks,
@ -266,18 +267,20 @@ async def index_composio_google_calendar(
documents_indexed = 0
documents_skipped = 0
documents_failed = 0 # Track events that failed processing
duplicate_content_count = (
0 # Track events skipped due to duplicate content_hash
)
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all events, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
events_to_process = [] # List of dicts with document and event data
new_documents_created = False
for event in events:
# Send heartbeat periodically to indicate task is still alive
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
try:
# Handle both standard Google API and potential Composio variations
event_id = event.get("id", "") or event.get("eventId", "")
@ -315,61 +318,28 @@ async def index_composio_google_calendar(
if existing_document:
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Update existing
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
events_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
"end_time": end_time,
"location": location,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Calendar: {summary}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
)
await session.commit()
)
continue
# Document doesn't exist by unique_identifier_hash
@ -380,49 +350,19 @@ async def index_composio_google_calendar(
)
if duplicate_by_content:
# A document with the same content already exists (likely from standard connector)
logger.info(
f"Event {summary} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Create new document
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = (
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
)
if location:
summary_content += f"\nLocation: {location}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Calendar: {summary}",
title=summary,
document_type=DocumentType(
TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]
),
@ -436,19 +376,116 @@ async def index_composio_google_calendar(
"toolkit_id": "googlecalendar",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
events_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"event_id": event_id,
"summary": summary,
"start_time": start_time,
"end_time": end_time,
"location": location,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
for item in events_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"event_id": item["event_id"],
"summary": item["summary"],
"start_time": item["start_time"],
"document_type": "Google Calendar Event (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}"
if item["location"]:
summary_content += f"\nLocation: {item['location']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["summary"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"event_id": item["event_id"],
"summary": item["summary"],
"start_time": item["start_time"],
"end_time": item["end_time"],
"location": item["location"],
"connector_id": connector_id,
"source": "composio",
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
@ -457,7 +494,15 @@ async def index_composio_google_calendar(
except Exception as e:
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
documents_skipped += 1
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
@ -490,10 +535,13 @@ async def index_composio_google_calendar(
else:
raise
# Build warning message if duplicates were found
warning_message = None
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_message = f"{duplicate_content_count} skipped (duplicate)"
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
await task_logger.log_task_success(
log_entry,
@ -501,13 +549,15 @@ async def index_composio_google_calendar(
{
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
f"({duplicate_content_count} due to duplicate content from other connectors)"
f"Composio Google Calendar indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
return documents_indexed, warning_message

View file

@ -21,10 +21,14 @@ from sqlalchemy.orm.attributes import flag_modified
from app.config import config
from app.connectors.composio_connector import ComposioConnector
from app.db import Document, DocumentType, Log
from app.db import Document, DocumentStatus, DocumentType, Log
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
check_duplicate_document_by_hash,
safe_set_chunks,
)
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
@ -537,22 +541,6 @@ async def check_document_by_unique_identifier(
return existing_doc_result.scalars().first()
async def check_document_by_content_hash(
session: AsyncSession, content_hash: str
) -> Document | None:
"""Check if a document with the given content hash already exists.
This is used to prevent duplicate content from being indexed, regardless
of which connector originally indexed it.
"""
from sqlalchemy.future import select
existing_doc_result = await session.execute(
select(Document).where(Document.content_hash == content_hash)
)
return existing_doc_result.scalars().first()
async def check_document_by_google_drive_file_id(
session: AsyncSession, file_id: str, search_space_id: int
) -> Document | None:
@ -843,14 +831,16 @@ async def _index_composio_drive_delta_sync(
log_entry,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
"""Index Google Drive files using delta sync (only changed files).
"""Index Google Drive files using delta sync with real-time document status updates.
Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync.
Handles: new files, modified files, and deleted files.
"""
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
processing_errors = []
duplicate_content_count = 0
last_heartbeat_time = time.time()
# Fetch all changes with pagination
@ -881,14 +871,13 @@ async def _index_composio_drive_delta_sync(
logger.info(f"Processing {len(all_changes)} changes from delta sync")
for change in all_changes[:max_items]:
# Send heartbeat periodically to indicate task is still alive
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
# =======================================================================
# PHASE 1: Analyze all changes, handle deletions, create pending documents
# =======================================================================
files_to_process = []
new_documents_created = False
for change in all_changes[:max_items]:
try:
# Handle removed files
is_removed = change.get("removed", False)
@ -899,9 +888,8 @@ async def _index_composio_drive_delta_sync(
documents_skipped += 1
continue
# Check if file was trashed or removed
# Check if file was trashed or removed - handle deletions immediately
if is_removed or file_info.get("trashed", False):
# Remove document from database
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"drive_{file_id}", search_space_id
@ -923,37 +911,233 @@ async def _index_composio_drive_delta_sync(
if mime_type == "application/vnd.google-apps.folder":
continue
# Process the file
indexed, skipped, errors = await _process_single_drive_file(
session=session,
composio_connector=composio_connector,
file_id=file_id,
file_name=file_name,
mime_type=mime_type,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
# Check for existing document by file ID (from any connector)
existing_by_file_id = await check_document_by_google_drive_file_id(
session, file_id, search_space_id
)
documents_indexed += indexed
documents_skipped += skipped
processing_errors.extend(errors)
# Generate unique identifier hash
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"drive_{file_id}", search_space_id
)
# Check if document exists by unique identifier
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_by_file_id and not existing_document:
# File already indexed by different connector - skip
logger.info(
f"Skipping file {file_name} (file_id={file_id}): already indexed "
f"by {existing_by_file_id.document_type.value}"
)
documents_skipped += 1
continue
if existing_document:
# Queue existing document for update
files_to_process.append(
{
"document": existing_document,
"is_new": False,
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
}
)
continue
# Create new document with PENDING status
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
document_metadata={
"file_id": file_id,
"file_name": file_name,
"FILE_NAME": file_name,
"mime_type": mime_type,
"connector_id": connector_id,
"toolkit_id": "googledrive",
"source": "composio",
},
content="Pending...",
content_hash=unique_identifier_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[],
status=DocumentStatus.pending(),
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
files_to_process.append(
{
"document": document,
"is_new": True,
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for change: {e!s}", exc_info=True)
documents_skipped += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# =======================================================================
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
for item in files_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit
document.status = DocumentStatus.processing()
await session.commit()
# Get file content
content, content_error = await composio_connector.get_drive_file_content(
item["file_id"], original_mime_type=item["mime_type"]
)
if content_error or not content:
logger.warning(
f"Could not get content for file {item['file_name']}: {content_error}"
)
markdown_content = f"# {item['file_name']}\n\n"
markdown_content += f"**File ID:** {item['file_id']}\n"
markdown_content += f"**Type:** {item['mime_type']}\n"
elif isinstance(content, dict):
error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}"
logger.error(error_msg)
processing_errors.append(error_msg)
markdown_content = f"# {item['file_name']}\n\n"
markdown_content += f"**File ID:** {item['file_id']}\n"
markdown_content += f"**Type:** {item['mime_type']}\n"
else:
markdown_content = await _process_file_content(
content=content,
file_name=item["file_name"],
file_id=item["file_id"],
mime_type=item["mime_type"],
search_space_id=search_space_id,
user_id=user_id,
session=session,
task_logger=task_logger,
log_entry=log_entry,
processing_errors=processing_errors,
)
content_hash = generate_content_hash(markdown_content, search_space_id)
# For existing documents, check if content changed
if not item["is_new"] and document.content_hash == content_hash:
if not DocumentStatus.is_state(document.status, DocumentStatus.READY):
document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Check for duplicate content hash (for new documents)
if item["is_new"]:
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"File {item['file_name']} already indexed by another connector. Skipping."
)
await session.delete(document)
duplicate_content_count += 1
documents_skipped += 1
continue
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"file_id": item["file_id"],
"file_name": item["file_name"],
"mime_type": item["mime_type"],
"document_type": "Google Drive File (Composio)",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata_for_summary
)
else:
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
# Update document to READY
document.title = item["file_name"]
document.content = summary_content
document.content_hash = content_hash
document.embedding = summary_embedding
document.document_metadata = {
"file_id": item["file_id"],
"file_name": item["file_name"],
"FILE_NAME": item["file_name"],
"mime_type": item["mime_type"],
"connector_id": connector_id,
"source": "composio",
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed > 0 and documents_indexed % 10 == 0:
if documents_indexed % 10 == 0:
await session.commit()
logger.info(f"Committed batch: {documents_indexed} changes processed")
except Exception as e:
error_msg = f"Error processing change for file {file_id}: {e!s}"
error_msg = f"Error processing change for file {item['file_id']}: {e!s}"
logger.error(error_msg, exc_info=True)
processing_errors.append(error_msg)
documents_skipped += 1
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
logger.info(
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped"
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, "
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
)
return documents_indexed, documents_skipped, processing_errors
@ -973,10 +1157,12 @@ async def _index_composio_drive_full_scan(
log_entry,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
"""Index Google Drive files using full scan (first sync or when no delta token)."""
"""Index Google Drive files using full scan with real-time document status updates."""
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
processing_errors = []
duplicate_content_count = 0
last_heartbeat_time = time.time()
all_files = []
@ -1108,14 +1294,14 @@ async def _index_composio_drive_full_scan(
f"Found {len(all_files)} Google Drive files to index via Composio (full scan)"
)
for file_info in all_files:
# Send heartbeat periodically to indicate task is still alive
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
# =======================================================================
# PHASE 1: Analyze all files, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
files_to_process = [] # List of dicts with document and file data
new_documents_created = False
for file_info in all_files:
try:
# Handle both standard Google API and potential Composio variations
file_id = file_info.get("id", "") or file_info.get("fileId", "")
@ -1132,227 +1318,242 @@ async def _index_composio_drive_full_scan(
if mime_type == "application/vnd.google-apps.folder":
continue
# Process the file
indexed, skipped, errors = await _process_single_drive_file(
session=session,
composio_connector=composio_connector,
file_id=file_id,
file_name=file_name,
mime_type=mime_type,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
# ========== EARLY DUPLICATE CHECK BY FILE ID ==========
existing_by_file_id = await check_document_by_google_drive_file_id(
session, file_id, search_space_id
)
if existing_by_file_id:
logger.info(
f"Skipping file {file_name} (file_id={file_id}): already indexed "
f"by {existing_by_file_id.document_type.value}"
)
documents_skipped += 1
continue
# Generate unique identifier hash
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"drive_{file_id}", search_space_id
)
documents_indexed += indexed
documents_skipped += skipped
processing_errors.extend(errors)
# Check if document exists by unique identifier
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Queue existing document for update (will be set to processing in Phase 2)
files_to_process.append(
{
"document": existing_document,
"is_new": False,
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
}
)
continue
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
document_metadata={
"file_id": file_id,
"file_name": file_name,
"FILE_NAME": file_name,
"mime_type": mime_type,
"connector_id": connector_id,
"toolkit_id": "googledrive",
"source": "composio",
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
files_to_process.append(
{
"document": document,
"is_new": True,
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for file: {e!s}", exc_info=True)
documents_skipped += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
for item in files_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Get file content (pass mime_type for Google Workspace export handling)
content, content_error = await composio_connector.get_drive_file_content(
item["file_id"], original_mime_type=item["mime_type"]
)
if content_error or not content:
logger.warning(
f"Could not get content for file {item['file_name']}: {content_error}"
)
markdown_content = f"# {item['file_name']}\n\n"
markdown_content += f"**File ID:** {item['file_id']}\n"
markdown_content += f"**Type:** {item['mime_type']}\n"
elif isinstance(content, dict):
error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}"
logger.error(error_msg)
processing_errors.append(error_msg)
markdown_content = f"# {item['file_name']}\n\n"
markdown_content += f"**File ID:** {item['file_id']}\n"
markdown_content += f"**Type:** {item['mime_type']}\n"
else:
# Process content based on file type
markdown_content = await _process_file_content(
content=content,
file_name=item["file_name"],
file_id=item["file_id"],
mime_type=item["mime_type"],
search_space_id=search_space_id,
user_id=user_id,
session=session,
task_logger=task_logger,
log_entry=log_entry,
processing_errors=processing_errors,
)
content_hash = generate_content_hash(markdown_content, search_space_id)
# For existing documents, check if content changed
if not item["is_new"] and document.content_hash == content_hash:
# Ensure status is ready
if not DocumentStatus.is_state(document.status, DocumentStatus.READY):
document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Check for duplicate content hash (for new documents)
if item["is_new"]:
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"File {item['file_name']} already indexed by another connector. Skipping."
)
# Remove the pending document we created
await session.delete(document)
duplicate_content_count += 1
documents_skipped += 1
continue
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"file_id": item["file_id"],
"file_name": item["file_name"],
"mime_type": item["mime_type"],
"document_type": "Google Drive File (Composio)",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata_for_summary
)
else:
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(markdown_content)
# Update document to READY with actual content
document.title = item["file_name"]
document.content = summary_content
document.content_hash = content_hash
document.embedding = summary_embedding
document.document_metadata = {
"file_id": item["file_id"],
"file_name": item["file_name"],
"FILE_NAME": item["file_name"],
"mime_type": item["mime_type"],
"connector_id": connector_id,
"source": "composio",
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents
if documents_indexed > 0 and documents_indexed % 10 == 0:
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Drive files processed so far"
)
await session.commit()
except Exception as e:
error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}"
error_msg = f"Error processing Drive file {item['file_name']}: {e!s}"
logger.error(error_msg, exc_info=True)
processing_errors.append(error_msg)
documents_skipped += 1
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
logger.info(
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, "
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
)
return documents_indexed, documents_skipped, processing_errors
async def _process_single_drive_file(
session: AsyncSession,
composio_connector: ComposioGoogleDriveConnector,
file_id: str,
file_name: str,
mime_type: str,
connector_id: int,
search_space_id: int,
user_id: str,
task_logger: TaskLoggingService,
log_entry,
) -> tuple[int, int, list[str]]:
"""Process a single Google Drive file for indexing.
Returns:
Tuple of (documents_indexed, documents_skipped, processing_errors)
"""
processing_errors = []
# ========== EARLY DUPLICATE CHECK BY FILE ID ==========
# Check if this Google Drive file was already indexed by ANY connector
# This happens BEFORE download/ETL to save expensive API calls
existing_by_file_id = await check_document_by_google_drive_file_id(
session, file_id, search_space_id
)
if existing_by_file_id:
logger.info(
f"Skipping file {file_name} (file_id={file_id}): already indexed "
f"by {existing_by_file_id.document_type.value} as '{existing_by_file_id.title}' "
f"(saved download & ETL cost)"
)
return 0, 1, processing_errors # Skip - NO download, NO ETL!
# ======================================================
# Generate unique identifier hash
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
unique_identifier_hash = generate_unique_identifier_hash(
document_type, f"drive_{file_id}", search_space_id
)
# Check if document exists by unique identifier (same connector, same file)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Get file content (pass mime_type for Google Workspace export handling)
content, content_error = await composio_connector.get_drive_file_content(
file_id, original_mime_type=mime_type
)
if content_error or not content:
logger.warning(f"Could not get content for file {file_name}: {content_error}")
# Use metadata as content fallback
markdown_content = f"# {file_name}\n\n"
markdown_content += f"**File ID:** {file_id}\n"
markdown_content += f"**Type:** {mime_type}\n"
elif isinstance(content, dict):
# Safety check: if content is still a dict, log error and use fallback
error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}"
logger.error(error_msg)
processing_errors.append(error_msg)
markdown_content = f"# {file_name}\n\n"
markdown_content += f"**File ID:** {file_id}\n"
markdown_content += f"**Type:** {mime_type}\n"
else:
# Process content based on file type
markdown_content = await _process_file_content(
content=content,
file_name=file_name,
file_id=file_id,
mime_type=mime_type,
search_space_id=search_space_id,
user_id=user_id,
session=session,
task_logger=task_logger,
log_entry=log_entry,
processing_errors=processing_errors,
)
content_hash = generate_content_hash(markdown_content, search_space_id)
if existing_document:
if existing_document.content_hash == content_hash:
return 0, 1, processing_errors # Skipped - unchanged
# Update existing document
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if user_llm:
document_metadata = {
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
summary_embedding = config.embedding_model_instance.embed(summary_content)
chunks = await create_document_chunks(markdown_content)
existing_document.title = f"Drive: {file_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"file_id": file_id,
"file_name": file_name,
"FILE_NAME": file_name, # For compatibility
"mime_type": mime_type,
"connector_id": connector_id,
"source": "composio",
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
return 1, 0, processing_errors # Indexed - updated
# Check if content_hash already exists (from any connector)
# This prevents duplicate content and avoids IntegrityError on unique constraint
existing_by_content_hash = await check_document_by_content_hash(
session, content_hash
)
if existing_by_content_hash:
logger.info(
f"Skipping file {file_name} (file_id={file_id}): identical content "
f"already indexed as '{existing_by_content_hash.title}'"
)
return 0, 1, processing_errors # Skipped - duplicate content
# Create new document
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if user_llm:
document_metadata = {
"file_id": file_id,
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File (Composio)",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
summary_embedding = config.embedding_model_instance.embed(summary_content)
chunks = await create_document_chunks(markdown_content)
document = Document(
search_space_id=search_space_id,
title=f"Drive: {file_name}",
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
document_metadata={
"file_id": file_id,
"file_name": file_name,
"FILE_NAME": file_name, # For compatibility
"mime_type": mime_type,
"toolkit_id": "googledrive",
"source": "composio",
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
return 1, 0, processing_errors # Indexed - new
async def _fetch_folder_files_recursively(
composio_connector: ComposioGoogleDriveConnector,
folder_id: str,

View file

@ -100,6 +100,83 @@ class PodcastStatus(str, Enum):
FAILED = "failed"
class DocumentStatus:
"""
Helper class for document processing status (stored as JSONB).
Status values:
- {"state": "ready"} - Document is fully processed and searchable
- {"state": "pending"} - Document is queued, waiting to be processed
- {"state": "processing"} - Document is currently being processed (only 1 at a time)
- {"state": "failed", "reason": "..."} - Processing failed with reason
Usage:
document.status = DocumentStatus.pending()
document.status = DocumentStatus.processing()
document.status = DocumentStatus.ready()
document.status = DocumentStatus.failed("LLM rate limit exceeded")
"""
# State constants
READY = "ready"
PENDING = "pending"
PROCESSING = "processing"
FAILED = "failed"
@staticmethod
def ready() -> dict:
"""Return status dict for a ready/searchable document."""
return {"state": DocumentStatus.READY}
@staticmethod
def pending() -> dict:
"""Return status dict for a document waiting to be processed."""
return {"state": DocumentStatus.PENDING}
@staticmethod
def processing() -> dict:
"""Return status dict for a document being processed."""
return {"state": DocumentStatus.PROCESSING}
@staticmethod
def failed(reason: str, **extra_details) -> dict:
"""
Return status dict for a failed document.
Args:
reason: Human-readable failure reason
**extra_details: Optional additional details (duplicate_of, error_code, etc.)
"""
status = {
"state": DocumentStatus.FAILED,
"reason": reason[:500],
} # Truncate long reasons
if extra_details:
status.update(extra_details)
return status
@staticmethod
def get_state(status: dict | None) -> str | None:
"""Extract state from status dict, returns None if invalid."""
if status is None:
return None
return status.get("state") if isinstance(status, dict) else None
@staticmethod
def is_state(status: dict | None, state: str) -> bool:
"""Check if status matches a given state."""
return DocumentStatus.get_state(status) == state
@staticmethod
def get_failure_reason(status: dict | None) -> str | None:
"""Extract failure reason from status dict."""
if status is None or not isinstance(status, dict):
return None
if status.get("state") == DocumentStatus.FAILED:
return status.get("reason")
return None
class LiteLLMProvider(str, Enum):
"""
Enum for LLM providers supported by LiteLLM.
@ -813,6 +890,17 @@ class Document(BaseModel, TimestampMixin):
index=True,
)
# Processing status for real-time visibility (JSONB)
# Format: {"state": "ready"} or {"state": "processing"} or {"state": "failed", "reason": "..."}
# Default to {"state": "ready"} for backward compatibility with existing documents
status = Column(
JSONB,
nullable=False,
default=DocumentStatus.ready,
server_default=text('\'{"state": "ready"}\'::jsonb'),
index=True,
)
# Relationships
search_space = relationship("SearchSpace", back_populates="documents")
created_by = relationship("User", back_populates="documents")

View file

@ -19,6 +19,7 @@ from app.db import (
from app.schemas import (
DocumentRead,
DocumentsCreate,
DocumentStatusSchema,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
@ -112,9 +113,23 @@ async def create_documents_file_upload(
user: User = Depends(current_active_user),
):
"""
Upload files as documents.
Upload files as documents with real-time status tracking.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL)
- Phase 2: Celery processes each file: pending processing ready/failed
Requires DOCUMENTS_CREATE permission.
"""
from datetime import datetime
from app.db import DocumentStatus
from app.tasks.document_processors.base import (
check_document_by_unique_identifier,
get_current_timestamp,
)
from app.utils.document_converters import generate_unique_identifier_hash
try:
# Check permission
await check_permission(
@ -128,38 +143,105 @@ async def create_documents_file_upload(
if not files:
raise HTTPException(status_code=400, detail="No files provided")
created_documents: list[Document] = []
files_to_process: list[
tuple[Document, str, str]
] = [] # (document, temp_path, filename)
skipped_duplicates = 0
# ===== PHASE 1: Create pending documents for all files =====
# This makes ALL documents visible in the UI immediately with pending status
for file in files:
try:
# Save file to a temporary location to avoid stream issues
import os
import tempfile
# Create temp file
# Save file to temp location
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(file.filename)[1]
delete=False, suffix=os.path.splitext(file.filename or "")[1]
) as temp_file:
temp_path = temp_file.name
# Write uploaded file to temp file
content = await file.read()
with open(temp_path, "wb") as f:
f.write(content)
from app.tasks.celery_tasks.document_tasks import (
process_file_upload_task,
file_size = len(content)
# Generate unique identifier for deduplication check
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.FILE, file.filename or "unknown", search_space_id
)
process_file_upload_task.delay(
temp_path, file.filename, search_space_id, str(user.id)
# Check if document already exists (by unique identifier)
existing = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing:
# Clean up temp file for duplicates
os.unlink(temp_path)
skipped_duplicates += 1
continue
# Create pending document (visible immediately in UI via ElectricSQL)
document = Document(
search_space_id=search_space_id,
title=file.filename or "Uploaded File",
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file.filename,
"file_size": file_size,
"upload_time": datetime.now().isoformat(),
},
content="Processing...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary, updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
status=DocumentStatus.pending(), # Shows "pending" in UI
updated_at=get_current_timestamp(),
created_by_id=str(user.id),
)
session.add(document)
created_documents.append(document)
files_to_process.append(
(document, temp_path, file.filename or "unknown")
)
except Exception as e:
raise HTTPException(
status_code=422,
detail=f"Failed to process file {file.filename}: {e!s}",
) from e
await session.commit()
return {"message": "Files uploaded for processing"}
# Commit all pending documents - they appear in UI immediately via ElectricSQL
if created_documents:
await session.commit()
# Refresh to get generated IDs
for doc in created_documents:
await session.refresh(doc)
# ===== PHASE 2: Dispatch Celery tasks for each file =====
# Each task will update document status: pending → processing → ready/failed
from app.tasks.celery_tasks.document_tasks import (
process_file_upload_with_document_task,
)
for document, temp_path, filename in files_to_process:
process_file_upload_with_document_task.delay(
document_id=document.id,
temp_path=temp_path,
filename=filename,
search_space_id=search_space_id,
user_id=str(user.id),
)
return {
"message": "Files uploaded for processing",
"document_ids": [doc.id for doc in created_documents],
"total_files": len(files),
"pending_files": len(files_to_process),
"skipped_duplicates": skipped_duplicates,
}
except HTTPException:
raise
except Exception as e:
@ -211,7 +293,11 @@ async def read_documents(
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
query = select(Document).filter(Document.search_space_id == search_space_id)
query = (
select(Document)
.options(selectinload(Document.created_by))
.filter(Document.search_space_id == search_space_id)
)
count_query = (
select(func.count())
.select_from(Document)
@ -221,6 +307,7 @@ async def read_documents(
# Get documents from all search spaces user has membership in
query = (
select(Document)
.options(selectinload(Document.created_by))
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
@ -261,6 +348,19 @@ async def read_documents(
# Convert database objects to API-friendly format
api_documents = []
for doc in db_documents:
# Get user name (display_name or email fallback)
created_by_name = None
if doc.created_by:
created_by_name = doc.created_by.display_name or doc.created_by.email
# Parse status from JSONB
status_data = None
if hasattr(doc, "status") and doc.status:
status_data = DocumentStatusSchema(
state=doc.status.get("state", "ready"),
reason=doc.status.get("reason"),
)
api_documents.append(
DocumentRead(
id=doc.id,
@ -273,6 +373,9 @@ async def read_documents(
created_at=doc.created_at,
updated_at=doc.updated_at,
search_space_id=doc.search_space_id,
created_by_id=doc.created_by_id,
created_by_name=created_by_name,
status=status_data,
)
)
@ -341,7 +444,11 @@ async def search_documents(
Permission.DOCUMENTS_READ.value,
"You don't have permission to read documents in this search space",
)
query = select(Document).filter(Document.search_space_id == search_space_id)
query = (
select(Document)
.options(selectinload(Document.created_by))
.filter(Document.search_space_id == search_space_id)
)
count_query = (
select(func.count())
.select_from(Document)
@ -351,6 +458,7 @@ async def search_documents(
# Get documents from all search spaces user has membership in
query = (
select(Document)
.options(selectinload(Document.created_by))
.join(SearchSpace)
.join(SearchSpaceMembership)
.filter(SearchSpaceMembership.user_id == user.id)
@ -395,6 +503,19 @@ async def search_documents(
# Convert database objects to API-friendly format
api_documents = []
for doc in db_documents:
# Get user name (display_name or email fallback)
created_by_name = None
if doc.created_by:
created_by_name = doc.created_by.display_name or doc.created_by.email
# Parse status from JSONB
status_data = None
if hasattr(doc, "status") and doc.status:
status_data = DocumentStatusSchema(
state=doc.status.get("state", "ready"),
reason=doc.status.get("reason"),
)
api_documents.append(
DocumentRead(
id=doc.id,
@ -407,6 +528,9 @@ async def search_documents(
created_at=doc.created_at,
updated_at=doc.updated_at,
search_space_id=doc.search_space_id,
created_by_id=doc.created_by_id,
created_by_name=created_by_name,
status=status_data,
)
)
@ -782,6 +906,7 @@ async def delete_document(
"""
Delete a document.
Requires DOCUMENTS_DELETE permission for the search space.
Documents in "processing" state cannot be deleted.
"""
try:
result = await session.execute(
@ -794,6 +919,14 @@ async def delete_document(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Check if document is pending or currently being processed
doc_state = document.status.get("state") if document.status else None
if doc_state in ("pending", "processing"):
raise HTTPException(
status_code=409, # Conflict
detail="Cannot delete document while it is pending or being processed. Please wait for processing to complete.",
)
# Check permission for the search space
await check_permission(
session,

View file

@ -230,6 +230,14 @@ async def delete_note(
if not document:
raise HTTPException(status_code=404, detail="Note not found")
# Check if note is pending or currently being processed
doc_state = document.status.get("state") if document.status else None
if doc_state in ("pending", "processing"):
raise HTTPException(
status_code=409,
detail="Cannot delete note while it is pending or being processed. Please wait for processing to complete.",
)
# Delete document (chunks will be cascade deleted)
await session.delete(document)
await session.commit()

View file

@ -2127,6 +2127,7 @@ async def run_google_gmail_indexing(
start_date: str | None,
end_date: str | None,
update_last_indexed: bool,
on_heartbeat_callback=None,
) -> tuple[int, str | None]:
# Use a reasonable default for max_messages
max_messages = 1000
@ -2139,6 +2140,7 @@ async def run_google_gmail_indexing(
end_date=end_date,
update_last_indexed=update_last_indexed,
max_messages=max_messages,
on_heartbeat_callback=on_heartbeat_callback,
)
# index_google_gmail_messages returns (int, str) but we need (int, str | None)
return indexed_count, error_message if error_message else None

View file

@ -11,6 +11,7 @@ from .documents import (
DocumentBase,
DocumentRead,
DocumentsCreate,
DocumentStatusSchema,
DocumentTitleRead,
DocumentTitleSearchResponse,
DocumentUpdate,
@ -104,6 +105,7 @@ __all__ = [
# Document schemas
"DocumentBase",
"DocumentRead",
"DocumentStatusSchema",
"DocumentTitleRead",
"DocumentTitleSearchResponse",
"DocumentUpdate",

View file

@ -41,6 +41,13 @@ class DocumentUpdate(DocumentBase):
pass
class DocumentStatusSchema(BaseModel):
"""Document processing status."""
state: str # "ready", "processing", "failed"
reason: str | None = None
class DocumentRead(BaseModel):
id: int
title: str
@ -53,6 +60,12 @@ class DocumentRead(BaseModel):
updated_at: datetime | None
search_space_id: int
created_by_id: UUID | None = None # User who created/uploaded this document
created_by_name: str | None = (
None # Display name or email of the user who created this document
)
status: DocumentStatusSchema | None = (
None # Processing status (ready, processing, failed)
)
model_config = ConfigDict(from_attributes=True)

View file

@ -982,7 +982,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
channel_name = metadata.get("channel_name", "Unknown Channel")
message_date = metadata.get("start_date", "")
title = f"Slack: {channel_name}"
title = channel_name
if message_date:
title += f" ({message_date})"
return title
@ -1056,7 +1056,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
page_title = metadata.get("page_title", "Untitled Page")
indexed_at = metadata.get("indexed_at", "")
title = f"Notion: {page_title}"
title = page_title
if indexed_at:
title += f" (indexed: {indexed_at})"
return title
@ -1366,9 +1366,9 @@ class ConnectorService:
issue_title = metadata.get("issue_title", "Untitled Issue")
issue_state = metadata.get("state", "")
title = (
f"Linear: {issue_identifier} - {issue_title}"
f"{issue_identifier} - {issue_title}"
if issue_identifier
else f"Linear: {issue_title}"
else issue_title
)
if issue_state:
title += f" ({issue_state})"
@ -1465,11 +1465,7 @@ class ConnectorService:
issue_key = metadata.get("issue_key", "")
issue_title = metadata.get("issue_title", "Untitled Issue")
status = metadata.get("status", "")
title = (
f"Jira: {issue_key} - {issue_title}"
if issue_key
else f"Jira: {issue_title}"
)
title = f"{issue_key} - {issue_title}" if issue_key else issue_title
if status:
title += f" ({status})"
return title
@ -1570,7 +1566,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
event_summary = metadata.get("event_summary", "Untitled Event")
start_time = metadata.get("start_time", "")
title = f"Calendar: {event_summary}"
title = event_summary
if start_time:
title += f" ({start_time})"
return title
@ -1675,7 +1671,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
record_id = metadata.get("record_id", "")
return f"Airtable Record: {record_id}" if record_id else "Airtable Record"
return record_id if record_id else "Airtable Record"
def _description_fn(
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
@ -1952,7 +1948,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
page_title = metadata.get("page_title", "Untitled Page")
space_key = metadata.get("space_key", "")
title = f"Confluence: {page_title}"
title = page_title
if space_key:
title += f" ({space_key})"
return title
@ -2238,7 +2234,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
channel_name = metadata.get("channel_name", "Unknown Channel")
message_date = metadata.get("start_date", "")
title = f"Discord: {channel_name}"
title = channel_name
if message_date:
title += f" ({message_date})"
return title
@ -2314,7 +2310,7 @@ class ConnectorService:
team_name = metadata.get("team_name", "Unknown Team")
channel_name = metadata.get("channel_name", "Unknown Channel")
message_date = metadata.get("start_date", "")
title = f"Teams: {team_name} - {channel_name}"
title = f"{team_name} - {channel_name}"
if message_date:
title += f" ({message_date})"
return title
@ -2387,11 +2383,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
event_name = metadata.get("event_name", "Untitled Event")
start_time = metadata.get("start_time", "")
return (
f"Luma: {event_name} ({start_time})"
if start_time
else f"Luma: {event_name}"
)
return f"{event_name} ({start_time})" if start_time else event_name
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
return metadata.get("event_url", "") or ""
@ -2651,7 +2643,7 @@ class ConnectorService:
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
page_name = metadata.get("page_name", "Untitled Page")
return f"BookStack: {page_name}"
return page_name
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
page_slug = metadata.get("page_slug", "")

View file

@ -537,6 +537,304 @@ async def _process_file_upload(
raise
@celery_app.task(name="process_file_upload_with_document", bind=True)
def process_file_upload_with_document_task(
self,
document_id: int,
temp_path: str,
filename: str,
search_space_id: int,
user_id: str,
):
"""
Celery task to process uploaded file with existing pending document.
This task is used by the 2-phase document upload flow:
- Phase 1 (API): Creates pending document (visible in UI immediately)
- Phase 2 (this task): Updates document status: pending processing ready/failed
Args:
document_id: ID of the pending document created in Phase 1
temp_path: Path to the uploaded file
filename: Original filename
search_space_id: ID of the search space
user_id: ID of the user
"""
import asyncio
import os
import traceback
logger.info(
f"[process_file_upload_with_document] Task started - document_id: {document_id}, "
f"file: {filename}, search_space_id: {search_space_id}"
)
# Check if file exists and is accessible
if not os.path.exists(temp_path):
logger.error(
f"[process_file_upload_with_document] File does not exist: {temp_path}. "
"The temp file may have been cleaned up before the task ran."
)
# Mark document as failed since file is missing
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_mark_document_failed(
document_id,
"File not found - temp file may have been cleaned up",
)
)
finally:
loop.close()
return
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_process_file_with_document(
document_id, temp_path, filename, search_space_id, user_id
)
)
logger.info(
f"[process_file_upload_with_document] Task completed successfully for: {filename}"
)
except Exception as e:
logger.error(
f"[process_file_upload_with_document] Task failed for {filename}: {e}\n"
f"Traceback:\n{traceback.format_exc()}"
)
raise
finally:
loop.close()
async def _mark_document_failed(document_id: int, reason: str):
"""Mark a document as failed when task cannot proceed."""
from app.db import Document, DocumentStatus
from app.tasks.document_processors.base import get_current_timestamp
async with get_celery_session_maker()() as session:
document = await session.get(Document, document_id)
if document:
document.status = DocumentStatus.failed(reason)
document.updated_at = get_current_timestamp()
await session.commit()
logger.info(f"Marked document {document_id} as failed: {reason}")
async def _process_file_with_document(
document_id: int,
temp_path: str,
filename: str,
search_space_id: int,
user_id: str,
):
"""
Process file and update existing pending document status.
This function implements Phase 2 of the 2-phase document upload:
- Sets document status to 'processing' (shows spinner in UI)
- Processes the file (parsing, embedding, chunking)
- Updates document to 'ready' on success or 'failed' on error
"""
import os
from app.db import Document, DocumentStatus
from app.tasks.document_processors.base import get_current_timestamp
from app.tasks.document_processors.file_processors import (
process_file_in_background_with_document,
)
logger.info(
f"[_process_file_with_document] Starting async processing for: {filename}"
)
async with get_celery_session_maker()() as session:
logger.info(
f"[_process_file_with_document] Database session created for: {filename}"
)
task_logger = TaskLoggingService(session, search_space_id)
# Get the document
document = await session.get(Document, document_id)
if not document:
logger.error(f"Document {document_id} not found")
return
# Get file size for notification metadata
try:
file_size = os.path.getsize(temp_path)
logger.info(f"[_process_file_with_document] File size: {file_size} bytes")
except Exception as e:
logger.warning(
f"[_process_file_with_document] Could not get file size: {e}"
)
file_size = None
# Create notification for document processing
logger.info(
f"[_process_file_with_document] Creating notification for: {filename}"
)
notification = (
await NotificationService.document_processing.notify_processing_started(
session=session,
user_id=UUID(user_id),
document_type="FILE",
document_name=filename,
search_space_id=search_space_id,
file_size=file_size,
)
)
log_entry = await task_logger.log_task_start(
task_name="process_file_upload_with_document",
source="document_processor",
message=f"Starting file processing for: {filename} (document_id: {document_id})",
metadata={
"document_type": "FILE",
"document_id": document_id,
"filename": filename,
"file_path": temp_path,
"user_id": user_id,
},
)
try:
# Set status to PROCESSING (shows spinner in UI via ElectricSQL)
document.status = DocumentStatus.processing()
await session.commit()
logger.info(
f"[_process_file_with_document] Document {document_id} status set to 'processing'"
)
# Process the file and update document
result = await process_file_in_background_with_document(
document=document,
file_path=temp_path,
filename=filename,
search_space_id=search_space_id,
user_id=user_id,
session=session,
task_logger=task_logger,
log_entry=log_entry,
notification=notification,
)
# Update notification on success
if result:
await (
NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
document_id=result.id,
chunks_count=None,
)
)
logger.info(
f"[_process_file_with_document] Successfully processed document {document_id}"
)
else:
# Duplicate detected - mark as failed
document.status = DocumentStatus.failed("Duplicate content detected")
document.updated_at = get_current_timestamp()
await session.commit()
await (
NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
error_message="Document already exists (duplicate)",
)
)
except Exception as e:
# Import here to avoid circular dependencies
from fastapi import HTTPException
from app.services.page_limit_service import PageLimitExceededError
# Check if this is a page limit error
page_limit_error: PageLimitExceededError | None = None
if isinstance(e, PageLimitExceededError):
page_limit_error = e
elif (
isinstance(e, HTTPException)
and e.__cause__
and isinstance(e.__cause__, PageLimitExceededError)
):
page_limit_error = e.__cause__
# Mark document as failed (shows error in UI via ElectricSQL)
error_message = str(e)[:500]
document.status = DocumentStatus.failed(error_message)
document.updated_at = get_current_timestamp()
await session.commit()
logger.info(
f"[_process_file_with_document] Document {document_id} marked as failed: {error_message[:100]}"
)
# Handle page limit errors with dedicated notification
if page_limit_error is not None:
try:
await session.refresh(notification)
await NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
error_message="Page limit exceeded",
)
await NotificationService.page_limit.notify_page_limit_exceeded(
session=session,
user_id=UUID(user_id),
document_name=filename,
document_type="FILE",
search_space_id=search_space_id,
pages_used=page_limit_error.pages_used,
pages_limit=page_limit_error.pages_limit,
pages_to_add=page_limit_error.pages_to_add,
)
except Exception as notif_error:
logger.error(
f"Failed to create page limit notification: {notif_error!s}"
)
else:
# Update notification on failure
try:
await session.refresh(notification)
await NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
error_message=str(e)[:100],
)
except Exception as notif_error:
logger.error(
f"Failed to update notification on failure: {notif_error!s}"
)
await task_logger.log_task_failure(
log_entry,
error_message[:100],
str(e),
{"error_type": type(e).__name__, "document_id": document_id},
)
logger.error(f"Error processing file {filename}: {e!s}")
raise
finally:
# Clean up temp file
if os.path.exists(temp_path):
try:
os.unlink(temp_path)
logger.info(
f"[_process_file_with_document] Cleaned up temp file: {temp_path}"
)
except Exception as cleanup_error:
logger.warning(
f"[_process_file_with_document] Failed to clean up temp file: {cleanup_error}"
)
@celery_app.task(name="process_circleback_meeting", bind=True)
def process_circleback_meeting_task(
self,

View file

@ -4,33 +4,41 @@ This task runs periodically (every 5 minutes by default) to find notifications
that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
Additionally, it cleans up documents stuck in pending/processing state that belong
to connectors with stale notifications.
Detection mechanism:
- Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
- If the task crashes, the Redis key expires automatically
- This cleanup task checks for in-progress notifications without a Redis heartbeat key
- Such notifications are marked as failed with O(1) batch UPDATE
- Documents with pending/processing status for those connectors are also marked as failed
"""
import contextlib
import json
import logging
import os
from datetime import UTC, datetime
import redis
from sqlalchemy import and_, text
from sqlalchemy import and_, or_, text
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.future import select
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Notification
from app.db import Document, DocumentStatus, Notification
logger = logging.getLogger(__name__)
# Redis client for checking heartbeats
_redis_client: redis.Redis | None = None
# Error message shown to users when sync is interrupted
STALE_SYNC_ERROR_MESSAGE = "Sync was interrupted unexpectedly. Please retry."
def get_redis_client() -> redis.Redis:
"""Get or create Redis client for heartbeat checking."""
@ -70,6 +78,7 @@ def cleanup_stale_indexing_notifications_task():
- Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
And marks them as failed with O(1) batch UPDATE.
Also marks associated pending/processing documents as failed.
"""
import asyncio
@ -86,15 +95,20 @@ async def _cleanup_stale_notifications():
"""Find and mark stale connector indexing notifications as failed.
Uses Redis TTL-based detection:
1. Find all in-progress notifications
1. Find all in-progress notifications with their connector_id
2. Check which ones are missing their Redis heartbeat key
3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
4. Mark associated documents (pending/processing) as failed
"""
async with get_celery_session_maker()() as session:
try:
# Find all in-progress connector indexing notifications
# Fetch full metadata to properly extract connector_id
result = await session.execute(
select(Notification.id).where(
select(
Notification.id,
Notification.notification_metadata,
).where(
and_(
Notification.type == "connector_indexing",
Notification.notification_metadata["status"].astext
@ -102,24 +116,37 @@ async def _cleanup_stale_notifications():
)
)
)
in_progress_ids = [row[0] for row in result.fetchall()]
in_progress_rows = result.fetchall()
if not in_progress_ids:
if not in_progress_rows:
logger.debug("No in-progress connector indexing notifications found")
return
# Check which ones are missing heartbeat keys in Redis
redis_client = get_redis_client()
stale_notification_ids = []
stale_connector_ids = []
for notification_id in in_progress_ids:
for row in in_progress_rows:
notification_id = row[0]
metadata = row[1] # Full metadata dict
heartbeat_key = _get_heartbeat_key(notification_id)
if not redis_client.exists(heartbeat_key):
stale_notification_ids.append(notification_id)
# Extract connector_id from metadata dict for document cleanup
if metadata and isinstance(metadata, dict):
connector_id = metadata.get("connector_id")
logger.debug(
f"Notification {notification_id} metadata: {metadata}, "
f"connector_id: {connector_id}"
)
if connector_id is not None:
with contextlib.suppress(ValueError, TypeError):
stale_connector_ids.append(int(connector_id))
if not stale_notification_ids:
logger.debug(
f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
f"All {len(in_progress_rows)} in-progress notifications have active Redis heartbeats"
)
return
@ -127,18 +154,15 @@ async def _cleanup_stale_notifications():
f"Found {len(stale_notification_ids)} stale connector indexing notifications "
f"(no Redis heartbeat key): {stale_notification_ids}"
)
logger.info(f"Connector IDs for document cleanup: {stale_connector_ids}")
# O(1) Batch UPDATE using JSONB || operator
# O(1) Batch UPDATE notifications using JSONB || operator
# This merges the update data into existing notification_metadata
# Also updates title and message for proper UI display
error_message = (
"Something went wrong while syncing your content. Please retry."
)
update_data = {
"status": "failed",
"completed_at": datetime.now(UTC).isoformat(),
"error_message": error_message,
"error_message": STALE_SYNC_ERROR_MESSAGE,
"sync_stage": "failed",
}
@ -152,16 +176,96 @@ async def _cleanup_stale_notifications():
"""),
{
"update_json": json.dumps(update_data),
"display_message": f"{error_message}",
"display_message": STALE_SYNC_ERROR_MESSAGE,
"ids": stale_notification_ids,
},
)
await session.commit()
logger.info(
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed"
)
# ===== Clean up stuck documents for stale connectors =====
if stale_connector_ids:
await _cleanup_stuck_documents(session, stale_connector_ids)
await session.commit()
except Exception as e:
logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
await session.rollback()
async def _cleanup_stuck_documents(session, connector_ids: list[int]):
"""
Mark documents stuck in pending/processing state as failed for given connectors.
This ensures that when a connector sync is interrupted, all partially-processed
documents are marked with a clear error state instead of being stuck indefinitely.
Args:
session: Database session
connector_ids: List of connector IDs whose documents should be cleaned up
"""
if not connector_ids:
return
try:
# Count documents that will be affected (for logging)
count_result = await session.execute(
select(Document.id).where(
and_(
Document.connector_id.in_(connector_ids),
or_(
Document.status["state"].astext == DocumentStatus.PENDING,
Document.status["state"].astext == DocumentStatus.PROCESSING,
),
)
)
)
stuck_doc_ids = [row[0] for row in count_result.fetchall()]
if not stuck_doc_ids:
logger.debug(f"No stuck documents found for connector IDs: {connector_ids}")
return
logger.warning(
f"Found {len(stuck_doc_ids)} stuck documents (pending/processing) "
f"for connector IDs {connector_ids}: {stuck_doc_ids[:20]}..." # Log first 20
)
# O(1) Batch UPDATE: Mark all stuck documents as failed using JSONB
# The error message matches what we show in notifications
failed_status = DocumentStatus.failed(STALE_SYNC_ERROR_MESSAGE)
await session.execute(
text("""
UPDATE documents
SET status = CAST(:failed_status AS jsonb),
updated_at = :now
WHERE connector_id = ANY(:connector_ids)
AND (
status->>'state' = :pending_state
OR status->>'state' = :processing_state
)
"""),
{
"failed_status": json.dumps(failed_status),
"now": datetime.now(UTC),
"connector_ids": connector_ids,
"pending_state": DocumentStatus.PENDING,
"processing_state": DocumentStatus.PROCESSING,
},
)
logger.info(
f"Successfully marked {len(stuck_doc_ids)} stuck documents as failed "
f"for connector IDs: {connector_ids}"
)
except Exception as e:
logger.error(
f"Error cleaning up stuck documents for connectors {connector_ids}: {e!s}",
exc_info=True,
)
# Don't raise - let the notification cleanup continue even if document cleanup fails

View file

@ -1,5 +1,9 @@
"""
Airtable connector indexer.
Implements real-time document status updates using a two-phase approach:
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
- Phase 2: Process each document one by one (pending processing ready/failed)
"""
import time
@ -10,7 +14,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.airtable_history import AirtableHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -27,6 +31,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -134,24 +139,32 @@ async def index_airtable_records(
await task_logger.log_task_success(
log_entry, success_msg, {"bases_count": 0}
)
return 0, success_msg
# CRITICAL: Update timestamp even when no bases found so Electric SQL syncs
await update_connector_last_indexed(
session, connector, update_last_indexed
)
await session.commit()
return 0, None # Return None (not error) when no items found
logger.info(f"Found {len(bases)} Airtable bases to process")
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
total_documents_indexed = 0
# Process each base
# Track overall statistics
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
# =======================================================================
# PHASE 1: Collect all records and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
records_to_process = [] # List of dicts with document and record data
new_documents_created = False
for base in bases:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
base_id = base.get("id")
base_name = base.get("name", "Unknown Base")
@ -201,7 +214,6 @@ async def index_airtable_records(
max_records=max_records,
)
)
else:
# Fetch all records
records, records_error = airtable_connector.get_all_records(
@ -222,21 +234,14 @@ async def index_airtable_records(
logger.info(f"Found {len(records)} records in table {table_name}")
documents_indexed = 0
skipped_messages = []
documents_skipped = 0
# Process each record
# Phase 1: Analyze each record and create pending documents
for record in records:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
try:
record_id = record.get("id", "")
if not record_id:
documents_skipped += 1
continue
# Generate markdown content
markdown_content = (
airtable_connector.format_record_to_markdown(
@ -246,16 +251,11 @@ async def index_airtable_records(
if not markdown_content.strip():
logger.warning(
f"Skipping message with no content: {record.get('id')}"
)
skipped_messages.append(
f"{record.get('id')} (no content)"
f"Skipping record with no content: {record_id}"
)
documents_skipped += 1
continue
record_id = record.get("id", "Unknown")
# Generate unique identifier hash for this Airtable record
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.AIRTABLE_CONNECTOR,
@ -278,77 +278,30 @@ async def index_airtable_records(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Airtable record {record_id} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = (
DocumentStatus.ready()
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Airtable record {record_id}. Updating document."
)
# Generate document summary
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"record_id": record_id,
"created_time": record.get(
"CREATED_TIME()", ""
),
"document_type": "Airtable Record",
"connector_type": "Airtable",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content,
user_llm,
document_metadata,
)
else:
summary_content = (
f"Airtable Record: {record_id}\n\n"
)
summary_embedding = (
config.embedding_model_instance.embed(
summary_content
)
)
# Process chunks
chunks = await create_document_chunks(
markdown_content
)
# Update existing document
existing_document.title = (
f"Airtable Record: {record_id}"
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
records_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"record_id": record_id,
"created_time": record.get(
"CREATED_TIME()", ""
),
"record": record,
"base_name": base_name,
"table_name": table_name,
}
existing_document.chunks = chunks
existing_document.updated_at = (
get_current_timestamp()
)
documents_indexed += 1
logger.info(
f"Successfully updated Airtable record {record_id}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -365,123 +318,210 @@ async def index_airtable_records(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate document summary
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"record_id": record_id,
"created_time": record.get("CREATED_TIME()", ""),
"document_type": "Airtable Record",
"connector_type": "Airtable",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Airtable Record: {record_id}\n\n"
summary_embedding = (
config.embedding_model_instance.embed(
summary_content
)
)
# Process chunks
chunks = await create_document_chunks(markdown_content)
# Create and store new document
logger.info(
f"Creating new document for Airtable record: {record_id}"
)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Airtable Record: {record_id}",
title=record_id,
document_type=DocumentType.AIRTABLE_CONNECTOR,
document_metadata={
"record_id": record_id,
"created_time": record.get("CREATED_TIME()", ""),
"base_name": base_name,
"table_name": table_name,
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new Airtable record {summary_content}"
)
new_documents_created = True
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Airtable records processed so far"
)
await session.commit()
records_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"record_id": record_id,
"record": record,
"base_name": base_name,
"table_name": table_name,
}
)
except Exception as e:
logger.error(
f"Error processing the Airtable record {record.get('id', 'Unknown')}: {e!s}",
exc_info=True,
f"Error in Phase 1 for record: {e!s}", exc_info=True
)
skipped_messages.append(
f"{record.get('id', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this message and continue with others
documents_failed += 1
continue
# Accumulate total processed across all tables
total_processed += documents_indexed
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([r for r in records_to_process if r['is_new']])} pending documents"
)
await session.commit()
# Final commit for any remaining documents not yet committed in batches
if documents_indexed > 0:
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(records_to_process)} documents")
for item in records_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"record_id": item["record_id"],
"created_time": item["record"].get("CREATED_TIME()", ""),
"document_type": "Airtable Record",
"connector_type": "Airtable",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Airtable Record: {item['record_id']}\n\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["record_id"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"record_id": item["record_id"],
"created_time": item["record"].get("CREATED_TIME()", ""),
"base_name": item["base_name"],
"table_name": item["table_name"],
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Final commit for table {table_name}: {documents_indexed} Airtable records processed"
f"Committing batch: {documents_indexed} Airtable records processed so far"
)
await session.commit()
logger.info(
f"Successfully committed all Airtable document changes for table {table_name}"
)
# Update the last_indexed_at timestamp for the connector only if requested
# (after all tables in all bases are processed)
if total_processed > 0:
await update_connector_last_indexed(
session, connector, update_last_indexed
except Exception as e:
logger.error(
f"Error processing Airtable record: {e!s}", exc_info=True
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
total_processed = documents_indexed
# Final commit to ensure all documents are persisted (safety net)
logger.info(
f"Final commit: Total {documents_indexed} Airtable records processed"
)
try:
await session.commit()
logger.info(
"Successfully committed all Airtable document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same record was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success after processing all bases and tables
await task_logger.log_task_success(
log_entry,
f"Successfully completed Airtable indexing for connector {connector_id}",
{
"events_processed": total_processed,
"documents_indexed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Airtable indexing completed: {total_processed} total records processed"
f"Airtable indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
warning_message,
)
except Exception as e:
logger.error(

View file

@ -28,6 +28,35 @@ def get_current_timestamp() -> datetime:
return datetime.now(UTC)
def safe_set_chunks(document: Document, chunks: list) -> None:
"""
Safely assign chunks to a document without triggering lazy loading.
ALWAYS use this instead of `document.chunks = chunks` to avoid
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
Why this is needed:
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
load the OLD chunks first (for comparison/orphan detection)
- This lazy loading fails in async context with asyncpg driver
- set_committed_value bypasses this by setting the value directly
This function is safe regardless of how the document was loaded
(with or without selectinload).
Args:
document: The Document object to update
chunks: List of Chunk objects to assign
Example:
# Instead of: document.chunks = chunks (DANGEROUS!)
safe_set_chunks(document, chunks) # Always safe
"""
from sqlalchemy.orm.attributes import set_committed_value
set_committed_value(document, "chunks", chunks)
def parse_date_flexible(date_str: str) -> datetime:
"""
Parse date from multiple common formats.

View file

@ -1,5 +1,9 @@
"""
BookStack connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Collect all pages and create pending documents (visible in UI immediately)
- Phase 2: Process each page: pending processing ready/failed
"""
import time
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.bookstack_connector import BookStackConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +32,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -184,22 +189,22 @@ async def index_bookstack_pages(
logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True)
return 0, f"Error fetching BookStack pages: {e!s}"
# Process and index each page
# =======================================================================
# PHASE 1: Analyze all pages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
documents_indexed = 0
skipped_pages = []
documents_skipped = 0
documents_failed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
pages_to_process = [] # List of dicts with document and page data
new_documents_created = False
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_name = page.get("name", "")
@ -218,7 +223,7 @@ async def index_bookstack_pages(
# Fetch full page content (Markdown preferred)
try:
page_detail, page_content = bookstack_client.get_page_with_content(
_, page_content = bookstack_client.get_page_with_content(
page_id, use_markdown=True
)
except Exception as e:
@ -252,82 +257,38 @@ async def index_bookstack_pages(
# Build page URL
page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}"
# Build document metadata
doc_metadata = {
"page_id": page_id,
"page_name": page_name,
"page_slug": page_slug,
"book_id": book_id,
"book_slug": book_slug,
"chapter_id": chapter_id,
"base_url": bookstack_base_url,
"page_url": page_url,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(
f"Document for BookStack page {page_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for BookStack page {page_name}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
summary_metadata = {
"page_name": page_name,
"page_id": page_id,
"book_id": book_id,
"document_type": "BookStack Page",
"connector_type": "BookStack",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, summary_metadata
)
else:
summary_content = (
f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
)
if page_content:
content_preview = page_content[:1000]
if len(page_content) > 1000:
content_preview += "..."
summary_content += (
f"Content Preview: {content_preview}\n\n"
)
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(full_content)
# Update existing document
existing_document.title = f"BookStack - {page_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = doc_metadata
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(f"Successfully updated BookStack page {page_name}")
continue
# Queue existing document for update (will be set to processing in Phase 2)
pages_to_process.append(
{
"document": existing_document,
"is_new": False,
"page_id": page_id,
"page_name": page_name,
"page_slug": page_slug,
"book_id": book_id,
"book_slug": book_slug,
"chapter_id": chapter_id,
"page_url": page_url,
"page_content": page_content,
"full_content": full_content,
"content_hash": content_hash,
}
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -345,17 +306,108 @@ async def index_bookstack_pages(
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=page_name,
document_type=DocumentType.BOOKSTACK_CONNECTOR,
document_metadata={
"page_id": page_id,
"page_name": page_name,
"page_slug": page_slug,
"book_id": book_id,
"book_slug": book_slug,
"chapter_id": chapter_id,
"base_url": bookstack_base_url,
"page_url": page_url,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
pages_to_process.append(
{
"document": document,
"is_new": True,
"page_id": page_id,
"page_name": page_name,
"page_slug": page_slug,
"book_id": book_id,
"book_slug": book_slug,
"chapter_id": chapter_id,
"page_url": page_url,
"page_content": page_content,
"full_content": full_content,
"content_hash": content_hash,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
for item in pages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Build document metadata
doc_metadata = {
"page_id": item["page_id"],
"page_name": item["page_name"],
"page_slug": item["page_slug"],
"book_id": item["book_id"],
"book_slug": item["book_slug"],
"chapter_id": item["chapter_id"],
"base_url": bookstack_base_url,
"page_url": item["page_url"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
if user_llm:
summary_metadata = {
"page_name": page_name,
"page_id": page_id,
"book_id": book_id,
"page_name": item["page_name"],
"page_id": item["page_id"],
"book_id": item["book_id"],
"document_type": "BookStack Page",
"connector_type": "BookStack",
}
@ -363,17 +415,15 @@ async def index_bookstack_pages(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, summary_metadata
item["full_content"], user_llm, summary_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
)
if page_content:
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n"
if item["page_content"]:
# Take first 1000 characters of content for summary
content_preview = page_content[:1000]
if len(page_content) > 1000:
content_preview = item["page_content"][:1000]
if len(item["page_content"]) > 1000:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
summary_embedding = config.embedding_model_instance.embed(
@ -381,30 +431,21 @@ async def index_bookstack_pages(
)
# Process chunks - using the full page content
chunks = await create_document_chunks(full_content)
chunks = await create_document_chunks(item["full_content"])
# Create and store new document
logger.info(f"Creating new document for page {page_name}")
document = Document(
search_space_id=search_space_id,
title=f"BookStack - {page_name}",
document_type=DocumentType.BOOKSTACK_CONNECTOR,
document_metadata=doc_metadata,
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
# Update document to READY with actual content
document.title = item["page_name"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = doc_metadata
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new page {page_name}")
# Batch commit every 10 documents
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} BookStack pages processed so far"
@ -413,46 +454,76 @@ async def index_bookstack_pages(
except Exception as e:
logger.error(
f"Error processing page {page.get('name', 'Unknown')}: {e!s}",
f"Error processing page {item.get('page_name', 'Unknown')}: {e!s}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
skipped_pages.append(
f"{page.get('name', 'Unknown')} (processing error)"
f"{item.get('page_name', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this page and continue with others
documents_failed += 1
continue
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} BookStack pages processed"
)
await session.commit()
logger.info("Successfully committed all BookStack document changes to database")
try:
await session.commit()
logger.info(
"Successfully committed all BookStack document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same page was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed BookStack indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"pages_processed": documents_indexed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"skipped_pages_count": len(skipped_pages),
},
)
logger.info(
f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
f"BookStack indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
ClickUp connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import contextlib
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.clickup_history import ClickUpHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +32,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -141,10 +146,18 @@ async def index_clickup_tasks(
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Collect all tasks and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
tasks_to_process = [] # List of dicts with document and task data
new_documents_created = False
# Iterate workspaces and fetch tasks
for workspace in workspaces:
workspace_id = workspace.get("id")
@ -183,15 +196,6 @@ async def index_clickup_tasks(
)
for task in tasks:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
task_id = task.get("id")
task_name = task.get("name", "Untitled Task")
@ -255,73 +259,38 @@ async def index_clickup_tasks(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(
f"Document for ClickUp task {task_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
# Queue existing document for update (will be set to processing in Phase 2)
logger.info(
f"Content changed for ClickUp task {task_name}. Updating document."
f"Content changed for ClickUp task {task_name}. Queuing for update."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
tasks_to_process.append(
{
"document": existing_document,
"is_new": False,
"task_content": task_content,
"content_hash": content_hash,
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_list": task_list_name,
"task_space": task_space_name,
"assignees": len(task_assignees),
"document_type": "ClickUp Task",
"connector_type": "ClickUp",
"task_list_name": task_list_name,
"task_space_name": task_space_name,
"task_assignees": task_assignees,
"task_due_date": task_due_date,
"task_created": task_created,
"task_updated": task_updated,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
task_content, user_llm, document_metadata
)
else:
summary_content = task_content
summary_embedding = (
config.embedding_model_instance.embed(task_content)
)
# Process chunks
chunks = await create_document_chunks(task_content)
# Update existing document
existing_document.title = f"Task - {task_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_assignees": task_assignees,
"task_due_date": task_due_date,
"task_created": task_created,
"task_updated": task_updated,
"indexed_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(
f"Successfully updated ClickUp task {task_name}"
)
continue
@ -341,42 +310,10 @@ async def index_clickup_tasks(
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_list": task_list_name,
"task_space": task_space_name,
"assignees": len(task_assignees),
"document_type": "ClickUp Task",
"connector_type": "ClickUp",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
task_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = task_content
summary_embedding = config.embedding_model_instance.embed(
task_content
)
chunks = await create_document_chunks(task_content)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Task - {task_name}",
title=task_name,
document_type=DocumentType.CLICKUP_CONNECTOR,
document_metadata={
"task_id": task_id,
@ -387,44 +324,180 @@ async def index_clickup_tasks(
"task_due_date": task_due_date,
"task_created": task_created,
"task_updated": task_updated,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new task {task_name}")
new_documents_created = True
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} ClickUp tasks processed so far"
)
await session.commit()
tasks_to_process.append(
{
"document": document,
"is_new": True,
"task_content": task_content,
"content_hash": content_hash,
"task_id": task_id,
"task_name": task_name,
"task_status": task_status,
"task_priority": task_priority,
"task_list_name": task_list_name,
"task_space_name": task_space_name,
"task_assignees": task_assignees,
"task_due_date": task_due_date,
"task_created": task_created,
"task_updated": task_updated,
}
)
except Exception as e:
logger.error(
f"Error processing task {task.get('name', 'Unknown')}: {e!s}",
f"Error in Phase 1 for task {task.get('name', 'Unknown')}: {e!s}",
exc_info=True,
)
documents_skipped += 1
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(tasks_to_process)} documents")
for item in tasks_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"task_id": item["task_id"],
"task_name": item["task_name"],
"task_status": item["task_status"],
"task_priority": item["task_priority"],
"task_list": item["task_list_name"],
"task_space": item["task_space_name"],
"assignees": len(item["task_assignees"]),
"document_type": "ClickUp Task",
"connector_type": "ClickUp",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["task_content"], user_llm, document_metadata_for_summary
)
else:
summary_content = item["task_content"]
summary_embedding = config.embedding_model_instance.embed(
item["task_content"]
)
chunks = await create_document_chunks(item["task_content"])
# Update document to READY with actual content
document.title = item["task_name"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"task_id": item["task_id"],
"task_name": item["task_name"],
"task_status": item["task_status"],
"task_priority": item["task_priority"],
"task_assignees": item["task_assignees"],
"task_due_date": item["task_due_date"],
"task_created": item["task_created"],
"task_updated": item["task_updated"],
"connector_id": connector_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} ClickUp tasks processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing task {item.get('task_name', 'Unknown')}: {e!s}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} ClickUp tasks processed")
await session.commit()
try:
await session.commit()
logger.info(
"Successfully committed all ClickUp document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same task was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
await task_logger.log_task_success(
log_entry,
@ -433,11 +506,12 @@ async def index_clickup_tasks(
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
},
)
logger.info(
f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped"
f"clickup indexing completed: {documents_indexed} ready, {documents_skipped} skipped, {documents_failed} failed"
)
# Close client connection

View file

@ -1,5 +1,9 @@
"""
Confluence connector indexer.
Provides real-time document status updates during indexing using a two-phase approach:
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
- Phase 2: Process each document one by one (PENDING PROCESSING READY/FAILED)
"""
import contextlib
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.confluence_history import ConfluenceHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -29,6 +33,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -180,22 +185,22 @@ async def index_confluence_pages(
await confluence_client.close()
return 0, f"Error fetching Confluence pages: {e!s}"
# Process and index each page
# =======================================================================
# PHASE 1: Analyze all pages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
documents_indexed = 0
skipped_pages = []
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
pages_to_process = [] # List of dicts with document and page data
new_documents_created = False
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_title = page.get("title", "")
@ -205,7 +210,6 @@ async def index_confluence_pages(
logger.warning(
f"Skipping page with missing ID or title: {page_id or 'Unknown'}"
)
skipped_pages.append(f"{page_title or 'Unknown'} (missing data)")
documents_skipped += 1
continue
@ -236,7 +240,6 @@ async def index_confluence_pages(
if not full_content.strip():
logger.warning(f"Skipping page with no content: {page_title}")
skipped_pages.append(f"{page_title} (no content)")
documents_skipped += 1
continue
@ -258,74 +261,29 @@ async def index_confluence_pages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Confluence page {page_title} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Confluence page {page_title}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"page_title": page_title,
"page_id": page_id,
"space_id": space_id,
"comment_count": comment_count,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, document_metadata
)
else:
summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
if page_content:
content_preview = page_content[:1000]
if len(page_content) > 1000:
content_preview += "..."
summary_content += (
f"Content Preview: {content_preview}\n\n"
)
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(full_content)
# Update existing document
existing_document.title = f"Confluence - {page_title}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
pages_to_process.append(
{
"document": existing_document,
"is_new": False,
"full_content": full_content,
"page_content": page_content,
"content_hash": content_hash,
"page_id": page_id,
"page_title": page_title,
"space_id": space_id,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(
f"Successfully updated Confluence page {page_title}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -340,21 +298,92 @@ async def index_confluence_pages(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=page_title,
document_type=DocumentType.CONFLUENCE_CONNECTOR,
document_metadata={
"page_id": page_id,
"page_title": page_title,
"space_id": space_id,
"comment_count": comment_count,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
pages_to_process.append(
{
"document": document,
"is_new": True,
"full_content": full_content,
"page_content": page_content,
"content_hash": content_hash,
"page_id": page_id,
"page_title": page_title,
"space_id": space_id,
"comment_count": comment_count,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
for item in pages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"page_title": page_title,
"page_id": page_id,
"space_id": space_id,
"comment_count": comment_count,
"page_title": item["page_title"],
"page_id": item["page_id"],
"space_id": item["space_id"],
"comment_count": item["comment_count"],
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
@ -362,55 +391,45 @@ async def index_confluence_pages(
summary_content,
summary_embedding,
) = await generate_document_summary(
full_content, user_llm, document_metadata
item["full_content"], user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = (
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
)
if page_content:
# Take first 500 characters of content for summary
content_preview = page_content[:1000]
if len(page_content) > 1000:
summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n"
if item["page_content"]:
# Take first 1000 characters of content for summary
content_preview = item["page_content"][:1000]
if len(item["page_content"]) > 1000:
content_preview += "..."
summary_content += f"Content Preview: {content_preview}\n\n"
summary_content += f"Comments: {comment_count}"
summary_content += f"Comments: {item['comment_count']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full page content with comments
chunks = await create_document_chunks(full_content)
chunks = await create_document_chunks(item["full_content"])
# Create and store new document
logger.info(f"Creating new document for page {page_title}")
document = Document(
search_space_id=search_space_id,
title=f"Confluence - {page_title}",
document_type=DocumentType.CONFLUENCE_CONNECTOR,
document_metadata={
"page_id": page_id,
"page_title": page_title,
"space_id": space_id,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
# Update document to READY with actual content
document.title = item["page_title"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"page_id": item["page_id"],
"page_title": item["page_title"],
"space_id": item["space_id"],
"comment_count": item["comment_count"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new page {page_title}")
# Batch commit every 10 documents
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Confluence pages processed so far"
@ -419,53 +438,80 @@ async def index_confluence_pages(
except Exception as e:
logger.error(
f"Error processing page {page.get('title', 'Unknown')}: {e!s}",
f"Error processing page {item.get('page_title', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_pages.append(
f"{page.get('title', 'Unknown')} (processing error)"
)
documents_skipped += 1
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue # Skip this page and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
# Final commit to ensure all documents are persisted (safety net)
logger.info(
f"Final commit: Total {documents_indexed} Confluence pages processed"
)
await session.commit()
logger.info(
"Successfully committed all Confluence document changes to database"
)
try:
await session.commit()
logger.info(
"Successfully committed all Confluence document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same page was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Confluence indexing for connector {connector_id}",
{
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_pages_count": len(skipped_pages),
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
f"Confluence indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
# Close the client connection
if confluence_client:
await confluence_client.close()
return (
total_processed,
None,
) # Return None as the error message to indicate success
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Discord connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import asyncio
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.discord_connector import DiscordConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
@ -27,6 +31,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -48,7 +53,11 @@ async def index_discord_messages(
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Discord messages from all accessible channels.
Index Discord messages from the configured guild's channels.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
Args:
session: Database session
@ -113,6 +122,37 @@ async def index_discord_messages(
logger.info(f"Starting Discord indexing for connector {connector_id}")
# =======================================================================
# GUILD FILTERING: Only index the specific guild configured for this connector
# =======================================================================
# Extract guild_id from connector config (set during OAuth flow)
configured_guild_id = connector.config.get("guild_id")
configured_guild_name = connector.config.get("guild_name")
# Legacy connector check - if no guild_id, we need to warn and handle gracefully
is_legacy_connector = configured_guild_id is None
if is_legacy_connector:
logger.warning(
f"Discord connector {connector_id} has no guild_id configured. "
"This is a legacy connector. Please reconnect the Discord server to fix this. "
"For now, indexing will be skipped to prevent indexing unwanted servers."
)
await task_logger.log_task_failure(
log_entry,
f"Legacy Discord connector {connector_id} missing guild_id",
"No guild_id configured. Please reconnect this Discord server.",
{"error_type": "MissingGuildId", "is_legacy": True},
)
return (
0,
"This Discord connector needs to be reconnected. Please disconnect and reconnect your Discord server to enable indexing.",
)
logger.info(
f"Configured to index guild: {configured_guild_name} ({configured_guild_id})"
)
# Initialize Discord client with OAuth credentials support
await task_logger.log_task_progress(
log_entry,
@ -255,77 +295,66 @@ async def index_discord_messages(
try:
await task_logger.log_task_progress(
log_entry,
f"Starting Discord bot and fetching guilds for connector {connector_id}",
{"stage": "fetch_guilds"},
f"Starting Discord bot for connector {connector_id}",
{"stage": "bot_initialization"},
)
logger.info("Starting Discord bot to fetch guilds")
logger.info("Starting Discord bot")
discord_client._bot_task = asyncio.create_task(discord_client.start_bot())
await discord_client._wait_until_ready()
logger.info("Fetching Discord guilds")
guilds = await discord_client.get_guilds()
logger.info(f"Found {len(guilds)} guilds")
# We only process the configured guild, not all guilds
logger.info(
f"Processing configured guild only: {configured_guild_name} ({configured_guild_id})"
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Discord guilds for connector {connector_id}",
f"Failed to start Discord bot for connector {connector_id}",
str(e),
{"error_type": "GuildFetchError"},
{"error_type": "BotStartError"},
)
logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True)
logger.error(f"Failed to start Discord bot: {e!s}", exc_info=True)
await discord_client.close_bot()
return 0, f"Failed to get Discord guilds: {e!s}"
if not guilds:
await task_logger.log_task_success(
log_entry,
f"No Discord guilds found for connector {connector_id}",
{"guilds_found": 0},
)
logger.info("No Discord guilds found to index")
await discord_client.close_bot()
return 0, "No Discord guilds found"
return 0, f"Failed to start Discord bot: {e!s}"
# Track results
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
skipped_channels: list[str] = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Process each guild and channel
# Use the configured guild info
guild_id = configured_guild_id
guild_name = configured_guild_name or "Unknown Guild"
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(guilds)} Discord guilds",
{"stage": "process_guilds", "total_guilds": len(guilds)},
f"Processing Discord guild: {guild_name}",
{"stage": "process_guild", "guild_id": guild_id, "guild_name": guild_name},
)
# =======================================================================
# PHASE 1: Collect all messages and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
messages_to_process = [] # List of dicts with document and message data
new_documents_created = False
try:
for guild in guilds:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
guild_id = guild["id"]
guild_name = guild["name"]
logger.info(f"Processing guild: {guild_name} ({guild_id})")
try:
channels = await discord_client.get_text_channels(guild_id)
if not channels:
logger.info(
f"No channels found in guild {guild_name}. Skipping."
)
skipped_channels.append(f"{guild_name} (no channels)")
documents_skipped += 1
continue
logger.info(f"Processing guild: {guild_name} ({guild_id})")
try:
channels = await discord_client.get_text_channels(guild_id)
if not channels:
logger.info(f"No channels found in guild {guild_name}. Skipping.")
skipped_channels.append(f"{guild_name} (no channels)")
else:
for channel in channels:
channel_id = channel["id"]
channel_name = channel["name"]
@ -343,14 +372,12 @@ async def index_discord_messages(
skipped_channels.append(
f"{guild_name}#{channel_name} (fetch error)"
)
documents_skipped += 1
continue
if not messages:
logger.info(
f"No messages found in channel {channel_name} for the specified date range."
)
documents_skipped += 1
continue
# Filter/format messages
@ -365,7 +392,6 @@ async def index_discord_messages(
logger.info(
f"No valid messages found in channel {channel_name} after filtering."
)
documents_skipped += 1
continue
# Process each message as an individual document (like Slack)
@ -427,32 +453,23 @@ async def index_discord_messages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = (
DocumentStatus.ready()
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document."
)
# Update chunks and embedding
chunks = await create_document_chunks(
combined_document_string
)
doc_embedding = (
config.embedding_model_instance.embed(
combined_document_string
)
)
# Update existing document
existing_document.content = combined_document_string
existing_document.content_hash = content_hash
existing_document.embedding = doc_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"guild_name": guild_name,
"guild_id": guild_id,
"channel_name": channel_name,
@ -460,22 +477,9 @@ async def index_discord_messages(
"message_id": msg_id,
"message_timestamp": msg_timestamp,
"message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
}
# Delete old chunks and add new ones
existing_document.chunks = chunks
existing_document.updated_at = (
get_current_timestamp()
)
documents_indexed += 1
logger.info(
f"Successfully updated Discord message {msg_id}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -492,22 +496,14 @@ async def index_discord_messages(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Process chunks
chunks = await create_document_chunks(
combined_document_string
)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Create and store new document
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Discord - {guild_name}#{channel_name}",
title=f"{guild_name}#{channel_name}",
document_type=DocumentType.DISCORD_CONNECTOR,
document_metadata={
"guild_name": guild_name,
@ -515,87 +511,177 @@ async def index_discord_messages(
"channel_name": channel_name,
"channel_id": channel_id,
"message_id": msg_id,
"message_timestamp": msg_timestamp,
"message_user_name": msg_user_name,
"indexed_at": datetime.now(UTC).strftime(
"%Y-%m-%d %H:%M:%S"
),
"connector_id": connector_id,
},
content=combined_document_string,
embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
new_documents_created = True
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Discord messages processed so far"
)
await session.commit()
messages_to_process.append(
{
"document": document,
"is_new": True,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"guild_name": guild_name,
"guild_id": guild_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_id": msg_id,
"message_timestamp": msg_timestamp,
"message_user_name": msg_user_name,
}
)
logger.info(
f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
)
except Exception as e:
logger.error(
f"Error processing guild {guild_name}: {e!s}", exc_info=True
)
skipped_channels.append(f"{guild_name} (processing error)")
except Exception as e:
logger.error(
f"Error processing guild {guild_name}: {e!s}", exc_info=True
)
skipped_channels.append(f"{guild_name} (processing error)")
documents_skipped += 1
continue
finally:
await discord_client.close_bot()
# Update last_indexed_at only if we indexed at least one
if documents_indexed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
for item in messages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (embeddings, chunks)
chunks = await create_document_chunks(item["combined_document_string"])
doc_embedding = config.embedding_model_instance.embed(
item["combined_document_string"]
)
# Update document to READY with actual content
document.title = f"{item['guild_name']}#{item['channel_name']}"
document.content = item["combined_document_string"]
document.content_hash = item["content_hash"]
document.embedding = doc_embedding
document.document_metadata = {
"guild_name": item["guild_name"],
"guild_id": item["guild_id"],
"channel_name": item["channel_name"],
"channel_id": item["channel_id"],
"message_id": item["message_id"],
"message_timestamp": item["message_timestamp"],
"message_user_name": item["message_user_name"],
"indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Discord messages processed so far"
)
await session.commit()
except Exception as e:
logger.error(f"Error processing Discord message: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} Discord messages processed"
)
await session.commit()
# Prepare result message
result_message = None
if skipped_channels:
result_message = (
f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: "
+ ", ".join(skipped_channels)
try:
await session.commit()
logger.info(
"Successfully committed all Discord document changes to database"
)
else:
result_message = f"Processed {documents_indexed} messages."
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
if skipped_channels:
warning_parts.append(f"{len(skipped_channels)} channels skipped")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Discord indexing for connector {connector_id}",
{
"messages_processed": documents_indexed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
"skipped_channels_count": len(skipped_channels),
"guilds_processed": len(guilds),
"result_message": result_message,
"guild_id": guild_id,
"guild_name": guild_name,
},
)
logger.info(
f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped"
f"Discord indexing completed for guild {guild_name}: {documents_indexed} ready, {documents_skipped} skipped, "
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
)
return (
documents_indexed,
None,
) # Return None on success (result_message is for logging only)
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Elasticsearch indexer for SurfSense
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Collect all documents and create pending documents (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import json
@ -13,7 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.connectors.elasticsearch_connector import ElasticsearchConnector
from app.db import Document, DocumentType, SearchSourceConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
@ -25,6 +29,7 @@ from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
get_current_timestamp,
safe_set_chunks,
)
# Type hint for heartbeat callback
@ -164,6 +169,8 @@ async def index_elasticsearch_documents(
)
documents_processed = 0
documents_skipped = 0
documents_failed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
@ -178,23 +185,22 @@ async def index_elasticsearch_documents(
"max_documents": max_documents,
},
)
# Use scroll search for large result sets
# =======================================================================
# PHASE 1: Collect all documents from Elasticsearch and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
docs_to_process = [] # List of dicts with document and ES data
new_documents_created = False
hits_collected = 0
async for hit in es_connector.scroll_search(
index=index_name,
query=query,
size=min(max_documents, 100), # Scroll in batches
fields=config.get("ELASTICSEARCH_FIELDS"),
):
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_processed)
last_heartbeat_time = time.time()
if documents_processed >= max_documents:
if hits_collected >= max_documents:
break
try:
@ -220,26 +226,12 @@ async def index_elasticsearch_documents(
if not content.strip():
logger.warning(f"Skipping document {doc_id} - no content found")
documents_skipped += 1
continue
# Create content hash
content_hash = generate_content_hash(content, search_space_id)
# Build metadata
metadata = {
"elasticsearch_id": doc_id,
"elasticsearch_index": hit.get("_index", index_name),
"elasticsearch_score": hit.get("_score"),
"indexed_at": datetime.now().isoformat(),
"source": "ELASTICSEARCH_CONNECTOR",
}
# Add any additional metadata fields specified in config
if "ELASTICSEARCH_METADATA_FIELDS" in config:
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
if field in source:
metadata[f"es_{field}"] = source[field]
# Build source-unique identifier and hash (prefer source id dedupe)
source_identifier = f"{hit.get('_index', index_name)}:{doc_id}"
unique_identifier_hash = generate_unique_identifier_hash(
@ -258,98 +250,223 @@ async def index_elasticsearch_documents(
)
if existing_doc:
# If content is unchanged, skip. Otherwise update the existing document.
# If content is unchanged, skip. Otherwise queue for update.
if existing_doc.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_doc.status, DocumentStatus.READY
):
existing_doc.status = DocumentStatus.ready()
logger.info(
f"Skipping ES doc {doc_id} — already indexed (doc id {existing_doc.id})"
)
continue
else:
logger.info(
f"Updating existing document {existing_doc.id} for ES doc {doc_id}"
)
existing_doc.title = title
existing_doc.content = content
existing_doc.content_hash = content_hash
existing_doc.document_metadata = metadata
existing_doc.unique_identifier_hash = unique_identifier_hash
chunks = await create_document_chunks(content)
existing_doc.chunks = chunks
existing_doc.updated_at = get_current_timestamp()
await session.flush()
documents_processed += 1
if documents_processed % 10 == 0:
await session.commit()
documents_skipped += 1
continue
# Create document
# Queue existing document for update (will be set to processing in Phase 2)
docs_to_process.append(
{
"document": existing_doc,
"is_new": False,
"doc_id": doc_id,
"title": title,
"content": content,
"content_hash": content_hash,
"unique_identifier_hash": unique_identifier_hash,
"hit": hit,
"source": source,
}
)
hits_collected += 1
continue
# Build metadata for new document
metadata = {
"elasticsearch_id": doc_id,
"elasticsearch_index": hit.get("_index", index_name),
"elasticsearch_score": hit.get("_score"),
"source": "ELASTICSEARCH_CONNECTOR",
"connector_id": connector_id,
}
# Add any additional metadata fields specified in config
if "ELASTICSEARCH_METADATA_FIELDS" in config:
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
if field in source:
metadata[f"es_{field}"] = source[field]
# Create new document with PENDING status (visible in UI immediately)
document = Document(
title=title,
content=content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
document_type=DocumentType.ELASTICSEARCH_CONNECTOR,
document_metadata=metadata,
search_space_id=search_space_id,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
# Create chunks and attach to document (persist via relationship)
chunks = await create_document_chunks(content)
document.chunks = chunks
session.add(document)
await session.flush()
new_documents_created = True
docs_to_process.append(
{
"document": document,
"is_new": True,
"doc_id": doc_id,
"title": title,
"content": content,
"content_hash": content_hash,
"unique_identifier_hash": unique_identifier_hash,
"hit": hit,
"source": source,
}
)
hits_collected += 1
except Exception as e:
logger.error(f"Error in Phase 1 for ES doc: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(docs_to_process)} documents")
for item in docs_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_processed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Build metadata
metadata = {
"elasticsearch_id": item["doc_id"],
"elasticsearch_index": item["hit"].get("_index", index_name),
"elasticsearch_score": item["hit"].get("_score"),
"indexed_at": datetime.now().isoformat(),
"source": "ELASTICSEARCH_CONNECTOR",
"connector_id": connector_id,
}
# Add any additional metadata fields specified in config
if "ELASTICSEARCH_METADATA_FIELDS" in config:
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
if field in item["source"]:
metadata[f"es_{field}"] = item["source"][field]
# Create chunks
chunks = await create_document_chunks(item["content"])
# Update document to READY with actual content
document.title = item["title"]
document.content = item["content"]
document.content_hash = item["content_hash"]
document.unique_identifier_hash = item["unique_identifier_hash"]
document.document_metadata = metadata
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_processed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_processed % 10 == 0:
logger.info(
f"Processed {documents_processed} Elasticsearch documents"
f"Committing batch: {documents_processed} Elasticsearch documents processed so far"
)
await session.commit()
except Exception as e:
msg = f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}"
msg = f"Error processing Elasticsearch document {item.get('doc_id', 'unknown')}: {e}"
logger.error(msg)
await task_logger.log_task_failure(
log_entry,
"Document processing error",
msg,
{
"document_id": hit.get("_id", "unknown"),
"error_type": type(e).__name__,
},
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# Final commit
await session.commit()
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
if update_last_indexed:
connector.last_indexed_at = (
datetime.now(UTC).isoformat().replace("+00:00", "Z")
)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_processed} Elasticsearch documents processed"
)
try:
await session.commit()
logger.info(
"Successfully committed all Elasticsearch document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same document was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
await task_logger.log_task_success(
log_entry,
f"Successfully indexed {documents_processed} documents from Elasticsearch",
{"documents_indexed": documents_processed, "index": index_name},
{
"documents_indexed": documents_processed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"index": index_name,
},
)
logger.info(
f"Successfully indexed {documents_processed} documents from Elasticsearch"
f"Elasticsearch indexing completed: {documents_processed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed"
)
# Update last indexed timestamp if requested
if update_last_indexed and documents_processed > 0:
# connector.last_indexed_at = datetime.now()
connector.last_indexed_at = (
datetime.now(UTC).isoformat().replace("+00:00", "Z")
)
await session.commit()
await task_logger.log_task_progress(
log_entry,
"Updated connector.last_indexed_at",
{"last_indexed_at": connector.last_indexed_at},
)
return documents_processed, None
return documents_processed, warning_message
finally:
# Clean up Elasticsearch connection

View file

@ -3,6 +3,10 @@ GitHub connector indexer using gitingest.
This indexer processes entire repository digests in one pass, dramatically
reducing LLM API calls compared to the previous file-by-file approach.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -13,8 +17,8 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.connectors.github_connector import GitHubConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -30,6 +34,8 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
# Type hint for heartbeat callback
@ -164,7 +170,7 @@ async def index_github_repos(
)
return 0, f"Failed to initialize GitHub client: {e!s}"
# 4. Process each repository with gitingest
# 4. Process each repository with gitingest using 2-phase approach
await task_logger.log_task_progress(
log_entry,
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
@ -181,24 +187,25 @@ async def index_github_repos(
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
# =======================================================================
# PHASE 1: Analyze all repos and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
repos_to_process = [] # List of dicts with document and digest data
new_documents_created = False
for repo_full_name in repo_full_names_to_index:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue
logger.info(f"Ingesting repository: {repo_full_name}")
try:
logger.info(f"Phase 1: Analyzing repository: {repo_full_name}")
# Run gitingest via subprocess (isolated from event loop)
# Using to_thread to not block the async database operations
import asyncio
digest = await asyncio.to_thread(
@ -212,30 +219,266 @@ async def index_github_repos(
errors.append(f"No digest for {repo_full_name}")
continue
# Process the digest and create documents
docs_created = await _process_repository_digest(
session=session,
digest=digest,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
connector_id=connector_id,
# Generate unique identifier based on repo name
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
)
documents_processed += docs_created
logger.info(
f"Created {docs_created} documents from repository: {repo_full_name}"
# Generate content hash from digest
full_content = digest.full_digest
content_hash = generate_content_hash(full_content, search_space_id)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(f"Repository {repo_full_name} unchanged. Skipping.")
documents_skipped += 1
continue
# Queue existing document for update (will be set to processing in Phase 2)
logger.info(
f"Content changed for repository {repo_full_name}. Queuing for update."
)
repos_to_process.append(
{
"document": existing_document,
"is_new": False,
"digest": digest,
"content_hash": content_hash,
"repo_full_name": repo_full_name,
"unique_identifier_hash": unique_identifier_hash,
}
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"Repository {repo_full_name} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
documents_skipped += 1
continue
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=repo_full_name,
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata={
"repository_full_name": repo_full_name,
"url": f"https://github.com/{repo_full_name}",
"branch": digest.branch,
"ingestion_method": "gitingest",
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
repos_to_process.append(
{
"document": document,
"is_new": True,
"digest": digest,
"content_hash": content_hash,
"repo_full_name": repo_full_name,
"unique_identifier_hash": unique_identifier_hash,
}
)
except Exception as repo_err:
logger.error(
f"Failed to process repository {repo_full_name}: {repo_err}"
f"Error in Phase 1 for repository {repo_full_name}: {repo_err}",
exc_info=True,
)
errors.append(f"Phase 1 error for {repo_full_name}: {repo_err}")
documents_failed += 1
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(repos_to_process)} documents")
for item in repos_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
digest = item["digest"]
repo_full_name = item["repo_full_name"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
document_metadata_for_summary = {
"repository": repo_full_name,
"document_type": "GitHub Repository",
"connector_type": "GitHub",
"ingestion_method": "gitingest",
"file_tree": digest.tree[:2000]
if len(digest.tree) > 2000
else digest.tree,
"estimated_tokens": digest.estimated_tokens,
}
if user_llm:
# Prepare content for summarization
summary_content = digest.full_digest
if len(summary_content) > MAX_DIGEST_CHARS:
summary_content = (
f"# Repository: {repo_full_name}\n\n"
f"## File Structure\n\n{digest.tree}\n\n"
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
)
summary_text, summary_embedding = await generate_document_summary(
summary_content, user_llm, document_metadata_for_summary
)
else:
# Fallback to simple summary if no LLM configured
summary_text = (
f"# GitHub Repository: {repo_full_name}\n\n"
f"## Summary\n{digest.summary}\n\n"
f"## File Structure\n{digest.tree[:3000]}"
)
summary_embedding = config.embedding_model_instance.embed(
summary_text
)
# Chunk the full digest content for granular search
try:
chunks_data = await create_document_chunks(digest.content)
except Exception as chunk_err:
logger.error(
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
)
chunks_data = await _simple_chunk_content(digest.content)
# Update document to READY with actual content
doc_metadata = {
"repository_full_name": repo_full_name,
"url": f"https://github.com/{repo_full_name}",
"branch": digest.branch,
"ingestion_method": "gitingest",
"file_tree": digest.tree,
"gitingest_summary": digest.summary,
"estimated_tokens": digest.estimated_tokens,
"connector_id": connector_id,
"indexed_at": datetime.now(UTC).isoformat(),
}
document.title = repo_full_name
document.content = summary_text
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = doc_metadata
safe_set_chunks(document, chunks_data)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_processed += 1
documents_indexed += 1
logger.info(
f"Created document for repository {repo_full_name} "
f"with {len(chunks_data)} chunks"
)
# Batch commit every 5 documents (repositories are large)
if documents_indexed % 5 == 0:
logger.info(
f"Committing batch: {documents_indexed} GitHub repos processed so far"
)
await session.commit()
except Exception as repo_err:
logger.error(
f"Error processing repository {repo_full_name}: {repo_err}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(repo_err))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit
await session.commit()
logger.info(
f"Final commit: Total {documents_processed} GitHub repositories processed"
)
try:
await session.commit()
logger.info(
"Successfully committed all GitHub document changes to database"
)
except Exception as e:
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
logger.info(
f"Finished GitHub indexing for connector {connector_id}. "
f"Created {documents_processed} documents."
@ -247,6 +490,8 @@ async def index_github_repos(
f"Successfully completed GitHub indexing for connector {connector_id}",
{
"documents_processed": documents_processed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"errors_count": len(errors),
"repo_count": len(repo_full_names_to_index),
"method": "gitingest",
@ -286,163 +531,6 @@ async def index_github_repos(
return documents_processed, error_message
async def _process_repository_digest(
session: AsyncSession,
digest: RepositoryDigest,
search_space_id: int,
user_id: str,
task_logger: TaskLoggingService,
log_entry,
connector_id: int,
) -> int:
"""
Process a repository digest and create documents.
For each repository, we create:
1. One main document with the repository summary
2. Chunks from the full digest content for granular search
Args:
session: Database session
digest: The repository digest from gitingest
search_space_id: ID of the search space
user_id: ID of the user
task_logger: Task logging service
log_entry: Current log entry
Returns:
Number of documents created
"""
repo_full_name = digest.repo_full_name
documents_created = 0
# Generate unique identifier based on repo name and content hash
# This allows updates when repo content changes
full_content = digest.full_digest
content_hash = generate_content_hash(full_content, search_space_id)
# Use repo name as the unique identifier (one document per repo)
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(f"Repository {repo_full_name} unchanged. Skipping.")
return 0
else:
logger.info(
f"Content changed for repository {repo_full_name}. Updating document."
)
# Delete existing document to replace with new one
await session.delete(existing_document)
await session.flush()
else:
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"Repository {repo_full_name} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
return 0
# Generate summary using LLM (ONE call per repository!)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
document_metadata = {
"repository": repo_full_name,
"document_type": "GitHub Repository",
"connector_type": "GitHub",
"ingestion_method": "gitingest",
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
"estimated_tokens": digest.estimated_tokens,
}
if user_llm:
# Prepare content for summarization
# Include tree structure and truncated content if too large
summary_content = digest.full_digest
if len(summary_content) > MAX_DIGEST_CHARS:
# Truncate but keep the tree and beginning of content
summary_content = (
f"# Repository: {repo_full_name}\n\n"
f"## File Structure\n\n{digest.tree}\n\n"
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
)
summary_text, summary_embedding = await generate_document_summary(
summary_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_text = (
f"# GitHub Repository: {repo_full_name}\n\n"
f"## Summary\n{digest.summary}\n\n"
f"## File Structure\n{digest.tree[:3000]}"
)
summary_embedding = config.embedding_model_instance.embed(summary_text)
# Chunk the full digest content for granular search
try:
# Use the content (not the summary) for chunking
# This preserves file-level granularity in search
chunks_data = await create_document_chunks(digest.content)
except Exception as chunk_err:
logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}")
# Fall back to a simpler chunking approach
chunks_data = await _simple_chunk_content(digest.content)
# Create the document
doc_metadata = {
"repository_full_name": repo_full_name,
"url": f"https://github.com/{repo_full_name}",
"branch": digest.branch,
"ingestion_method": "gitingest",
"file_tree": digest.tree,
"gitingest_summary": digest.summary,
"estimated_tokens": digest.estimated_tokens,
"indexed_at": datetime.now(UTC).isoformat(),
}
document = Document(
title=f"GitHub Repository: {repo_full_name}",
document_type=DocumentType.GITHUB_CONNECTOR,
document_metadata=doc_metadata,
content=summary_text,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
search_space_id=search_space_id,
chunks=chunks_data,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_created += 1
logger.info(
f"Created document for repository {repo_full_name} "
f"with {len(chunks_data)} chunks"
)
return documents_created
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
"""
Simple fallback chunking when the regular chunker fails.

View file

@ -1,5 +1,9 @@
"""
Google Calendar connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -11,7 +15,7 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.google_calendar_connector import GoogleCalendarConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +32,7 @@ from .base import (
get_current_timestamp,
logger,
parse_date_flexible,
safe_set_chunks,
update_connector_last_indexed,
)
@ -305,7 +310,7 @@ async def index_google_calendar_events(
documents_indexed = 0
documents_skipped = 0
skipped_events = []
documents_failed = 0 # Track events that failed processing
duplicate_content_count = (
0 # Track events skipped due to duplicate content_hash
)
@ -313,14 +318,14 @@ async def index_google_calendar_events(
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all events, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
events_to_process = [] # List of dicts with document and event data
new_documents_created = False
for event in events:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
event_id = event.get("id")
event_summary = event.get("summary", "No Title")
@ -328,14 +333,12 @@ async def index_google_calendar_events(
if not event_id:
logger.warning(f"Skipping event with missing ID: {event_summary}")
skipped_events.append(f"{event_summary} (missing ID)")
documents_skipped += 1
continue
event_markdown = calendar_client.format_event_to_markdown(event)
if not event_markdown.strip():
logger.warning(f"Skipping event with no content: {event_summary}")
skipped_events.append(f"{event_summary} (no content)")
documents_skipped += 1
continue
@ -362,82 +365,31 @@ async def index_google_calendar_events(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Google Calendar event {event_summary} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Google Calendar event {event_summary}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location or "No location",
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
event_markdown, user_llm, document_metadata
)
else:
summary_content = (
f"Google Calendar Event: {event_summary}\n\n"
)
summary_content += f"Calendar: {calendar_id}\n"
summary_content += f"Start: {start_time}\n"
summary_content += f"End: {end_time}\n"
if location:
summary_content += f"Location: {location}\n"
if description:
desc_preview = description[:1000]
if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(event_markdown)
# Update existing document
existing_document.title = f"Calendar Event - {event_summary}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
events_to_process.append(
{
"document": existing_document,
"is_new": False,
"event_markdown": event_markdown,
"content_hash": content_hash,
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"description": description,
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(
f"Successfully updated Google Calendar event {event_summary}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -455,55 +407,12 @@ async def index_google_calendar_events(
)
duplicate_content_count += 1
documents_skipped += 1
skipped_events.append(
f"{event_summary} (already indexed by another connector)"
)
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location or "No location",
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
event_markdown, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Google Calendar Event: {event_summary}\n\n"
summary_content += f"Calendar: {calendar_id}\n"
summary_content += f"Start: {start_time}\n"
summary_content += f"End: {end_time}\n"
if location:
summary_content += f"Location: {location}\n"
if description:
desc_preview = description[:1000]
if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(event_markdown)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Calendar Event - {event_summary}",
title=event_summary,
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
document_metadata={
"event_id": event_id,
@ -512,23 +421,133 @@ async def index_google_calendar_events(
"start_time": start_time,
"end_time": end_time,
"location": location,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new event {event_summary}")
new_documents_created = True
# Batch commit every 10 documents
events_to_process.append(
{
"document": document,
"is_new": True,
"event_markdown": event_markdown,
"content_hash": content_hash,
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"description": description,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
for item in events_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"event_id": item["event_id"],
"event_summary": item["event_summary"],
"calendar_id": item["calendar_id"],
"start_time": item["start_time"],
"end_time": item["end_time"],
"location": item["location"] or "No location",
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["event_markdown"], user_llm, document_metadata_for_summary
)
else:
summary_content = (
f"Google Calendar Event: {item['event_summary']}\n\n"
)
summary_content += f"Calendar: {item['calendar_id']}\n"
summary_content += f"Start: {item['start_time']}\n"
summary_content += f"End: {item['end_time']}\n"
if item["location"]:
summary_content += f"Location: {item['location']}\n"
if item["description"]:
desc_preview = item["description"][:1000]
if len(item["description"]) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["event_markdown"])
# Update document to READY with actual content
document.title = item["event_summary"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"event_id": item["event_id"],
"event_summary": item["event_summary"],
"calendar_id": item["calendar_id"],
"start_time": item["start_time"],
"end_time": item["end_time"],
"location": item["location"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
@ -536,19 +555,20 @@ async def index_google_calendar_events(
await session.commit()
except Exception as e:
logger.error(
f"Error processing event {event.get('summary', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_events.append(
f"{event.get('summary', 'Unknown')} (processing error)"
)
documents_skipped += 1
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
@ -556,6 +576,9 @@ async def index_google_calendar_events(
)
try:
await session.commit()
logger.info(
"Successfully committed all Google Calendar document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
@ -572,10 +595,15 @@ async def index_google_calendar_events(
else:
raise
# Build warning message if duplicates were found
warning_message = None
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_message = f"{duplicate_content_count} skipped (duplicate)"
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
total_processed = documents_indexed
await task_logger.log_task_success(
log_entry,
@ -584,14 +612,15 @@ async def index_google_calendar_events(
"events_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
"skipped_events_count": len(skipped_events),
},
)
logger.info(
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
f"({duplicate_content_count} due to duplicate content from other connectors)"
f"Google Calendar indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
return total_processed, warning_message

View file

@ -1,4 +1,9 @@
"""Google Drive indexer using Surfsense file processors."""
"""Google Drive indexer using Surfsense file processors.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import logging
import time
@ -17,11 +22,12 @@ from app.connectors.google_drive import (
get_files_in_folder,
get_start_page_token,
)
from app.db import DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
check_document_by_unique_identifier,
get_connector_by_id,
get_current_timestamp,
update_connector_last_indexed,
)
from app.utils.document_converters import generate_unique_identifier_hash
@ -324,8 +330,29 @@ async def index_google_drive_single_file(
display_name = file_name or file.get("name", "Unknown")
logger.info(f"Indexing Google Drive file: {display_name} ({file_id})")
# Create pending document for status visibility
pending_doc, should_skip = await _create_pending_document_for_file(
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
)
if should_skip:
await task_logger.log_task_progress(
log_entry,
f"File {display_name} is unchanged or not indexable",
{"status": "skipped"},
)
return 0, None
# Commit pending document so it appears in UI
if pending_doc and pending_doc.id is None:
await session.commit()
# Process the file
indexed, skipped = await _process_single_file(
indexed, skipped, failed = await _process_single_file(
drive_client=drive_client,
session=session,
file=file,
@ -334,6 +361,7 @@ async def index_google_drive_single_file(
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
)
await session.commit()
@ -341,6 +369,15 @@ async def index_google_drive_single_file(
"Successfully committed Google Drive file indexing changes to database"
)
if failed > 0:
error_msg = f"Failed to index file {display_name}"
await task_logger.log_task_failure(
log_entry,
error_msg,
{"file_name": display_name, "file_id": file_id},
)
return 0, error_msg
if indexed > 0:
await task_logger.log_task_success(
log_entry,
@ -397,7 +434,12 @@ async def _index_full_scan(
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""Perform full scan indexing of a folder."""
"""Perform full scan indexing of a folder.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Collect all files and create pending documents (visible in UI immediately)
- Phase 2: Process each file: pending processing ready/failed
"""
await task_logger.log_task_progress(
log_entry,
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
@ -410,29 +452,31 @@ async def _index_full_scan(
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Collect all files and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
files_to_process = [] # List of (file, pending_document or None)
new_documents_created = False
# Queue of folders to process: (folder_id, folder_name)
folders_to_process = [(folder_id, folder_name)]
logger.info("Phase 1: Collecting files and creating pending documents")
while folders_to_process and files_processed < max_files:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
current_folder_id, current_folder_name = folders_to_process.pop(0)
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
logger.info(f"Scanning folder: {current_folder_name} ({current_folder_id})")
page_token = None
while files_processed < max_files:
# Get files and folders in current folder
# include_subfolders=True here so we get folder items to queue them
files, next_token, error = await get_files_in_folder(
drive_client,
current_folder_id,
@ -462,35 +506,74 @@ async def _index_full_scan(
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
continue
# Process the file
files_processed += 1
indexed, skipped = await _process_single_file(
drive_client=drive_client,
# Create pending document for this file
pending_doc, should_skip = await _create_pending_document_for_file(
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
)
documents_indexed += indexed
documents_skipped += skipped
if should_skip:
documents_skipped += 1
continue
if documents_indexed % 10 == 0 and documents_indexed > 0:
await session.commit()
logger.info(
f"Committed batch: {documents_indexed} files indexed so far"
)
if pending_doc and pending_doc.id is None:
# New document was created
new_documents_created = True
files_to_process.append((file, pending_doc))
page_token = next_token
if not page_token:
break
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each file one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(files_to_process)} files")
for file, pending_doc in files_to_process:
# Check if it's time for a heartbeat update
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
indexed, skipped, failed = await _process_single_file(
drive_client=drive_client,
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
)
documents_indexed += indexed
documents_skipped += skipped
documents_failed += failed
if documents_indexed % 10 == 0 and documents_indexed > 0:
await session.commit()
logger.info(f"Committed batch: {documents_indexed} files indexed so far")
logger.info(
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed"
)
return documents_indexed, documents_skipped
@ -514,6 +597,10 @@ async def _index_with_delta_sync(
Note: include_subfolders is accepted for API consistency but delta sync
automatically tracks changes across all folders including subfolders.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Collect all changes and create pending documents (visible in UI immediately)
- Phase 2: Process each file: pending processing ready/failed
"""
await task_logger.log_task_progress(
log_entry,
@ -537,19 +624,21 @@ async def _index_with_delta_sync(
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze changes and create pending documents for new/modified files
# =======================================================================
changes_to_process = [] # List of (change, file, pending_document or None)
new_documents_created = False
logger.info("Phase 1: Analyzing changes and creating pending documents")
for change in changes:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if files_processed >= max_files:
break
@ -566,7 +655,45 @@ async def _index_with_delta_sync(
if not file:
continue
indexed, skipped = await _process_single_file(
# Create pending document for this file
pending_doc, should_skip = await _create_pending_document_for_file(
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
)
if should_skip:
documents_skipped += 1
continue
if pending_doc and pending_doc.id is None:
# New document was created
new_documents_created = True
changes_to_process.append((change, file, pending_doc))
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info("Phase 1: Committing pending documents")
await session.commit()
# =======================================================================
# PHASE 2: Process each file one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(changes_to_process)} changes")
for _, file, pending_doc in changes_to_process:
# Check if it's time for a heartbeat update
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
indexed, skipped, failed = await _process_single_file(
drive_client=drive_client,
session=session,
file=file,
@ -575,21 +702,125 @@ async def _index_with_delta_sync(
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
pending_document=pending_doc,
)
documents_indexed += indexed
documents_skipped += skipped
documents_failed += failed
if documents_indexed % 10 == 0 and documents_indexed > 0:
await session.commit()
logger.info(f"Committed batch: {documents_indexed} changes processed")
logger.info(
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped"
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed"
)
return documents_indexed, documents_skipped
async def _create_pending_document_for_file(
session: AsyncSession,
file: dict,
connector_id: int,
search_space_id: int,
user_id: str,
) -> tuple[Document | None, bool]:
"""
Create a pending document for a Google Drive file if it doesn't exist.
This is Phase 1 of the 2-phase document status update pattern.
Creates documents with 'pending' status so they appear in UI immediately.
Args:
session: Database session
file: File metadata from Google Drive API
connector_id: ID of the Drive connector
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Tuple of (document, should_skip):
- (existing_doc, False): Existing document that needs update
- (new_pending_doc, False): New pending document created
- (None, True): File should be skipped (unchanged, rename-only, or folder)
"""
from app.connectors.google_drive.file_types import should_skip_file
file_id = file.get("id")
file_name = file.get("name", "Unknown")
mime_type = file.get("mimeType", "")
# Skip folders and shortcuts
if should_skip_file(mime_type):
return None, True
if not file_id:
return None, True
# Generate unique identifier hash for this file
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
# Check if document exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Check if this is a rename-only update (content unchanged)
incoming_md5 = file.get("md5Checksum")
incoming_modified_time = file.get("modifiedTime")
doc_metadata = existing_document.document_metadata or {}
stored_md5 = doc_metadata.get("md5_checksum")
stored_modified_time = doc_metadata.get("modified_time")
# Determine if content changed
content_unchanged = False
if incoming_md5 and stored_md5:
content_unchanged = incoming_md5 == stored_md5
elif not incoming_md5 and incoming_modified_time and stored_modified_time:
# Google Workspace file - use modifiedTime as fallback
content_unchanged = incoming_modified_time == stored_modified_time
if content_unchanged:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
return None, True
# Content changed - return existing document for update
return existing_document, False
# Create new pending document
document = Document(
search_space_id=search_space_id,
title=file_name,
document_type=DocumentType.GOOGLE_DRIVE_FILE,
document_metadata={
"google_drive_file_id": file_id,
"google_drive_file_name": file_name,
"google_drive_mime_type": mime_type,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
return document, False
async def _check_rename_only_update(
session: AsyncSession,
file: dict,
@ -725,15 +956,31 @@ async def _process_single_file(
user_id: str,
task_logger: TaskLoggingService,
log_entry: any,
) -> tuple[int, int]:
pending_document: Document | None = None,
) -> tuple[int, int, int]:
"""
Process a single file by downloading and using Surfsense's file processor.
Implements Phase 2 of the 2-phase document status update pattern.
Updates document status: pending processing ready/failed
Args:
drive_client: Google Drive client
session: Database session
file: File metadata from Google Drive API
connector_id: ID of the connector
search_space_id: ID of the search space
user_id: ID of the user
task_logger: Task logging service
log_entry: Log entry for tracking
pending_document: Optional pending document created in Phase 1
Returns:
Tuple of (indexed_count, skipped_count)
Tuple of (indexed_count, skipped_count, failed_count)
"""
file_name = file.get("name", "Unknown")
mime_type = file.get("mimeType", "")
file_id = file.get("id")
try:
logger.info(f"Processing file: {file_name} ({mime_type})")
@ -756,10 +1003,15 @@ async def _process_single_file(
# Return 1 for renamed files (they are "indexed" in the sense that they're updated)
# Return 0 for unchanged files
if "renamed" in (rename_message or "").lower():
return 1, 0
return 0, 1
return 1, 0, 0
return 0, 1, 0
_, error, _ = await download_and_process_file(
# Set document to PROCESSING status if we have a pending document
if pending_document:
pending_document.status = DocumentStatus.processing()
await session.commit()
_, error, metadata = await download_and_process_file(
client=drive_client,
file=file,
search_space_id=search_space_id,
@ -776,14 +1028,46 @@ async def _process_single_file(
f"Skipped {file_name}: {error}",
{"status": "skipped", "reason": error},
)
return 0, 1
# Mark pending document as failed if it exists
if pending_document:
pending_document.status = DocumentStatus.failed(error)
pending_document.updated_at = get_current_timestamp()
await session.commit()
return 0, 1, 0
# The document was created/updated by download_and_process_file
# Find the document and ensure it has READY status
if file_id:
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
processed_doc = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Ensure status is READY
if processed_doc and not DocumentStatus.is_state(
processed_doc.status, DocumentStatus.READY
):
processed_doc.status = DocumentStatus.ready()
processed_doc.updated_at = get_current_timestamp()
await session.commit()
logger.info(f"Successfully indexed Google Drive file: {file_name}")
return 1, 0
return 1, 0, 0
except Exception as e:
logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True)
return 0, 1
# Mark pending document as failed if it exists
if pending_document:
try:
pending_document.status = DocumentStatus.failed(str(e))
pending_document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
return 0, 0, 1
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):

View file

@ -1,5 +1,9 @@
"""
Google Gmail connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -13,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.google_gmail_connector import GoogleGmailConnector
from app.db import (
Document,
DocumentStatus,
DocumentType,
SearchSourceConnectorType,
)
@ -32,6 +37,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -220,20 +226,23 @@ async def index_google_gmail_messages(
logger.info(f"Found {len(messages)} Google gmail messages to index")
documents_indexed = 0
skipped_messages = []
documents_skipped = 0
documents_failed = 0 # Track messages that failed processing
duplicate_content_count = (
0 # Track messages skipped due to duplicate content_hash
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all messages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
messages_to_process = [] # List of dicts with document and message data
new_documents_created = False
for message in messages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Extract message information
message_id = message.get("id", "")
@ -259,7 +268,6 @@ async def index_google_gmail_messages(
if not message_id:
logger.warning(f"Skipping message with missing ID: {subject}")
skipped_messages.append(f"{subject} (missing ID)")
documents_skipped += 1
continue
@ -268,7 +276,6 @@ async def index_google_gmail_messages(
if not markdown_content.strip():
logger.warning(f"Skipping message with no content: {subject}")
skipped_messages.append(f"{subject} (no content)")
documents_skipped += 1
continue
@ -288,68 +295,29 @@ async def index_google_gmail_messages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Gmail message {subject} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Gmail message {subject}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
summary_content = f"Google Gmail Message: {subject}\n\n"
summary_content += f"Sender: {sender}\n"
summary_content += f"Date: {date_str}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(markdown_content)
# Update existing document
existing_document.title = f"Gmail: {subject}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"connector_id": connector_id,
"date_str": date_str,
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(f"Successfully updated Gmail message {subject}")
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -364,48 +332,14 @@ async def index_google_gmail_messages(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Google Gmail Message: {subject}\n\n"
summary_content += f"Sender: {sender}\n"
summary_content += f"Date: {date_str}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(markdown_content)
# Create and store new document
logger.info(f"Creating new document for Gmail message: {subject}")
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Gmail: {subject}",
title=subject,
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
document_metadata={
"message_id": message_id,
@ -413,21 +347,120 @@ async def index_google_gmail_messages(
"subject": subject,
"sender": sender,
"date": date_str,
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new email {summary_content}")
new_documents_created = True
# Batch commit every 10 documents
messages_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date_str": date_str,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
for item in messages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"date": item["date_str"],
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
summary_content = f"Google Gmail Message: {item['subject']}\n\n"
summary_content += f"Sender: {item['sender']}\n"
summary_content += f"Date: {item['date_str']}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["subject"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"date": item["date_str"],
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Gmail messages processed so far"
@ -435,45 +468,76 @@ async def index_google_gmail_messages(
await session.commit()
except Exception as e:
logger.error(
f"Error processing the email {message_id}: {e!s}",
exc_info=True,
)
skipped_messages.append(f"{subject} (processing error)")
documents_skipped += 1
continue # Skip this message and continue with others
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
await session.commit()
logger.info(
"Successfully committed all Google gmail document changes to database"
)
try:
await session.commit()
logger.info(
"Successfully committed all Google Gmail document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same message was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
total_processed = documents_indexed
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Google gmail indexing for connector {connector_id}",
f"Successfully completed Google Gmail indexing for connector {connector_id}",
{
"events_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_messages_count": len(skipped_messages),
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Google gmail indexing completed: {documents_indexed} new emails, {documents_skipped} skipped"
f"Google Gmail indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
warning_message,
) # Return warning_message (None on success)
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Jira connector indexer.
Provides real-time document status updates during indexing using a two-phase approach:
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
- Phase 2: Process each document one by one (PENDING PROCESSING READY/FAILED)
"""
import contextlib
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.jira_history import JiraHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -29,6 +33,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -174,22 +179,22 @@ async def index_jira_issues(
logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True)
return 0, f"Error fetching Jira issues: {e!s}"
# Process and index each issue
# =======================================================================
# PHASE 1: Analyze all issues, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
documents_indexed = 0
skipped_issues = []
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
issues_to_process = [] # List of dicts with document and issue data
new_documents_created = False
for issue in issues:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
issue_id = issue.get("key")
issue_identifier = issue.get("key", "")
@ -199,9 +204,6 @@ async def index_jira_issues(
logger.warning(
f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
)
skipped_issues.append(
f"{issue_identifier or 'Unknown'} (missing data)"
)
documents_skipped += 1
continue
@ -215,7 +217,6 @@ async def index_jira_issues(
logger.warning(
f"Skipping issue with no content: {issue_identifier} - {issue_title}"
)
skipped_issues.append(f"{issue_identifier} (no content)")
documents_skipped += 1
continue
@ -237,73 +238,29 @@ async def index_jira_issues(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Jira issue {issue_identifier} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Jira issue {issue_identifier}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"issue_key": issue_identifier,
"issue_title": issue_title,
"status": formatted_issue.get("status", "Unknown"),
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Jira Issue",
"connector_type": "Jira",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
if formatted_issue.get("description"):
summary_content += f"Description: {formatted_issue.get('description')}\n\n"
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(issue_content)
# Update existing document
existing_document.title = (
f"Jira - {issue_identifier}: {issue_title}"
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
issues_to_process.append(
{
"document": existing_document,
"is_new": False,
"issue_content": issue_content,
"content_hash": content_hash,
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"state": formatted_issue.get("status", "Unknown"),
"formatted_issue": formatted_issue,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(
f"Successfully updated Jira issue {issue_identifier}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -318,53 +275,14 @@ async def index_jira_issues(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"issue_key": issue_identifier,
"issue_title": issue_title,
"status": formatted_issue.get("status", "Unknown"),
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Jira Issue",
"connector_type": "Jira",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
if formatted_issue.get("description"):
summary_content += (
f"Description: {formatted_issue.get('description')}\n\n"
)
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content)
# Create and store new document
logger.info(
f"Creating new document for issue {issue_identifier} - {issue_title}"
)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Jira - {issue_identifier}: {issue_title}",
title=f"{issue_identifier}: {issue_title}",
document_type=DocumentType.JIRA_CONNECTOR,
document_metadata={
"issue_id": issue_id,
@ -372,25 +290,122 @@ async def index_jira_issues(
"issue_title": issue_title,
"state": formatted_issue.get("status", "Unknown"),
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
new_documents_created = True
issues_to_process.append(
{
"document": document,
"is_new": True,
"issue_content": issue_content,
"content_hash": content_hash,
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"formatted_issue": formatted_issue,
"comment_count": comment_count,
}
)
# Batch commit every 10 documents
except Exception as e:
logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(issues_to_process)} documents")
for item in issues_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"issue_key": item["issue_identifier"],
"issue_title": item["issue_title"],
"status": item["formatted_issue"].get("status", "Unknown"),
"priority": item["formatted_issue"].get("priority", "Unknown"),
"comment_count": item["comment_count"],
"document_type": "Jira Issue",
"connector_type": "Jira",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["issue_content"], user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n"
if item["formatted_issue"].get("description"):
summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n"
summary_content += f"Comments: {item['comment_count']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments
chunks = await create_document_chunks(item["issue_content"])
# Update document to READY with actual content
document.title = f"{item['issue_identifier']}: {item['issue_title']}"
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"issue_id": item["issue_id"],
"issue_identifier": item["issue_identifier"],
"issue_title": item["issue_title"],
"state": item["formatted_issue"].get("status", "Unknown"),
"comment_count": item["comment_count"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Jira issues processed so far"
@ -399,48 +414,75 @@ async def index_jira_issues(
except Exception as e:
logger.error(
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_issues.append(
f"{issue.get('identifier', 'Unknown')} (processing error)"
)
documents_skipped += 1
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue # Skip this issue and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
# Final commit to ensure all documents are persisted (safety net)
logger.info(f"Final commit: Total {documents_indexed} Jira issues processed")
await session.commit()
logger.info("Successfully committed all JIRA document changes to database")
try:
await session.commit()
logger.info("Successfully committed all JIRA document changes to database")
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same issue was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed JIRA indexing for connector {connector_id}",
{
"issues_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"skipped_issues_count": len(skipped_issues),
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
f"JIRA indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
# Clean up the connector
await jira_client.close()
return (
total_processed,
None,
) # Return None as the error message to indicate success
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Linear connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.linear_connector import LinearConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +32,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -196,6 +201,7 @@ async def index_linear_issues(
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
documents_failed = 0 # Track issues that failed processing
skipped_issues = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
@ -207,16 +213,14 @@ async def index_linear_issues(
{"stage": "process_issues", "total_issues": len(issues)},
)
# Process each issue
for issue in issues:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all issues, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
issues_to_process = [] # List of dicts with document and issue data
new_documents_created = False
for issue in issues:
try:
issue_id = issue.get("id", "")
issue_identifier = issue.get("identifier", "")
@ -262,80 +266,39 @@ async def index_linear_issues(
state = formatted_issue.get("state", "Unknown")
description = formatted_issue.get("description", "")
comment_count = len(formatted_issue.get("comments", []))
priority = formatted_issue.get("priority", "Unknown")
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(
f"Document for Linear issue {issue_identifier} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Linear issue {issue_identifier}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"state": state,
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
if description and len(description) > 1000:
description = description[:997] + "..."
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(issue_content)
# Update existing document
existing_document.title = (
f"Linear - {issue_identifier}: {issue_title}"
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
issues_to_process.append(
{
"document": existing_document,
"is_new": False,
"issue_content": issue_content,
"content_hash": content_hash,
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"state": state,
"description": description,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"priority": priority,
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(
f"Successfully updated Linear issue {issue_identifier}"
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -353,51 +316,10 @@ async def index_linear_issues(
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"state": state,
"priority": formatted_issue.get("priority", "Unknown"),
"comment_count": comment_count,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
issue_content, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
# Truncate description if it's too long for the summary
if description and len(description) > 1000:
description = description[:997] + "..."
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
summary_content += f"Comments: {comment_count}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks - using the full issue content with comments
chunks = await create_document_chunks(issue_content)
# Create and store new document
logger.info(
f"Creating new document for issue {issue_identifier} - {issue_title}"
)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Linear - {issue_identifier}: {issue_title}",
title=f"{issue_identifier}: {issue_title}",
document_type=DocumentType.LINEAR_CONNECTOR,
document_metadata={
"issue_id": issue_id,
@ -405,25 +327,126 @@ async def index_linear_issues(
"issue_title": issue_title,
"state": state,
"comment_count": comment_count,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
new_documents_created = True
issues_to_process.append(
{
"document": document,
"is_new": True,
"issue_content": issue_content,
"content_hash": content_hash,
"issue_id": issue_id,
"issue_identifier": issue_identifier,
"issue_title": issue_title,
"state": state,
"description": description,
"comment_count": comment_count,
"priority": priority,
}
)
# Batch commit every 10 documents
except Exception as e:
logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(issues_to_process)} documents")
for item in issues_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"issue_id": item["issue_identifier"],
"issue_title": item["issue_title"],
"state": item["state"],
"priority": item["priority"],
"comment_count": item["comment_count"],
"document_type": "Linear Issue",
"connector_type": "Linear",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["issue_content"], user_llm, document_metadata_for_summary
)
else:
# Fallback to simple summary if no LLM configured
description = item["description"]
if description and len(description) > 1000:
description = description[:997] + "..."
summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n"
if description:
summary_content += f"Description: {description}\n\n"
summary_content += f"Comments: {item['comment_count']}"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["issue_content"])
# Update document to READY with actual content
document.title = f"{item['issue_identifier']}: {item['issue_title']}"
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"issue_id": item["issue_id"],
"issue_identifier": item["issue_identifier"],
"issue_title": item["issue_title"],
"state": item["state"],
"comment_count": item["comment_count"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Linear issues processed so far"
@ -432,44 +455,72 @@ async def index_linear_issues(
except Exception as e:
logger.error(
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
skipped_issues.append(
f"{issue.get('identifier', 'Unknown')} (processing error)"
f"{item.get('issue_identifier', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this issue and continue with others
documents_failed += 1
continue
# Update the last_indexed_at timestamp for the connector only if requested
total_processed = documents_indexed
if update_last_indexed:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} Linear issues processed")
await session.commit()
logger.info("Successfully committed all Linear document changes to database")
try:
await session.commit()
logger.info(
"Successfully committed all Linear document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same issue was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Build warning message if there were issues
warning_parts = []
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Linear indexing for connector {connector_id}",
{
"issues_processed": total_processed,
"issues_processed": documents_indexed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"skipped_issues_count": len(skipped_issues),
},
)
logger.info(
f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
f"Linear indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed"
)
return (
total_processed,
None,
) # Return None as the error message to indicate success
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Luma connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Collect all events and create pending documents (visible in UI immediately)
- Phase 2: Process each event: pending processing ready/failed
"""
import time
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.luma_connector import LumaConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -27,6 +31,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -227,21 +232,22 @@ async def index_luma_events(
logger.error(f"Error fetching Luma events: {e!s}", exc_info=True)
return 0, f"Error fetching Luma events: {e!s}"
# =======================================================================
# PHASE 1: Analyze all events, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
skipped_events = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
events_to_process = [] # List of dicts with document and event data
new_documents_created = False
for event in events:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Luma event structure fields - events have nested 'event' field
event_data = event.get("event", {})
@ -298,91 +304,38 @@ async def index_luma_events(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(
f"Document for Luma event {event_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Luma event {event_name}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"event_name": event_name,
"event_url": event_url,
"start_at": start_at,
"end_at": end_at,
"timezone": timezone,
"location": location or "No location",
"city": city,
"hosts": host_names,
"document_type": "Luma Event",
"connector_type": "Luma",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
event_markdown, user_llm, document_metadata
)
else:
summary_content = f"Luma Event: {event_name}\n\n"
if event_url:
summary_content += f"URL: {event_url}\n"
summary_content += f"Start: {start_at}\n"
summary_content += f"End: {end_at}\n"
if timezone:
summary_content += f"Timezone: {timezone}\n"
if location:
summary_content += f"Location: {location}\n"
if city:
summary_content += f"City: {city}\n"
if host_names:
summary_content += f"Hosts: {host_names}\n"
if description:
desc_preview = description[:1000]
if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(event_markdown)
# Update existing document
existing_document.title = f"Luma Event - {event_name}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
events_to_process.append(
{
"document": existing_document,
"is_new": False,
"event_id": event_id,
"event_name": event_name,
"event_url": event_url,
"event_markdown": event_markdown,
"content_hash": content_hash,
"start_at": start_at,
"end_at": end_at,
"timezone": timezone,
"location": location,
"city": city,
"hosts": host_names,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"host_names": host_names,
"description": description,
"cover_url": cover_url,
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(f"Successfully updated Luma event {event_name}")
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -400,62 +353,10 @@ async def index_luma_events(
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"event_id": event_id,
"event_name": event_name,
"event_url": event_url,
"start_at": start_at,
"end_at": end_at,
"timezone": timezone,
"location": location or "No location",
"city": city,
"hosts": host_names,
"document_type": "Luma Event",
"connector_type": "Luma",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
event_markdown, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Luma Event: {event_name}\n\n"
if event_url:
summary_content += f"URL: {event_url}\n"
summary_content += f"Start: {start_at}\n"
summary_content += f"End: {end_at}\n"
if timezone:
summary_content += f"Timezone: {timezone}\n"
if location:
summary_content += f"Location: {location}\n"
if city:
summary_content += f"City: {city}\n"
if host_names:
summary_content += f"Hosts: {host_names}\n"
if description:
desc_preview = description[:1000]
if len(description) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(event_markdown)
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Luma Event - {event_name}",
title=event_name,
document_type=DocumentType.LUMA_CONNECTOR,
document_metadata={
"event_id": event_id,
@ -468,23 +369,151 @@ async def index_luma_events(
"city": city,
"hosts": host_names,
"cover_url": cover_url,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new event {event_name}")
new_documents_created = True
# Batch commit every 10 documents
events_to_process.append(
{
"document": document,
"is_new": True,
"event_id": event_id,
"event_name": event_name,
"event_url": event_url,
"event_markdown": event_markdown,
"content_hash": content_hash,
"start_at": start_at,
"end_at": end_at,
"timezone": timezone,
"location": location,
"city": city,
"host_names": host_names,
"description": description,
"cover_url": cover_url,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
for item in events_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"event_id": item["event_id"],
"event_name": item["event_name"],
"event_url": item["event_url"],
"start_at": item["start_at"],
"end_at": item["end_at"],
"timezone": item["timezone"],
"location": item["location"] or "No location",
"city": item["city"],
"hosts": item["host_names"],
"document_type": "Luma Event",
"connector_type": "Luma",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["event_markdown"], user_llm, document_metadata_for_summary
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Luma Event: {item['event_name']}\n\n"
if item["event_url"]:
summary_content += f"URL: {item['event_url']}\n"
summary_content += f"Start: {item['start_at']}\n"
summary_content += f"End: {item['end_at']}\n"
if item["timezone"]:
summary_content += f"Timezone: {item['timezone']}\n"
if item["location"]:
summary_content += f"Location: {item['location']}\n"
if item["city"]:
summary_content += f"City: {item['city']}\n"
if item["host_names"]:
summary_content += f"Hosts: {item['host_names']}\n"
if item["description"]:
desc_preview = item["description"][:1000]
if len(item["description"]) > 1000:
desc_preview += "..."
summary_content += f"Description: {desc_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["event_markdown"])
# Update document to READY with actual content
document.title = item["event_name"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"event_id": item["event_id"],
"event_name": item["event_name"],
"event_url": item["event_url"],
"start_at": item["start_at"],
"end_at": item["end_at"],
"timezone": item["timezone"],
"location": item["location"],
"city": item["city"],
"hosts": item["host_names"],
"cover_url": item["cover_url"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Luma events processed so far"
@ -493,38 +522,71 @@ async def index_luma_events(
except Exception as e:
logger.error(
f"Error processing event {event.get('name', 'Unknown')}: {e!s}",
f"Error processing event {item.get('event_name', 'Unknown')}: {e!s}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
skipped_events.append(
f"{event.get('name', 'Unknown')} (processing error)"
f"{item.get('event_name', 'Unknown')} (processing error)"
)
documents_skipped += 1
documents_failed += 1
continue
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
# This ensures the UI shows "Last indexed" instead of "Never indexed"
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} Luma events processed")
await session.commit()
try:
await session.commit()
logger.info("Successfully committed all Luma document changes to database")
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
await task_logger.log_task_success(
log_entry,
f"Successfully completed Luma indexing for connector {connector_id}",
{
"events_processed": total_processed,
"events_processed": documents_indexed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"skipped_events_count": len(skipped_events),
},
)
logger.info(
f"Luma indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
f"Luma indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed"
)
return total_processed, None
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Notion connector indexer.
Implements real-time document status updates using a two-phase approach:
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
- Phase 2: Process each document one by one (pending processing ready/failed)
"""
import time
@ -9,8 +13,9 @@ from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.notion_history import NotionHistoryConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +33,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -245,12 +251,17 @@ async def index_notion_pages(
{"pages_found": 0},
)
logger.info("No Notion pages found to index")
# CRITICAL: Update timestamp even when no pages found so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
await notion_client.close()
return 0, None # Success with 0 pages, not an error
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
skipped_pages = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
@ -262,22 +273,69 @@ async def index_notion_pages(
{"stage": "process_pages", "total_pages": len(pages)},
)
# Process each page
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all pages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
pages_to_process = [] # List of dicts with document and page data
new_documents_created = False
# Helper function to convert page content to markdown
def process_blocks(blocks, level=0):
result = ""
for block in blocks:
block_type = block.get("type")
block_content = block.get("content", "")
children = block.get("children", [])
# Add indentation based on level
indent = " " * level
# Format based on block type
if block_type in ["paragraph", "text"]:
result += f"{indent}{block_content}\n\n"
elif block_type in ["heading_1", "header"]:
result += f"{indent}# {block_content}\n\n"
elif block_type == "heading_2":
result += f"{indent}## {block_content}\n\n"
elif block_type == "heading_3":
result += f"{indent}### {block_content}\n\n"
elif block_type == "bulleted_list_item":
result += f"{indent}* {block_content}\n"
elif block_type == "numbered_list_item":
result += f"{indent}1. {block_content}\n"
elif block_type == "to_do":
result += f"{indent}- [ ] {block_content}\n"
elif block_type == "toggle":
result += f"{indent}> {block_content}\n"
elif block_type == "code":
result += f"{indent}```\n{block_content}\n```\n\n"
elif block_type == "quote":
result += f"{indent}> {block_content}\n\n"
elif block_type == "callout":
result += f"{indent}> **Note:** {block_content}\n\n"
elif block_type == "image":
result += f"{indent}![Image]({block_content})\n\n"
else:
# Default for other block types
if block_content:
result += f"{indent}{block_content}\n\n"
# Process children recursively
if children:
result += process_blocks(children, level + 1)
return result
for page in pages:
try:
page_id = page.get("page_id")
page_title = page.get("title", f"Untitled page ({page_id})")
page_content = page.get("content", [])
logger.info(f"Processing Notion page: {page_title} ({page_id})")
if not page_id:
documents_skipped += 1
continue
if not page_content:
logger.info(f"No content found in page {page_title}. Skipping.")
@ -287,57 +345,6 @@ async def index_notion_pages(
# Convert page content to markdown format
markdown_content = f"# Notion Page: {page_title}\n\n"
# Process blocks recursively
def process_blocks(blocks, level=0):
result = ""
for block in blocks:
block_type = block.get("type")
block_content = block.get("content", "")
children = block.get("children", [])
# Add indentation based on level
indent = " " * level
# Format based on block type
if block_type in ["paragraph", "text"]:
result += f"{indent}{block_content}\n\n"
elif block_type in ["heading_1", "header"]:
result += f"{indent}# {block_content}\n\n"
elif block_type == "heading_2":
result += f"{indent}## {block_content}\n\n"
elif block_type == "heading_3":
result += f"{indent}### {block_content}\n\n"
elif block_type == "bulleted_list_item":
result += f"{indent}* {block_content}\n"
elif block_type == "numbered_list_item":
result += f"{indent}1. {block_content}\n"
elif block_type == "to_do":
result += f"{indent}- [ ] {block_content}\n"
elif block_type == "toggle":
result += f"{indent}> {block_content}\n"
elif block_type == "code":
result += f"{indent}```\n{block_content}\n```\n\n"
elif block_type == "quote":
result += f"{indent}> {block_content}\n\n"
elif block_type == "callout":
result += f"{indent}> **Note:** {block_content}\n\n"
elif block_type == "image":
result += f"{indent}![Image]({block_content})\n\n"
else:
# Default for other block types
if block_content:
result += f"{indent}{block_content}\n\n"
# Process children recursively
if children:
result += process_blocks(children, level + 1)
return result
logger.debug(
f"Converting {len(page_content)} blocks to markdown for page {page_title}"
)
markdown_content += process_blocks(page_content)
# Format document metadata
@ -377,71 +384,26 @@ async def index_notion_pages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
f"Document for Notion page {page_title} unchanged. Skipping."
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Notion page {page_title}. Updating document."
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if not user_llm:
logger.error(
f"No long context LLM configured for user {user_id}"
)
skipped_pages.append(f"{page_title} (no LLM configured)")
documents_skipped += 1
continue
# Generate summary with metadata
document_metadata = {
"page_title": page_title,
# Queue existing document for update (will be set to processing in Phase 2)
pages_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"page_id": page_id,
"document_type": "Notion Page",
"connector_type": "Notion",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
# Process chunks
chunks = await create_document_chunks(markdown_content)
# Update existing document
existing_document.title = f"Notion - {page_title}"
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"page_title": page_title,
"page_id": page_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
existing_document.connector_id = connector_id
documents_indexed += 1
logger.info(f"Successfully updated Notion page: {page_title}")
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} documents processed so far"
)
await session.commit()
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -456,91 +418,182 @@ async def index_notion_pages(
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Get user's long context LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if not user_llm:
logger.error(f"No long context LLM configured for user {user_id}")
skipped_pages.append(f"{page_title} (no LLM configured)")
documents_skipped += 1
continue
# Generate summary with metadata
logger.debug(f"Generating summary for page {page_title}")
document_metadata = {
"page_title": page_title,
"page_id": page_id,
"document_type": "Notion Page",
"connector_type": "Notion",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
# Process chunks
logger.debug(f"Chunking content for page {page_title}")
chunks = await create_document_chunks(markdown_content)
# Create and store new document
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Notion - {page_title}",
title=page_title,
document_type=DocumentType.NOTION_CONNECTOR,
document_metadata={
"page_title": page_title,
"page_id": page_id,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
},
content=summary_content,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new Notion page: {page_title}")
new_documents_created = True
# Batch commit every 10 documents
pages_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"page_id": page_id,
"page_title": page_title,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
for item in pages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata_for_summary = {
"page_title": item["page_title"],
"page_id": item["page_id"],
"document_type": "Notion Page",
"connector_type": "Notion",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content'][:500]}..."
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["page_title"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"page_title": item["page_title"],
"page_id": item["page_id"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} documents processed so far"
f"Committing batch: {documents_indexed} Notion pages processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}",
exc_info=True,
)
skipped_pages.append(
f"{page.get('title', 'Unknown')} (processing error)"
)
documents_skipped += 1
continue # Skip this page and continue with others
logger.error(f"Error processing Notion page: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
skipped_pages.append(f"{item['page_title']} (processing error)")
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one page
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
# Final commit to ensure all documents are persisted (safety net)
logger.info(f"Final commit: Total {documents_indexed} documents processed")
await session.commit()
try:
await session.commit()
logger.info(
"Successfully committed all Notion document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same page was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Get final count of pages with skipped Notion AI content
pages_with_skipped_ai_content = notion_client.get_skipped_content_count()
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Prepare result message with user-friendly notification about skipped content
result_message = None
if skipped_pages:
@ -563,6 +616,8 @@ async def index_notion_pages(
"pages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
"skipped_pages_count": len(skipped_pages),
"pages_with_skipped_ai_content": pages_with_skipped_ai_content,
"result_message": result_message,
@ -570,7 +625,9 @@ async def index_notion_pages(
)
logger.info(
f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
f"Notion indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed "
f"({duplicate_content_count} duplicate content)"
)
# Clean up the async client
@ -590,6 +647,10 @@ async def index_notion_pages(
"Using legacy token. Reconnect with OAuth for better reliability."
)
# Include warning message if there were issues
if warning_message:
notification_parts.append(warning_message)
user_notification_message = (
" ".join(notification_parts) if notification_parts else None
)

View file

@ -3,6 +3,10 @@ Obsidian connector indexer.
Indexes markdown notes from a local Obsidian vault.
This connector is only available in self-hosted mode.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import os
@ -17,7 +21,7 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -34,6 +38,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -307,25 +312,22 @@ async def index_obsidian_vault(
logger.info(f"Processing {len(files)} files after date filtering")
# Get LLM for summarization
long_context_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
indexed_count = 0
skipped_count = 0
failed_count = 0
duplicate_content_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all files, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
files_to_process = [] # List of dicts with document and file data
new_documents_created = False
for file_info in files:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(indexed_count)
last_heartbeat_time = time.time()
try:
file_path = file_info["path"]
relative_path = file_info["relative_path"]
@ -368,13 +370,151 @@ async def index_obsidian_vault(
search_space_id,
)
# Generate content hash
content_hash = generate_content_hash(content, search_space_id)
# Check for existing document
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Generate content hash
content_hash = generate_content_hash(content, search_space_id)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.debug(f"Note {title} unchanged, skipping")
skipped_count += 1
continue
# Queue existing document for update (will be set to processing in Phase 2)
files_to_process.append(
{
"document": existing_document,
"is_new": False,
"file_info": file_info,
"content": content,
"body_content": body_content,
"frontmatter": frontmatter,
"wiki_links": wiki_links,
"tags": tags,
"title": title,
"relative_path": relative_path,
"content_hash": content_hash,
"unique_identifier_hash": unique_identifier_hash,
}
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"Obsidian note {title} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
duplicate_content_count += 1
skipped_count += 1
continue
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=title,
document_type=DocumentType.OBSIDIAN_CONNECTOR,
document_metadata={
"vault_name": vault_name,
"file_path": relative_path,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
files_to_process.append(
{
"document": document,
"is_new": True,
"file_info": file_info,
"content": content,
"body_content": body_content,
"frontmatter": frontmatter,
"wiki_links": wiki_links,
"tags": tags,
"title": title,
"relative_path": relative_path,
"content_hash": content_hash,
"unique_identifier_hash": unique_identifier_hash,
}
)
except Exception as e:
logger.exception(
f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}"
)
failed_count += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
# Get LLM for summarization
long_context_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
for item in files_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(indexed_count)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Extract data from item
title = item["title"]
relative_path = item["relative_path"]
content = item["content"]
body_content = item["body_content"]
frontmatter = item["frontmatter"]
wiki_links = item["wiki_links"]
tags = item["tags"]
content_hash = item["content_hash"]
file_info = item["file_info"]
# Build metadata
document_metadata = {
@ -404,134 +544,114 @@ async def index_obsidian_vault(
]
document_string = build_document_metadata_string(metadata_sections)
if existing_document:
# Check if content has changed
if existing_document.content_hash == content_hash:
logger.debug(f"Note {title} unchanged, skipping")
skipped_count += 1
continue
# Update existing document
logger.info(f"Updating note: {title}")
# Generate new summary if content changed
if long_context_llm:
new_summary, _ = await generate_document_summary(
document_string,
long_context_llm,
document_metadata,
)
# Store summary in metadata
document_metadata["summary"] = new_summary
# Add URL and connector_id to metadata
document_metadata["url"] = (
f"obsidian://{vault_name}/{relative_path}"
)
document_metadata["connector_id"] = connector_id
existing_document.content = document_string
existing_document.content_hash = content_hash
existing_document.document_metadata = document_metadata
existing_document.updated_at = get_current_timestamp()
# Update embedding
embedding = config.embedding_model_instance.embed(document_string)
existing_document.embedding = embedding
# Update chunks - delete old and create new
existing_document.chunks.clear()
new_chunks = await create_document_chunks(document_string)
existing_document.chunks = new_chunks
indexed_count += 1
else:
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"Obsidian note {title} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
skipped_count += 1
continue
# Create new document
logger.info(f"Indexing new note: {title}")
# Generate summary
summary_content = ""
if long_context_llm:
summary_content, _ = await generate_document_summary(
document_string,
long_context_llm,
document_metadata,
)
# Generate embedding
embedding = config.embedding_model_instance.embed(document_string)
# Add URL and summary to metadata
document_metadata["url"] = (
f"obsidian://{vault_name}/{relative_path}"
)
document_metadata["summary"] = summary_content
document_metadata["connector_id"] = connector_id
# Create chunks
chunks = await create_document_chunks(document_string)
# Create document
new_document = Document(
search_space_id=search_space_id,
title=title,
document_type=DocumentType.OBSIDIAN_CONNECTOR,
content=document_string,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
document_metadata=document_metadata,
embedding=embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
# Generate summary
summary_content = ""
if long_context_llm:
summary_content, _ = await generate_document_summary(
document_string,
long_context_llm,
document_metadata,
)
session.add(new_document)
# Generate embedding
embedding = config.embedding_model_instance.embed(document_string)
indexed_count += 1
# Add URL and summary to metadata
document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}"
document_metadata["summary"] = summary_content
document_metadata["connector_id"] = connector_id
# Create chunks
chunks = await create_document_chunks(document_string)
# Update document to READY with actual content
document.title = title
document.content = document_string
document.content_hash = content_hash
document.embedding = embedding
document.document_metadata = document_metadata
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
indexed_count += 1
# Batch commit every 10 documents (for ready status updates)
if indexed_count % 10 == 0:
logger.info(
f"Committing batch: {indexed_count} Obsidian notes processed so far"
)
await session.commit()
except Exception as e:
logger.exception(
f"Error processing file {file_info.get('path', 'unknown')}: {e}"
f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}"
)
skipped_count += 1
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
failed_count += 1
continue
# Update connector's last indexed timestamp
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all changes
await session.commit()
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed")
try:
await session.commit()
logger.info(
"Successfully committed all Obsidian document changes to database"
)
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same note was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if failed_count > 0:
warning_parts.append(f"{failed_count} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
total_processed = indexed_count
await task_logger.log_task_success(
log_entry,
f"Successfully indexed {indexed_count} Obsidian notes (skipped {skipped_count})",
f"Successfully completed Obsidian vault indexing for connector {connector_id}",
{
"indexed_count": indexed_count,
"skipped_count": skipped_count,
"total_files": len(files),
"notes_processed": total_processed,
"documents_indexed": indexed_count,
"documents_skipped": skipped_count,
"documents_failed": failed_count,
"duplicate_content_count": duplicate_content_count,
},
)
return indexed_count, None
logger.info(
f"Obsidian vault indexing completed: {indexed_count} ready, "
f"{skipped_count} skipped, {failed_count} failed "
f"({duplicate_content_count} duplicate content)"
)
return total_processed, warning_message
except SQLAlchemyError as e:
logger.exception(f"Database error during Obsidian indexing: {e}")

View file

@ -1,5 +1,9 @@
"""
Slack connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.slack_history import SlackHistory
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
@ -28,6 +32,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -168,11 +173,15 @@ async def index_slack_messages(
f"No Slack channels found for connector {connector_id}",
{"channels_found": 0},
)
return 0, "No Slack channels found"
# CRITICAL: Update timestamp even when no channels found so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
return 0, None # Return None (not error) when no channels found
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
documents_failed = 0 # Track messages that failed processing
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
@ -184,15 +193,14 @@ async def index_slack_messages(
{"stage": "process_channels", "total_channels": len(channels)},
)
# Process each channel
# =======================================================================
# PHASE 1: Collect all messages from all channels, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
messages_to_process = [] # List of dicts with document and message data
new_documents_created = False
for channel_obj in channels:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
channel_id = channel_obj["id"]
channel_name = channel_obj["name"]
is_private = channel_obj["is_private"]
@ -305,47 +313,33 @@ async def index_slack_messages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
logger.info(
f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping."
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document."
)
# Update chunks and embedding
chunks = await create_document_chunks(
combined_document_string
)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Update existing document
existing_document.content = combined_document_string
existing_document.content_hash = content_hash
existing_document.embedding = doc_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"channel_name": channel_name,
"channel_id": channel_id,
"msg_ts": msg_ts,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(formatted_messages),
"indexed_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
# Delete old chunks and add new ones
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_indexed += 1
logger.info(f"Successfully updated Slack message {msg_ts}")
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -363,48 +357,47 @@ async def index_slack_messages(
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Process chunks
chunks = await create_document_chunks(combined_document_string)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Create and store new document
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Slack - {channel_name}",
title=channel_name,
document_type=DocumentType.SLACK_CONNECTOR,
document_metadata={
"channel_name": channel_name,
"channel_id": channel_id,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(formatted_messages),
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"msg_ts": msg_ts,
"connector_id": connector_id,
},
content=combined_document_string,
embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
new_documents_created = True
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Slack channels processed so far"
)
await session.commit()
messages_to_process.append(
{
"document": document,
"is_new": True,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"channel_name": channel_name,
"channel_id": channel_id,
"msg_ts": msg_ts,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(formatted_messages),
}
)
logger.info(
f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
f"Phase 1: Collected {len(formatted_messages)} messages from channel {channel_name}"
)
except SlackApiError as slack_error:
@ -420,43 +413,129 @@ async def index_slack_messages(
documents_skipped += 1
continue # Skip this channel and continue with others
# Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one channel
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
for item in messages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (embeddings, chunks)
chunks = await create_document_chunks(item["combined_document_string"])
doc_embedding = config.embedding_model_instance.embed(
item["combined_document_string"]
)
# Update document to READY with actual content
document.title = item["channel_name"]
document.content = item["combined_document_string"]
document.content_hash = item["content_hash"]
document.embedding = doc_embedding
document.document_metadata = {
"channel_name": item["channel_name"],
"channel_id": item["channel_id"],
"start_date": item["start_date"],
"end_date": item["end_date"],
"message_count": item["message_count"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed} Slack messages processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing Slack message {item.get('msg_ts', 'Unknown')}: {e!s}",
exc_info=True,
)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} Slack channels processed")
await session.commit()
logger.info(f"Final commit: Total {documents_indexed} Slack messages processed")
try:
await session.commit()
logger.info("Successfully committed all Slack document changes to database")
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"This may occur if the same message was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Prepare result message
result_message = None
if skipped_channels:
result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
else:
result_message = f"Processed {total_processed} channels."
# Build warning message if there were issues
warning_parts = []
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Slack indexing for connector {connector_id}",
{
"channels_processed": total_processed,
"channels_processed": len(channels),
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"skipped_channels_count": len(skipped_channels),
"result_message": result_message,
},
)
logger.info(
f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
f"Slack indexing completed: {documents_indexed} ready, "
f"{documents_skipped} skipped, {documents_failed} failed"
)
return (
total_processed,
None,
) # Return None on success (result_message is for logging only)
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,17 +1,21 @@
"""
Microsoft Teams connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
from collections.abc import Awaitable, Callable
from datetime import UTC
from datetime import UTC, datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.teams_history import TeamsHistory
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
@ -27,6 +31,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -50,6 +55,10 @@ async def index_teams_messages(
"""
Index Microsoft Teams messages from all accessible teams and channels.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
Args:
session: Database session
connector_id: ID of the Teams connector
@ -165,11 +174,16 @@ async def index_teams_messages(
f"No Teams found for connector {connector_id}",
{"teams_found": 0},
)
return 0, "No Teams found"
# CRITICAL: Update timestamp even when no teams found so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
await session.commit()
return 0, None # Return None (not error) when no items found
# Track the number of documents indexed
documents_indexed = 0
documents_skipped = 0
documents_failed = 0
duplicate_content_count = 0
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
@ -182,8 +196,6 @@ async def index_teams_messages(
)
# Convert date strings to datetime objects for filtering
from datetime import datetime
start_datetime = None
end_datetime = None
if start_date_str:
@ -197,16 +209,14 @@ async def index_teams_messages(
hour=23, minute=59, second=59, tzinfo=UTC
)
# Process each team
for team in teams:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Collect all messages and create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
messages_to_process = [] # List of dicts with document and message data
new_documents_created = False
for team in teams:
team_id = team.get("id")
team_name = team.get("displayName", "Unknown Team")
@ -239,7 +249,6 @@ async def index_teams_messages(
channel_name,
team_name,
)
documents_skipped += 1
continue
# Process each message
@ -322,60 +331,33 @@ async def index_teams_messages(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(
"Document for Teams message %s in channel %s unchanged. Skipping.",
message_id,
channel_name,
)
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = (
DocumentStatus.ready()
)
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
"Content changed for Teams message %s in channel %s. Updating document.",
message_id,
channel_name,
)
# Update chunks and embedding
chunks = await create_document_chunks(
combined_document_string
)
doc_embedding = (
config.embedding_model_instance.embed(
combined_document_string
)
)
# Update existing document
existing_document.content = combined_document_string
existing_document.content_hash = content_hash
existing_document.embedding = doc_embedding
existing_document.document_metadata = {
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"team_name": team_name,
"team_id": team_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_id": message_id,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(messages),
"indexed_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
# Delete old chunks and add new ones
existing_document.chunks = chunks
existing_document.updated_at = (
get_current_timestamp()
)
documents_indexed += 1
logger.info(
"Successfully updated Teams message %s",
message_id,
)
continue
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
@ -395,62 +377,50 @@ async def index_teams_messages(
duplicate_by_content.id,
duplicate_by_content.document_type,
)
duplicate_content_count += 1
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Process chunks
chunks = await create_document_chunks(
combined_document_string
)
doc_embedding = config.embedding_model_instance.embed(
combined_document_string
)
# Create and store new document
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=f"Teams - {team_name} - {channel_name}",
title=f"{team_name} - {channel_name}",
document_type=DocumentType.TEAMS_CONNECTOR,
document_metadata={
"team_name": team_name,
"team_id": team_id,
"channel_name": channel_name,
"channel_id": channel_id,
"start_date": start_date_str,
"end_date": end_date_str,
"message_count": len(messages),
"indexed_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
"connector_id": connector_id,
},
content=combined_document_string,
embedding=doc_embedding,
chunks=chunks,
content_hash=content_hash,
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
documents_indexed += 1
new_documents_created = True
# Batch commit every 10 documents
if documents_indexed % 10 == 0:
logger.info(
"Committing batch: %s Teams messages processed so far",
documents_indexed,
)
await session.commit()
logger.info(
"Successfully indexed channel %s in team %s with %s messages",
channel_name,
team_name,
len(messages),
)
messages_to_process.append(
{
"document": document,
"is_new": True,
"combined_document_string": combined_document_string,
"content_hash": content_hash,
"team_name": team_name,
"team_id": team_id,
"channel_name": channel_name,
"channel_id": channel_id,
"message_id": message_id,
"start_date": start_date_str,
"end_date": end_date_str,
}
)
except Exception as e:
logger.error(
@ -462,54 +432,143 @@ async def index_teams_messages(
skipped_channels.append(
f"{team_name}/{channel_name} (processing error)"
)
documents_skipped += 1
continue
except Exception as e:
logger.error("Error processing team %s: %s", team_name, str(e))
continue
# Update the last_indexed_at timestamp for the connector only if requested
# and if we successfully indexed at least one document
total_processed = documents_indexed
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
for item in messages_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time
document = item["document"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (embeddings, chunks)
chunks = await create_document_chunks(item["combined_document_string"])
doc_embedding = config.embedding_model_instance.embed(
item["combined_document_string"]
)
# Update document to READY with actual content
document.title = f"{item['team_name']} - {item['channel_name']}"
document.content = item["combined_document_string"]
document.content_hash = item["content_hash"]
document.embedding = doc_embedding
document.document_metadata = {
"team_name": item["team_name"],
"team_id": item["team_id"],
"channel_name": item["channel_name"],
"channel_id": item["channel_id"],
"start_date": item["start_date"],
"end_date": item["end_date"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0:
logger.info(
"Committing batch: %s Teams messages processed so far",
documents_indexed,
)
await session.commit()
except Exception as e:
logger.error(f"Error processing Teams message: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
"Final commit: Total %s Teams messages processed", documents_indexed
)
await session.commit()
try:
await session.commit()
logger.info("Successfully committed all Teams document changes to database")
except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if (
"duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower()
):
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Prepare result message
result_message = None
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
if skipped_channels:
result_message = f"Processed {total_processed} messages. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
else:
result_message = f"Processed {total_processed} messages."
warning_parts.append(f"{len(skipped_channels)} channels skipped")
warning_message = ", ".join(warning_parts) if warning_parts else None
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully completed Teams indexing for connector {connector_id}",
{
"messages_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_skipped": documents_skipped,
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
"skipped_channels_count": len(skipped_channels),
"result_message": result_message,
},
)
logger.info(
"Teams indexing completed: %s new messages, %s skipped",
"Teams indexing completed: %s ready, %s skipped, %s failed "
"(%s duplicate content)",
documents_indexed,
documents_skipped,
documents_failed,
duplicate_content_count,
)
return (
total_processed,
None,
) # Return None on success (result_message is for logging only)
return documents_indexed, warning_message
except SQLAlchemyError as db_error:
await session.rollback()

View file

@ -1,5 +1,9 @@
"""
Webcrawler connector indexer.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
"""
import time
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -28,6 +32,7 @@ from .base import (
get_connector_by_id,
get_current_timestamp,
logger,
safe_set_chunks,
update_connector_last_indexed,
)
@ -49,7 +54,11 @@ async def index_crawled_urls(
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index web page URLs.
Index web page URLs with real-time document status updates.
Implements 2-phase approach for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
- Phase 2: Process each document: pending processing ready/failed
Args:
session: Database session
@ -150,9 +159,9 @@ async def index_crawled_urls(
await task_logger.log_task_progress(
log_entry,
f"Starting to crawl {len(urls)} URLs",
f"Starting to process {len(urls)} URLs",
{
"stage": "crawling",
"stage": "processing",
"total_urls": len(urls),
},
)
@ -160,28 +169,128 @@ async def index_crawled_urls(
documents_indexed = 0
documents_updated = 0
documents_skipped = 0
failed_urls = []
documents_failed = 0
duplicate_content_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for idx, url in enumerate(urls, 1):
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all URLs, create pending documents for new ones
# This makes ALL new documents visible in the UI immediately with pending status
# =======================================================================
urls_to_process = [] # List of dicts with document and URL data
new_documents_created = False
for url in urls:
try:
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if it's already being processed
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PENDING
):
logger.info(f"URL {url} already pending. Skipping.")
documents_skipped += 1
continue
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PROCESSING
):
logger.info(f"URL {url} already processing. Skipping.")
documents_skipped += 1
continue
# Queue existing document for potential update check
urls_to_process.append(
{
"document": existing_document,
"is_new": False,
"url": url,
"unique_identifier_hash": unique_identifier_hash,
}
)
continue
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=url[:100], # Placeholder - URL as title (truncated)
document_type=DocumentType.CRAWLED_URL,
document_metadata={
"url": url,
"connector_id": connector_id,
},
content="Pending crawl...", # Placeholder content
content_hash=unique_identifier_hash, # Temporary unique value
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # PENDING status - visible in UI
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
urls_to_process.append(
{
"document": document,
"is_new": True,
"url": url,
"unique_identifier_hash": unique_identifier_hash,
}
)
except Exception as e:
logger.error(f"Error in Phase 1 for URL {url}: {e!s}", exc_info=True)
documents_failed += 1
continue
# Commit all pending documents - they all appear in UI now
if new_documents_created:
logger.info(
f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents"
)
await session.commit()
# =======================================================================
# PHASE 2: Process each URL one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(urls_to_process)} URLs")
for item in urls_to_process:
# Send heartbeat periodically
if on_heartbeat_callback:
current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed + documents_updated)
last_heartbeat_time = current_time
document = item["document"]
url = item["url"]
is_new = item["is_new"]
try:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
document.status = DocumentStatus.processing()
await session.commit()
await task_logger.log_task_progress(
log_entry,
f"Crawling URL {idx}/{len(urls)}: {url}",
f"Crawling URL: {url}",
{
"stage": "crawling_url",
"url_index": idx,
"url": url,
},
)
@ -191,7 +300,10 @@ async def index_crawled_urls(
if error or not crawl_result:
logger.warning(f"Failed to crawl URL {url}: {error}")
failed_urls.append((url, error or "Unknown error"))
document.status = DocumentStatus.failed(error or "Crawl failed")
document.updated_at = get_current_timestamp()
await session.commit()
documents_failed += 1
continue
# Extract content and metadata
@ -201,23 +313,18 @@ async def index_crawled_urls(
if not content.strip():
logger.warning(f"Skipping URL with no content: {url}")
failed_urls.append((url, "No content extracted"))
documents_skipped += 1
document.status = DocumentStatus.failed("No content extracted")
document.updated_at = get_current_timestamp()
await session.commit()
documents_failed += 1
continue
# Format content as structured document for summary generation (includes all metadata)
# Format content as structured document for summary generation
structured_document = crawler.format_to_structured_document(
crawl_result
)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Generate content hash using a version WITHOUT metadata
# This ensures the hash only changes when actual content changes,
# not when metadata (which contains dynamic fields like timestamps, IDs, etc.) changes
structured_document_for_hash = crawler.format_to_structured_document(
crawl_result, exclude_metadata=True
)
@ -225,114 +332,53 @@ async def index_crawled_urls(
structured_document_for_hash, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Extract useful metadata
title = metadata.get("title", url)
description = metadata.get("description", "")
language = metadata.get("language", "")
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(f"Document for URL {url} unchanged. Skipping.")
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for URL {url}. Updating document."
)
# Update title immediately for better UX
document.title = title
await session.commit()
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Crawled URL: {title}\n\n"
summary_content += f"URL: {url}\n"
if description:
summary_content += f"Description: {description}\n"
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
content_preview += "..."
summary_content += f"Content Preview:\n{content_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(content)
# Update existing document
existing_document.title = title
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
**metadata,
"crawler_type": crawler_type,
"last_crawled_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
existing_document.chunks = chunks
existing_document.updated_at = get_current_timestamp()
documents_updated += 1
logger.info(f"Successfully updated URL {url}")
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"URL {url} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, "
f"type: {duplicate_by_content.document_type}). Skipping."
)
# For existing documents, check if content has changed
if not is_new and document.content_hash == content_hash:
logger.info(f"Document for URL {url} unchanged. Marking as ready.")
# Ensure status is ready (might have been stuck)
document.status = DocumentStatus.ready()
await session.commit()
documents_skipped += 1
continue
# Document doesn't exist - create new one
# Generate summary with metadata
# For new documents, check if duplicate content exists elsewhere
if is_new:
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
session, content_hash
)
if duplicate_by_content:
logger.info(
f"URL {url} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}). "
f"Marking as failed."
)
document.status = DocumentStatus.failed(
"Duplicate content exists"
)
document.updated_at = get_current_timestamp()
await session.commit()
duplicate_content_count += 1
documents_skipped += 1
continue
# Generate summary with LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
document_metadata_for_summary = {
"url": url,
"title": title,
"description": description,
@ -344,7 +390,7 @@ async def index_crawled_urls(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
structured_document, user_llm, document_metadata_for_summary
)
else:
# Fallback to simple summary if no LLM configured
@ -366,32 +412,32 @@ async def index_crawled_urls(
summary_content
)
# Process chunks
chunks = await create_document_chunks(content)
document = Document(
search_space_id=search_space_id,
title=title,
document_type=DocumentType.CRAWLED_URL,
document_metadata={
**metadata,
"crawler_type": crawler_type,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
# Update document to READY with actual content
document.title = title
document.content = summary_content
document.content_hash = content_hash
document.embedding = summary_embedding
document.document_metadata = {
**metadata,
"crawler_type": crawler_type,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
safe_set_chunks(document, chunks)
document.status = DocumentStatus.ready() # READY status
document.updated_at = get_current_timestamp()
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new URL {url}")
if is_new:
documents_indexed += 1
else:
documents_updated += 1
# Batch commit every 10 documents
logger.info(f"Successfully processed URL {url}")
# Batch commit every 10 documents (for ready status updates)
if (documents_indexed + documents_updated) % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed + documents_updated} URLs processed so far"
@ -399,32 +445,51 @@ async def index_crawled_urls(
await session.commit()
except Exception as e:
logger.error(
f"Error processing URL {url}: {e!s}",
exc_info=True,
)
failed_urls.append((url, str(e)))
logger.error(f"Error processing URL {url}: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e)[:200])
document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1
continue
total_processed = documents_indexed + documents_updated
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed"
)
await session.commit()
# Log failed URLs if any (for debugging purposes)
if failed_urls:
failed_summary = "; ".join(
[f"{url}: {error}" for url, error in failed_urls[:5]]
try:
await session.commit()
logger.info(
"Successfully committed all webcrawler document changes to database"
)
if len(failed_urls) > 5:
failed_summary += f" (and {len(failed_urls) - 5} more)"
logger.warning(f"Some URLs failed to index: {failed_summary}")
except Exception as e:
# Handle any remaining integrity errors gracefully
if "duplicate key value violates unique constraint" in str(e).lower():
logger.warning(
f"Duplicate content_hash detected during final commit. "
f"Rolling back and continuing. Error: {e!s}"
)
await session.rollback()
else:
raise
# Build warning message if there were issues
warning_parts = []
if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate")
if documents_failed > 0:
warning_parts.append(f"{documents_failed} failed")
warning_message = ", ".join(warning_parts) if warning_parts else None
await task_logger.log_task_success(
log_entry,
@ -434,19 +499,21 @@ async def index_crawled_urls(
"documents_indexed": documents_indexed,
"documents_updated": documents_updated,
"documents_skipped": documents_skipped,
"failed_urls_count": len(failed_urls),
"documents_failed": documents_failed,
"duplicate_content_count": duplicate_content_count,
},
)
logger.info(
f"Web page indexing completed: {documents_indexed} new, "
f"{documents_updated} updated, {documents_skipped} skipped, "
f"{len(failed_urls)} failed"
f"{documents_failed} failed"
)
return (
total_processed,
None,
) # Return None on success (result_message is for logging only)
if warning_message:
return total_processed, f"Completed with issues: {warning_message}"
return total_processed, None
except SQLAlchemyError as db_error:
await session.rollback()
@ -494,9 +561,7 @@ async def get_crawled_url_documents(
)
if connector_id:
# Filter by connector if needed - you might need to add a connector_id field to Document
# or filter by some other means depending on your schema
pass
query = query.filter(Document.connector_id == connector_id)
result = await session.execute(query)
documents = result.scalars().all()

View file

@ -14,6 +14,35 @@ from app.db import Document
md = MarkdownifyTransformer()
def safe_set_chunks(document: Document, chunks: list) -> None:
"""
Safely assign chunks to a document without triggering lazy loading.
ALWAYS use this instead of `document.chunks = chunks` to avoid
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
Why this is needed:
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
load the OLD chunks first (for comparison/orphan detection)
- This lazy loading fails in async context with asyncpg driver
- set_committed_value bypasses this by setting the value directly
This function is safe regardless of how the document was loaded
(with or without selectinload).
Args:
document: The Document object to update
chunks: List of Chunk objects to assign
Example:
# Instead of: document.chunks = chunks (DANGEROUS!)
safe_set_chunks(document, chunks) # Always safe
"""
from sqlalchemy.orm.attributes import set_committed_value
set_committed_value(document, "chunks", chunks)
def get_current_timestamp() -> datetime:
"""
Get the current timestamp with timezone for updated_at field.

View file

@ -3,6 +3,11 @@ Circleback meeting document processor.
This module processes meeting data received from Circleback webhooks
and stores it as searchable documents in the database.
Implements real-time document status updates for UI feedback:
- Create document with 'pending' status (visible in UI immediately)
- Set to 'processing' while processing content
- Set to 'ready' or 'failed' when complete
"""
import logging
@ -14,6 +19,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Document,
DocumentStatus,
DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
@ -30,6 +36,7 @@ from app.utils.document_converters import (
from .base import (
check_document_by_unique_identifier,
get_current_timestamp,
safe_set_chunks,
)
logger = logging.getLogger(__name__)
@ -47,6 +54,11 @@ async def add_circleback_meeting_document(
"""
Process and store a Circleback meeting document.
Implements real-time document status updates:
- Phase 1: Create document with 'pending' status (visible in UI immediately)
- Phase 2: Set to 'processing' while processing content
- Phase 3: Set to 'ready' or 'failed' when complete
Args:
session: Database session
meeting_id: Circleback meeting ID
@ -59,6 +71,7 @@ async def add_circleback_meeting_document(
Returns:
Document object if successful, None if failed or duplicate
"""
document = None
try:
# Generate unique identifier hash using Circleback meeting ID
unique_identifier = f"circleback_{meeting_id}"
@ -77,6 +90,12 @@ async def add_circleback_meeting_document(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
await session.commit()
logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.")
return existing_document
else:
@ -84,6 +103,78 @@ async def add_circleback_meeting_document(
logger.info(
f"Content changed for Circleback meeting {meeting_id}. Updating document."
)
document = existing_document
# Set to PROCESSING status and commit - shows "processing" in UI
document.status = DocumentStatus.processing()
await session.commit()
else:
# =======================================================================
# PHASE 1: Create document with PENDING status
# This makes the document visible in the UI immediately
# =======================================================================
# Fetch the user who set up the Circleback connector (preferred)
# or fall back to search space owner if no connector found
created_by_user_id = None
# Try to find the Circleback connector for this search space
connector_result = await session.execute(
select(SearchSourceConnector.user_id).where(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
)
)
connector_user = connector_result.scalar_one_or_none()
if connector_user:
# Use the user who set up the Circleback connector
created_by_user_id = connector_user
else:
# Fallback: use search space owner if no connector found
search_space_result = await session.execute(
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
)
created_by_user_id = search_space_result.scalar_one_or_none()
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=meeting_name,
document_type=DocumentType.CIRCLEBACK,
document_metadata={
"CIRCLEBACK_MEETING_ID": meeting_id,
"MEETING_NAME": meeting_name,
"SOURCE": "CIRCLEBACK_WEBHOOK",
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=created_by_user_id,
connector_id=connector_id,
)
session.add(document)
# Commit immediately so document appears in UI with pending status
await session.commit()
logger.info(
f"Created pending Circleback meeting document {meeting_id} in search space {search_space_id}"
)
# =======================================================================
# PHASE 2: Set to PROCESSING status
# =======================================================================
document.status = DocumentStatus.processing()
await session.commit()
# =======================================================================
# PHASE 3: Process the document content
# =======================================================================
# Get LLM for generating summary
llm = await get_document_summary_llm(session, search_space_id)
@ -100,7 +191,7 @@ async def add_circleback_meeting_document(
summary_embedding = None
else:
# Generate summary with metadata
document_metadata = {
summary_metadata = {
"meeting_name": meeting_name,
"meeting_id": meeting_id,
"document_type": "Circleback Meeting",
@ -111,7 +202,7 @@ async def add_circleback_meeting_document(
},
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, llm, document_metadata
markdown_content, llm, summary_metadata
)
# Process chunks
@ -126,7 +217,7 @@ async def add_circleback_meeting_document(
f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable"
)
# Prepare document metadata
# Prepare final document metadata
document_metadata = {
"CIRCLEBACK_MEETING_ID": meeting_id,
"MEETING_NAME": meeting_name,
@ -134,77 +225,34 @@ async def add_circleback_meeting_document(
**metadata,
}
# Fetch the user who set up the Circleback connector (preferred)
# or fall back to search space owner if no connector found
created_by_user_id = None
# =======================================================================
# PHASE 4: Update document to READY status with actual content
# =======================================================================
document.title = meeting_name
document.content = summary_content
document.content_hash = content_hash
if summary_embedding is not None:
document.embedding = summary_embedding
document.document_metadata = document_metadata
safe_set_chunks(document, chunks)
document.blocknote_document = blocknote_json
document.content_needs_reindexing = False
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
# Ensure connector_id is set (backfill for documents created before this field)
if connector_id is not None:
document.connector_id = connector_id
# Try to find the Circleback connector for this search space
connector_result = await session.execute(
select(SearchSourceConnector.user_id).where(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
)
)
connector_user = connector_result.scalar_one_or_none()
await session.commit()
await session.refresh(document)
if connector_user:
# Use the user who set up the Circleback connector
created_by_user_id = connector_user
else:
# Fallback: use search space owner if no connector found
search_space_result = await session.execute(
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
)
created_by_user_id = search_space_result.scalar_one_or_none()
# Update or create document
if existing_document:
# Update existing document
existing_document.title = meeting_name
existing_document.content = summary_content
existing_document.content_hash = content_hash
if summary_embedding is not None:
existing_document.embedding = summary_embedding
existing_document.document_metadata = document_metadata
existing_document.chunks = chunks
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
# Ensure connector_id is set (backfill for documents created before this field)
if connector_id is not None:
existing_document.connector_id = connector_id
await session.commit()
await session.refresh(existing_document)
document = existing_document
logger.info(
f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}"
)
else:
# Create new document
document = Document(
search_space_id=search_space_id,
title=meeting_name,
document_type=DocumentType.CIRCLEBACK,
document_metadata=document_metadata,
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=created_by_user_id,
connector_id=connector_id,
)
session.add(document)
await session.commit()
await session.refresh(document)
logger.info(
f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}"
f"Processed Circleback meeting document {meeting_id} in search space {search_space_id} - now ready"
)
return document
@ -214,8 +262,28 @@ async def add_circleback_meeting_document(
logger.error(
f"Database error processing Circleback meeting {meeting_id}: {db_error}"
)
# Mark document as failed if it was created
if document is not None:
try:
document.status = DocumentStatus.failed(str(db_error))
document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
raise db_error
except Exception as e:
await session.rollback()
logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}")
# Mark document as failed if it was created
if document is not None:
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e

View file

@ -17,7 +17,7 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
from app.db import Document, DocumentType, Log, Notification
from app.db import Document, DocumentStatus, DocumentType, Log, Notification
from app.services.llm_service import get_user_long_context_llm
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
@ -33,6 +33,7 @@ from .base import (
check_document_by_unique_identifier,
check_duplicate_document,
get_current_timestamp,
safe_set_chunks,
)
from .markdown_processor import add_received_markdown_file_document
@ -499,6 +500,7 @@ async def add_received_file_document_using_unstructured(
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
await session.commit()
await session.refresh(existing_document)
@ -528,6 +530,7 @@ async def add_received_file_document_using_unstructured(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
)
session.add(document)
@ -640,6 +643,7 @@ async def add_received_file_document_using_llamacloud(
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
await session.commit()
await session.refresh(existing_document)
@ -669,6 +673,7 @@ async def add_received_file_document_using_llamacloud(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
)
session.add(document)
@ -806,6 +811,7 @@ async def add_received_file_document_using_docling(
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
await session.commit()
await session.refresh(existing_document)
@ -835,6 +841,7 @@ async def add_received_file_document_using_docling(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
)
session.add(document)
@ -1606,3 +1613,372 @@ async def process_file_in_background(
logging.error(f"Error processing file in background: {error_message}")
raise # Re-raise so the wrapper can also handle it
async def process_file_in_background_with_document(
document: Document,
file_path: str,
filename: str,
search_space_id: int,
user_id: str,
session: AsyncSession,
task_logger: TaskLoggingService,
log_entry: Log,
connector: dict | None = None,
notification: Notification | None = None,
) -> Document | None:
"""
Process file and update existing pending document (2-phase pattern).
This function is Phase 2 of the real-time document status updates:
- Phase 1 (API): Created document with pending status
- Phase 2 (this): Process file and update document to ready/failed
The document already exists with pending status. This function:
1. Parses the file content (markdown, audio, or ETL services)
2. Updates the document with content, embeddings, and chunks
3. Sets status to 'ready' on success
Args:
document: Existing document with pending status
file_path: Path to the uploaded file
filename: Original filename
search_space_id: ID of the search space
user_id: ID of the user
session: Database session
task_logger: Task logging service
log_entry: Log entry for this task
connector: Optional connector info for Google Drive files
notification: Optional notification for progress updates
Returns:
Updated Document object if successful, None if duplicate content detected
"""
import os
from app.config import config as app_config
from app.services.llm_service import get_user_long_context_llm
from app.utils.blocknote_converter import convert_markdown_to_blocknote
try:
markdown_content = None
etl_service = None
# ===== STEP 1: Parse file content based on type =====
# Check if the file is a markdown or text file
if filename.lower().endswith((".md", ".markdown", ".txt")):
# Update notification: parsing stage
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Reading file",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing markdown/text file: {filename}",
{"file_type": "markdown", "processing_stage": "reading_file"},
)
# Read markdown content directly
with open(file_path, encoding="utf-8") as f:
markdown_content = f.read()
etl_service = "MARKDOWN"
# Clean up temp file
with contextlib.suppress(Exception):
os.unlink(file_path)
# Check if the file is an audio file
elif filename.lower().endswith(
(".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
):
# Update notification: parsing stage (transcription)
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Transcribing audio",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing audio file for transcription: {filename}",
{"file_type": "audio", "processing_stage": "starting_transcription"},
)
# Transcribe audio
stt_service_type = (
"local"
if app_config.STT_SERVICE
and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
result = stt_service.transcribe_file(file_path)
transcribed_text = result.get("text", "")
if not transcribed_text:
raise ValueError("Transcription returned empty text")
markdown_content = (
f"# Transcription of {filename}\n\n{transcribed_text}"
)
else:
with open(file_path, "rb") as audio_file:
transcription_kwargs = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
transcription_kwargs["api_base"] = (
app_config.STT_SERVICE_API_BASE
)
transcription_response = await atranscription(
**transcription_kwargs
)
transcribed_text = transcription_response.get("text", "")
if not transcribed_text:
raise ValueError("Transcription returned empty text")
markdown_content = (
f"# Transcription of {filename}\n\n{transcribed_text}"
)
etl_service = "AUDIO_TRANSCRIPTION"
# Clean up temp file
with contextlib.suppress(Exception):
os.unlink(file_path)
else:
# Document files - use ETL service
from app.services.page_limit_service import (
PageLimitExceededError,
PageLimitService,
)
page_limit_service = PageLimitService(session)
# Estimate page count
try:
estimated_pages = page_limit_service.estimate_pages_before_processing(
file_path
)
except Exception:
file_size = os.path.getsize(file_path)
estimated_pages = max(1, file_size // (80 * 1024))
# Check page limit
await page_limit_service.check_page_limit(user_id, estimated_pages)
if app_config.ETL_SERVICE == "UNSTRUCTURED":
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Extracting content",
)
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
markdown_content = await convert_document_to_markdown(docs)
actual_pages = page_limit_service.estimate_pages_from_elements(docs)
final_page_count = max(estimated_pages, actual_pages)
etl_service = "UNSTRUCTURED"
# Update page usage
await page_limit_service.update_page_usage(
user_id, final_page_count, allow_exceed=True
)
elif app_config.ETL_SERVICE == "LLAMACLOUD":
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Extracting content",
)
result = await parse_with_llamacloud_retry(
file_path=file_path,
estimated_pages=estimated_pages,
task_logger=task_logger,
log_entry=log_entry,
)
markdown_documents = await result.aget_markdown_documents(
split_by_page=False
)
if not markdown_documents:
raise RuntimeError(
f"LlamaCloud parsing returned no documents: {filename}"
)
markdown_content = markdown_documents[0].text
etl_service = "LLAMACLOUD"
# Update page usage
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
)
elif app_config.ETL_SERVICE == "DOCLING":
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Extracting content",
)
# Suppress logging during Docling import
getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
getLogger("docling.document_converter").setLevel(ERROR)
getLogger(
"docling_core.transforms.chunker.hierarchical_chunker"
).setLevel(ERROR)
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
markdown_content = result.document.export_to_markdown()
etl_service = "DOCLING"
# Update page usage
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
)
else:
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
# Clean up temp file
with contextlib.suppress(Exception):
os.unlink(file_path)
if not markdown_content:
raise RuntimeError(f"Failed to extract content from file: {filename}")
# ===== STEP 2: Check for duplicate content =====
content_hash = generate_content_hash(markdown_content, search_space_id)
existing_by_content = await check_duplicate_document(session, content_hash)
if existing_by_content and existing_by_content.id != document.id:
# Duplicate content found - mark this document as failed
logging.info(
f"Duplicate content detected for {filename}, "
f"matches document {existing_by_content.id}"
)
return None
# ===== STEP 3: Generate embeddings and chunks =====
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking"
)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if user_llm:
document_metadata = {
"file_name": filename,
"etl_service": etl_service,
"document_type": "File Document",
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, document_metadata
)
else:
# Fallback: use truncated content as summary
summary_content = markdown_content[:4000]
from app.config import config
summary_embedding = config.embedding_model_instance.embed(summary_content)
chunks = await create_document_chunks(markdown_content)
# Convert to BlockNote for editing
blocknote_json = await convert_markdown_to_blocknote(markdown_content)
# ===== STEP 4: Update document to READY =====
from sqlalchemy.orm.attributes import flag_modified
document.title = filename
document.content = summary_content
document.content_hash = content_hash
document.embedding = summary_embedding
document.document_metadata = {
"FILE_NAME": filename,
"ETL_SERVICE": etl_service or "UNKNOWN",
**(document.document_metadata or {}),
}
flag_modified(document, "document_metadata")
# Use safe_set_chunks to avoid async issues
safe_set_chunks(document, chunks)
document.blocknote_document = blocknote_json
document.content_needs_reindexing = False
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready() # Shows checkmark in UI
await session.commit()
await session.refresh(document)
await task_logger.log_task_success(
log_entry,
f"Successfully processed file: {filename}",
{
"document_id": document.id,
"content_hash": content_hash,
"file_type": etl_service,
"chunks_count": len(chunks),
},
)
return document
except Exception as e:
await session.rollback()
from app.services.page_limit_service import PageLimitExceededError
if isinstance(e, PageLimitExceededError):
error_message = str(e)
elif isinstance(e, HTTPException) and "page limit" in str(e.detail).lower():
error_message = str(e.detail)
else:
error_message = f"Failed to process file: {filename}"
await task_logger.log_task_failure(
log_entry,
error_message,
str(e),
{
"error_type": type(e).__name__,
"filename": filename,
"document_id": document.id,
},
)
logging.error(f"Error processing file with document: {error_message}")
raise

View file

@ -7,7 +7,7 @@ import logging
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.db import Document, DocumentStatus, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -270,6 +270,7 @@ async def add_received_markdown_file_document(
existing_document.chunks = chunks
existing_document.blocknote_document = blocknote_json
existing_document.updated_at = get_current_timestamp()
existing_document.status = DocumentStatus.ready() # Mark as ready
await session.commit()
await session.refresh(existing_document)
@ -297,6 +298,7 @@ async def add_received_markdown_file_document(
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector.get("connector_id") if connector else None,
status=DocumentStatus.ready(), # Mark as ready
)
session.add(document)

View file

@ -1,5 +1,9 @@
"""
YouTube video document processor.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create document with 'pending' status (visible in UI immediately)
- Phase 2: Process document: pending processing ready/failed
"""
import logging
@ -12,7 +16,7 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
from app.db import Document, DocumentType
from app.db import Document, DocumentStatus, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
@ -26,6 +30,7 @@ from app.utils.proxy_config import get_requests_proxies
from .base import (
check_document_by_unique_identifier,
get_current_timestamp,
safe_set_chunks,
)
@ -61,6 +66,10 @@ async def add_youtube_video_document(
"""
Process a YouTube video URL, extract transcripts, and store as a document.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create document with 'pending' status (visible in UI immediately)
- Phase 2: Process document: pending processing ready/failed
Args:
session: Database session for storing the document
url: YouTube video URL (supports standard, shortened, and embed formats)
@ -85,15 +94,18 @@ async def add_youtube_video_document(
metadata={"url": url, "user_id": str(user_id)},
)
document = None
video_id = None
is_new_document = False
try:
# Extract video ID from URL
# Extract video ID from URL (lightweight operation)
await task_logger.log_task_progress(
log_entry,
f"Extracting video ID from URL: {url}",
{"stage": "video_id_extraction"},
)
# Get video ID
video_id = get_youtube_video_id(url)
if not video_id:
raise ValueError(f"Could not extract video ID from URL: {url}")
@ -104,13 +116,87 @@ async def add_youtube_video_document(
{"stage": "video_id_extracted", "video_id": video_id},
)
# Get video metadata
# Generate unique identifier hash for this YouTube video
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
)
# Check if document with this unique identifier already exists
await task_logger.log_task_progress(
log_entry,
f"Checking for existing video: {video_id}",
{"stage": "duplicate_check", "video_id": video_id},
)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# =======================================================================
# PHASE 1: Create pending document or prepare existing for update
# =======================================================================
if existing_document:
document = existing_document
is_new_document = False
# Check if already being processed
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PENDING
):
logging.info(
f"YouTube video {video_id} already pending. Returning existing."
)
return existing_document
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PROCESSING
):
logging.info(
f"YouTube video {video_id} already processing. Returning existing."
)
return existing_document
else:
# Create new document with PENDING status (visible in UI immediately)
await task_logger.log_task_progress(
log_entry,
f"Creating pending document for video: {video_id}",
{"stage": "pending_document_creation"},
)
document = Document(
title=f"YouTube Video: {video_id}", # Placeholder title
document_type=DocumentType.YOUTUBE_VIDEO,
document_metadata={
"url": url,
"video_id": video_id,
},
content="Processing video...", # Placeholder content
content_hash=unique_identifier_hash, # Temporary unique value
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation
status=DocumentStatus.pending(), # PENDING status - visible in UI
search_space_id=search_space_id,
updated_at=get_current_timestamp(),
created_by_id=user_id,
)
session.add(document)
await session.commit() # Document visible in UI now with pending status!
is_new_document = True
logging.info(f"Created pending document for YouTube video {video_id}")
# =======================================================================
# PHASE 2: Set to PROCESSING and do heavy work
# =======================================================================
document.status = DocumentStatus.processing()
await session.commit() # UI shows "processing" status
await task_logger.log_task_progress(
log_entry,
f"Fetching video metadata for: {video_id}",
{"stage": "metadata_fetch"},
)
# Fetch video metadata
params = {
"format": "json",
"url": f"https://www.youtube.com/watch?v={video_id}",
@ -130,6 +216,10 @@ async def add_youtube_video_document(
):
video_data = await response.json()
# Update title immediately for better UX (user sees actual title sooner)
document.title = video_data.get("title", f"YouTube Video: {video_id}")
await session.commit()
await task_logger.log_task_progress(
log_entry,
f"Video metadata fetched: {video_data.get('title', 'Unknown')}",
@ -219,53 +309,28 @@ async def add_youtube_video_document(
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
# Generate unique identifier hash for this YouTube video
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
)
# Generate content hash
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check if document with this unique identifier already exists
await task_logger.log_task_progress(
log_entry,
f"Checking for existing video: {video_id}",
{"stage": "duplicate_check", "video_id": video_id},
)
# For existing documents, check if content has changed
if not is_new_document and existing_document.content_hash == content_hash:
await task_logger.log_task_success(
log_entry,
f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
"video_id": video_id,
},
)
logging.info(
f"Document for YouTube video {video_id} unchanged. Marking as ready."
)
document.status = DocumentStatus.ready()
await session.commit()
return document
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
await task_logger.log_task_success(
log_entry,
f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
"video_id": video_id,
},
)
logging.info(
f"Document for YouTube video {video_id} unchanged. Skipping."
)
return existing_document
else:
# Content has changed - update the existing document
logging.info(
f"Content changed for YouTube video {video_id}. Updating document."
)
await task_logger.log_task_progress(
log_entry,
f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}",
{"stage": "document_update", "video_id": video_id},
)
# Get LLM for summary generation (needed for both create and update)
# Get LLM for summary generation
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
@ -287,7 +352,7 @@ async def add_youtube_video_document(
)
# Generate summary with metadata
document_metadata = {
document_metadata_for_summary = {
"url": url,
"video_id": video_id,
"title": video_data.get("title", "YouTube Video"),
@ -297,7 +362,7 @@ async def add_youtube_video_document(
"has_transcript": "No captions available" not in transcript_text,
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
combined_document_string, user_llm, document_metadata_for_summary
)
# Process chunks
@ -319,65 +384,33 @@ async def add_youtube_video_document(
chunks = await create_document_chunks(combined_document_string)
# Update or create document
if existing_document:
# Update existing document
await task_logger.log_task_progress(
log_entry,
f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
{"stage": "document_update", "chunks_count": len(chunks)},
)
# =======================================================================
# PHASE 3: Update document to READY with all content
# =======================================================================
await task_logger.log_task_progress(
log_entry,
f"Finalizing document: {video_data.get('title', 'YouTube Video')}",
{"stage": "document_finalization", "chunks_count": len(chunks)},
)
existing_document.title = video_data.get("title", "YouTube Video")
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
"url": url,
"video_id": video_id,
"video_title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
}
existing_document.chunks = chunks
existing_document.blocknote_document = blocknote_json
existing_document.updated_at = get_current_timestamp()
document.title = video_data.get("title", "YouTube Video")
document.content = summary_content
document.content_hash = content_hash
document.embedding = summary_embedding
document.document_metadata = {
"url": url,
"video_id": video_id,
"video_title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
}
safe_set_chunks(document, chunks)
document.blocknote_document = blocknote_json
document.status = DocumentStatus.ready() # READY status - fully processed
document.updated_at = get_current_timestamp()
await session.commit()
await session.refresh(existing_document)
document = existing_document
else:
# Create new document
await task_logger.log_task_progress(
log_entry,
f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
{"stage": "document_creation", "chunks_count": len(chunks)},
)
document = Document(
title=video_data.get("title", "YouTube Video"),
document_type=DocumentType.YOUTUBE_VIDEO,
document_metadata={
"url": url,
"video_id": video_id,
"video_title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
},
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
search_space_id=search_space_id,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
updated_at=get_current_timestamp(),
created_by_id=user_id,
)
session.add(document)
await session.commit()
await session.refresh(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
@ -395,27 +428,51 @@ async def add_youtube_video_document(
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
# Mark document as failed if it exists
if document:
try:
document.status = DocumentStatus.failed(
f"Database error: {str(db_error)[:150]}"
)
document.updated_at = get_current_timestamp()
await session.commit()
except Exception:
await session.rollback()
else:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error while processing YouTube video: {url}",
str(db_error),
{
"error_type": "SQLAlchemyError",
"video_id": video_id if "video_id" in locals() else None,
"video_id": video_id,
},
)
raise db_error
except Exception as e:
await session.rollback()
# Mark document as failed if it exists
if document:
try:
document.status = DocumentStatus.failed(str(e)[:200])
document.updated_at = get_current_timestamp()
await session.commit()
except Exception:
await session.rollback()
else:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to process YouTube video: {url}",
str(e),
{
"error_type": type(e).__name__,
"video_id": video_id if "video_id" in locals() else None,
"video_id": video_id,
},
)
logging.error(f"Failed to process YouTube video: {e!s}")

View file

@ -13,6 +13,7 @@ import {
llmPreferencesAtom,
} from "@/atoms/new-llm-config/new-llm-config-query.atoms";
import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
import { ConnectorIndicator } from "@/components/assistant-ui/connector-popup";
import { DocumentUploadDialogProvider } from "@/components/assistant-ui/document-upload-popup";
import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb";
import { LayoutDataProvider } from "@/components/layout";
@ -192,6 +193,8 @@ export function DashboardClientLayout({
<LayoutDataProvider searchSpaceId={searchSpaceId} breadcrumb={<DashboardBreadcrumb />}>
{children}
</LayoutDataProvider>
{/* Global connector dialog - triggered from documents page */}
<ConnectorIndicator hideTrigger />
</DocumentUploadDialogProvider>
);
}

View file

@ -1,10 +1,12 @@
"use client";
import type React from "react";
import { useRef, useState, useEffect } from "react";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
export function getDocumentTypeIcon(type: string): React.ReactNode {
return getConnectorIcon(type);
export function getDocumentTypeIcon(type: string, className?: string): React.ReactNode {
return getConnectorIcon(type, className);
}
export function getDocumentTypeLabel(type: string): string {
@ -15,16 +17,43 @@ export function getDocumentTypeLabel(type: string): string {
}
export function DocumentTypeChip({ type, className }: { type: string; className?: string }) {
const icon = getDocumentTypeIcon(type);
return (
<span
className={
"inline-flex items-center gap-1.5 rounded-full border border-border bg-primary/5 px-2 py-1 text-xs font-medium " +
(className ?? "")
const icon = getDocumentTypeIcon(type, "h-4 w-4");
const fullLabel = getDocumentTypeLabel(type);
const textRef = useRef<HTMLSpanElement>(null);
const [isTruncated, setIsTruncated] = useState(false);
useEffect(() => {
const checkTruncation = () => {
if (textRef.current) {
setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth);
}
};
checkTruncation();
window.addEventListener("resize", checkTruncation);
return () => window.removeEventListener("resize", checkTruncation);
}, []);
const chip = (
<span
className={`inline-flex items-center gap-1.5 rounded bg-muted/40 px-2 py-1 text-xs text-muted-foreground max-w-full overflow-hidden ${className ?? ""}`}
>
<span className="text-primary">{icon}</span>
{getDocumentTypeLabel(type)}
<span className="opacity-80 flex-shrink-0">{icon}</span>
<span ref={textRef} className="truncate min-w-0">
{fullLabel}
</span>
</span>
);
if (isTruncated) {
return (
<Tooltip>
<TooltipTrigger asChild>{chip}</TooltipTrigger>
<TooltipContent side="top" className="max-w-xs">
<p>{fullLabel}</p>
</TooltipContent>
</Tooltip>
);
}
return chip;
}

View file

@ -1,9 +1,21 @@
"use client";
import { CircleAlert, CircleX, Columns3, Filter, ListFilter, Trash } from "lucide-react";
import { AnimatePresence, motion, type Variants } from "motion/react";
import { useSetAtom } from "jotai";
import {
CircleAlert,
CircleX,
FilePlus2,
FileType,
ListFilter,
Search,
SlidersHorizontal,
Trash,
} from "lucide-react";
import { motion } from "motion/react";
import { useTranslations } from "next-intl";
import React, { useMemo, useRef } from "react";
import React, { useMemo, useRef, useState } from "react";
import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
import {
AlertDialog,
AlertDialogAction,
@ -17,24 +29,10 @@ import {
} from "@/components/ui/alert-dialog";
import { Button } from "@/components/ui/button";
import { Checkbox } from "@/components/ui/checkbox";
import {
DropdownMenu,
DropdownMenuCheckboxItem,
DropdownMenuContent,
DropdownMenuLabel,
DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
import type { ColumnVisibility } from "./types";
const fadeInScale: Variants = {
hidden: { opacity: 0, scale: 0.95 },
visible: { opacity: 1, scale: 1, transition: { type: "spring", stiffness: 300, damping: 30 } },
exit: { opacity: 0, scale: 0.95, transition: { duration: 0.15 } },
};
import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
export function DocumentsFilters({
typeCounts: typeCountsRecord,
@ -44,8 +42,6 @@ export function DocumentsFilters({
onBulkDelete,
onToggleType,
activeTypes,
columnVisibility,
onToggleColumn,
}: {
typeCounts: Partial<Record<DocumentTypeEnum, number>>;
selectedIds: Set<number>;
@ -54,17 +50,27 @@ export function DocumentsFilters({
onBulkDelete: () => Promise<void>;
onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
activeTypes: DocumentTypeEnum[];
columnVisibility: ColumnVisibility;
onToggleColumn: (id: keyof ColumnVisibility, checked: boolean) => void;
}) {
const t = useTranslations("documents");
const id = React.useId();
const inputRef = useRef<HTMLInputElement>(null);
// Dialog hooks for action buttons
const { openDialog: openUploadDialog } = useDocumentUploadDialog();
const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom);
const [typeSearchQuery, setTypeSearchQuery] = useState("");
const uniqueTypes = useMemo(() => {
return Object.keys(typeCountsRecord).sort() as DocumentTypeEnum[];
}, [typeCountsRecord]);
const filteredTypes = useMemo(() => {
if (!typeSearchQuery.trim()) return uniqueTypes;
const query = typeSearchQuery.toLowerCase();
return uniqueTypes.filter((type) => getDocumentTypeLabel(type).toLowerCase().includes(query));
}, [uniqueTypes, typeSearchQuery]);
const typeCounts = useMemo(() => {
const map = new Map<string, number>();
for (const [type, count] of Object.entries(typeCountsRecord)) {
@ -75,202 +81,233 @@ export function DocumentsFilters({
return (
<motion.div
className="flex flex-wrap items-center justify-start gap-3 w-full"
className="flex flex-col gap-4"
initial={{ opacity: 0, y: 10 }}
animate={{ opacity: 1, y: 0 }}
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.1 }}
>
<div className="flex items-center gap-3 flex-wrap w-full sm:w-auto">
{/* Main toolbar row */}
<div className="flex flex-wrap items-center gap-3">
{/* Action Buttons - Left Side */}
<div className="flex items-center gap-2">
<Button
onClick={openUploadDialog}
variant="outline"
size="sm"
className="h-9 gap-2 bg-white text-gray-700 border-white hover:bg-gray-50 dark:bg-white dark:text-gray-800 dark:hover:bg-gray-100"
>
<FilePlus2 size={16} />
<span>Upload documents</span>
</Button>
<Button
onClick={() => setConnectorDialogOpen(true)}
variant="outline"
size="sm"
className="h-9 gap-2 bg-white text-gray-700 border-white hover:bg-gray-50 dark:bg-white dark:text-gray-800 dark:hover:bg-gray-100"
>
<SlidersHorizontal size={16} />
<span>Manage connectors</span>
</Button>
</div>
{/* Spacer */}
<div className="flex-1" />
{/* Search Input */}
<motion.div
className="relative w-full sm:w-auto"
className="relative w-[180px]"
initial={{ opacity: 0, y: -10 }}
animate={{ opacity: 1, y: 0 }}
transition={{ type: "spring", stiffness: 300, damping: 30 }}
>
<div className="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3 text-muted-foreground">
<ListFilter size={14} aria-hidden="true" />
</div>
<Input
id={`${id}-input`}
ref={inputRef}
className="peer w-full sm:min-w-60 ps-9"
className="peer h-9 w-full pl-9 pr-9 text-sm bg-background border-border/60 focus-visible:ring-1 focus-visible:ring-ring/30"
value={searchValue}
onChange={(e) => onSearch(e.target.value)}
placeholder={t("filter_placeholder")}
placeholder="Filter by title"
type="text"
aria-label={t("filter_placeholder")}
/>
<motion.div
className="pointer-events-none absolute inset-y-0 start-0 flex items-center justify-center ps-3 text-muted-foreground/80 peer-disabled:opacity-50"
initial={{ scale: 0.8 }}
animate={{ scale: 1 }}
transition={{ delay: 0.1 }}
>
<ListFilter size={16} strokeWidth={2} aria-hidden="true" />
</motion.div>
{Boolean(searchValue) && (
<motion.button
className="absolute inset-y-0 end-0 flex h-full w-9 items-center justify-center rounded-e-lg text-muted-foreground/80 outline-offset-2 transition-colors hover:text-foreground focus:z-10 focus-visible:outline focus-visible:outline-ring/70"
className="absolute inset-y-0 right-0 flex h-full w-9 items-center justify-center rounded-r-md text-muted-foreground/60 hover:text-foreground transition-colors"
aria-label="Clear filter"
onClick={() => {
onSearch("");
inputRef.current?.focus();
}}
initial={{ opacity: 0, rotate: -90 }}
animate={{ opacity: 1, rotate: 0 }}
exit={{ opacity: 0, rotate: 90 }}
initial={{ opacity: 0, scale: 0.8 }}
animate={{ opacity: 1, scale: 1 }}
exit={{ opacity: 0, scale: 0.8 }}
whileHover={{ scale: 1.1 }}
whileTap={{ scale: 0.9 }}
>
<CircleX size={16} strokeWidth={2} aria-hidden="true" />
<CircleX size={14} strokeWidth={2} aria-hidden="true" />
</motion.button>
)}
</motion.div>
<Popover>
<PopoverTrigger asChild>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button variant="outline">
<Filter
className="-ms-1 me-2 opacity-60"
size={16}
strokeWidth={2}
aria-hidden="true"
/>
Type
{/* Filter Buttons Group */}
<div className="flex items-center gap-2 flex-wrap">
{/* Type Filter */}
<Popover>
<PopoverTrigger asChild>
<Button
variant="outline"
size="sm"
className="h-9 gap-2 border-dashed border-border/60 text-muted-foreground hover:text-foreground hover:border-border"
>
<FileType size={14} className="text-muted-foreground" />
<span className="hidden sm:inline">Type</span>
{activeTypes.length > 0 && (
<motion.span
initial={{ scale: 0.8 }}
animate={{ scale: 1 }}
className="-me-1 ms-3 inline-flex h-5 max-h-full items-center rounded border border-border bg-background px-1 text-[0.625rem] font-medium text-muted-foreground/70"
>
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-primary text-[10px] font-medium text-primary-foreground">
{activeTypes.length}
</motion.span>
</span>
)}
</Button>
</motion.div>
</PopoverTrigger>
<PopoverContent className="min-w-36 p-3" align="start">
<motion.div initial="hidden" animate="visible" exit="exit" variants={fadeInScale}>
<div className="space-y-3">
<div className="text-xs font-medium text-muted-foreground">Filters</div>
<div className="space-y-3">
<AnimatePresence>
{uniqueTypes.map((value: DocumentTypeEnum, i) => (
<motion.div
</PopoverTrigger>
<PopoverContent className="w-64 !p-0 overflow-hidden" align="end">
<div>
{/* Search input */}
<div className="p-2 border-b border-border/50">
<div className="relative">
<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
<Input
placeholder="Search types..."
value={typeSearchQuery}
onChange={(e) => setTypeSearchQuery(e.target.value)}
className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0"
/>
</div>
</div>
<div className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5">
{filteredTypes.length === 0 ? (
<div className="py-6 text-center text-sm text-muted-foreground">
No types found
</div>
) : (
filteredTypes.map((value: DocumentTypeEnum, i) => (
<div
key={value}
className="flex items-center gap-2"
initial={{ opacity: 0, y: -5 }}
animate={{ opacity: 1, y: 0 }}
exit={{ opacity: 0, y: 5 }}
transition={{ delay: i * 0.05 }}
role="button"
tabIndex={0}
className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-muted/50 transition-colors cursor-pointer text-left"
onClick={() => onToggleType(value, !activeTypes.includes(value))}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
onToggleType(value, !activeTypes.includes(value));
}
}}
>
{/* Icon */}
<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
{getDocumentTypeIcon(value, "h-4 w-4")}
</div>
{/* Text content */}
<div className="flex flex-col min-w-0 flex-1 gap-0.5">
<span className="text-[13px] font-medium text-foreground truncate leading-tight">
{getDocumentTypeLabel(value)}
</span>
<span className="text-[11px] text-muted-foreground leading-tight">
{typeCounts.get(value)} document
{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
</span>
</div>
{/* Checkbox */}
<Checkbox
id={`${id}-${i}`}
checked={activeTypes.includes(value)}
onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
/>
<Label
htmlFor={`${id}-${i}`}
className="flex grow justify-between gap-2 font-normal"
>
{value}{" "}
<span className="ms-2 text-xs text-muted-foreground">
{typeCounts.get(value)}
</span>
</Label>
</motion.div>
))}
</AnimatePresence>
</div>
))
)}
</div>
{activeTypes.length > 0 && (
<div className="px-3 pt-1.5 pb-1.5 border-t border-border/50">
<Button
variant="ghost"
size="sm"
className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground"
onClick={() => {
activeTypes.forEach((t) => {
onToggleType(t, false);
});
}}
>
Clear filters
</Button>
</div>
)}
</div>
</motion.div>
</PopoverContent>
</Popover>
</PopoverContent>
</Popover>
<DropdownMenu>
<DropdownMenuTrigger asChild>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button variant="outline">
<Columns3
className="-ms-1 me-2 opacity-60"
size={16}
strokeWidth={2}
aria-hidden="true"
/>
View
</Button>
</motion.div>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
<DropdownMenuLabel>Toggle columns</DropdownMenuLabel>
{(
[
["title", "Title"],
["document_type", "Type"],
["content", "Content"],
["created_at", "Created At"],
] as Array<[keyof ColumnVisibility, string]>
).map(([key, label]) => (
<DropdownMenuCheckboxItem
key={key}
className="capitalize"
checked={columnVisibility[key]}
onCheckedChange={(v) => onToggleColumn(key, !!v)}
onSelect={(e) => e.preventDefault()}
>
{label}
</DropdownMenuCheckboxItem>
))}
</DropdownMenuContent>
</DropdownMenu>
</div>
<div className="flex items-center gap-3 w-full sm:w-auto sm:ml-auto">
{selectedIds.size > 0 && (
<AlertDialog>
<AlertDialogTrigger asChild>
<Button className="w-full sm:w-auto" variant="outline">
<Trash
className="-ms-1 me-2 opacity-60"
size={16}
strokeWidth={2}
aria-hidden="true"
/>
Delete
<span className="-me-1 ms-3 inline-flex h-5 max-h-full items-center rounded border border-border bg-background px-1 text-[0.625rem] font-medium text-muted-foreground/70">
{selectedIds.size}
</span>
</Button>
</AlertDialogTrigger>
<AlertDialogContent>
<div className="flex flex-col gap-2 max-sm:items-center sm:flex-row sm:gap-4">
<div
className="flex size-9 shrink-0 items-center justify-center rounded-full border border-border"
aria-hidden="true"
{/* Bulk Delete Button */}
{selectedIds.size > 0 && (
<AlertDialog>
<AlertDialogTrigger asChild>
<motion.div
initial={{ opacity: 0, scale: 0.9 }}
animate={{ opacity: 1, scale: 1 }}
exit={{ opacity: 0, scale: 0.9 }}
>
<CircleAlert className="opacity-80" size={16} strokeWidth={2} />
{/* Mobile: icon with count */}
<Button variant="destructive" size="sm" className="h-9 gap-1.5 px-2.5 md:hidden">
<Trash size={14} />
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-destructive-foreground/20 text-[10px] font-medium">
{selectedIds.size}
</span>
</Button>
{/* Desktop: full button */}
<Button variant="destructive" size="sm" className="h-9 gap-2 hidden md:flex">
<Trash size={14} />
Delete
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-destructive-foreground/20 text-[10px] font-medium">
{selectedIds.size}
</span>
</Button>
</motion.div>
</AlertDialogTrigger>
<AlertDialogContent className="max-w-md">
<div className="flex flex-col gap-2 sm:flex-row sm:gap-4">
<div
className="flex size-10 shrink-0 items-center justify-center rounded-full bg-destructive/10 text-destructive"
aria-hidden="true"
>
<CircleAlert size={18} strokeWidth={2} />
</div>
<AlertDialogHeader className="flex-1">
<AlertDialogTitle>
Delete {selectedIds.size} document{selectedIds.size !== 1 ? "s" : ""}?
</AlertDialogTitle>
<AlertDialogDescription>
This action cannot be undone. This will permanently delete the selected{" "}
{selectedIds.size === 1 ? "document" : "documents"} from your search space.
</AlertDialogDescription>
</AlertDialogHeader>
</div>
<AlertDialogHeader>
<AlertDialogTitle>Are you absolutely sure?</AlertDialogTitle>
<AlertDialogDescription>
This action cannot be undone. This will permanently delete {selectedIds.size}{" "}
selected {selectedIds.size === 1 ? "row" : "rows"}.
</AlertDialogDescription>
</AlertDialogHeader>
</div>
<AlertDialogFooter>
<AlertDialogCancel>Cancel</AlertDialogCancel>
<AlertDialogAction onClick={onBulkDelete}>Delete</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
)}
<AlertDialogFooter>
<AlertDialogCancel>Cancel</AlertDialogCancel>
<AlertDialogAction
onClick={onBulkDelete}
className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
>
Delete
</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
)}
</div>
</div>
</motion.div>
);

View file

@ -1,14 +1,30 @@
"use client";
import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react";
import { formatDistanceToNow } from "date-fns";
import {
AlertCircle,
Calendar,
CheckCircle2,
ChevronDown,
ChevronUp,
Clock,
FileText,
FileX,
Loader2,
Network,
Plus,
User,
} from "lucide-react";
import { motion } from "motion/react";
import { useParams } from "next/navigation";
import { useTranslations } from "next-intl";
import React from "react";
import React, { useRef, useState, useEffect, useCallback } from "react";
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
import { DocumentViewer } from "@/components/document-viewer";
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
import { MarkdownViewer } from "@/components/markdown-viewer";
import { Button } from "@/components/ui/button";
import { Checkbox } from "@/components/ui/checkbox";
import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog";
import { Skeleton } from "@/components/ui/skeleton";
import { Spinner } from "@/components/ui/spinner";
import {
Table,
@ -19,9 +35,64 @@ import {
TableRow,
} from "@/components/ui/table";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { DocumentTypeChip, getDocumentTypeIcon } from "./DocumentTypeIcon";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { DocumentTypeChip } from "./DocumentTypeIcon";
import { RowActions } from "./RowActions";
import type { ColumnVisibility, Document } from "./types";
import type { ColumnVisibility, Document, DocumentStatus } from "./types";
// Status indicator component for document processing status
function StatusIndicator({ status }: { status?: DocumentStatus }) {
const state = status?.state ?? "ready";
switch (state) {
case "pending":
return (
<Tooltip>
<TooltipTrigger asChild>
<div className="flex items-center justify-center">
<Clock className="h-5 w-5 text-muted-foreground/60" />
</div>
</TooltipTrigger>
<TooltipContent side="top">Pending - waiting to be synced</TooltipContent>
</Tooltip>
);
case "processing":
return (
<Tooltip>
<TooltipTrigger asChild>
<div className="flex items-center justify-center">
<Spinner size="sm" className="text-primary" />
</div>
</TooltipTrigger>
<TooltipContent side="top">Syncing</TooltipContent>
</Tooltip>
);
case "failed":
return (
<Tooltip>
<TooltipTrigger asChild>
<div className="flex items-center justify-center">
<AlertCircle className="h-5 w-5 text-destructive" />
</div>
</TooltipTrigger>
<TooltipContent side="top" className="max-w-xs">
{status?.reason || "Processing failed"}
</TooltipContent>
</Tooltip>
);
case "ready":
return (
<Tooltip>
<TooltipTrigger asChild>
<div className="flex items-center justify-center">
<CheckCircle2 className="h-5 w-5 text-muted-foreground/60" />
</div>
</TooltipTrigger>
<TooltipContent side="top">Ready</TooltipContent>
</Tooltip>
);
}
}
export type SortKey = keyof Pick<Document, "title" | "document_type" | "created_at">;
@ -36,57 +107,215 @@ function sortDocuments(docs: Document[], key: SortKey, desc: boolean): Document[
return desc ? sorted.reverse() : sorted;
}
function truncate(text: string, len = 150): string {
const plain = text
.replace(/[#*_`>\-[\]()]+/g, " ")
.replace(/\s+/g, " ")
.trim();
if (plain.length <= len) return plain;
return `${plain.slice(0, len)}...`;
function formatRelativeDate(dateStr: string): string {
return formatDistanceToNow(new Date(dateStr), { addSuffix: true });
}
function formatAbsoluteDate(dateStr: string): string {
const date = new Date(dateStr);
return date.toLocaleString("en-US", {
year: "numeric",
month: "long",
day: "numeric",
hour: "2-digit",
minute: "2-digit",
hour12: false,
});
}
function TruncatedText({ text, className }: { text: string; className?: string }) {
const textRef = useRef<HTMLSpanElement>(null);
const [isTruncated, setIsTruncated] = useState(false);
useEffect(() => {
const checkTruncation = () => {
if (textRef.current) {
setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth);
}
};
checkTruncation();
window.addEventListener("resize", checkTruncation);
return () => window.removeEventListener("resize", checkTruncation);
}, []);
if (isTruncated) {
return (
<Tooltip>
<TooltipTrigger asChild>
<span ref={textRef} className={className}>
{text}
</span>
</TooltipTrigger>
<TooltipContent side="top" className="max-w-xs">
<p className="break-words">{text}</p>
</TooltipContent>
</Tooltip>
);
}
return (
<span ref={textRef} className={className}>
{text}
</span>
);
}
function SortableHeader({
children,
sortKey,
currentSortKey,
sortDesc,
onSort,
icon,
}: {
children: React.ReactNode;
sortKey: SortKey;
currentSortKey: SortKey;
sortDesc: boolean;
onSort: (key: SortKey) => void;
icon?: React.ReactNode;
}) {
const isActive = currentSortKey === sortKey;
return (
<button
type="button"
onClick={() => onSort(sortKey)}
className="flex items-center gap-1.5 text-left text-sm font-medium text-muted-foreground/70 hover:text-muted-foreground transition-colors group"
>
{icon && <span className="opacity-60">{icon}</span>}
{children}
<span
className={`transition-opacity ${isActive ? "opacity-100" : "opacity-0 group-hover:opacity-50"}`}
>
{isActive && sortDesc ? <ChevronDown size={14} /> : <ChevronUp size={14} />}
</span>
</button>
);
}
export function DocumentsTableShell({
documents,
loading,
error,
onRefresh,
selectedIds,
setSelectedIds,
columnVisibility,
deleteDocument,
sortKey,
sortDesc,
onSortChange,
deleteDocument,
searchSpaceId,
}: {
documents: Document[];
loading: boolean;
error: boolean;
onRefresh: () => Promise<void>;
selectedIds: Set<number>;
setSelectedIds: (update: Set<number>) => void;
columnVisibility: ColumnVisibility;
deleteDocument: (id: number) => Promise<boolean>;
sortKey: SortKey;
sortDesc: boolean;
onSortChange: (key: SortKey) => void;
deleteDocument: (id: number) => Promise<boolean>;
searchSpaceId: string;
}) {
const t = useTranslations("documents");
const params = useParams();
const searchSpaceId = params.search_space_id;
const { openDialog } = useDocumentUploadDialog();
// State for metadata viewer (opened via Ctrl/Cmd+Click)
// Real-time documents don't sync metadata - we fetch on-demand when viewing
const [metadataDoc, setMetadataDoc] = useState<Document | null>(null);
const [metadataContent, setMetadataContent] = useState<any>(null);
const [metadataLoading, setMetadataLoading] = useState(false);
// State for lazy document content viewer
// Real-time documents don't sync content - we fetch on-demand when viewing
const [viewingDoc, setViewingDoc] = useState<Document | null>(null);
const [viewingContent, setViewingContent] = useState<string>("");
const [viewingLoading, setViewingLoading] = useState(false);
// Fetch document metadata on-demand when metadata viewer is opened
const handleViewMetadata = useCallback(async (doc: Document) => {
setMetadataDoc(doc);
// If metadata is already available (from API/search), use it directly
if (doc.document_metadata) {
setMetadataContent(doc.document_metadata);
return;
}
// Otherwise, fetch from API (lazy loading for real-time synced documents)
setMetadataLoading(true);
try {
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
setMetadataContent(fullDoc.document_metadata);
} catch (err) {
console.error("[DocumentsTableShell] Failed to fetch document metadata:", err);
setMetadataContent(null);
} finally {
setMetadataLoading(false);
}
}, []);
// Close metadata viewer
const handleCloseMetadata = useCallback(() => {
setMetadataDoc(null);
setMetadataContent(null);
setMetadataLoading(false);
}, []);
// Fetch document content on-demand when viewer is opened
const handleViewDocument = useCallback(async (doc: Document) => {
setViewingDoc(doc);
// If content is already available (from API/search), use it directly
if (doc.content) {
setViewingContent(doc.content);
return;
}
// Otherwise, fetch from API (lazy loading for real-time synced documents)
setViewingLoading(true);
try {
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
setViewingContent(fullDoc.content);
} catch (err) {
console.error("[DocumentsTableShell] Failed to fetch document content:", err);
setViewingContent("Failed to load document content.");
} finally {
setViewingLoading(false);
}
}, []);
// Close document viewer
const handleCloseViewer = useCallback(() => {
setViewingDoc(null);
setViewingContent("");
setViewingLoading(false);
}, []);
const sorted = React.useMemo(
() => sortDocuments(documents, sortKey, sortDesc),
[documents, sortKey, sortDesc]
);
const allSelectedOnPage = sorted.length > 0 && sorted.every((d) => selectedIds.has(d.id));
const someSelectedOnPage = sorted.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage;
// Helper: check if document can be selected (not processing/pending)
const isSelectable = (doc: Document) => {
const state = doc.status?.state;
return state !== "pending" && state !== "processing";
};
// Only consider selectable documents for "select all" logic
const selectableDocs = sorted.filter(isSelectable);
const allSelectedOnPage =
selectableDocs.length > 0 && selectableDocs.every((d) => selectedIds.has(d.id));
const someSelectedOnPage =
selectableDocs.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage;
const toggleAll = (checked: boolean) => {
const next = new Set(selectedIds);
if (checked)
sorted.forEach((d) => {
// Only select documents that are not processing/pending
selectableDocs.forEach((d) => {
next.add(d.id);
});
else
@ -107,39 +336,139 @@ export function DocumentsTableShell({
return (
<motion.div
className="rounded-md border mt-6 overflow-hidden"
className="rounded-lg border border-border/40 bg-background overflow-hidden"
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.2 }}
>
{loading ? (
<div className="flex h-[400px] w-full items-center justify-center">
<div className="flex flex-col items-center gap-2">
<Spinner size="lg" className="text-primary" />
<p className="text-sm text-muted-foreground">{t("loading")}</p>
<>
{/* Desktop Skeleton View */}
<div className="hidden md:flex md:flex-col">
<Table className="table-fixed w-full">
<TableHeader>
<TableRow className="hover:bg-transparent border-b border-border/40">
<TableHead className="w-8 px-0 text-center">
<div className="flex items-center justify-center h-full">
<Skeleton className="h-4 w-4 rounded" />
</div>
</TableHead>
<TableHead className="w-[35%] max-w-0 border-r border-border/40">
<Skeleton className="h-3 w-20" />
</TableHead>
{columnVisibility.document_type && (
<TableHead className="w-[20%] min-w-[120px] max-w-[200px] border-r border-border/40">
<Skeleton className="h-3 w-14" />
</TableHead>
)}
{columnVisibility.created_by && (
<TableHead className="w-36 border-r border-border/40">
<Skeleton className="h-3 w-10" />
</TableHead>
)}
{columnVisibility.created_at && (
<TableHead className="w-32 border-r border-border/40">
<Skeleton className="h-3 w-16" />
</TableHead>
)}
{columnVisibility.status && (
<TableHead className="w-20 text-center">
<Skeleton className="h-3 w-12 mx-auto" />
</TableHead>
)}
<TableHead className="w-10">
<span className="sr-only">Actions</span>
</TableHead>
</TableRow>
</TableHeader>
</Table>
<div className="h-[50vh] overflow-auto">
<Table className="table-fixed w-full">
<TableBody>
{[65, 80, 45, 72, 55, 88, 40, 60, 50, 75].map((widthPercent, index) => (
<TableRow
key={`skeleton-${index}`}
className="border-b border-border/40 hover:bg-transparent"
>
<TableCell className="w-8 px-0 py-2.5 text-center">
<div className="flex items-center justify-center h-full">
<Skeleton className="h-4 w-4 rounded" />
</div>
</TableCell>
<TableCell className="w-[35%] py-2.5 max-w-0 border-r border-border/40">
<Skeleton className="h-4" style={{ width: `${widthPercent}%` }} />
</TableCell>
{columnVisibility.document_type && (
<TableCell className="w-[20%] min-w-[120px] max-w-[200px] py-2.5 border-r border-border/40 overflow-hidden">
<Skeleton className="h-5 w-24 rounded" />
</TableCell>
)}
{columnVisibility.created_by && (
<TableCell className="w-36 py-2.5 truncate border-r border-border/40">
<Skeleton className="h-4 w-20" />
</TableCell>
)}
{columnVisibility.created_at && (
<TableCell className="w-32 py-2.5 border-r border-border/40">
<Skeleton className="h-4 w-20" />
</TableCell>
)}
{columnVisibility.status && (
<TableCell className="w-20 py-2.5 text-center">
<Skeleton className="h-5 w-5 mx-auto rounded-full" />
</TableCell>
)}
<TableCell className="w-10 py-2.5 text-center">
<Skeleton className="h-6 w-6 mx-auto rounded" />
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</div>
</div>
</div>
{/* Mobile Skeleton View */}
<div className="md:hidden divide-y divide-border/30 h-[50vh] overflow-auto">
{[70, 85, 55, 78, 62, 90].map((widthPercent, index) => (
<div key={`skeleton-mobile-${index}`} className="px-4 py-3">
<div className="flex items-start gap-3">
<Skeleton className="h-4 w-4 mt-0.5 rounded" />
<div className="flex-1 min-w-0 space-y-2">
<Skeleton className="h-4" style={{ width: `${widthPercent}%` }} />
<div className="flex flex-wrap items-center gap-2">
<Skeleton className="h-5 w-20 rounded" />
{columnVisibility.created_by && <Skeleton className="h-3 w-14" />}
{columnVisibility.created_at && <Skeleton className="h-3 w-20" />}
</div>
</div>
<div className="flex items-center gap-2">
{columnVisibility.status && <Skeleton className="h-5 w-5 rounded-full" />}
<Skeleton className="h-7 w-7 rounded" />
</div>
</div>
</div>
))}
</div>
</>
) : error ? (
<div className="flex h-[400px] w-full items-center justify-center">
<div className="flex flex-col items-center gap-2">
<div className="flex h-[50vh] w-full items-center justify-center">
<div className="flex flex-col items-center gap-3">
<AlertCircle className="h-8 w-8 text-destructive/60" />
<p className="text-sm text-destructive">{t("error_loading")}</p>
<Button variant="outline" size="sm" onClick={() => onRefresh()} className="mt-2">
{t("retry")}
</Button>
</div>
</div>
) : sorted.length === 0 ? (
<div className="flex h-[400px] w-full items-center justify-center">
<div className="flex h-[50vh] w-full items-center justify-center">
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.4 }}
className="flex flex-col items-center gap-4 max-w-md px-4 text-center"
>
<div className="rounded-full bg-muted p-4">
<FileX className="h-8 w-8 text-muted-foreground" />
<div className="rounded-full bg-muted/50 p-4">
<FileX className="h-8 w-8 text-muted-foreground/60" />
</div>
<div className="space-y-2">
<div className="space-y-1.5">
<h3 className="text-lg font-semibold">{t("no_documents")}</h3>
<p className="text-sm text-muted-foreground">
Get started by uploading your first document.
@ -153,234 +482,301 @@ export function DocumentsTableShell({
</div>
) : (
<>
<div className="hidden md:block max-h-[60vh] overflow-auto">
{/* Desktop Table View - Notion Style */}
<div className="hidden md:flex md:flex-col">
{/* Fixed Header */}
<Table className="table-fixed w-full">
<TableHeader className="sticky top-0 bg-background">
<TableRow className="hover:bg-transparent">
<TableHead style={{ width: 28 }}>
<Checkbox
checked={allSelectedOnPage || (someSelectedOnPage && "indeterminate")}
onCheckedChange={(v) => toggleAll(!!v)}
aria-label="Select all"
/>
<TableHeader>
<TableRow className="hover:bg-transparent border-b border-border/40">
<TableHead className="w-8 px-0 text-center">
<div className="flex items-center justify-center h-full">
<Checkbox
checked={allSelectedOnPage || (someSelectedOnPage && "indeterminate")}
onCheckedChange={(v) => toggleAll(!!v)}
aria-label="Select all"
className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary"
/>
</div>
</TableHead>
<TableHead className="w-[35%] border-r border-border/40">
<SortableHeader
sortKey="title"
currentSortKey={sortKey}
sortDesc={sortDesc}
onSort={onSortHeader}
icon={<FileText size={14} className="text-muted-foreground" />}
>
Document
</SortableHeader>
</TableHead>
{columnVisibility.title && (
<TableHead style={{ width: 250 }}>
<Button
variant="ghost"
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
onClick={() => onSortHeader("title")}
>
{t("title")}
{sortKey === "title" ? (
sortDesc ? (
<ChevronDown className="shrink-0 opacity-60" size={16} />
) : (
<ChevronUp className="shrink-0 opacity-60" size={16} />
)
) : null}
</Button>
</TableHead>
)}
{columnVisibility.document_type && (
<TableHead style={{ width: 180 }}>
<Button
variant="ghost"
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
onClick={() => onSortHeader("document_type")}
<TableHead className="w-[20%] min-w-[120px] max-w-[200px] border-r border-border/40">
<SortableHeader
sortKey="document_type"
currentSortKey={sortKey}
sortDesc={sortDesc}
onSort={onSortHeader}
icon={<Network size={14} className="text-muted-foreground" />}
>
{t("type")}
{sortKey === "document_type" ? (
sortDesc ? (
<ChevronDown className="shrink-0 opacity-60" size={16} />
) : (
<ChevronUp className="shrink-0 opacity-60" size={16} />
)
) : null}
</Button>
Source
</SortableHeader>
</TableHead>
)}
{columnVisibility.content && (
<TableHead style={{ width: 300 }}>{t("content_summary")}</TableHead>
{columnVisibility.created_by && (
<TableHead className="w-36 border-r border-border/40">
<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
<User size={14} className="opacity-60 text-muted-foreground" />
User
</span>
</TableHead>
)}
{columnVisibility.created_at && (
<TableHead style={{ width: 120 }}>
<Button
variant="ghost"
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
onClick={() => onSortHeader("created_at")}
<TableHead className="w-32 border-r border-border/40">
<SortableHeader
sortKey="created_at"
currentSortKey={sortKey}
sortDesc={sortDesc}
onSort={onSortHeader}
icon={<Calendar size={14} className="text-muted-foreground" />}
>
Created At
{sortKey === "created_at" ? (
sortDesc ? (
<ChevronDown className="shrink-0 opacity-60" size={16} />
) : (
<ChevronUp className="shrink-0 opacity-60" size={16} />
)
) : null}
</Button>
Created
</SortableHeader>
</TableHead>
)}
<TableHead style={{ width: 60 }}>
{columnVisibility.status && (
<TableHead className="w-20 text-center">
<span className="text-sm font-medium text-muted-foreground/70">Status</span>
</TableHead>
)}
<TableHead className="w-10">
<span className="sr-only">Actions</span>
</TableHead>
</TableRow>
</TableHeader>
<TableBody>
{sorted.map((doc, index) => {
const icon = getDocumentTypeIcon(doc.document_type);
const title = doc.title;
const truncatedTitle = title.length > 30 ? `${title.slice(0, 30)}...` : title;
return (
<motion.tr
key={doc.id}
initial={{ opacity: 0, y: 10 }}
animate={{
opacity: 1,
y: 0,
transition: {
type: "spring",
stiffness: 300,
damping: 30,
delay: index * 0.03,
},
}}
exit={{ opacity: 0, y: -10 }}
className="border-b transition-colors hover:bg-muted/50"
>
<TableCell className="px-4 py-3">
<Checkbox
checked={selectedIds.has(doc.id)}
onCheckedChange={(v) => toggleOne(doc.id, !!v)}
aria-label="Select row"
/>
</TableCell>
{columnVisibility.title && (
<TableCell className="px-4 py-3">
<motion.div
className="flex items-center gap-2 font-medium"
whileHover={{ scale: 1.02 }}
transition={{ type: "spring", stiffness: 300 }}
style={{ display: "flex" }}
</Table>
{/* Scrollable Body */}
<div className="h-[50vh] overflow-auto">
<Table className="table-fixed w-full">
<TableBody>
{sorted.map((doc, index) => {
const title = doc.title;
const isSelected = selectedIds.has(doc.id);
const canSelect = isSelectable(doc);
return (
<motion.tr
key={doc.id}
initial={{ opacity: 0 }}
animate={{
opacity: 1,
transition: {
duration: 0.2,
delay: index * 0.02,
},
}}
className={`border-b border-border/40 transition-colors ${
isSelected ? "bg-primary/5 hover:bg-primary/8" : "hover:bg-muted/30"
}`}
>
<TableCell className="w-8 px-0 py-2.5 text-center">
<div className="flex items-center justify-center h-full">
<Checkbox
checked={isSelected}
onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)}
disabled={!canSelect}
aria-label={
canSelect ? "Select row" : "Cannot select while processing"
}
className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`}
/>
</div>
</TableCell>
<TableCell className="w-[35%] py-2.5 max-w-0 border-r border-border/40">
<button
type="button"
className="block w-full text-left text-sm text-foreground hover:text-foreground transition-colors cursor-pointer bg-transparent border-0 p-0 truncate"
onClick={(e) => {
// Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata
if (e.ctrlKey || e.metaKey) {
e.preventDefault();
e.stopPropagation();
handleViewMetadata(doc);
} else {
// Normal click opens document viewer (lazy loads content)
handleViewDocument(doc);
}
}}
onKeyDown={(e) => {
// Ctrl/Cmd + Enter opens metadata
if ((e.ctrlKey || e.metaKey) && e.key === "Enter") {
e.preventDefault();
handleViewMetadata(doc);
} else if (e.key === "Enter") {
// Enter opens document viewer
handleViewDocument(doc);
}
}}
>
<TruncatedText text={title} className="truncate block" />
</button>
</TableCell>
{columnVisibility.document_type && (
<TableCell className="w-[20%] min-w-[120px] max-w-[200px] py-2.5 border-r border-border/40 overflow-hidden">
<DocumentTypeChip type={doc.document_type} />
</TableCell>
)}
{columnVisibility.created_by && (
<TableCell className="w-36 py-2.5 text-sm text-foreground truncate border-r border-border/40">
{doc.created_by_name || "—"}
</TableCell>
)}
{columnVisibility.created_at && (
<TableCell className="w-32 py-2.5 text-sm text-foreground border-r border-border/40">
<Tooltip>
<TooltipTrigger asChild>
<span className="flex items-center gap-2">
<span className="text-muted-foreground shrink-0">{icon}</span>
<span>{truncatedTitle}</span>
<span className="cursor-default">
{formatRelativeDate(doc.created_at)}
</span>
</TooltipTrigger>
<TooltipContent>
<p>{title}</p>
<TooltipContent side="top">
{formatAbsoluteDate(doc.created_at)}
</TooltipContent>
</Tooltip>
</motion.div>
</TableCell>
)}
{columnVisibility.status && (
<TableCell className="w-20 py-2.5 text-center">
<StatusIndicator status={doc.status} />
</TableCell>
)}
<TableCell className="w-10 py-2.5 text-center">
<RowActions
document={doc}
deleteDocument={deleteDocument}
searchSpaceId={searchSpaceId}
/>
</TableCell>
)}
{columnVisibility.document_type && (
<TableCell className="px-4 py-3">
<div className="flex items-center gap-2">
<DocumentTypeChip type={doc.document_type} />
</div>
</TableCell>
)}
{columnVisibility.content && (
<TableCell className="px-4 py-3">
<div className="flex flex-col gap-2">
<div className="max-w-[300px] max-h-[60px] overflow-hidden text-sm text-muted-foreground">
{truncate(doc.content)}
</div>
<DocumentViewer
title={doc.title}
content={doc.content}
trigger={
<Button variant="ghost" size="sm" className="w-fit text-xs">
{t("view_full")}
</Button>
}
/>
</div>
</TableCell>
)}
{columnVisibility.created_at && (
<TableCell className="px-4 py-3">
{new Date(doc.created_at).toLocaleDateString()}
</TableCell>
)}
<TableCell className="px-4 py-3">
<RowActions
document={doc}
deleteDocument={deleteDocument}
refreshDocuments={async () => {
await onRefresh();
}}
searchSpaceId={searchSpaceId as string}
/>
</TableCell>
</motion.tr>
);
})}
</TableBody>
</Table>
</motion.tr>
);
})}
</TableBody>
</Table>
</div>
</div>
<div className="md:hidden divide-y">
{sorted.map((doc) => {
const icon = getDocumentTypeIcon(doc.document_type);
{/* Mobile Card View - Notion Style */}
<div className="md:hidden divide-y divide-border/40 h-[50vh] overflow-auto">
{sorted.map((doc, index) => {
const isSelected = selectedIds.has(doc.id);
const canSelect = isSelectable(doc);
return (
<div key={doc.id} className="p-3">
<motion.div
key={doc.id}
initial={{ opacity: 0 }}
animate={{ opacity: 1, transition: { delay: index * 0.03 } }}
className={`px-4 py-3 transition-colors ${
isSelected ? "bg-primary/5" : "hover:bg-muted/20"
}`}
>
<div className="flex items-center gap-3">
<Checkbox
checked={selectedIds.has(doc.id)}
onCheckedChange={(v) => toggleOne(doc.id, !!v)}
aria-label="Select row"
checked={isSelected}
onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)}
disabled={!canSelect}
aria-label={canSelect ? "Select row" : "Cannot select while processing"}
className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`}
/>
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2 min-w-0">
<span className="text-muted-foreground shrink-0">{icon}</span>
<div className="font-medium truncate">{doc.title}</div>
</div>
<div className="mt-1 flex flex-wrap items-center gap-2">
<div className="flex-1 min-w-0 space-y-1.5">
<button
type="button"
className="text-left text-sm text-foreground hover:text-foreground transition-colors cursor-pointer truncate block w-full bg-transparent border-0 p-0"
onClick={(e) => {
// Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata
if (e.ctrlKey || e.metaKey) {
e.preventDefault();
e.stopPropagation();
handleViewMetadata(doc);
} else {
// Normal click opens document viewer (lazy loads content)
handleViewDocument(doc);
}
}}
onKeyDown={(e) => {
// Ctrl/Cmd + Enter opens metadata
if ((e.ctrlKey || e.metaKey) && e.key === "Enter") {
e.preventDefault();
handleViewMetadata(doc);
} else if (e.key === "Enter") {
// Enter opens document viewer
handleViewDocument(doc);
}
}}
>
{doc.title}
</button>
<div className="flex flex-wrap items-center gap-2">
<DocumentTypeChip type={doc.document_type} />
<span className="text-xs text-muted-foreground">
{new Date(doc.created_at).toLocaleDateString()}
</span>
{columnVisibility.created_by && doc.created_by_name && (
<span className="text-xs text-foreground">{doc.created_by_name}</span>
)}
{columnVisibility.created_at && (
<Tooltip>
<TooltipTrigger asChild>
<span className="text-xs text-foreground cursor-default">
{formatRelativeDate(doc.created_at)}
</span>
</TooltipTrigger>
<TooltipContent side="top">
{formatAbsoluteDate(doc.created_at)}
</TooltipContent>
</Tooltip>
)}
</div>
{columnVisibility.content && (
<div className="mt-2 text-sm text-muted-foreground">
{truncate(doc.content)}
<div className="mt-1">
<DocumentViewer
title={doc.title}
content={doc.content}
trigger={
<Button
variant="ghost"
size="sm"
className="w-fit text-xs p-0 h-auto"
>
{t("view_full")}
</Button>
}
/>
</div>
</div>
)}
</div>
<RowActions
document={doc}
deleteDocument={deleteDocument}
refreshDocuments={async () => {
await onRefresh();
}}
searchSpaceId={searchSpaceId as string}
/>
<div className="flex items-center gap-2">
{columnVisibility.status && <StatusIndicator status={doc.status} />}
<RowActions
document={doc}
deleteDocument={deleteDocument}
searchSpaceId={searchSpaceId}
/>
</div>
</div>
</div>
</motion.div>
);
})}
</div>
</>
)}
{/* Metadata Viewer - opened via Ctrl/Cmd+Click on document title */}
{/* Lazy loads metadata from API for real-time synced documents */}
<JsonMetadataViewer
title={metadataDoc?.title ?? ""}
metadata={metadataContent}
loading={metadataLoading}
open={!!metadataDoc}
onOpenChange={(open) => {
if (!open) handleCloseMetadata();
}}
/>
{/* Document Content Viewer - lazy loads content on-demand */}
<Dialog open={!!viewingDoc} onOpenChange={(open) => !open && handleCloseViewer()}>
<DialogContent className="max-w-4xl max-h-[80vh] overflow-y-auto">
<DialogHeader>
<DialogTitle>{viewingDoc?.title}</DialogTitle>
</DialogHeader>
<div className="mt-4">
{viewingLoading ? (
<div className="flex items-center justify-center py-12">
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
</div>
) : (
<MarkdownViewer content={viewingContent} />
)}
</div>
</DialogContent>
</Dialog>
</motion.div>
);
}

View file

@ -2,164 +2,89 @@
import { ChevronFirst, ChevronLast, ChevronLeft, ChevronRight } from "lucide-react";
import { motion } from "motion/react";
import { useTranslations } from "next-intl";
import { Button } from "@/components/ui/button";
import { Label } from "@/components/ui/label";
import { Pagination, PaginationContent, PaginationItem } from "@/components/ui/pagination";
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from "@/components/ui/select";
const PAGE_SIZE = 50;
export function PaginationControls({
pageIndex,
pageSize,
total,
onPageSizeChange,
onFirst,
onPrev,
onNext,
onLast,
canPrev,
canNext,
id,
}: {
pageIndex: number;
pageSize: number;
total: number;
onPageSizeChange: (size: number) => void;
onFirst: () => void;
onPrev: () => void;
onNext: () => void;
onLast: () => void;
canPrev: boolean;
canNext: boolean;
id: string;
}) {
const t = useTranslations("documents");
const start = total === 0 ? 0 : pageIndex * pageSize + 1;
const end = Math.min((pageIndex + 1) * pageSize, total);
const start = pageIndex * PAGE_SIZE + 1;
const end = Math.min((pageIndex + 1) * PAGE_SIZE, total);
return (
<div className="flex items-center justify-between gap-8 mt-6">
<motion.div
className="flex items-center gap-3"
initial={{ opacity: 0, x: -20 }}
animate={{ opacity: 1, x: 0 }}
transition={{ type: "spring", stiffness: 300, damping: 30 }}
>
<Label htmlFor={id} className="max-sm:sr-only">
{t("rows_per_page")}
</Label>
<Select value={String(pageSize)} onValueChange={(v) => onPageSizeChange(Number(v))}>
<SelectTrigger id={id} className="w-fit whitespace-nowrap">
<SelectValue placeholder="Select number of results" />
</SelectTrigger>
<SelectContent>
{[5, 10, 25, 50].map((s) => (
<SelectItem key={s} value={String(s)}>
{s}
</SelectItem>
))}
</SelectContent>
</Select>
</motion.div>
<motion.div
className="flex items-center justify-end gap-3 py-3 px-2"
initial={{ opacity: 0, y: 10 }}
animate={{ opacity: 1, y: 0 }}
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.3 }}
>
{/* Range indicator */}
<span className="text-sm text-muted-foreground tabular-nums">
{start}-{end} of {total}
</span>
<motion.div
className="flex grow justify-end whitespace-nowrap text-sm text-muted-foreground"
initial={{ opacity: 0 }}
animate={{ opacity: 1 }}
transition={{ delay: 0.2 }}
>
<p className="whitespace-nowrap text-sm text-muted-foreground" aria-live="polite">
<span className="text-foreground">
{start}-{end}
</span>{" "}
of <span className="text-foreground">{total}</span>
</p>
</motion.div>
<div>
<Pagination>
<PaginationContent>
<PaginationItem>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button
size="icon"
variant="outline"
className="disabled:pointer-events-none disabled:opacity-50"
onClick={onFirst}
disabled={!canPrev}
aria-label="Go to first page"
>
<ChevronFirst size={16} strokeWidth={2} aria-hidden="true" />
</Button>
</motion.div>
</PaginationItem>
<PaginationItem>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button
size="icon"
variant="outline"
className="disabled:pointer-events-none disabled:opacity-50"
onClick={onPrev}
disabled={!canPrev}
aria-label="Go to previous page"
>
<ChevronLeft size={16} strokeWidth={2} aria-hidden="true" />
</Button>
</motion.div>
</PaginationItem>
<PaginationItem>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button
size="icon"
variant="outline"
className="disabled:pointer-events-none disabled:opacity-50"
onClick={onNext}
disabled={!canNext}
aria-label="Go to next page"
>
<ChevronRight size={16} strokeWidth={2} aria-hidden="true" />
</Button>
</motion.div>
</PaginationItem>
<PaginationItem>
<motion.div
whileHover={{ scale: 1.05 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button
size="icon"
variant="outline"
className="disabled:pointer-events-none disabled:opacity-50"
onClick={onLast}
disabled={!canNext}
aria-label="Go to last page"
>
<ChevronLast size={16} strokeWidth={2} aria-hidden="true" />
</Button>
</motion.div>
</PaginationItem>
</PaginationContent>
</Pagination>
{/* Navigation buttons */}
<div className="flex items-center gap-1">
<Button
variant="ghost"
size="icon"
className="h-8 w-8 disabled:opacity-40"
onClick={onFirst}
disabled={!canPrev}
aria-label="Go to first page"
>
<ChevronFirst size={18} strokeWidth={2} />
</Button>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 disabled:opacity-40"
onClick={onPrev}
disabled={!canPrev}
aria-label="Go to previous page"
>
<ChevronLeft size={18} strokeWidth={2} />
</Button>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 disabled:opacity-40"
onClick={onNext}
disabled={!canNext}
aria-label="Go to next page"
>
<ChevronRight size={18} strokeWidth={2} />
</Button>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 disabled:opacity-40"
onClick={onLast}
disabled={!canNext}
aria-label="Go to last page"
>
<ChevronLast size={18} strokeWidth={2} />
</Button>
</div>
</div>
</motion.div>
);
}
export { PAGE_SIZE };

View file

@ -1,11 +1,9 @@
"use client";
import { FileText, MoreHorizontal, Pencil, Trash2 } from "lucide-react";
import { motion } from "motion/react";
import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
import { useRouter } from "next/navigation";
import { useState } from "react";
import { toast } from "sonner";
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
import {
AlertDialog,
AlertDialogAction,
@ -22,7 +20,6 @@ import {
DropdownMenuItem,
DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import type { Document } from "./types";
// Only FILE and NOTE document types can be edited
@ -34,16 +31,13 @@ const NON_DELETABLE_DOCUMENT_TYPES = ["SURFSENSE_DOCS"] as const;
export function RowActions({
document,
deleteDocument,
refreshDocuments,
searchSpaceId,
}: {
document: Document;
deleteDocument: (id: number) => Promise<boolean>;
refreshDocuments: () => Promise<void>;
searchSpaceId: string;
}) {
const [isDeleteOpen, setIsDeleteOpen] = useState(false);
const [isMetadataOpen, setIsMetadataOpen] = useState(false);
const [isDeleting, setIsDeleting] = useState(false);
const router = useRouter();
@ -51,20 +45,37 @@ export function RowActions({
document.document_type as (typeof EDITABLE_DOCUMENT_TYPES)[number]
);
const isDeletable = !NON_DELETABLE_DOCUMENT_TYPES.includes(
// Documents in "pending" or "processing" state should show disabled delete
const isBeingProcessed =
document.status?.state === "pending" || document.status?.state === "processing";
// SURFSENSE_DOCS are system-managed and should not show delete at all
const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes(
document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number]
);
// Edit and Delete are disabled while processing
const isEditDisabled = isBeingProcessed;
const isDeleteDisabled = isBeingProcessed;
const handleDelete = async () => {
setIsDeleting(true);
try {
const ok = await deleteDocument(document.id);
if (ok) toast.success("Document deleted successfully");
else toast.error("Failed to delete document");
await refreshDocuments();
} catch (error) {
if (!ok) toast.error("Failed to delete document");
// Note: Success toast is handled by the mutation atom's onSuccess callback
// Cache is updated optimistically by the mutation, no need to refresh
} catch (error: unknown) {
console.error("Error deleting document:", error);
toast.error("Failed to delete document");
// Check for 409 Conflict (document started processing after UI loaded)
const status =
(error as { response?: { status?: number } })?.response?.status ??
(error as { status?: number })?.status;
if (status === 409) {
toast.error("Document is now being processed. Please try again later.");
} else {
toast.error("Failed to delete document");
}
} finally {
setIsDeleting(false);
setIsDeleteOpen(false);
@ -76,124 +87,121 @@ export function RowActions({
};
return (
<div className="flex items-center justify-end gap-1">
<>
{/* Desktop Actions */}
<div className="hidden md:flex items-center gap-1">
{isEditable && (
<Tooltip>
<TooltipTrigger asChild>
<motion.div
whileHover={{ scale: 1.1 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 text-muted-foreground hover:text-foreground hover:bg-muted/80"
onClick={handleEdit}
>
<Pencil className="h-4 w-4" />
<span className="sr-only">Edit Document</span>
</Button>
</motion.div>
</TooltipTrigger>
<TooltipContent side="top">
<p>Edit Document</p>
</TooltipContent>
</Tooltip>
)}
<Tooltip>
<TooltipTrigger asChild>
<motion.div
whileHover={{ scale: 1.1 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
>
<div className="hidden md:inline-flex items-center justify-center">
{isEditable ? (
// Editable documents: show 3-dot dropdown with edit + delete
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 text-muted-foreground hover:text-foreground hover:bg-muted/80"
onClick={() => setIsMetadataOpen(true)}
>
<FileText className="h-4 w-4" />
<span className="sr-only">View Metadata</span>
<MoreHorizontal className="h-4 w-4" />
<span className="sr-only">Open menu</span>
</Button>
</motion.div>
</TooltipTrigger>
<TooltipContent side="top">
<p>View Metadata</p>
</TooltipContent>
</Tooltip>
{isDeletable && (
<Tooltip>
<TooltipTrigger asChild>
<motion.div
whileHover={{ scale: 1.1 }}
whileTap={{ scale: 0.95 }}
transition={{ type: "spring", stiffness: 400, damping: 17 }}
</DropdownMenuTrigger>
<DropdownMenuContent align="end" className="w-40">
<DropdownMenuItem
onClick={() => !isEditDisabled && handleEdit()}
disabled={isEditDisabled}
className={
isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""
}
>
<Button
variant="ghost"
size="icon"
className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
onClick={() => setIsDeleteOpen(true)}
disabled={isDeleting}
<Pencil className="mr-2 h-4 w-4" />
<span>Edit</span>
</DropdownMenuItem>
{shouldShowDelete && (
<DropdownMenuItem
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
disabled={isDeleteDisabled}
className={
isDeleteDisabled
? "text-muted-foreground cursor-not-allowed opacity-50"
: "text-destructive focus:text-destructive"
}
>
<Trash2 className="h-4 w-4" />
<span className="sr-only">Delete</span>
</Button>
</motion.div>
</TooltipTrigger>
<TooltipContent side="top">
<p>Delete</p>
</TooltipContent>
</Tooltip>
<Trash2 className="mr-2 h-4 w-4" />
<span>Delete</span>
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
) : (
// Non-editable documents: show only delete button directly
shouldShowDelete && (
<Button
variant="ghost"
size="icon"
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
disabled={isDeleting || isDeleteDisabled}
>
<Trash2 className="h-4 w-4" />
<span className="sr-only">Delete</span>
</Button>
)
)}
</div>
{/* Mobile Actions Dropdown */}
<div className="flex md:hidden">
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button variant="ghost" size="icon" className="h-8 w-8 text-muted-foreground">
<MoreHorizontal className="h-4 w-4" />
<span className="sr-only">Open menu</span>
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end" className="w-40">
{isEditable && (
<DropdownMenuItem onClick={handleEdit}>
<div className="inline-flex md:hidden items-center justify-center">
{isEditable ? (
// Editable documents: show 3-dot dropdown
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button variant="ghost" size="icon" className="h-8 w-8 text-muted-foreground">
<MoreHorizontal className="h-4 w-4" />
<span className="sr-only">Open menu</span>
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end" className="w-40">
<DropdownMenuItem
onClick={() => !isEditDisabled && handleEdit()}
disabled={isEditDisabled}
className={
isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""
}
>
<Pencil className="mr-2 h-4 w-4" />
<span>Edit</span>
</DropdownMenuItem>
)}
<DropdownMenuItem onClick={() => setIsMetadataOpen(true)}>
<FileText className="mr-2 h-4 w-4" />
<span>Metadata</span>
</DropdownMenuItem>
{isDeletable && (
<DropdownMenuItem
onClick={() => setIsDeleteOpen(true)}
className="text-destructive focus:text-destructive"
>
<Trash2 className="mr-2 h-4 w-4" />
<span>Delete</span>
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
{shouldShowDelete && (
<DropdownMenuItem
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
disabled={isDeleteDisabled}
className={
isDeleteDisabled
? "text-muted-foreground cursor-not-allowed opacity-50"
: "text-destructive focus:text-destructive"
}
>
<Trash2 className="mr-2 h-4 w-4" />
<span>Delete</span>
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
) : (
// Non-editable documents: show only delete button directly
shouldShowDelete && (
<Button
variant="ghost"
size="icon"
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
disabled={isDeleting || isDeleteDisabled}
>
<Trash2 className="h-4 w-4" />
<span className="sr-only">Delete</span>
</Button>
)
)}
</div>
<JsonMetadataViewer
title={document.title}
metadata={document.document_metadata}
open={isMetadataOpen}
onOpenChange={setIsMetadataOpen}
/>
<AlertDialog open={isDeleteOpen} onOpenChange={setIsDeleteOpen}>
<AlertDialogContent>
<AlertDialogHeader>
@ -214,6 +222,6 @@ export function RowActions({
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
</div>
</>
);
}

View file

@ -1,18 +1,27 @@
export type DocumentType = string;
export type DocumentStatus = {
state: "ready" | "pending" | "processing" | "failed";
reason?: string;
};
export type Document = {
id: number;
title: string;
document_type: DocumentType;
document_metadata: any;
content: string;
// Optional: Only needed when viewing document details (lazy loaded)
document_metadata?: any;
content?: string;
created_at: string;
search_space_id: number;
created_by_id?: string | null;
created_by_name?: string | null;
status?: DocumentStatus;
};
export type ColumnVisibility = {
title: boolean;
document_type: boolean;
content: boolean;
created_by: boolean;
created_at: boolean;
status: boolean;
};

View file

@ -2,22 +2,19 @@
import { useQuery } from "@tanstack/react-query";
import { useAtomValue } from "jotai";
import { RefreshCw, SquarePlus, Upload } from "lucide-react";
import { motion } from "motion/react";
import { useParams, useRouter } from "next/navigation";
import { useParams } from "next/navigation";
import { useTranslations } from "next-intl";
import { useCallback, useEffect, useId, useMemo, useState } from "react";
import { useCallback, useEffect, useMemo, useState } from "react";
import { toast } from "sonner";
import { deleteDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms";
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
import { Button } from "@/components/ui/button";
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
import { useDocuments } from "@/hooks/use-documents";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import { cacheKeys } from "@/lib/query-client/cache-keys";
import { DocumentsFilters } from "./components/DocumentsFilters";
import { DocumentsTableShell, type SortKey } from "./components/DocumentsTableShell";
import { PaginationControls } from "./components/PaginationControls";
import { PAGE_SIZE, PaginationControls } from "./components/PaginationControls";
import type { ColumnVisibility } from "./components/types";
function useDebounced<T>(value: T, delay = 250) {
@ -31,70 +28,48 @@ function useDebounced<T>(value: T, delay = 250) {
export default function DocumentsTable() {
const t = useTranslations("documents");
const id = useId();
const params = useParams();
const router = useRouter();
const searchSpaceId = Number(params.search_space_id);
const { openDialog: openUploadDialog } = useDocumentUploadDialog();
const handleNewNote = useCallback(() => {
router.push(`/dashboard/${searchSpaceId}/editor/new`);
}, [router, searchSpaceId]);
const [search, setSearch] = useState("");
const debouncedSearch = useDebounced(search, 250);
const [activeTypes, setActiveTypes] = useState<DocumentTypeEnum[]>([]);
const [columnVisibility, setColumnVisibility] = useState<ColumnVisibility>({
title: true,
document_type: true,
content: true,
created_by: true,
created_at: true,
status: true,
});
const [pageIndex, setPageIndex] = useState(0);
const [pageSize, setPageSize] = useState(50);
const [sortKey, setSortKey] = useState<SortKey>("title");
const [sortDesc, setSortDesc] = useState(false);
const [sortKey, setSortKey] = useState<SortKey>("created_at");
const [sortDesc, setSortDesc] = useState(true);
const [selectedIds, setSelectedIds] = useState<Set<number>>(new Set());
const { data: rawTypeCounts } = useAtomValue(documentTypeCountsAtom);
const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom);
// Build query parameters for fetching documents
const queryParams = useMemo(
() => ({
search_space_id: searchSpaceId,
page: pageIndex,
page_size: pageSize,
...(activeTypes.length > 0 && { document_types: activeTypes }),
}),
[searchSpaceId, pageIndex, pageSize, activeTypes]
);
// REAL-TIME: Use Electric SQL hook for live document updates (when not searching)
const {
documents: realtimeDocuments,
typeCounts: realtimeTypeCounts,
loading: realtimeLoading,
error: realtimeError,
} = useDocuments(searchSpaceId, activeTypes);
// Build search query parameters
// Check if we're in search mode
const isSearchMode = !!debouncedSearch.trim();
// Build search query parameters (only used when searching)
const searchQueryParams = useMemo(
() => ({
search_space_id: searchSpaceId,
page: pageIndex,
page_size: pageSize,
page_size: PAGE_SIZE,
title: debouncedSearch.trim(),
...(activeTypes.length > 0 && { document_types: activeTypes }),
}),
[searchSpaceId, pageIndex, pageSize, activeTypes, debouncedSearch]
[searchSpaceId, pageIndex, activeTypes, debouncedSearch]
);
// Use query for fetching documents
const {
data: documentsResponse,
isLoading: isDocumentsLoading,
refetch: refetchDocuments,
error: documentsError,
} = useQuery({
queryKey: cacheKeys.documents.globalQueryParams(queryParams),
queryFn: () => documentsApiService.getDocuments({ queryParams }),
staleTime: 3 * 60 * 1000, // 3 minutes
enabled: !!searchSpaceId && !debouncedSearch.trim(),
});
// Use query for searching documents
// API search query (only enabled when searching - Electric doesn't do full-text search)
const {
data: searchResponse,
isLoading: isSearchLoading,
@ -103,134 +78,135 @@ export default function DocumentsTable() {
} = useQuery({
queryKey: cacheKeys.documents.globalQueryParams(searchQueryParams),
queryFn: () => documentsApiService.searchDocuments({ queryParams: searchQueryParams }),
staleTime: 3 * 60 * 1000, // 3 minutes
enabled: !!searchSpaceId && !!debouncedSearch.trim(),
staleTime: 30 * 1000, // 30 seconds for search (shorter since it's on-demand)
enabled: !!searchSpaceId && isSearchMode,
});
// Determine if we should show SurfSense docs (when no type filter or SURFSENSE_DOCS is selected)
const showSurfsenseDocs =
activeTypes.length === 0 || activeTypes.includes("SURFSENSE_DOCS" as DocumentTypeEnum);
// Client-side sorting for real-time documents
const sortedRealtimeDocuments = useMemo(() => {
const docs = [...realtimeDocuments];
docs.sort((a, b) => {
const av = a[sortKey] ?? "";
const bv = b[sortKey] ?? "";
let cmp: number;
if (sortKey === "created_at") {
cmp = new Date(av as string).getTime() - new Date(bv as string).getTime();
} else {
cmp = String(av).localeCompare(String(bv));
}
return sortDesc ? -cmp : cmp;
});
return docs;
}, [realtimeDocuments, sortKey, sortDesc]);
// Use query for fetching SurfSense docs
const {
data: surfsenseDocsResponse,
isLoading: isSurfsenseDocsLoading,
refetch: refetchSurfsenseDocs,
} = useQuery({
queryKey: ["surfsense-docs", debouncedSearch, pageIndex, pageSize],
queryFn: () =>
documentsApiService.getSurfsenseDocs({
queryParams: {
page: pageIndex,
page_size: pageSize,
title: debouncedSearch.trim() || undefined,
},
}),
staleTime: 3 * 60 * 1000, // 3 minutes
enabled: showSurfsenseDocs,
});
// Client-side pagination for real-time documents
const paginatedRealtimeDocuments = useMemo(() => {
const start = pageIndex * PAGE_SIZE;
const end = start + PAGE_SIZE;
return sortedRealtimeDocuments.slice(start, end);
}, [sortedRealtimeDocuments, pageIndex]);
// Transform SurfSense docs to match the Document type
const surfsenseDocsAsDocuments: Document[] = useMemo(() => {
if (!surfsenseDocsResponse?.items) return [];
return surfsenseDocsResponse.items.map((doc) => ({
id: doc.id,
title: doc.title,
document_type: "SURFSENSE_DOCS",
document_metadata: { source: doc.source },
content: doc.content,
created_at: new Date().toISOString(),
search_space_id: -1, // Special value for global docs
}));
}, [surfsenseDocsResponse]);
// Determine what to display based on search mode
const displayDocs = isSearchMode
? (searchResponse?.items || []).map((item) => ({
id: item.id,
search_space_id: item.search_space_id,
document_type: item.document_type,
title: item.title,
created_by_id: item.created_by_id ?? null,
created_by_name: item.created_by_name ?? null,
created_at: item.created_at,
status: (
item as {
status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string };
}
).status ?? { state: "ready" as const },
}))
: paginatedRealtimeDocuments;
// Merge type counts with SURFSENSE_DOCS count
const typeCounts = useMemo(() => {
const counts = { ...(rawTypeCounts || {}) };
if (surfsenseDocsResponse?.total) {
counts.SURFSENSE_DOCS = surfsenseDocsResponse.total;
}
return counts;
}, [rawTypeCounts, surfsenseDocsResponse?.total]);
const displayTotal = isSearchMode ? searchResponse?.total || 0 : sortedRealtimeDocuments.length;
// Extract documents and total based on search state
const documents = debouncedSearch.trim()
? searchResponse?.items || []
: documentsResponse?.items || [];
const total = debouncedSearch.trim() ? searchResponse?.total || 0 : documentsResponse?.total || 0;
const loading = isSearchMode ? isSearchLoading : realtimeLoading;
const error = isSearchMode ? searchError : realtimeError;
const loading = debouncedSearch.trim() ? isSearchLoading : isDocumentsLoading;
const error = debouncedSearch.trim() ? searchError : documentsError;
// Display results directly
const displayDocs = documents;
const displayTotal = total;
const pageStart = pageIndex * pageSize;
const pageEnd = Math.min(pageStart + pageSize, displayTotal);
const pageEnd = Math.min((pageIndex + 1) * PAGE_SIZE, displayTotal);
const onToggleType = (type: DocumentTypeEnum, checked: boolean) => {
setActiveTypes((prev) => (checked ? [...prev, type] : prev.filter((t) => t !== type)));
setActiveTypes((prev) => {
if (checked) {
return prev.includes(type) ? prev : [...prev, type];
} else {
return prev.filter((t) => t !== type);
}
});
setPageIndex(0);
};
const onToggleColumn = (id: keyof ColumnVisibility, checked: boolean) => {
setColumnVisibility((prev) => ({ ...prev, [id]: checked }));
};
const [isRefreshing, setIsRefreshing] = useState(false);
const refreshCurrentView = useCallback(async () => {
if (isRefreshing) return;
setIsRefreshing(true);
try {
if (debouncedSearch.trim()) {
await refetchSearch();
} else {
await refetchDocuments();
}
toast.success(t("refresh_success") || "Documents refreshed");
} finally {
setIsRefreshing(false);
}
}, [debouncedSearch, refetchSearch, refetchDocuments, t, isRefreshing]);
// Create a delete function for single document deletion
const deleteDocument = useCallback(
async (id: number) => {
try {
await deleteDocumentMutation({ id });
return true;
} catch (error) {
console.error("Failed to delete document:", error);
return false;
}
},
[deleteDocumentMutation]
);
const onBulkDelete = async () => {
if (selectedIds.size === 0) {
toast.error(t("no_rows_selected"));
return;
}
// Filter out pending/processing documents - they cannot be deleted
// For real-time mode, use sortedRealtimeDocuments (which has status)
// For search mode, use searchResponse items (need to safely access status)
const allDocs = isSearchMode
? (searchResponse?.items || []).map((item) => ({
id: item.id,
status: (item as { status?: { state: string } }).status,
}))
: sortedRealtimeDocuments.map((doc) => ({ id: doc.id, status: doc.status }));
const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id));
const deletableIds = selectedDocs
.filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing")
.map((doc) => doc.id);
const inProgressCount = selectedIds.size - deletableIds.length;
if (inProgressCount > 0) {
toast.warning(
`${inProgressCount} document(s) are pending or processing and cannot be deleted.`
);
}
if (deletableIds.length === 0) {
return;
}
try {
// Delete documents one by one using the mutation
// Track 409 conflicts separately (document started processing after UI loaded)
let conflictCount = 0;
const results = await Promise.all(
Array.from(selectedIds).map(async (id) => {
deletableIds.map(async (id) => {
try {
await deleteDocumentMutation({ id });
return true;
} catch {
} catch (error: unknown) {
const status =
(error as { response?: { status?: number } })?.response?.status ??
(error as { status?: number })?.status;
if (status === 409) conflictCount++;
return false;
}
})
);
const okCount = results.filter((r) => r === true).length;
if (okCount === selectedIds.size)
if (okCount === deletableIds.length) {
toast.success(t("delete_success_count", { count: okCount }));
else toast.error(t("delete_partial_failed"));
// Refetch the current page with appropriate method
await refreshCurrentView();
} else if (conflictCount > 0) {
toast.error(`${conflictCount} document(s) started processing. Please try again later.`);
} else {
toast.error(t("delete_partial_failed"));
}
// If in search mode, refetch search results to reflect deletion
if (isSearchMode) {
await refetchSearch();
}
// Real-time mode: Electric will sync the deletion automatically
setSelectedIds(new Set());
} catch (e) {
console.error(e);
@ -238,10 +214,47 @@ export default function DocumentsTable() {
}
};
// Single document delete handler for RowActions
const handleDeleteDocument = useCallback(
async (id: number): Promise<boolean> => {
try {
await deleteDocumentMutation({ id });
toast.success(t("delete_success") || "Document deleted");
// If in search mode, refetch search results to reflect deletion
if (isSearchMode) {
await refetchSearch();
}
// Real-time mode: Electric will sync the deletion automatically
return true;
} catch (e) {
console.error("Error deleting document:", e);
return false;
}
},
[deleteDocumentMutation, isSearchMode, refetchSearch, t]
);
const handleSortChange = useCallback((key: SortKey) => {
setSortKey((currentKey) => {
if (currentKey === key) {
setSortDesc((v) => !v);
return currentKey;
}
setSortDesc(false);
return key;
});
}, []);
// Reset page when search changes (type filter already resets via onToggleType)
// biome-ignore lint/correctness/useExhaustiveDependencies: Intentionally reset page on search change
useEffect(() => {
setPageIndex(0);
}, [debouncedSearch]);
useEffect(() => {
const mq = window.matchMedia("(max-width: 768px)");
const apply = (isSmall: boolean) => {
setColumnVisibility((prev) => ({ ...prev, content: !isSmall, created_at: !isSmall }));
setColumnVisibility((prev) => ({ ...prev, created_by: !isSmall, created_at: !isSmall }));
};
apply(mq.matches);
const onChange = (e: MediaQueryListEvent) => apply(e.matches);
@ -254,81 +267,44 @@ export default function DocumentsTable() {
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.3 }}
className="w-full px-6 py-4 space-y-6 min-h-[calc(100vh-64px)]"
className="w-full max-w-7xl mx-auto px-6 pt-17 pb-6 space-y-6 min-h-[calc(100vh-64px)]"
>
<motion.div
className="flex items-center justify-between"
initial={{ opacity: 0, y: 10 }}
animate={{ opacity: 1, y: 0 }}
transition={{ delay: 0.1 }}
>
<div>
<h2 className="text-xl md:text-2xl font-bold tracking-tight">{t("title")}</h2>
<p className="text-xs md:text-sm text-muted-foreground">{t("subtitle")}</p>
</div>
<div className="flex items-center gap-2">
<Button onClick={openUploadDialog} variant="default" size="sm">
<Upload className="w-4 h-4 mr-2" />
{t("upload_documents")}
</Button>
<Button onClick={handleNewNote} variant="outline" size="sm">
<SquarePlus className="w-4 h-4 mr-2" />
{t("create_shared_note")}
</Button>
<Button onClick={refreshCurrentView} variant="outline" size="sm" disabled={isRefreshing}>
<RefreshCw className={`w-4 h-4 mr-2 ${isRefreshing ? "animate-spin" : ""}`} />
{t("refresh")}
</Button>
</div>
</motion.div>
{/* Filters - use real-time type counts */}
<DocumentsFilters
typeCounts={rawTypeCounts ?? {}}
typeCounts={realtimeTypeCounts}
selectedIds={selectedIds}
onSearch={setSearch}
searchValue={search}
onBulkDelete={onBulkDelete}
onToggleType={onToggleType}
activeTypes={activeTypes}
columnVisibility={columnVisibility}
onToggleColumn={onToggleColumn}
/>
{/* Table */}
<DocumentsTableShell
documents={displayDocs}
loading={!!loading}
error={!!error}
onRefresh={refreshCurrentView}
selectedIds={selectedIds}
setSelectedIds={setSelectedIds}
columnVisibility={columnVisibility}
deleteDocument={deleteDocument}
sortKey={sortKey}
sortDesc={sortDesc}
onSortChange={(key) => {
if (sortKey === key) setSortDesc((v) => !v);
else {
setSortKey(key);
setSortDesc(false);
}
}}
onSortChange={handleSortChange}
deleteDocument={handleDeleteDocument}
searchSpaceId={String(searchSpaceId)}
/>
{/* Pagination */}
<PaginationControls
pageIndex={pageIndex}
pageSize={pageSize}
total={displayTotal}
onPageSizeChange={(s) => {
setPageSize(s);
setPageIndex(0);
}}
onFirst={() => setPageIndex(0)}
onPrev={() => setPageIndex((i) => Math.max(0, i - 1))}
onNext={() => setPageIndex((i) => (pageEnd < displayTotal ? i + 1 : i))}
onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / pageSize) - 1))}
onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / PAGE_SIZE) - 1))}
canPrev={pageIndex > 0}
canNext={pageEnd < displayTotal}
id={id}
/>
</motion.div>
);

View file

@ -0,0 +1,4 @@
import { atom } from "jotai";
// Atom to control the connector dialog open state from anywhere in the app
export const connectorDialogOpenAtom = atom(false);

View file

@ -1,5 +1,4 @@
import { atomWithMutation } from "jotai-tanstack-query";
import { toast } from "sonner";
import type {
CreateConnectorRequest,
DeleteConnectorRequest,
@ -17,15 +16,16 @@ export const createConnectorMutationAtom = atomWithMutation((get) => {
const searchSpaceId = get(activeSearchSpaceIdAtom);
return {
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
enabled: !!searchSpaceId,
mutationFn: async (request: CreateConnectorRequest) => {
return connectorsApiService.createConnector(request);
},
onSuccess: () => {
if (!searchSpaceId) return;
queryClient.invalidateQueries({
queryKey: cacheKeys.connectors.all(searchSpaceId!),
queryKey: cacheKeys.connectors.all(searchSpaceId),
});
},
};
@ -35,15 +35,16 @@ export const updateConnectorMutationAtom = atomWithMutation((get) => {
const searchSpaceId = get(activeSearchSpaceIdAtom);
return {
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
enabled: !!searchSpaceId,
mutationFn: async (request: UpdateConnectorRequest) => {
return connectorsApiService.updateConnector(request);
},
onSuccess: (_, request: UpdateConnectorRequest) => {
if (!searchSpaceId) return;
queryClient.invalidateQueries({
queryKey: cacheKeys.connectors.all(searchSpaceId!),
queryKey: cacheKeys.connectors.all(searchSpaceId),
});
queryClient.invalidateQueries({
queryKey: cacheKeys.connectors.byId(String(request.id)),
@ -56,15 +57,16 @@ export const deleteConnectorMutationAtom = atomWithMutation((get) => {
const searchSpaceId = get(activeSearchSpaceIdAtom);
return {
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
enabled: !!searchSpaceId,
mutationFn: async (request: DeleteConnectorRequest) => {
return connectorsApiService.deleteConnector(request);
},
onSuccess: (_, request: DeleteConnectorRequest) => {
if (!searchSpaceId) return;
queryClient.setQueryData(
cacheKeys.connectors.all(searchSpaceId!),
cacheKeys.connectors.all(searchSpaceId),
(oldData: GetConnectorsResponse | undefined) => {
if (!oldData) return oldData;
return oldData.filter((connector) => connector.id !== request.id);
@ -88,9 +90,9 @@ export const indexConnectorMutationAtom = atomWithMutation((get) => {
},
onSuccess: (response: IndexConnectorResponse) => {
toast.success(response.message);
if (!searchSpaceId) return;
queryClient.invalidateQueries({
queryKey: cacheKeys.connectors.all(searchSpaceId!),
queryKey: cacheKeys.connectors.all(searchSpaceId),
});
queryClient.invalidateQueries({
queryKey: cacheKeys.connectors.byId(String(response.connector_id)),

View file

@ -48,7 +48,7 @@ export const uploadDocumentMutationAtom = atomWithMutation((get) => {
},
onSuccess: () => {
toast.success("Files uploaded for processing");
// Note: Toast notification is handled by the caller (DocumentUploadTab) to use i18n
// Invalidate logs summary to show new processing tasks immediately on documents page
queryClient.invalidateQueries({
queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined),
@ -95,7 +95,7 @@ export const deleteDocumentMutationAtom = atomWithMutation((get) => {
},
onSuccess: (_, request: DeleteDocumentRequest) => {
toast.success("Document deleted successfully");
// Note: Toast is handled by the caller (page.tsx onBulkDelete) to show count info
queryClient.setQueryData(
cacheKeys.documents.globalQueryParams(documentsQueryParams),
(oldData: GetDocumentsResponse | undefined) => {

View file

@ -19,7 +19,7 @@ import { Spinner } from "@/components/ui/spinner";
import { Tabs, TabsContent } from "@/components/ui/tabs";
import type { SearchSourceConnector } from "@/contracts/types/connector.types";
import { useConnectorsElectric } from "@/hooks/use-connectors-electric";
import { useDocumentsElectric } from "@/hooks/use-documents-electric";
import { useDocuments } from "@/hooks/use-documents";
import { useInbox } from "@/hooks/use-inbox";
import { cn } from "@/lib/utils";
import { ConnectorDialogHeader } from "./connector-popup/components/connector-dialog-header";
@ -37,7 +37,7 @@ import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab";
import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view";
import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view";
export const ConnectorIndicator: FC = () => {
export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger = false }) => {
const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
const searchParams = useSearchParams();
const { data: currentUser } = useAtomValue(currentUserAtom);
@ -63,7 +63,9 @@ export const ConnectorIndicator: FC = () => {
const llmConfigLoading = preferencesLoading || globalConfigsLoading;
// Fetch document type counts using Electric SQL + PGlite for real-time updates
const { documentTypeCounts, loading: documentTypesLoading } = useDocumentsElectric(searchSpaceId);
const { typeCounts: documentTypeCounts, loading: documentTypesLoading } = useDocuments(
searchSpaceId ? Number(searchSpaceId) : null
);
// Fetch notifications to detect indexing failures
const { inboxItems = [] } = useInbox(
@ -186,34 +188,38 @@ export const ConnectorIndicator: FC = () => {
return (
<Dialog open={isOpen} onOpenChange={handleOpenChange}>
<TooltipIconButton
data-joyride="connector-icon"
tooltip={hasConnectors ? `Manage ${activeConnectorsCount} connectors` : "Connect your data"}
side="bottom"
className={cn(
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
"outline-none focus:outline-none focus-visible:outline-none font-semibold text-xs",
"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none"
)}
aria-label={
hasConnectors ? `View ${activeConnectorsCount} connectors` : "Add your first connector"
}
onClick={() => handleOpenChange(true)}
>
{isLoading ? (
<Spinner size="sm" />
) : (
<>
<Cable className="size-4 stroke-[1.5px]" />
{activeConnectorsCount > 0 && (
<span className="absolute -top-0.5 right-0 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
{activeConnectorsCount > 99 ? "99+" : activeConnectorsCount}
</span>
)}
</>
)}
</TooltipIconButton>
{!hideTrigger && (
<TooltipIconButton
data-joyride="connector-icon"
tooltip={
hasConnectors ? `Manage ${activeConnectorsCount} connectors` : "Connect your data"
}
side="bottom"
className={cn(
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
"outline-none focus:outline-none focus-visible:outline-none font-semibold text-xs",
"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none"
)}
aria-label={
hasConnectors ? `View ${activeConnectorsCount} connectors` : "Add your first connector"
}
onClick={() => handleOpenChange(true)}
>
{isLoading ? (
<Spinner size="sm" />
) : (
<>
<Cable className="size-4 stroke-[1.5px]" />
{activeConnectorsCount > 0 && (
<span className="absolute -top-0.5 right-0 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
{activeConnectorsCount > 99 ? "99+" : activeConnectorsCount}
</span>
)}
</>
)}
</TooltipIconButton>
)}
<DialogContent className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border bg-muted text-foreground focus:outline-none focus:ring-0 focus-visible:outline-none focus-visible:ring-0 [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5">
<DialogTitle className="sr-only">Manage Connectors</DialogTitle>

View file

@ -1,8 +1,9 @@
import { format } from "date-fns";
import { useAtomValue } from "jotai";
import { useAtom, useAtomValue } from "jotai";
import { useRouter, useSearchParams } from "next/navigation";
import { useCallback, useEffect, useRef, useState } from "react";
import { toast } from "sonner";
import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
import {
createConnectorMutationAtom,
deleteConnectorMutationAtom,
@ -49,7 +50,8 @@ export const useConnectorDialog = () => {
const { mutateAsync: deleteConnector } = useAtomValue(deleteConnectorMutationAtom);
const { mutateAsync: createConnector } = useAtomValue(createConnectorMutationAtom);
const [isOpen, setIsOpen] = useState(false);
// Use global atom for dialog open state so it can be controlled from anywhere
const [isOpen, setIsOpen] = useAtom(connectorDialogOpenAtom);
const [activeTab, setActiveTab] = useState("all");
const [connectingId, setConnectingId] = useState<string | null>(null);
const [isScrolled, setIsScrolled] = useState(false);
@ -293,6 +295,7 @@ export const useConnectorDialog = () => {
connectingConnectorType,
viewingAccountsType,
viewingMCPList,
setIsOpen,
]);
// Detect OAuth success / Failure and transition to config view
@ -345,9 +348,10 @@ export const useConnectorDialog = () => {
// If we found the connector, find the matching OAuth/Composio connector by type
if (newConnector) {
const connectorType = newConnector.connector_type;
oauthConnector =
OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) ||
COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type);
OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) ||
COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType);
}
}
@ -358,8 +362,9 @@ export const useConnectorDialog = () => {
COMPOSIO_CONNECTORS.find((c) => c.id === params.connector);
if (oauthConnector) {
const oauthConnectorType = oauthConnector.connectorType;
newConnector = result.data.find(
(c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType
(c: SearchSourceConnector) => c.connector_type === oauthConnectorType
);
}
}
@ -399,7 +404,7 @@ export const useConnectorDialog = () => {
// Invalid query params - log but don't crash
console.warn("Invalid connector popup query params in OAuth success handler:", error);
}
}, [searchParams, searchSpaceId, refetchAllConnectors]);
}, [searchParams, searchSpaceId, refetchAllConnectors, setIsOpen]);
// Handle OAuth connection
const handleConnectOAuth = useCallback(
@ -514,7 +519,7 @@ export const useConnectorDialog = () => {
} finally {
setConnectingId(null);
}
}, [searchSpaceId, createConnector, refetchAllConnectors]);
}, [searchSpaceId, createConnector, refetchAllConnectors, setIsOpen]);
// Handle connecting non-OAuth connectors (like Tavily API)
const handleConnectNonOAuth = useCallback(
@ -677,12 +682,8 @@ export const useConnectorDialog = () => {
const successMessage =
currentConnectorType === "MCP_CONNECTOR"
? `${connector.name} added successfully`
: `${connectorTitle} connected and indexing started!`;
toast.success(successMessage, {
description: periodicEnabledForIndexing
? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutesForIndexing)}.`
: "You can continue working while we sync your data.",
});
: `${connectorTitle} connected and syncing started!`;
toast.success(successMessage);
const url = new URL(window.location.href);
url.searchParams.delete("modal");
@ -782,7 +783,6 @@ export const useConnectorDialog = () => {
updateConnector,
indexConnector,
router,
getFrequencyLabel,
]
);
@ -1010,11 +1010,7 @@ export const useConnectorDialog = () => {
);
}
toast.success(`${indexingConfig.connectorTitle} indexing started`, {
description: periodicEnabled
? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutes)}.`
: "You can continue working while we sync your data.",
});
toast.success(`${indexingConfig.connectorTitle} indexing started`);
// Update URL - the effect will handle closing the modal and clearing state
const url = new URL(window.location.href);
@ -1045,7 +1041,6 @@ export const useConnectorDialog = () => {
updateConnector,
periodicEnabled,
frequencyMinutes,
getFrequencyLabel,
router,
indexingConnectorConfig,
]
@ -1426,9 +1421,7 @@ export const useConnectorDialog = () => {
end_date: endDateStr,
},
});
toast.success("Indexing started", {
description: "You can continue working while we sync your data.",
});
toast.success("Indexing started");
// Invalidate queries to refresh data
queryClient.invalidateQueries({
@ -1445,7 +1438,7 @@ export const useConnectorDialog = () => {
}
}
},
[searchSpaceId, indexConnector, queryClient]
[searchSpaceId, indexConnector]
);
// Handle going back from edit view
@ -1527,7 +1520,7 @@ export const useConnectorDialog = () => {
}
}
},
[activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector]
[activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector, setIsOpen]
);
// Handle tab change

View file

@ -1,4 +1,4 @@
import { FileJson } from "lucide-react";
import { FileJson, Loader2 } from "lucide-react";
import React from "react";
import { defaultStyles, JsonView } from "react-json-view-lite";
import { Button } from "@/components/ui/button";
@ -17,6 +17,7 @@ interface JsonMetadataViewerProps {
trigger?: React.ReactNode;
open?: boolean;
onOpenChange?: (open: boolean) => void;
loading?: boolean;
}
export function JsonMetadataViewer({
@ -25,6 +26,7 @@ export function JsonMetadataViewer({
trigger,
open,
onOpenChange,
loading,
}: JsonMetadataViewerProps) {
// Ensure metadata is a valid object
const jsonData = React.useMemo(() => {
@ -54,7 +56,13 @@ export function JsonMetadataViewer({
</DialogTitle>
</DialogHeader>
<div className="mt-2 sm:mt-4 p-2 sm:p-4 bg-muted/30 rounded-md text-xs sm:text-sm">
<JsonView data={jsonData} style={defaultStyles} />
{loading ? (
<div className="flex items-center justify-center py-12">
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
</div>
) : (
<JsonView data={jsonData} style={defaultStyles} />
)}
</div>
</DialogContent>
</Dialog>

View file

@ -90,7 +90,7 @@ export function LayoutDataProvider({
});
// Fetch threads (40 total to allow up to 20 per section - shared/private)
const { data: threadsData } = useQuery({
const { data: threadsData, isPending: isLoadingThreads } = useQuery({
queryKey: ["threads", searchSpaceId, { limit: 40 }],
queryFn: () => fetchThreads(Number(searchSpaceId), 40),
enabled: !!searchSpaceId,
@ -585,6 +585,7 @@ export function LayoutDataProvider({
theme={theme}
setTheme={setTheme}
isChatPage={isChatPage}
isLoadingChats={isLoadingThreads}
inbox={{
isOpen: isInboxSidebarOpen,
onOpenChange: setIsInboxSidebarOpen,

View file

@ -74,6 +74,7 @@ interface LayoutShellProps {
className?: string;
// Inbox props
inbox?: InboxProps;
isLoadingChats?: boolean;
}
export function LayoutShell({
@ -110,6 +111,7 @@ export function LayoutShell({
children,
className,
inbox,
isLoadingChats = false,
}: LayoutShellProps) {
const isMobile = useIsMobile();
const [mobileMenuOpen, setMobileMenuOpen] = useState(false);
@ -162,6 +164,7 @@ export function LayoutShell({
pageUsage={pageUsage}
theme={theme}
setTheme={setTheme}
isLoadingChats={isLoadingChats}
/>
<main className={cn("flex-1", isChatPage ? "overflow-hidden" : "overflow-auto")}>
@ -232,6 +235,7 @@ export function LayoutShell({
theme={theme}
setTheme={setTheme}
className="hidden md:flex border-r shrink-0"
isLoadingChats={isLoadingChats}
/>
{/* Docked Inbox Sidebar - renders as flex sibling between sidebar and content */}

View file

@ -37,6 +37,7 @@ interface MobileSidebarProps {
pageUsage?: PageUsage;
theme?: string;
setTheme?: (theme: "light" | "dark" | "system") => void;
isLoadingChats?: boolean;
}
export function MobileSidebarTrigger({ onClick }: { onClick: () => void }) {
@ -78,6 +79,7 @@ export function MobileSidebar({
pageUsage,
theme,
setTheme,
isLoadingChats = false,
}: MobileSidebarProps) {
const handleSearchSpaceSelect = (id: number) => {
onSearchSpaceSelect(id);
@ -158,6 +160,7 @@ export function MobileSidebar({
theme={theme}
setTheme={setTheme}
className="w-full border-none"
isLoadingChats={isLoadingChats}
/>
</div>
</SheetContent>

View file

@ -3,6 +3,7 @@
import { FolderOpen, PenSquare } from "lucide-react";
import { useTranslations } from "next-intl";
import { Button } from "@/components/ui/button";
import { Skeleton } from "@/components/ui/skeleton";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { cn } from "@/lib/utils";
import type { ChatItem, NavItem, PageUsage, SearchSpace, User } from "../../types/layout.types";
@ -14,6 +15,15 @@ import { SidebarHeader } from "./SidebarHeader";
import { SidebarSection } from "./SidebarSection";
import { SidebarUserProfile } from "./SidebarUserProfile";
function ChatListItemSkeleton() {
return (
<div className="flex w-full items-center gap-2 rounded-md p-2">
<Skeleton className="h-4 w-4 shrink-0 rounded" />
<Skeleton className="h-4 w-full max-w-[180px]" />
</div>
);
}
interface SidebarProps {
searchSpace: SearchSpace | null;
isCollapsed?: boolean;
@ -39,6 +49,7 @@ interface SidebarProps {
theme?: string;
setTheme?: (theme: "light" | "dark" | "system") => void;
className?: string;
isLoadingChats?: boolean;
}
export function Sidebar({
@ -66,6 +77,7 @@ export function Sidebar({
theme,
setTheme,
className,
isLoadingChats = false,
}: SidebarProps) {
const t = useTranslations("sidebar");
@ -153,7 +165,15 @@ export function Sidebar({
) : undefined
}
>
{sharedChats.length > 0 ? (
{isLoadingChats ? (
<div className="flex flex-col gap-0.5">
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
</div>
) : sharedChats.length > 0 ? (
<div className="relative min-h-0 flex-1">
<div
className={`flex flex-col gap-0.5 max-h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${sharedChats.length > 4 ? "pb-8" : ""}`}
@ -206,7 +226,15 @@ export function Sidebar({
) : undefined
}
>
{chats.length > 0 ? (
{isLoadingChats ? (
<div className="flex flex-col gap-0.5">
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
<ChatListItemSkeleton />
</div>
) : chats.length > 0 ? (
<div className="relative flex-1 min-h-0">
<div
className={`flex flex-col gap-0.5 h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${chats.length > 4 ? "pb-8" : ""}`}

View file

@ -92,7 +92,7 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
case "FILE":
return <File {...iconProps} />;
case "GOOGLE_DRIVE_FILE":
return <File {...iconProps} />;
return <Image src="/connectors/google-drive.svg" alt="Google Drive" {...imgProps} />;
case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
return <Image src="/connectors/google-drive.svg" alt="Google Drive" {...imgProps} />;
case "COMPOSIO_GMAIL_CONNECTOR":

View file

@ -23,6 +23,7 @@ export const documentTypeEnum = z.enum([
"ELASTICSEARCH_CONNECTOR",
"BOOKSTACK_CONNECTOR",
"CIRCLEBACK",
"OBSIDIAN_CONNECTOR",
"SURFSENSE_DOCS",
"NOTE",
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
@ -41,6 +42,8 @@ export const document = z.object({
created_at: z.string(),
updated_at: z.string().nullable(),
search_space_id: z.number(),
created_by_id: z.string().nullable().optional(),
created_by_name: z.string().nullable().optional(),
});
export const extensionDocumentContent = z.object({

View file

@ -1,185 +0,0 @@
"use client";
import { useEffect, useMemo, useRef, useState } from "react";
import type { SyncHandle } from "@/lib/electric/client";
import { useElectricClient } from "@/lib/electric/context";
interface Document {
id: number;
search_space_id: number;
document_type: string;
created_at: string;
}
/**
* Hook for managing documents with Electric SQL real-time sync
*
* Uses the Electric client from context (provided by ElectricProvider)
* instead of initializing its own - prevents race conditions and memory leaks
*/
export function useDocumentsElectric(searchSpaceId: number | string | null) {
// Get Electric client from context - ElectricProvider handles initialization
const electricClient = useElectricClient();
const [documents, setDocuments] = useState<Document[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<Error | null>(null);
const syncHandleRef = useRef<SyncHandle | null>(null);
const liveQueryRef = useRef<{ unsubscribe: () => void } | null>(null);
const syncKeyRef = useRef<string | null>(null);
// Calculate document type counts from synced documents
const documentTypeCounts = useMemo(() => {
if (!documents.length) return {};
const counts: Record<string, number> = {};
for (const doc of documents) {
counts[doc.document_type] = (counts[doc.document_type] || 0) + 1;
}
return counts;
}, [documents]);
// Start syncing when Electric client is available
useEffect(() => {
// Wait for both searchSpaceId and Electric client to be available
if (!searchSpaceId || !electricClient) {
setLoading(!electricClient); // Still loading if waiting for Electric
if (!searchSpaceId) {
setDocuments([]);
}
return;
}
// Create a unique key for this sync to prevent duplicate subscriptions
const syncKey = `documents_${searchSpaceId}`;
if (syncKeyRef.current === syncKey) {
// Already syncing for this search space
return;
}
let mounted = true;
syncKeyRef.current = syncKey;
async function startSync() {
try {
console.log("[useDocumentsElectric] Starting sync for search space:", searchSpaceId);
const handle = await electricClient.syncShape({
table: "documents",
where: `search_space_id = ${searchSpaceId}`,
columns: ["id", "document_type", "search_space_id", "created_at"],
primaryKey: ["id"],
});
console.log("[useDocumentsElectric] Sync started:", {
isUpToDate: handle.isUpToDate,
});
// Wait for initial sync with timeout
if (!handle.isUpToDate && handle.initialSyncPromise) {
try {
await Promise.race([
handle.initialSyncPromise,
new Promise((resolve) => setTimeout(resolve, 2000)),
]);
} catch (syncErr) {
console.error("[useDocumentsElectric] Initial sync failed:", syncErr);
}
}
if (!mounted) {
handle.unsubscribe();
return;
}
syncHandleRef.current = handle;
setLoading(false);
setError(null);
// Fetch initial documents
await fetchDocuments();
// Set up live query for real-time updates
await setupLiveQuery();
} catch (err) {
if (!mounted) return;
console.error("[useDocumentsElectric] Failed to start sync:", err);
setError(err instanceof Error ? err : new Error("Failed to sync documents"));
setLoading(false);
}
}
async function fetchDocuments() {
try {
const result = await electricClient.db.query<Document>(
`SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`,
[searchSpaceId]
);
if (mounted) {
setDocuments(result.rows || []);
}
} catch (err) {
console.error("[useDocumentsElectric] Failed to fetch:", err);
}
}
async function setupLiveQuery() {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const db = electricClient.db as any;
if (db.live?.query && typeof db.live.query === "function") {
const liveQuery = await db.live.query(
`SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`,
[searchSpaceId]
);
if (!mounted) {
liveQuery.unsubscribe?.();
return;
}
// Set initial results
if (liveQuery.initialResults?.rows) {
setDocuments(liveQuery.initialResults.rows);
} else if (liveQuery.rows) {
setDocuments(liveQuery.rows);
}
// Subscribe to changes
if (typeof liveQuery.subscribe === "function") {
liveQuery.subscribe((result: { rows: Document[] }) => {
if (mounted && result.rows) {
setDocuments(result.rows);
}
});
}
if (typeof liveQuery.unsubscribe === "function") {
liveQueryRef.current = liveQuery;
}
}
} catch (liveErr) {
console.error("[useDocumentsElectric] Failed to set up live query:", liveErr);
}
}
startSync();
return () => {
mounted = false;
syncKeyRef.current = null;
if (syncHandleRef.current) {
syncHandleRef.current.unsubscribe();
syncHandleRef.current = null;
}
if (liveQueryRef.current) {
liveQueryRef.current.unsubscribe();
liveQueryRef.current = null;
}
};
}, [searchSpaceId, electricClient]);
return { documentTypeCounts, loading, error };
}

View file

@ -0,0 +1,449 @@
"use client";
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
import { documentsApiService } from "@/lib/apis/documents-api.service";
import type { SyncHandle } from "@/lib/electric/client";
import { useElectricClient } from "@/lib/electric/context";
// Stable empty array to prevent infinite re-renders when no typeFilter is provided
const EMPTY_TYPE_FILTER: DocumentTypeEnum[] = [];
// Document status type (matches backend DocumentStatus JSONB)
export interface DocumentStatusType {
state: "ready" | "pending" | "processing" | "failed";
reason?: string;
}
// Document from Electric sync (lightweight table columns - NO content/metadata)
interface DocumentElectric {
id: number;
search_space_id: number;
document_type: string;
title: string;
created_by_id: string | null;
created_at: string;
status: DocumentStatusType | null;
}
// Document for display (with resolved user name)
export interface DocumentDisplay {
id: number;
search_space_id: number;
document_type: string;
title: string;
created_by_id: string | null;
created_by_name: string | null;
created_at: string;
status: DocumentStatusType;
}
/**
* Deduplicate by ID and sort by created_at descending (newest first)
*/
function deduplicateAndSort<T extends { id: number; created_at: string }>(items: T[]): T[] {
const seen = new Map<number, T>();
for (const item of items) {
// Keep the most recent version if duplicate
const existing = seen.get(item.id);
if (!existing || new Date(item.created_at) > new Date(existing.created_at)) {
seen.set(item.id, item);
}
}
return Array.from(seen.values()).sort(
(a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime()
);
}
/**
* Check if a document has valid/complete data
*/
function isValidDocument(doc: DocumentElectric): boolean {
return doc.id != null && doc.title != null && doc.title !== "";
}
/**
* Real-time documents hook with Electric SQL
*
* Architecture (100% Reliable):
* 1. API is the PRIMARY source of truth - always loads first
* 2. Electric provides REAL-TIME updates for additions and deletions
* 3. Use syncHandle.isUpToDate to determine if deletions can be trusted
* 4. Handles bulk deletions correctly by checking sync state
*
* @param searchSpaceId - The search space ID to filter documents
* @param typeFilter - Optional document types to filter by
*/
export function useDocuments(
searchSpaceId: number | null,
typeFilter: DocumentTypeEnum[] = EMPTY_TYPE_FILTER
) {
const electricClient = useElectricClient();
const [documents, setDocuments] = useState<DocumentDisplay[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<Error | null>(null);
// Track if initial API load is complete (source of truth)
const apiLoadedRef = useRef(false);
// User cache: userId → displayName
const userCacheRef = useRef<Map<string, string>>(new Map());
// Electric sync refs
const syncHandleRef = useRef<SyncHandle | null>(null);
const liveQueryRef = useRef<{ unsubscribe?: () => void } | null>(null);
// Real-time type counts
const typeCounts = useMemo(() => {
const counts: Record<string, number> = {};
for (const doc of documents) {
counts[doc.document_type] = (counts[doc.document_type] || 0) + 1;
}
return counts;
}, [documents]);
// Populate user cache from API response
const populateUserCache = useCallback(
(items: Array<{ created_by_id?: string | null; created_by_name?: string | null }>) => {
for (const item of items) {
if (item.created_by_id && item.created_by_name) {
userCacheRef.current.set(item.created_by_id, item.created_by_name);
}
}
},
[]
);
// Convert API item to display doc
const apiToDisplayDoc = useCallback(
(item: {
id: number;
search_space_id: number;
document_type: string;
title: string;
created_by_id?: string | null;
created_by_name?: string | null;
created_at: string;
status?: DocumentStatusType | null;
}): DocumentDisplay => ({
id: item.id,
search_space_id: item.search_space_id,
document_type: item.document_type,
title: item.title,
created_by_id: item.created_by_id ?? null,
created_by_name: item.created_by_name ?? null,
created_at: item.created_at,
status: item.status ?? { state: "ready" },
}),
[]
);
// Convert Electric doc to display doc
const electricToDisplayDoc = useCallback(
(doc: DocumentElectric): DocumentDisplay => ({
...doc,
created_by_name: doc.created_by_id
? (userCacheRef.current.get(doc.created_by_id) ?? null)
: null,
status: doc.status ?? { state: "ready" },
}),
[]
);
// EFFECT 1: Load from API (PRIMARY source of truth)
useEffect(() => {
if (!searchSpaceId) {
setLoading(false);
return;
}
// Capture validated value for async closure
const spaceId = searchSpaceId;
const currentTypeFilter = typeFilter;
let mounted = true;
apiLoadedRef.current = false;
async function loadFromApi() {
try {
setLoading(true);
console.log("[useDocuments] Loading from API (source of truth):", spaceId);
const response = await documentsApiService.getDocuments({
queryParams: {
search_space_id: spaceId,
page: 0,
page_size: -1, // Fetch all documents
...(currentTypeFilter.length > 0 && { document_types: currentTypeFilter }),
},
});
if (!mounted) return;
populateUserCache(response.items);
const docs = response.items.map(apiToDisplayDoc);
setDocuments(docs);
apiLoadedRef.current = true;
setError(null);
console.log("[useDocuments] API loaded", docs.length, "documents");
} catch (err) {
if (!mounted) return;
console.error("[useDocuments] API load failed:", err);
setError(err instanceof Error ? err : new Error("Failed to load documents"));
} finally {
if (mounted) setLoading(false);
}
}
loadFromApi();
return () => {
mounted = false;
};
}, [searchSpaceId, typeFilter, populateUserCache, apiToDisplayDoc]);
// EFFECT 2: Start Electric sync + live query for real-time updates
useEffect(() => {
if (!searchSpaceId || !electricClient) return;
// Capture validated values for async closure
const spaceId = searchSpaceId;
const client = electricClient;
const currentTypeFilter = typeFilter;
let mounted = true;
async function setupElectricRealtime() {
// Cleanup previous subscriptions
if (syncHandleRef.current) {
syncHandleRef.current.unsubscribe();
syncHandleRef.current = null;
}
if (liveQueryRef.current) {
liveQueryRef.current.unsubscribe?.();
liveQueryRef.current = null;
}
try {
console.log("[useDocuments] Starting Electric sync for real-time updates");
// Start Electric sync
const handle = await client.syncShape({
table: "documents",
where: `search_space_id = ${spaceId}`,
columns: [
"id",
"document_type",
"search_space_id",
"title",
"created_by_id",
"created_at",
"status",
],
primaryKey: ["id"],
});
if (!mounted) {
handle.unsubscribe();
return;
}
syncHandleRef.current = handle;
console.log("[useDocuments] Sync started, isUpToDate:", handle.isUpToDate);
// Wait for initial sync (with timeout)
if (!handle.isUpToDate && handle.initialSyncPromise) {
await Promise.race([
handle.initialSyncPromise,
new Promise((resolve) => setTimeout(resolve, 5000)),
]);
console.log("[useDocuments] Initial sync complete, isUpToDate:", handle.isUpToDate);
}
if (!mounted) return;
// Set up live query
const db = client.db as {
live?: {
query: <T>(
sql: string,
params?: (number | string)[]
) => Promise<{
subscribe: (cb: (result: { rows: T[] }) => void) => void;
unsubscribe?: () => void;
}>;
};
};
if (!db.live?.query) {
console.warn("[useDocuments] Live queries not available");
return;
}
let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at, status
FROM documents
WHERE search_space_id = $1`;
const params: (number | string)[] = [spaceId];
if (currentTypeFilter.length > 0) {
const placeholders = currentTypeFilter.map((_, i) => `$${i + 2}`).join(", ");
query += ` AND document_type IN (${placeholders})`;
params.push(...currentTypeFilter);
}
query += ` ORDER BY created_at DESC`;
const liveQuery = await db.live.query<DocumentElectric>(query, params);
if (!mounted) {
liveQuery.unsubscribe?.();
return;
}
console.log("[useDocuments] Live query subscribed");
liveQuery.subscribe((result: { rows: DocumentElectric[] }) => {
if (!mounted || !result.rows) return;
// DEBUG: Log first few raw documents to see what's coming from Electric
console.log("[useDocuments] Raw data sample:", result.rows.slice(0, 3));
const validItems = result.rows.filter(isValidDocument);
const isFullySynced = syncHandleRef.current?.isUpToDate ?? false;
console.log(
`[useDocuments] Live update: ${result.rows.length} raw, ${validItems.length} valid, synced: ${isFullySynced}`
);
// Fetch user names for new users (non-blocking)
const unknownUserIds = validItems
.filter(
(doc): doc is DocumentElectric & { created_by_id: string } =>
doc.created_by_id !== null && !userCacheRef.current.has(doc.created_by_id)
)
.map((doc) => doc.created_by_id);
if (unknownUserIds.length > 0) {
documentsApiService
.getDocuments({
queryParams: { search_space_id: spaceId, page: 0, page_size: 20 },
})
.then((response) => {
populateUserCache(response.items);
if (mounted) {
setDocuments((prev) =>
prev.map((doc) => ({
...doc,
created_by_name: doc.created_by_id
? (userCacheRef.current.get(doc.created_by_id) ?? null)
: null,
}))
);
}
})
.catch(() => {});
}
// Smart update logic based on sync state
setDocuments((prev) => {
// Don't process if API hasn't loaded yet
if (!apiLoadedRef.current) {
console.log("[useDocuments] Waiting for API load, skipping live update");
return prev;
}
// Case 1: Live query is empty
if (validItems.length === 0) {
if (isFullySynced && prev.length > 0) {
// Electric is fully synced and says 0 items - trust it (all deleted)
console.log("[useDocuments] All documents deleted (Electric synced)");
return [];
}
// Partial sync or error - keep existing
console.log("[useDocuments] Empty live result, keeping existing");
return prev;
}
// Case 2: Electric is fully synced - TRUST IT COMPLETELY (handles bulk deletes)
if (isFullySynced) {
const liveDocs = deduplicateAndSort(validItems.map(electricToDisplayDoc));
console.log(
`[useDocuments] Synced update: ${liveDocs.length} docs (was ${prev.length})`
);
return liveDocs;
}
// Case 3: Partial sync - only ADD new items, don't remove any
const existingIds = new Set(prev.map((d) => d.id));
const liveIds = new Set(validItems.map((d) => d.id));
// Find new items (in live but not in prev)
const newItems = validItems
.filter((item) => !existingIds.has(item.id))
.map(electricToDisplayDoc);
// Find updated items (in both, update with latest data)
const updatedPrev = prev.map((doc) => {
if (liveIds.has(doc.id)) {
const liveItem = validItems.find((v) => v.id === doc.id);
if (liveItem) {
return electricToDisplayDoc(liveItem);
}
}
return doc;
});
if (newItems.length > 0) {
console.log(`[useDocuments] Adding ${newItems.length} new items (partial sync)`);
return deduplicateAndSort([...newItems, ...updatedPrev]);
}
return updatedPrev;
});
});
liveQueryRef.current = liveQuery;
} catch (err) {
console.error("[useDocuments] Electric setup failed:", err);
// Don't set error - API data is already loaded
}
}
setupElectricRealtime();
return () => {
mounted = false;
if (syncHandleRef.current) {
syncHandleRef.current.unsubscribe();
syncHandleRef.current = null;
}
if (liveQueryRef.current) {
liveQueryRef.current.unsubscribe?.();
liveQueryRef.current = null;
}
};
}, [searchSpaceId, electricClient, typeFilter, electricToDisplayDoc, populateUserCache]);
// Track previous searchSpaceId to detect actual changes
const prevSearchSpaceIdRef = useRef<number | null>(null);
// Reset on search space change (not on initial mount)
useEffect(() => {
if (prevSearchSpaceIdRef.current !== null && prevSearchSpaceIdRef.current !== searchSpaceId) {
setDocuments([]);
apiLoadedRef.current = false;
userCacheRef.current.clear();
}
prevSearchSpaceIdRef.current = searchSpaceId;
}, [searchSpaceId]);
return {
documents,
typeCounts,
total: documents.length,
loading,
error,
};
}

View file

@ -38,10 +38,14 @@ function deduplicateAndSort(items: InboxItem[]): InboxItem[] {
/**
* Calculate the cutoff date for sync window
* IMPORTANT: Rounds to the start of the day (midnight UTC) to ensure stable values
* across re-renders. Without this, millisecond differences cause multiple syncs!
*/
function getSyncCutoffDate(): string {
const cutoff = new Date();
cutoff.setDate(cutoff.getDate() - SYNC_WINDOW_DAYS);
// Round to start of day to prevent millisecond differences causing duplicate syncs
cutoff.setUTCHours(0, 0, 0, 0);
return cutoff.toISOString();
}

View file

@ -12,10 +12,21 @@
* 3. Works even if logout cleanup fails
*/
import { PGlite } from "@electric-sql/pglite";
import { PGlite, type Transaction } from "@electric-sql/pglite";
import { live } from "@electric-sql/pglite/live";
import { electricSync } from "@electric-sql/pglite-sync";
// Debug logging - only logs in development, silent in production
const IS_DEV = process.env.NODE_ENV === "development";
function debugLog(...args: unknown[]) {
if (IS_DEV) console.log(...args);
}
function debugWarn(...args: unknown[]) {
if (IS_DEV) console.warn(...args);
}
// Types
export interface ElectricClient {
db: PGlite;
@ -56,7 +67,14 @@ const pendingSyncs = new Map<string, Promise<SyncHandle>>();
// v2: user-specific database architecture
// v3: consistent cutoff date for sync+queries, visibility refresh support
// v4: heartbeat-based stale notification detection with updated_at tracking
const SYNC_VERSION = 4;
// v5: fixed duplicate key errors (root cause: unstable cutoff dates in use-inbox.ts)
// - added onMustRefetch handler for server-side refetch scenarios
// - fixed getSyncCutoffDate to use stable midnight UTC timestamps
// v6: real-time documents table - added title and created_by_id columns for live document display
// v7: removed use-documents-electric.ts - consolidated to single documents sync to prevent conflicts
// v8: added status column for real-time document processing status (ready/processing/failed)
// v9: added pending state for accurate document queue visibility
const SYNC_VERSION = 11;
// Database name prefix for identifying SurfSense databases
const DB_PREFIX = "surfsense-";
@ -77,7 +95,7 @@ function getDbName(userId: string): string {
}
/**
* Clean up databases from OTHER users (not the current user)
* Clean up databases from OTHER users AND old versions
* This is called on login to ensure clean state
*/
async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
@ -85,6 +103,10 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
return;
}
// The exact database identifier we want to keep (current user + current version)
// Format: "surfsense-{userId}-v{version}"
const currentDbIdentifier = `${DB_PREFIX}${currentUserId}-v${SYNC_VERSION}`;
try {
// Try to list all databases (not supported in all browsers)
if (typeof window.indexedDB.databases === "function") {
@ -95,26 +117,27 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
if (!dbName) continue;
// Check if this is a SurfSense database
if (dbName.startsWith(DB_PREFIX) || dbName.includes("surfsense")) {
// Don't delete current user's database
if (dbName.includes(currentUserId)) {
console.log(`[Electric] Keeping current user's database: ${dbName}`);
if (dbName.includes("surfsense")) {
// Check if this is the current database
// PGlite stores with "/pglite/" prefix, so we check if the name ENDS WITH our identifier
if (dbName.endsWith(currentDbIdentifier)) {
debugLog(`[Electric] Keeping current database: ${dbName}`);
continue;
}
// Delete databases from other users
// Delete ALL other databases (other users OR old versions of current user)
try {
console.log(`[Electric] Deleting stale database: ${dbName}`);
debugLog(`[Electric] Deleting stale database: ${dbName}`);
window.indexedDB.deleteDatabase(dbName);
} catch (deleteErr) {
console.warn(`[Electric] Failed to delete database ${dbName}:`, deleteErr);
debugWarn(`[Electric] Failed to delete database ${dbName}:`, deleteErr);
}
}
}
}
} catch (err) {
// indexedDB.databases() not supported - that's okay, login cleanup is best-effort
console.warn("[Electric] Could not enumerate databases for cleanup:", err);
debugWarn("[Electric] Could not enumerate databases for cleanup:", err);
}
}
@ -140,7 +163,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// If initialized for a different user, close the old client first
if (electricClient && currentUserId !== userId) {
console.log(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`);
debugLog(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`);
await cleanupElectric();
}
@ -155,12 +178,12 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
initPromise = (async () => {
try {
// STEP 1: Clean up databases from other users (login-time cleanup)
console.log("[Electric] Cleaning up databases from other users...");
debugLog("[Electric] Cleaning up databases from other users...");
await cleanupOtherUserDatabases(userId);
// STEP 2: Create user-specific PGlite database
const dbName = getDbName(userId);
console.log(`[Electric] Initializing database: ${dbName}`);
debugLog(`[Electric] Initializing database: ${dbName}`);
const db = await PGlite.create({
dataDir: dbName,
@ -216,18 +239,22 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
`);
// Create the documents table schema in PGlite
// Only sync minimal fields needed for type counts: id, document_type, search_space_id
// Sync columns needed for real-time table display (lightweight - no content/metadata)
await db.exec(`
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
search_space_id INTEGER NOT NULL,
document_type TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
title TEXT NOT NULL DEFAULT '',
created_by_id TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status JSONB DEFAULT '{"state": "ready"}'::jsonb
);
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents(search_space_id);
CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(document_type);
CREATE INDEX IF NOT EXISTS idx_documents_search_space_type ON documents(search_space_id, document_type);
CREATE INDEX IF NOT EXISTS idx_documents_status ON documents((status->>'state'));
`);
await db.exec(`
@ -290,14 +317,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Check if we already have an active sync for this shape (memory optimization)
const existingHandle = activeSyncHandles.get(cacheKey);
if (existingHandle) {
console.log(`[Electric] Reusing existing sync handle for: ${cacheKey}`);
debugLog(`[Electric] Reusing existing sync handle for: ${cacheKey}`);
return existingHandle;
}
// Check if there's already a pending sync for this shape (prevent race condition)
const pendingSync = pendingSyncs.get(cacheKey);
if (pendingSync) {
console.log(`[Electric] Waiting for pending sync to complete: ${cacheKey}`);
debugLog(`[Electric] Waiting for pending sync to complete: ${cacheKey}`);
return pendingSync;
}
@ -323,7 +350,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
if (singleQuoteCount % 2 !== 0) {
// Odd number of quotes means unterminated string literal
console.warn("Where clause has unmatched quotes, fixing:", where);
debugWarn("Where clause has unmatched quotes, fixing:", where);
// Add closing quote at the end
validatedWhere = `${where}'`;
params.where = validatedWhere;
@ -337,15 +364,15 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
if (columns) params.columns = columns.join(",");
console.log("[Electric] Syncing shape with params:", params);
console.log("[Electric] Electric URL:", `${electricUrl}/v1/shape`);
console.log("[Electric] Where clause:", where, "Validated:", validatedWhere);
debugLog("[Electric] Syncing shape with params:", params);
debugLog("[Electric] Electric URL:", `${electricUrl}/v1/shape`);
debugLog("[Electric] Where clause:", where, "Validated:", validatedWhere);
try {
// Debug: Test Electric SQL connection directly first (DEV ONLY - skipped in production)
if (process.env.NODE_ENV === "development") {
const testUrl = `${electricUrl}/v1/shape?table=${table}&offset=-1${validatedWhere ? `&where=${encodeURIComponent(validatedWhere)}` : ""}`;
console.log("[Electric] Testing Electric SQL directly:", testUrl);
debugLog("[Electric] Testing Electric SQL directly:", testUrl);
try {
const testResponse = await fetch(testUrl);
const testHeaders = {
@ -353,9 +380,9 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
offset: testResponse.headers.get("electric-offset"),
upToDate: testResponse.headers.get("electric-up-to-date"),
};
console.log("[Electric] Direct Electric SQL response headers:", testHeaders);
debugLog("[Electric] Direct Electric SQL response headers:", testHeaders);
const testData = await testResponse.json();
console.log(
debugLog(
"[Electric] Direct Electric SQL data count:",
Array.isArray(testData) ? testData.length : "not array",
testData
@ -396,14 +423,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Shorter timeout (5 seconds) as fallback
setTimeout(() => {
if (!syncResolved) {
console.warn(
debugWarn(
`[Electric] ⚠️ Sync timeout for ${table} - checking isUpToDate one more time...`
);
// Check isUpToDate one more time before resolving
// This will be checked after shape is created
setTimeout(() => {
if (!syncResolved) {
console.warn(
debugWarn(
`[Electric] ⚠️ Sync timeout for ${table} - resolving anyway after 5s`
);
resolveInitialSync();
@ -413,7 +440,22 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
}, 5000);
});
// Include userId in shapeKey for user-specific sync state
// ROOT CAUSE FIX: The duplicate key errors were caused by unstable cutoff dates
// in use-inbox.ts generating different sync keys on each render.
// That's now fixed (rounded to midnight UTC in getSyncCutoffDate).
// We can safely use shapeKey for fast incremental sync.
const shapeKey = `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`;
// Type assertion to PGlite with electric extension
const pgWithElectric = db as unknown as {
electric: {
syncShapeToTable: (
config: Record<string, unknown>
) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>;
};
};
const shapeConfig = {
shape: {
url: `${electricUrl}/v1/shape`,
@ -425,9 +467,9 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
},
table,
primaryKey,
shapeKey: `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`, // User-specific versioned key
shapeKey, // Re-enabled for fast incremental sync (root cause in use-inbox.ts is fixed)
onInitialSync: () => {
console.log(
debugLog(
`[Electric] ✅ Initial sync complete for ${table} - data should now be in PGlite`
);
resolveInitialSync();
@ -440,21 +482,37 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
);
rejectInitialSync(error);
},
// Handle must-refetch: clear table data before Electric re-inserts from scratch
// This prevents "duplicate key" errors when the shape is invalidated
onMustRefetch: async (tx: Transaction) => {
debugLog(
`[Electric] ⚠️ Must refetch triggered for ${table} - clearing existing data`
);
try {
// Delete rows matching the shape's WHERE clause
// If no WHERE clause, delete all rows from the table
if (validatedWhere) {
// Parse the WHERE clause to build a DELETE statement
// The WHERE clause is already validated and formatted
await tx.exec(`DELETE FROM ${table} WHERE ${validatedWhere}`);
debugLog(`[Electric] 🗑️ Cleared ${table} rows matching: ${validatedWhere}`);
} else {
// No WHERE clause means we're syncing the entire table
await tx.exec(`DELETE FROM ${table}`);
debugLog(`[Electric] 🗑️ Cleared all rows from ${table}`);
}
} catch (cleanupError) {
console.error(
`[Electric] ❌ Failed to clear ${table} during must-refetch:`,
cleanupError
);
// Re-throw to let Electric handle the error
throw cleanupError;
}
},
};
console.log(
"[Electric] syncShapeToTable config:",
JSON.stringify(shapeConfig, null, 2)
);
// Type assertion to PGlite with electric extension
const pgWithElectric = db as PGlite & {
electric: {
syncShapeToTable: (
config: typeof shapeConfig
) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>;
};
};
debugLog("[Electric] syncShapeToTable config:", JSON.stringify(shapeConfig, null, 2));
let shape: { unsubscribe: () => void; isUpToDate: boolean; stream: unknown };
try {
@ -464,7 +522,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
const errorMessage =
syncError instanceof Error ? syncError.message : String(syncError);
if (errorMessage.includes("Already syncing")) {
console.warn(
debugWarn(
`[Electric] Already syncing ${table}, waiting for existing sync to settle...`
);
@ -474,12 +532,12 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Check if an active handle now exists (another sync might have completed)
const existingHandle = activeSyncHandles.get(cacheKey);
if (existingHandle) {
console.log(`[Electric] Found existing handle after waiting: ${cacheKey}`);
debugLog(`[Electric] Found existing handle after waiting: ${cacheKey}`);
return existingHandle;
}
// Retry once after waiting
console.log(`[Electric] Retrying sync for ${table}...`);
debugLog(`[Electric] Retrying sync for ${table}...`);
try {
shape = await pgWithElectric.electric.syncShapeToTable(shapeConfig);
} catch (retryError) {
@ -487,12 +545,10 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
retryError instanceof Error ? retryError.message : String(retryError);
if (retryMessage.includes("Already syncing")) {
// Still syncing - create a placeholder handle that indicates the table is being synced
console.warn(
`[Electric] ${table} still syncing, creating placeholder handle`
);
debugWarn(`[Electric] ${table} still syncing, creating placeholder handle`);
const placeholderHandle: SyncHandle = {
unsubscribe: () => {
console.log(`[Electric] Placeholder unsubscribe for: ${cacheKey}`);
debugLog(`[Electric] Placeholder unsubscribe for: ${cacheKey}`);
activeSyncHandles.delete(cacheKey);
},
get isUpToDate() {
@ -516,7 +572,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
}
// Log the actual shape result structure
console.log("[Electric] Shape sync result (initial):", {
debugLog("[Electric] Shape sync result (initial):", {
hasUnsubscribe: typeof shape?.unsubscribe === "function",
isUpToDate: shape?.isUpToDate,
hasStream: !!shape?.stream,
@ -525,7 +581,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Recommended Approach Step 1: Check isUpToDate immediately
if (shape.isUpToDate) {
console.log(
debugLog(
`[Electric] ✅ Sync already up-to-date for ${table} (resuming from previous state)`
);
resolveInitialSync();
@ -533,7 +589,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Recommended Approach Step 2: Subscribe to stream and watch for "up-to-date" message
if (shape?.stream) {
const stream = shape.stream as any;
console.log("[Electric] Shape stream details:", {
debugLog("[Electric] Shape stream details:", {
shapeHandle: stream?.shapeHandle,
lastOffset: stream?.lastOffset,
isUpToDate: stream?.isUpToDate,
@ -546,14 +602,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// NOTE: We keep this subscription active - don't unsubscribe!
// The stream is what Electric SQL uses for real-time updates
if (typeof stream?.subscribe === "function") {
console.log(
debugLog(
"[Electric] Subscribing to shape stream to watch for up-to-date message..."
);
// Subscribe but don't store unsubscribe - we want it to stay active
stream.subscribe((messages: unknown[]) => {
// Continue receiving updates even after sync is resolved
if (!syncResolved) {
console.log(
debugLog(
"[Electric] 🔵 Shape stream received messages:",
messages?.length || 0
);
@ -570,14 +626,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
(typeof msg === "object" && "up-to-date" in msg)
) {
if (!syncResolved) {
console.log(`[Electric] ✅ Received up-to-date message for ${table}`);
debugLog(`[Electric] ✅ Received up-to-date message for ${table}`);
resolveInitialSync();
}
// Continue listening for real-time updates - don't return!
}
}
if (!syncResolved && messages.length > 0) {
console.log(
debugLog(
"[Electric] First message:",
JSON.stringify(messages[0], null, 2)
);
@ -586,16 +642,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Also check stream's isUpToDate property after receiving messages
if (!syncResolved && stream?.isUpToDate) {
console.log(`[Electric] ✅ Stream isUpToDate is true for ${table}`);
debugLog(`[Electric] ✅ Stream isUpToDate is true for ${table}`);
resolveInitialSync();
}
});
// Also check stream's isUpToDate property immediately
if (stream?.isUpToDate) {
console.log(
`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`
);
debugLog(`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`);
resolveInitialSync();
}
}
@ -608,9 +662,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
}
if (shape.isUpToDate || stream?.isUpToDate) {
console.log(
`[Electric] ✅ Sync completed (detected via polling) for ${table}`
);
debugLog(`[Electric] ✅ Sync completed (detected via polling) for ${table}`);
clearInterval(pollInterval);
resolveInitialSync();
}
@ -621,7 +673,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
clearInterval(pollInterval);
});
} else {
console.warn(
debugWarn(
`[Electric] ⚠️ No stream available for ${table}, relying on callback and timeout`
);
}
@ -630,7 +682,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Create the sync handle with proper cleanup
const syncHandle: SyncHandle = {
unsubscribe: () => {
console.log(`[Electric] Unsubscribing from: ${cacheKey}`);
debugLog(`[Electric] Unsubscribing from: ${cacheKey}`);
// Remove from cache first
activeSyncHandles.delete(cacheKey);
// Then unsubscribe from the shape
@ -648,7 +700,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Cache the sync handle for reuse (memory optimization)
activeSyncHandles.set(cacheKey, syncHandle);
console.log(
debugLog(
`[Electric] Cached sync handle for: ${cacheKey} (total cached: ${activeSyncHandles.size})`
);
@ -660,7 +712,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
const response = await fetch(`${electricUrl}/v1/shape?table=${table}&offset=-1`, {
method: "GET",
});
console.log(
debugLog(
"[Electric] Electric SQL server response:",
response.status,
response.statusText
@ -682,14 +734,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
// Clean up the pending sync when done (whether success or failure)
syncPromise.finally(() => {
pendingSyncs.delete(cacheKey);
console.log(`[Electric] Pending sync removed for: ${cacheKey}`);
debugLog(`[Electric] Pending sync removed for: ${cacheKey}`);
});
return syncPromise;
},
};
console.log(`[Electric] ✅ Initialized successfully for user: ${userId}`);
debugLog(`[Electric] ✅ Initialized successfully for user: ${userId}`);
return electricClient;
} catch (error) {
console.error("[Electric] Failed to initialize:", error);
@ -715,10 +767,10 @@ export async function cleanupElectric(): Promise<void> {
}
const userIdToClean = currentUserId;
console.log(`[Electric] Cleaning up for user: ${userIdToClean}`);
debugLog(`[Electric] Cleaning up for user: ${userIdToClean}`);
// Unsubscribe from all active sync handles first (memory cleanup)
console.log(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`);
debugLog(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`);
// Copy keys to array to avoid mutation during iteration
const handleKeys = Array.from(activeSyncHandles.keys());
for (const key of handleKeys) {
@ -727,7 +779,7 @@ export async function cleanupElectric(): Promise<void> {
try {
handle.unsubscribe();
} catch (err) {
console.warn(`[Electric] Failed to unsubscribe from ${key}:`, err);
debugWarn(`[Electric] Failed to unsubscribe from ${key}:`, err);
}
}
}
@ -738,7 +790,7 @@ export async function cleanupElectric(): Promise<void> {
try {
// Close the PGlite database connection
await electricClient.db.close();
console.log("[Electric] Database closed");
debugLog("[Electric] Database closed");
} catch (error) {
console.error("[Electric] Error closing database:", error);
}
@ -754,13 +806,13 @@ export async function cleanupElectric(): Promise<void> {
try {
const dbName = `${DB_PREFIX}${userIdToClean}-v${SYNC_VERSION}`;
window.indexedDB.deleteDatabase(dbName);
console.log(`[Electric] Deleted database: ${dbName}`);
debugLog(`[Electric] Deleted database: ${dbName}`);
} catch (err) {
console.warn("[Electric] Failed to delete database:", err);
debugWarn("[Electric] Failed to delete database:", err);
}
}
console.log("[Electric] Cleanup complete");
debugLog("[Electric] Cleanup complete");
}
/**

View file

@ -308,6 +308,7 @@
"no_rows_selected": "No rows selected",
"delete_success_count": "Successfully deleted {count} document(s)",
"delete_partial_failed": "Some documents could not be deleted",
"delete_success": "Document deleted successfully",
"delete_error": "Error deleting documents",
"filter_by_title": "Filter by title...",
"bulk_delete": "Delete Selected",
@ -328,7 +329,6 @@
"filter_placeholder": "Filter by title...",
"rows_per_page": "Rows per page",
"refresh": "Refresh",
"refresh_success": "Documents refreshed",
"upload_documents": "Upload Documents",
"create_shared_note": "Create Shared Note",
"processing_documents": "Processing documents...",

View file

@ -313,7 +313,6 @@
"filter_placeholder": "按标题筛选...",
"rows_per_page": "每页行数",
"refresh": "刷新",
"refresh_success": "文档已刷新",
"upload_documents": "上传文档",
"create_shared_note": "创建共享笔记",
"processing_documents": "正在处理文档...",