diff --git a/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py b/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py index dc25a1edd..182bf981c 100644 --- a/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py +++ b/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py @@ -17,13 +17,6 @@ from collections.abc import Sequence from alembic import context, op -# Get Electric SQL user credentials from env.py configuration -_config = context.config -ELECTRIC_DB_USER = _config.get_main_option("electric_db_user", "electric") -ELECTRIC_DB_PASSWORD = _config.get_main_option( - "electric_db_password", "electric_password" -) - # revision identifiers, used by Alembic. revision: str = "66" down_revision: str | None = "65" @@ -31,8 +24,21 @@ branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None +def _get_electric_credentials() -> tuple[str, str]: + """Get Electric SQL credentials from Alembic config. + + Must be called inside upgrade()/downgrade(), not at module level, + because context.config is only available during migration execution. + """ + _config = context.config + user = _config.get_main_option("electric_db_user", "electric") + password = _config.get_main_option("electric_db_password", "electric_password") + return user, password + + def upgrade() -> None: """Upgrade schema - add notifications table and Electric SQL replication.""" + electric_db_user, electric_db_password = _get_electric_credentials() # Create notifications table op.execute( """ @@ -74,8 +80,8 @@ def upgrade() -> None: f""" DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{ELECTRIC_DB_USER}') THEN - CREATE USER {ELECTRIC_DB_USER} WITH REPLICATION PASSWORD '{ELECTRIC_DB_PASSWORD}'; + IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{electric_db_user}') THEN + CREATE USER {electric_db_user} WITH REPLICATION PASSWORD '{electric_db_password}'; END IF; END $$; @@ -89,19 +95,19 @@ def upgrade() -> None: DECLARE db_name TEXT := current_database(); BEGIN - EXECUTE format('GRANT CONNECT ON DATABASE %I TO {ELECTRIC_DB_USER}', db_name); + EXECUTE format('GRANT CONNECT ON DATABASE %I TO {electric_db_user}', db_name); END $$; """ ) - op.execute(f"GRANT USAGE ON SCHEMA public TO {ELECTRIC_DB_USER};") - op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {ELECTRIC_DB_USER};") - op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {ELECTRIC_DB_USER};") + op.execute(f"GRANT USAGE ON SCHEMA public TO {electric_db_user};") + op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {electric_db_user};") + op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {electric_db_user};") op.execute( - f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {ELECTRIC_DB_USER};" + f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {electric_db_user};" ) op.execute( - f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {ELECTRIC_DB_USER};" + f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {electric_db_user};" ) # Create the publication if not exists diff --git a/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py b/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py index 09bea2c19..92f027e00 100644 --- a/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py +++ b/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py @@ -10,8 +10,6 @@ SECRET_KEY rotation. from collections.abc import Sequence -import sqlalchemy as sa - from alembic import op # revision identifiers, used by Alembic. @@ -23,17 +21,45 @@ depends_on: str | Sequence[str] | None = None def upgrade() -> None: # Add access_token column (nullable so existing rows are unaffected) - op.add_column( - "image_generations", - sa.Column("access_token", sa.String(64), nullable=True), - ) - op.create_index( - "ix_image_generations_access_token", - "image_generations", - ["access_token"], + # Guard: skip entirely if image_generations table doesn't exist + op.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'image_generations' + ) THEN + -- Add column if not exists + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'image_generations' AND column_name = 'access_token' + ) THEN + ALTER TABLE image_generations + ADD COLUMN access_token VARCHAR(64); + END IF; + + -- Create index if not exists + CREATE INDEX IF NOT EXISTS ix_image_generations_access_token + ON image_generations (access_token); + END IF; + END$$; + """ ) def downgrade() -> None: - op.drop_index("ix_image_generations_access_token", table_name="image_generations") - op.drop_column("image_generations", "access_token") + op.execute("DROP INDEX IF EXISTS ix_image_generations_access_token") + op.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'image_generations' AND column_name = 'access_token' + ) THEN + ALTER TABLE image_generations DROP COLUMN access_token; + END IF; + END$$; + """ + ) diff --git a/surfsense_backend/alembic/versions/95_add_document_status_column.py b/surfsense_backend/alembic/versions/95_add_document_status_column.py new file mode 100644 index 000000000..f5a6fa65d --- /dev/null +++ b/surfsense_backend/alembic/versions/95_add_document_status_column.py @@ -0,0 +1,77 @@ +"""Add status column to documents table for per-document processing status + +Revision ID: 95 +Revises: 94 +Create Date: 2026-02-05 + +Changes: +1. Add status column (JSONB) to documents table +2. Default value is {"state": "ready"} for backward compatibility +3. Existing documents are set to ready status +4. Index created for efficient status filtering +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "95" +down_revision: str | None = "94" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Add status column to documents with default ready state.""" + + # 1. Add status column with default value for new rows + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'documents' AND column_name = 'status' + ) THEN + ALTER TABLE documents + ADD COLUMN status JSONB NOT NULL DEFAULT '{"state": "ready"}'::jsonb; + END IF; + END$$; + """ + ) + + # 2. Create index on status for efficient filtering by state + op.execute( + """ + CREATE INDEX IF NOT EXISTS ix_documents_status + ON documents ((status->>'state')); + """ + ) + + +def downgrade() -> None: + """Remove status column from documents.""" + + # Drop index + op.execute( + """ + DROP INDEX IF EXISTS ix_documents_status; + """ + ) + + # Drop column + op.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'documents' AND column_name = 'status' + ) THEN + ALTER TABLE documents + DROP COLUMN status; + END IF; + END$$; + """ + ) diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py index 1964a4d45..4764a0a41 100644 --- a/surfsense_backend/app/connectors/composio_gmail_connector.py +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -16,11 +16,15 @@ from sqlalchemy.orm import selectinload from app.config import config from app.connectors.composio_connector import ComposioConnector -from app.db import Document, DocumentType +from app.db import Document, DocumentStatus, DocumentType from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.tasks.connector_indexers.base import calculate_date_range +from app.tasks.connector_indexers.base import ( + calculate_date_range, + check_duplicate_document_by_hash, + safe_set_chunks, +) from app.utils.document_converters import ( create_document_chunks, generate_content_hash, @@ -206,26 +210,24 @@ class ComposioGmailConnector(ComposioConnector): # ============ Indexer Functions ============ -async def _process_gmail_message_batch( +async def _analyze_gmail_messages_phase1( session: AsyncSession, messages: list[dict[str, Any]], composio_connector: ComposioGmailConnector, connector_id: int, search_space_id: int, user_id: str, - total_documents_indexed: int = 0, -) -> tuple[int, int]: +) -> tuple[list[dict[str, Any]], int, int]: """ - Process a batch of Gmail messages and index them. - - Args: - total_documents_indexed: Running total of documents indexed so far (for batch commits). + Phase 1: Analyze all messages, create pending documents. + Makes ALL documents visible in the UI immediately with pending status. Returns: - Tuple of (documents_indexed, documents_skipped) + Tuple of (messages_to_process, documents_skipped, duplicate_content_count) """ - documents_indexed = 0 + messages_to_process = [] documents_skipped = 0 + duplicate_content_count = 0 for message in messages: try: @@ -235,11 +237,7 @@ async def _process_gmail_message_batch( documents_skipped += 1 continue - # Composio's GMAIL_FETCH_EMAILS already returns full message content - # No need for a separate detail API call - # Extract message info from Composio response - # Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds payload = message.get("payload", {}) headers = payload.get("headers", []) @@ -262,7 +260,7 @@ async def _process_gmail_message_batch( message ) - # Check for empty content (defensive parsing per Composio best practices) + # Check for empty content if not markdown_content.strip(): logger.warning(f"Skipping Gmail message with no content: {subject}") documents_skipped += 1 @@ -280,102 +278,58 @@ async def _process_gmail_message_batch( session, unique_identifier_hash ) - # Get label IDs from Composio response + # Get label IDs and thread_id from Composio response label_ids = message.get("labelIds", []) - # Extract thread_id if available (for consistency with non-Composio implementation) thread_id = message.get("threadId", "") or message.get("thread_id", "") if existing_document: if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, "message_id": message_id, "thread_id": thread_id, "subject": subject, "sender": sender, - "document_type": "Gmail Message (Composio)", + "date_str": date_str, + "label_ids": label_ids, } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "labels": label_ids, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - - # Batch commit every 10 documents - current_total = total_documents_indexed + documents_indexed - if current_total % 10 == 0: - logger.info( - f"Committing batch: {current_total} Gmail messages processed so far" - ) - await session.commit() + ) continue - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "document_type": "Gmail Message (Composio)", - } - summary_content, summary_embedding = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from standard connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash ) - chunks = await create_document_chunks(markdown_content) + if duplicate_by_content: + logger.info( + f"Message {subject} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping." + ) + duplicate_content_count += 1 + documents_skipped += 1 + continue + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Gmail: {subject}", + title=subject, document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]), document_metadata={ "message_id": message_id, @@ -388,39 +342,140 @@ async def _process_gmail_message_batch( "toolkit_id": "gmail", "source": "composio", }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) session.add(document) + + messages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + "label_ids": label_ids, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + return messages_to_process, documents_skipped, duplicate_content_count + + +async def _process_gmail_messages_phase2( + session: AsyncSession, + messages_to_process: list[dict[str, Any]], + connector_id: int, + search_space_id: int, + user_id: str, + on_heartbeat_callback: HeartbeatCallbackType | None = None, +) -> tuple[int, int]: + """ + Phase 2: Process each document one by one. + Each document transitions: pending → processing → ready/failed + + Returns: + Tuple of (documents_indexed, documents_failed) + """ + documents_indexed = 0 + documents_failed = 0 + last_heartbeat_time = time.time() + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "document_type": "Gmail Message (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + item["markdown_content"], user_llm, document_metadata_for_summary + ) + else: + summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["markdown_content"]) + + # Update document to READY with actual content + document.title = item["subject"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], + "labels": item["label_ids"], + "connector_id": connector_id, + "source": "composio", + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + documents_indexed += 1 - # Batch commit every 10 documents - current_total = total_documents_indexed + documents_indexed - if current_total % 10 == 0: + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: logger.info( - f"Committing batch: {current_total} Gmail messages processed so far" + f"Committing batch: {documents_indexed} Gmail messages processed so far" ) await session.commit() except Exception as e: logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) - documents_skipped += 1 - # Rollback on error to avoid partial state (per Composio best practices) + # Mark document as failed with reason (visible in UI) try: - await session.rollback() - except Exception as rollback_error: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: logger.error( - f"Error during rollback: {rollback_error!s}", exc_info=True + f"Failed to update document status to failed: {status_error}" ) + documents_failed += 1 continue - return documents_indexed, documents_skipped + return documents_indexed, documents_failed async def index_composio_gmail( @@ -437,7 +492,7 @@ async def index_composio_gmail( max_items: int = 1000, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, str]: - """Index Gmail messages via Composio with pagination and incremental processing.""" + """Index Gmail messages via Composio with real-time document status updates.""" try: composio_connector = ComposioGmailConnector(session, connector_id) @@ -448,14 +503,10 @@ async def index_composio_gmail( end_date = None # Use provided dates directly if both are provided, otherwise calculate from last_indexed_at - # This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior) if start_date is not None and end_date is not None: - # User provided both dates - use them directly start_date_str = start_date end_date_str = end_date else: - # Calculate date range with defaults (uses last_indexed_at or 365 days back) - # This ensures indexing works even when user doesn't specify dates start_date_str, end_date_str = calculate_date_range( connector, start_date, end_date, default_days_back=365 ) @@ -473,48 +524,32 @@ async def index_composio_gmail( f"(start_date={start_date_str}, end_date={end_date_str})" ) - # Use smaller batch size to avoid 413 payload too large errors + await task_logger.log_task_progress( + log_entry, + f"Fetching Gmail messages via Composio for connector {connector_id}", + {"stage": "fetching_messages"}, + ) + + # ======================================================================= + # FETCH ALL MESSAGES FIRST + # ======================================================================= batch_size = 50 page_token = None - total_documents_indexed = 0 - total_documents_skipped = 0 - total_messages_fetched = 0 - result_size_estimate = None # Will be set from first API response + all_messages = [] + result_size_estimate = None last_heartbeat_time = time.time() - while total_messages_fetched < max_items: - # Send heartbeat periodically to indicate task is still alive + while len(all_messages) < max_items: + # Send heartbeat periodically if on_heartbeat_callback: current_time = time.time() if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(total_documents_indexed) + await on_heartbeat_callback(len(all_messages)) last_heartbeat_time = current_time - # Calculate how many messages to fetch in this batch - remaining = max_items - total_messages_fetched + remaining = max_items - len(all_messages) current_batch_size = min(batch_size, remaining) - # Use result_size_estimate if available, otherwise fall back to max_items - estimated_total = ( - result_size_estimate if result_size_estimate is not None else max_items - ) - # Cap estimated_total at max_items to avoid showing misleading progress - estimated_total = min(estimated_total, max_items) - - await task_logger.log_task_progress( - log_entry, - f"Fetching Gmail messages batch via Composio for connector {connector_id} " - f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)", - { - "stage": "fetching_messages", - "batch_size": current_batch_size, - "total_fetched": total_messages_fetched, - "total_indexed": total_documents_indexed, - "estimated_total": estimated_total, - }, - ) - - # Fetch batch of messages ( messages, next_token, @@ -533,97 +568,136 @@ async def index_composio_gmail( return 0, f"Failed to fetch Gmail messages: {error}" if not messages: - # No more messages available break - # Update result_size_estimate from first response (Gmail provides this estimate) if result_size_estimate is None and result_size_estimate_batch is not None: result_size_estimate = result_size_estimate_batch logger.info( f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'" ) - total_messages_fetched += len(messages) - # Recalculate estimated_total after potentially updating result_size_estimate - estimated_total = ( - result_size_estimate if result_size_estimate is not None else max_items - ) - estimated_total = min(estimated_total, max_items) - + all_messages.extend(messages) logger.info( - f"Fetched batch of {len(messages)} Gmail messages " - f"(total: {total_messages_fetched}/{estimated_total})" + f"Fetched {len(messages)} messages (total: {len(all_messages)})" ) - # Process batch incrementally - batch_indexed, batch_skipped = await _process_gmail_message_batch( - session=session, - messages=messages, - composio_connector=composio_connector, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - total_documents_indexed=total_documents_indexed, - ) - - total_documents_indexed += batch_indexed - total_documents_skipped += batch_skipped - - logger.info( - f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped " - f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)" - ) - - # Batch commits happen in _process_gmail_message_batch every 10 documents - # This ensures progress is saved incrementally, preventing data loss on crashes - - # Check if we should continue - if not next_token: - # No more pages available + if not next_token or len(messages) < current_batch_size: break - if len(messages) < current_batch_size: - # Last page had fewer items than requested, we're done - break - - # Continue with next page page_token = next_token - if total_messages_fetched == 0: + if not all_messages: success_msg = "No Gmail messages found in the specified date range" await task_logger.log_task_success( log_entry, success_msg, {"messages_count": 0} ) - # CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status await update_connector_last_indexed(session, connector, update_last_indexed) await session.commit() - return 0, None # Return None (not error) when no items found + return ( + 0, + None, + ) # Return None (not error) when no items found - this is success with 0 items - # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs - # This ensures the UI shows "Last indexed" instead of "Never indexed" + logger.info(f"Found {len(all_messages)} Gmail messages to index via Composio") + + # ======================================================================= + # PHASE 1: Analyze all messages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + await task_logger.log_task_progress( + log_entry, + f"Phase 1: Creating pending documents for {len(all_messages)} messages", + {"stage": "phase1_pending"}, + ) + + ( + messages_to_process, + documents_skipped, + duplicate_content_count, + ) = await _analyze_gmail_messages_phase1( + session=session, + messages=all_messages, + composio_connector=composio_connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + ) + + # Commit all pending documents - they all appear in UI now + new_documents_count = len([m for m in messages_to_process if m["is_new"]]) + if new_documents_count > 0: + logger.info(f"Phase 1: Committing {new_documents_count} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + await task_logger.log_task_progress( + log_entry, + f"Phase 2: Processing {len(messages_to_process)} documents", + {"stage": "phase2_processing"}, + ) + + documents_indexed, documents_failed = await _process_gmail_messages_phase2( + session=session, + messages_to_process=messages_to_process, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + on_heartbeat_callback=on_heartbeat_callback, + ) + + # CRITICAL: Always update timestamp so Electric SQL syncs await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit to ensure all documents are persisted (safety net) - # This matches the pattern used in non-Composio Gmail indexer - logger.info( - f"Final commit: Total {total_documents_indexed} Gmail messages processed" - ) - await session.commit() - logger.info( - "Successfully committed all Composio Gmail document changes to database" - ) + # Final commit to ensure all documents are persisted + logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed") + try: + await session.commit() + logger.info( + "Successfully committed all Composio Gmail document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, f"Successfully completed Gmail indexing via Composio for connector {connector_id}", { - "documents_indexed": total_documents_indexed, - "documents_skipped": total_documents_skipped, - "messages_fetched": total_messages_fetched, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) - return total_documents_indexed, None + logger.info( + f"Composio Gmail indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" + ) + return documents_indexed, warning_message except Exception as e: logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True) diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py index 78ff360ca..6593721a1 100644 --- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -16,13 +16,14 @@ from sqlalchemy.orm import selectinload from app.config import config from app.connectors.composio_connector import ComposioConnector -from app.db import Document, DocumentType +from app.db import Document, DocumentStatus, DocumentType from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( calculate_date_range, check_duplicate_document_by_hash, + safe_set_chunks, ) from app.utils.document_converters import ( create_document_chunks, @@ -266,18 +267,20 @@ async def index_composio_google_calendar( documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Track events that failed processing duplicate_content_count = ( 0 # Track events skipped due to duplicate content_hash ) last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all events, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + events_to_process = [] # List of dicts with document and event data + new_documents_created = False + for event in events: - # Send heartbeat periodically to indicate task is still alive - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = current_time try: # Handle both standard Google API and potential Composio variations event_id = event.get("id", "") or event.get("eventId", "") @@ -315,61 +318,28 @@ async def index_composio_google_calendar( if existing_document: if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - # Update existing - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + events_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, "event_id": event_id, "summary": summary, "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", + "end_time": end_time, + "location": location, } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Calendar: {summary}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "end_time": end_time, - "location": location, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Google Calendar events processed so far" - ) - await session.commit() + ) continue # Document doesn't exist by unique_identifier_hash @@ -380,49 +350,19 @@ async def index_composio_google_calendar( ) if duplicate_by_content: - # A document with the same content already exists (likely from standard connector) logger.info( f"Event {summary} already indexed by another connector " f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." + f"type: {duplicate_by_content.document_type}). Skipping." ) duplicate_content_count += 1 documents_skipped += 1 continue - # Create new document - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "summary": summary, - "start_time": start_time, - "document_type": "Google Calendar Event (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = ( - f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}" - ) - if location: - summary_content += f"\nLocation: {location}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(markdown_content) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Calendar: {summary}", + title=summary, document_type=DocumentType( TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"] ), @@ -436,19 +376,116 @@ async def index_composio_google_calendar( "toolkit_id": "googlecalendar", "source": "composio", }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) session.add(document) + new_documents_created = True + + events_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(events_to_process)} documents") + + for item in events_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "event_id": item["event_id"], + "summary": item["summary"], + "start_time": item["start_time"], + "document_type": "Google Calendar Event (Composio)", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, + ) + else: + summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}" + if item["location"]: + summary_content += f"\nLocation: {item['location']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["markdown_content"]) + + # Update document to READY with actual content + document.title = item["summary"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "event_id": item["event_id"], + "summary": item["summary"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"], + "connector_id": connector_id, + "source": "composio", + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + documents_indexed += 1 - # Batch commit every 10 documents + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" @@ -457,7 +494,15 @@ async def index_composio_google_calendar( except Exception as e: logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs @@ -490,10 +535,13 @@ async def index_composio_google_calendar( else: raise - # Build warning message if duplicates were found - warning_message = None + # Build warning message if there were issues + warning_parts = [] if duplicate_content_count > 0: - warning_message = f"{duplicate_content_count} skipped (duplicate)" + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, @@ -501,13 +549,15 @@ async def index_composio_google_calendar( { "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " - f"({duplicate_content_count} due to duplicate content from other connectors)" + f"Composio Google Calendar indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) return documents_indexed, warning_message diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py index 66669e4e0..4ccd195e6 100644 --- a/surfsense_backend/app/connectors/composio_google_drive_connector.py +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -21,10 +21,14 @@ from sqlalchemy.orm.attributes import flag_modified from app.config import config from app.connectors.composio_connector import ComposioConnector -from app.db import Document, DocumentType, Log +from app.db import Document, DocumentStatus, DocumentType, Log from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import ( + check_duplicate_document_by_hash, + safe_set_chunks, +) from app.utils.document_converters import ( create_document_chunks, generate_content_hash, @@ -537,22 +541,6 @@ async def check_document_by_unique_identifier( return existing_doc_result.scalars().first() -async def check_document_by_content_hash( - session: AsyncSession, content_hash: str -) -> Document | None: - """Check if a document with the given content hash already exists. - - This is used to prevent duplicate content from being indexed, regardless - of which connector originally indexed it. - """ - from sqlalchemy.future import select - - existing_doc_result = await session.execute( - select(Document).where(Document.content_hash == content_hash) - ) - return existing_doc_result.scalars().first() - - async def check_document_by_google_drive_file_id( session: AsyncSession, file_id: str, search_space_id: int ) -> Document | None: @@ -843,14 +831,16 @@ async def _index_composio_drive_delta_sync( log_entry, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int, list[str]]: - """Index Google Drive files using delta sync (only changed files). + """Index Google Drive files using delta sync with real-time document status updates. Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync. Handles: new files, modified files, and deleted files. """ documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 processing_errors = [] + duplicate_content_count = 0 last_heartbeat_time = time.time() # Fetch all changes with pagination @@ -881,14 +871,13 @@ async def _index_composio_drive_delta_sync( logger.info(f"Processing {len(all_changes)} changes from delta sync") - for change in all_changes[:max_items]: - # Send heartbeat periodically to indicate task is still alive - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = current_time + # ======================================================================= + # PHASE 1: Analyze all changes, handle deletions, create pending documents + # ======================================================================= + files_to_process = [] + new_documents_created = False + for change in all_changes[:max_items]: try: # Handle removed files is_removed = change.get("removed", False) @@ -899,9 +888,8 @@ async def _index_composio_drive_delta_sync( documents_skipped += 1 continue - # Check if file was trashed or removed + # Check if file was trashed or removed - handle deletions immediately if is_removed or file_info.get("trashed", False): - # Remove document from database document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) unique_identifier_hash = generate_unique_identifier_hash( document_type, f"drive_{file_id}", search_space_id @@ -923,37 +911,233 @@ async def _index_composio_drive_delta_sync( if mime_type == "application/vnd.google-apps.folder": continue - # Process the file - indexed, skipped, errors = await _process_single_drive_file( - session=session, - composio_connector=composio_connector, - file_id=file_id, - file_name=file_name, - mime_type=mime_type, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, + # Check for existing document by file ID (from any connector) + existing_by_file_id = await check_document_by_google_drive_file_id( + session, file_id, search_space_id ) - documents_indexed += indexed - documents_skipped += skipped - processing_errors.extend(errors) + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id + ) + + # Check if document exists by unique identifier + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_by_file_id and not existing_document: + # File already indexed by different connector - skip + logger.info( + f"Skipping file {file_name} (file_id={file_id}): already indexed " + f"by {existing_by_file_id.document_type.value}" + ) + documents_skipped += 1 + continue + + if existing_document: + # Queue existing document for update + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) + continue + + # Create new document with PENDING status + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content="Pending...", + content_hash=unique_identifier_hash, + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], + status=DocumentStatus.pending(), + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + files_to_process.append( + { + "document": document, + "is_new": True, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for change: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # ======================================================================= + logger.info(f"Phase 2: Processing {len(files_to_process)} documents") + + for item in files_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit + document.status = DocumentStatus.processing() + await session.commit() + + # Get file content + content, content_error = await composio_connector.get_drive_file_content( + item["file_id"], original_mime_type=item["mime_type"] + ) + + if content_error or not content: + logger.warning( + f"Could not get content for file {item['file_name']}: {content_error}" + ) + markdown_content = f"# {item['file_name']}\n\n" + markdown_content += f"**File ID:** {item['file_id']}\n" + markdown_content += f"**Type:** {item['mime_type']}\n" + elif isinstance(content, dict): + error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {item['file_name']}\n\n" + markdown_content += f"**File ID:** {item['file_id']}\n" + markdown_content += f"**Type:** {item['mime_type']}\n" + else: + markdown_content = await _process_file_content( + content=content, + file_name=item["file_name"], + file_id=item["file_id"], + mime_type=item["mime_type"], + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + # For existing documents, check if content changed + if not item["is_new"] and document.content_hash == content_hash: + if not DocumentStatus.is_state(document.status, DocumentStatus.READY): + document.status = DocumentStatus.ready() + documents_skipped += 1 + continue + + # Check for duplicate content hash (for new documents) + if item["is_new"]: + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + if duplicate_by_content: + logger.info( + f"File {item['file_name']} already indexed by another connector. Skipping." + ) + await session.delete(document) + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "file_id": item["file_id"], + "file_name": item["file_name"], + "mime_type": item["mime_type"], + "document_type": "Google Drive File (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata_for_summary + ) + else: + summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + # Update document to READY + document.title = item["file_name"] + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "file_id": item["file_id"], + "file_name": item["file_name"], + "FILE_NAME": item["file_name"], + "mime_type": item["mime_type"], + "connector_id": connector_id, + "source": "composio", + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 # Batch commit every 10 documents - if documents_indexed > 0 and documents_indexed % 10 == 0: + if documents_indexed % 10 == 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} changes processed") except Exception as e: - error_msg = f"Error processing change for file {file_id}: {e!s}" + error_msg = f"Error processing change for file {item['file_id']}: {e!s}" logger.error(error_msg, exc_info=True) processing_errors.append(error_msg) - documents_skipped += 1 + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue logger.info( - f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, " + f"{documents_failed} failed ({duplicate_content_count} duplicate content)" ) return documents_indexed, documents_skipped, processing_errors @@ -973,10 +1157,12 @@ async def _index_composio_drive_full_scan( log_entry, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int, list[str]]: - """Index Google Drive files using full scan (first sync or when no delta token).""" + """Index Google Drive files using full scan with real-time document status updates.""" documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 processing_errors = [] + duplicate_content_count = 0 last_heartbeat_time = time.time() all_files = [] @@ -1108,14 +1294,14 @@ async def _index_composio_drive_full_scan( f"Found {len(all_files)} Google Drive files to index via Composio (full scan)" ) - for file_info in all_files: - # Send heartbeat periodically to indicate task is still alive - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = current_time + # ======================================================================= + # PHASE 1: Analyze all files, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + files_to_process = [] # List of dicts with document and file data + new_documents_created = False + for file_info in all_files: try: # Handle both standard Google API and potential Composio variations file_id = file_info.get("id", "") or file_info.get("fileId", "") @@ -1132,227 +1318,242 @@ async def _index_composio_drive_full_scan( if mime_type == "application/vnd.google-apps.folder": continue - # Process the file - indexed, skipped, errors = await _process_single_drive_file( - session=session, - composio_connector=composio_connector, - file_id=file_id, - file_name=file_name, - mime_type=mime_type, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, + # ========== EARLY DUPLICATE CHECK BY FILE ID ========== + existing_by_file_id = await check_document_by_google_drive_file_id( + session, file_id, search_space_id + ) + if existing_by_file_id: + logger.info( + f"Skipping file {file_name} (file_id={file_id}): already indexed " + f"by {existing_by_file_id.document_type.value}" + ) + documents_skipped += 1 + continue + + # Generate unique identifier hash + document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) + unique_identifier_hash = generate_unique_identifier_hash( + document_type, f"drive_{file_id}", search_space_id ) - documents_indexed += indexed - documents_skipped += skipped - processing_errors.extend(errors) + # Check if document exists by unique identifier + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Queue existing document for update (will be set to processing in Phase 2) + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), + document_metadata={ + "file_id": file_id, + "file_name": file_name, + "FILE_NAME": file_name, + "mime_type": mime_type, + "connector_id": connector_id, + "toolkit_id": "googledrive", + "source": "composio", + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + files_to_process.append( + { + "document": document, + "is_new": True, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for file: {e!s}", exc_info=True) + documents_skipped += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(files_to_process)} documents") + + for item in files_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Get file content (pass mime_type for Google Workspace export handling) + content, content_error = await composio_connector.get_drive_file_content( + item["file_id"], original_mime_type=item["mime_type"] + ) + + if content_error or not content: + logger.warning( + f"Could not get content for file {item['file_name']}: {content_error}" + ) + markdown_content = f"# {item['file_name']}\n\n" + markdown_content += f"**File ID:** {item['file_id']}\n" + markdown_content += f"**Type:** {item['mime_type']}\n" + elif isinstance(content, dict): + error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}" + logger.error(error_msg) + processing_errors.append(error_msg) + markdown_content = f"# {item['file_name']}\n\n" + markdown_content += f"**File ID:** {item['file_id']}\n" + markdown_content += f"**Type:** {item['mime_type']}\n" + else: + # Process content based on file type + markdown_content = await _process_file_content( + content=content, + file_name=item["file_name"], + file_id=item["file_id"], + mime_type=item["mime_type"], + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + processing_errors=processing_errors, + ) + + content_hash = generate_content_hash(markdown_content, search_space_id) + + # For existing documents, check if content changed + if not item["is_new"] and document.content_hash == content_hash: + # Ensure status is ready + if not DocumentStatus.is_state(document.status, DocumentStatus.READY): + document.status = DocumentStatus.ready() + documents_skipped += 1 + continue + + # Check for duplicate content hash (for new documents) + if item["is_new"]: + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + if duplicate_by_content: + logger.info( + f"File {item['file_name']} already indexed by another connector. Skipping." + ) + # Remove the pending document we created + await session.delete(document) + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "file_id": item["file_id"], + "file_name": item["file_name"], + "mime_type": item["mime_type"], + "document_type": "Google Drive File (Composio)", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata_for_summary + ) + else: + summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(markdown_content) + + # Update document to READY with actual content + document.title = item["file_name"] + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "file_id": item["file_id"], + "file_name": item["file_name"], + "FILE_NAME": item["file_name"], + "mime_type": item["mime_type"], + "connector_id": connector_id, + "source": "composio", + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 # Batch commit every 10 documents - if documents_indexed > 0 and documents_indexed % 10 == 0: + if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Drive files processed so far" ) await session.commit() except Exception as e: - error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}" + error_msg = f"Error processing Drive file {item['file_name']}: {e!s}" logger.error(error_msg, exc_info=True) processing_errors.append(error_msg) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue logger.info( - f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, " + f"{documents_failed} failed ({duplicate_content_count} duplicate content)" ) return documents_indexed, documents_skipped, processing_errors -async def _process_single_drive_file( - session: AsyncSession, - composio_connector: ComposioGoogleDriveConnector, - file_id: str, - file_name: str, - mime_type: str, - connector_id: int, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, -) -> tuple[int, int, list[str]]: - """Process a single Google Drive file for indexing. - - Returns: - Tuple of (documents_indexed, documents_skipped, processing_errors) - """ - processing_errors = [] - - # ========== EARLY DUPLICATE CHECK BY FILE ID ========== - # Check if this Google Drive file was already indexed by ANY connector - # This happens BEFORE download/ETL to save expensive API calls - existing_by_file_id = await check_document_by_google_drive_file_id( - session, file_id, search_space_id - ) - if existing_by_file_id: - logger.info( - f"Skipping file {file_name} (file_id={file_id}): already indexed " - f"by {existing_by_file_id.document_type.value} as '{existing_by_file_id.title}' " - f"(saved download & ETL cost)" - ) - return 0, 1, processing_errors # Skip - NO download, NO ETL! - # ====================================================== - - # Generate unique identifier hash - document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]) - unique_identifier_hash = generate_unique_identifier_hash( - document_type, f"drive_{file_id}", search_space_id - ) - - # Check if document exists by unique identifier (same connector, same file) - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - # Get file content (pass mime_type for Google Workspace export handling) - content, content_error = await composio_connector.get_drive_file_content( - file_id, original_mime_type=mime_type - ) - - if content_error or not content: - logger.warning(f"Could not get content for file {file_name}: {content_error}") - # Use metadata as content fallback - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - elif isinstance(content, dict): - # Safety check: if content is still a dict, log error and use fallback - error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}" - logger.error(error_msg) - processing_errors.append(error_msg) - markdown_content = f"# {file_name}\n\n" - markdown_content += f"**File ID:** {file_id}\n" - markdown_content += f"**Type:** {mime_type}\n" - else: - # Process content based on file type - markdown_content = await _process_file_content( - content=content, - file_name=file_name, - file_id=file_id, - mime_type=mime_type, - search_space_id=search_space_id, - user_id=user_id, - session=session, - task_logger=task_logger, - log_entry=log_entry, - processing_errors=processing_errors, - ) - - content_hash = generate_content_hash(markdown_content, search_space_id) - - if existing_document: - if existing_document.content_hash == content_hash: - return 0, 1, processing_errors # Skipped - unchanged - - # Update existing document - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" - summary_embedding = config.embedding_model_instance.embed(summary_content) - - chunks = await create_document_chunks(markdown_content) - - existing_document.title = f"Drive: {file_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "file_id": file_id, - "file_name": file_name, - "FILE_NAME": file_name, # For compatibility - "mime_type": mime_type, - "connector_id": connector_id, - "source": "composio", - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - return 1, 0, processing_errors # Indexed - updated - - # Check if content_hash already exists (from any connector) - # This prevents duplicate content and avoids IntegrityError on unique constraint - existing_by_content_hash = await check_document_by_content_hash( - session, content_hash - ) - if existing_by_content_hash: - logger.info( - f"Skipping file {file_name} (file_id={file_id}): identical content " - f"already indexed as '{existing_by_content_hash.title}'" - ) - return 0, 1, processing_errors # Skipped - duplicate content - - # Create new document - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - - if user_llm: - document_metadata = { - "file_id": file_id, - "file_name": file_name, - "mime_type": mime_type, - "document_type": "Google Drive File (Composio)", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}" - summary_embedding = config.embedding_model_instance.embed(summary_content) - - chunks = await create_document_chunks(markdown_content) - - document = Document( - search_space_id=search_space_id, - title=f"Drive: {file_name}", - document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]), - document_metadata={ - "file_id": file_id, - "file_name": file_name, - "FILE_NAME": file_name, # For compatibility - "mime_type": mime_type, - "toolkit_id": "googledrive", - "source": "composio", - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - session.add(document) - - return 1, 0, processing_errors # Indexed - new - - async def _fetch_folder_files_recursively( composio_connector: ComposioGoogleDriveConnector, folder_id: str, diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index a82c18470..0cab2820b 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -100,6 +100,83 @@ class PodcastStatus(str, Enum): FAILED = "failed" +class DocumentStatus: + """ + Helper class for document processing status (stored as JSONB). + + Status values: + - {"state": "ready"} - Document is fully processed and searchable + - {"state": "pending"} - Document is queued, waiting to be processed + - {"state": "processing"} - Document is currently being processed (only 1 at a time) + - {"state": "failed", "reason": "..."} - Processing failed with reason + + Usage: + document.status = DocumentStatus.pending() + document.status = DocumentStatus.processing() + document.status = DocumentStatus.ready() + document.status = DocumentStatus.failed("LLM rate limit exceeded") + """ + + # State constants + READY = "ready" + PENDING = "pending" + PROCESSING = "processing" + FAILED = "failed" + + @staticmethod + def ready() -> dict: + """Return status dict for a ready/searchable document.""" + return {"state": DocumentStatus.READY} + + @staticmethod + def pending() -> dict: + """Return status dict for a document waiting to be processed.""" + return {"state": DocumentStatus.PENDING} + + @staticmethod + def processing() -> dict: + """Return status dict for a document being processed.""" + return {"state": DocumentStatus.PROCESSING} + + @staticmethod + def failed(reason: str, **extra_details) -> dict: + """ + Return status dict for a failed document. + + Args: + reason: Human-readable failure reason + **extra_details: Optional additional details (duplicate_of, error_code, etc.) + """ + status = { + "state": DocumentStatus.FAILED, + "reason": reason[:500], + } # Truncate long reasons + if extra_details: + status.update(extra_details) + return status + + @staticmethod + def get_state(status: dict | None) -> str | None: + """Extract state from status dict, returns None if invalid.""" + if status is None: + return None + return status.get("state") if isinstance(status, dict) else None + + @staticmethod + def is_state(status: dict | None, state: str) -> bool: + """Check if status matches a given state.""" + return DocumentStatus.get_state(status) == state + + @staticmethod + def get_failure_reason(status: dict | None) -> str | None: + """Extract failure reason from status dict.""" + if status is None or not isinstance(status, dict): + return None + if status.get("state") == DocumentStatus.FAILED: + return status.get("reason") + return None + + class LiteLLMProvider(str, Enum): """ Enum for LLM providers supported by LiteLLM. @@ -813,6 +890,17 @@ class Document(BaseModel, TimestampMixin): index=True, ) + # Processing status for real-time visibility (JSONB) + # Format: {"state": "ready"} or {"state": "processing"} or {"state": "failed", "reason": "..."} + # Default to {"state": "ready"} for backward compatibility with existing documents + status = Column( + JSONB, + nullable=False, + default=DocumentStatus.ready, + server_default=text('\'{"state": "ready"}\'::jsonb'), + index=True, + ) + # Relationships search_space = relationship("SearchSpace", back_populates="documents") created_by = relationship("User", back_populates="documents") diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index be90df459..b20f8cd9c 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -19,6 +19,7 @@ from app.db import ( from app.schemas import ( DocumentRead, DocumentsCreate, + DocumentStatusSchema, DocumentTitleRead, DocumentTitleSearchResponse, DocumentUpdate, @@ -112,9 +113,23 @@ async def create_documents_file_upload( user: User = Depends(current_active_user), ): """ - Upload files as documents. + Upload files as documents with real-time status tracking. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL) + - Phase 2: Celery processes each file: pending → processing → ready/failed + Requires DOCUMENTS_CREATE permission. """ + from datetime import datetime + + from app.db import DocumentStatus + from app.tasks.document_processors.base import ( + check_document_by_unique_identifier, + get_current_timestamp, + ) + from app.utils.document_converters import generate_unique_identifier_hash + try: # Check permission await check_permission( @@ -128,38 +143,105 @@ async def create_documents_file_upload( if not files: raise HTTPException(status_code=400, detail="No files provided") + created_documents: list[Document] = [] + files_to_process: list[ + tuple[Document, str, str] + ] = [] # (document, temp_path, filename) + skipped_duplicates = 0 + + # ===== PHASE 1: Create pending documents for all files ===== + # This makes ALL documents visible in the UI immediately with pending status for file in files: try: - # Save file to a temporary location to avoid stream issues import os import tempfile - # Create temp file + # Save file to temp location with tempfile.NamedTemporaryFile( - delete=False, suffix=os.path.splitext(file.filename)[1] + delete=False, suffix=os.path.splitext(file.filename or "")[1] ) as temp_file: temp_path = temp_file.name - # Write uploaded file to temp file content = await file.read() with open(temp_path, "wb") as f: f.write(content) - from app.tasks.celery_tasks.document_tasks import ( - process_file_upload_task, + file_size = len(content) + + # Generate unique identifier for deduplication check + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.FILE, file.filename or "unknown", search_space_id ) - process_file_upload_task.delay( - temp_path, file.filename, search_space_id, str(user.id) + # Check if document already exists (by unique identifier) + existing = await check_document_by_unique_identifier( + session, unique_identifier_hash ) + if existing: + # Clean up temp file for duplicates + os.unlink(temp_path) + skipped_duplicates += 1 + continue + + # Create pending document (visible immediately in UI via ElectricSQL) + document = Document( + search_space_id=search_space_id, + title=file.filename or "Uploaded File", + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file.filename, + "file_size": file_size, + "upload_time": datetime.now().isoformat(), + }, + content="Processing...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary, updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + status=DocumentStatus.pending(), # Shows "pending" in UI + updated_at=get_current_timestamp(), + created_by_id=str(user.id), + ) + session.add(document) + created_documents.append(document) + files_to_process.append( + (document, temp_path, file.filename or "unknown") + ) + except Exception as e: raise HTTPException( status_code=422, detail=f"Failed to process file {file.filename}: {e!s}", ) from e - await session.commit() - return {"message": "Files uploaded for processing"} + # Commit all pending documents - they appear in UI immediately via ElectricSQL + if created_documents: + await session.commit() + # Refresh to get generated IDs + for doc in created_documents: + await session.refresh(doc) + + # ===== PHASE 2: Dispatch Celery tasks for each file ===== + # Each task will update document status: pending → processing → ready/failed + from app.tasks.celery_tasks.document_tasks import ( + process_file_upload_with_document_task, + ) + + for document, temp_path, filename in files_to_process: + process_file_upload_with_document_task.delay( + document_id=document.id, + temp_path=temp_path, + filename=filename, + search_space_id=search_space_id, + user_id=str(user.id), + ) + + return { + "message": "Files uploaded for processing", + "document_ids": [doc.id for doc in created_documents], + "total_files": len(files), + "pending_files": len(files_to_process), + "skipped_duplicates": skipped_duplicates, + } except HTTPException: raise except Exception as e: @@ -211,7 +293,11 @@ async def read_documents( Permission.DOCUMENTS_READ.value, "You don't have permission to read documents in this search space", ) - query = select(Document).filter(Document.search_space_id == search_space_id) + query = ( + select(Document) + .options(selectinload(Document.created_by)) + .filter(Document.search_space_id == search_space_id) + ) count_query = ( select(func.count()) .select_from(Document) @@ -221,6 +307,7 @@ async def read_documents( # Get documents from all search spaces user has membership in query = ( select(Document) + .options(selectinload(Document.created_by)) .join(SearchSpace) .join(SearchSpaceMembership) .filter(SearchSpaceMembership.user_id == user.id) @@ -261,6 +348,19 @@ async def read_documents( # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: + # Get user name (display_name or email fallback) + created_by_name = None + if doc.created_by: + created_by_name = doc.created_by.display_name or doc.created_by.email + + # Parse status from JSONB + status_data = None + if hasattr(doc, "status") and doc.status: + status_data = DocumentStatusSchema( + state=doc.status.get("state", "ready"), + reason=doc.status.get("reason"), + ) + api_documents.append( DocumentRead( id=doc.id, @@ -273,6 +373,9 @@ async def read_documents( created_at=doc.created_at, updated_at=doc.updated_at, search_space_id=doc.search_space_id, + created_by_id=doc.created_by_id, + created_by_name=created_by_name, + status=status_data, ) ) @@ -341,7 +444,11 @@ async def search_documents( Permission.DOCUMENTS_READ.value, "You don't have permission to read documents in this search space", ) - query = select(Document).filter(Document.search_space_id == search_space_id) + query = ( + select(Document) + .options(selectinload(Document.created_by)) + .filter(Document.search_space_id == search_space_id) + ) count_query = ( select(func.count()) .select_from(Document) @@ -351,6 +458,7 @@ async def search_documents( # Get documents from all search spaces user has membership in query = ( select(Document) + .options(selectinload(Document.created_by)) .join(SearchSpace) .join(SearchSpaceMembership) .filter(SearchSpaceMembership.user_id == user.id) @@ -395,6 +503,19 @@ async def search_documents( # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: + # Get user name (display_name or email fallback) + created_by_name = None + if doc.created_by: + created_by_name = doc.created_by.display_name or doc.created_by.email + + # Parse status from JSONB + status_data = None + if hasattr(doc, "status") and doc.status: + status_data = DocumentStatusSchema( + state=doc.status.get("state", "ready"), + reason=doc.status.get("reason"), + ) + api_documents.append( DocumentRead( id=doc.id, @@ -407,6 +528,9 @@ async def search_documents( created_at=doc.created_at, updated_at=doc.updated_at, search_space_id=doc.search_space_id, + created_by_id=doc.created_by_id, + created_by_name=created_by_name, + status=status_data, ) ) @@ -782,6 +906,7 @@ async def delete_document( """ Delete a document. Requires DOCUMENTS_DELETE permission for the search space. + Documents in "processing" state cannot be deleted. """ try: result = await session.execute( @@ -794,6 +919,14 @@ async def delete_document( status_code=404, detail=f"Document with id {document_id} not found" ) + # Check if document is pending or currently being processed + doc_state = document.status.get("state") if document.status else None + if doc_state in ("pending", "processing"): + raise HTTPException( + status_code=409, # Conflict + detail="Cannot delete document while it is pending or being processed. Please wait for processing to complete.", + ) + # Check permission for the search space await check_permission( session, diff --git a/surfsense_backend/app/routes/notes_routes.py b/surfsense_backend/app/routes/notes_routes.py index 928cd462a..47cf96d04 100644 --- a/surfsense_backend/app/routes/notes_routes.py +++ b/surfsense_backend/app/routes/notes_routes.py @@ -230,6 +230,14 @@ async def delete_note( if not document: raise HTTPException(status_code=404, detail="Note not found") + # Check if note is pending or currently being processed + doc_state = document.status.get("state") if document.status else None + if doc_state in ("pending", "processing"): + raise HTTPException( + status_code=409, + detail="Cannot delete note while it is pending or being processed. Please wait for processing to complete.", + ) + # Delete document (chunks will be cascade deleted) await session.delete(document) await session.commit() diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 70e8f28f9..747e02834 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -2127,6 +2127,7 @@ async def run_google_gmail_indexing( start_date: str | None, end_date: str | None, update_last_indexed: bool, + on_heartbeat_callback=None, ) -> tuple[int, str | None]: # Use a reasonable default for max_messages max_messages = 1000 @@ -2139,6 +2140,7 @@ async def run_google_gmail_indexing( end_date=end_date, update_last_indexed=update_last_indexed, max_messages=max_messages, + on_heartbeat_callback=on_heartbeat_callback, ) # index_google_gmail_messages returns (int, str) but we need (int, str | None) return indexed_count, error_message if error_message else None diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index ad5abf777..c6d66149f 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -11,6 +11,7 @@ from .documents import ( DocumentBase, DocumentRead, DocumentsCreate, + DocumentStatusSchema, DocumentTitleRead, DocumentTitleSearchResponse, DocumentUpdate, @@ -104,6 +105,7 @@ __all__ = [ # Document schemas "DocumentBase", "DocumentRead", + "DocumentStatusSchema", "DocumentTitleRead", "DocumentTitleSearchResponse", "DocumentUpdate", diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index 1f82ae9ce..4cedc7d93 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -41,6 +41,13 @@ class DocumentUpdate(DocumentBase): pass +class DocumentStatusSchema(BaseModel): + """Document processing status.""" + + state: str # "ready", "processing", "failed" + reason: str | None = None + + class DocumentRead(BaseModel): id: int title: str @@ -53,6 +60,12 @@ class DocumentRead(BaseModel): updated_at: datetime | None search_space_id: int created_by_id: UUID | None = None # User who created/uploaded this document + created_by_name: str | None = ( + None # Display name or email of the user who created this document + ) + status: DocumentStatusSchema | None = ( + None # Processing status (ready, processing, failed) + ) model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 4c5599815..251241e96 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -982,7 +982,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: channel_name = metadata.get("channel_name", "Unknown Channel") message_date = metadata.get("start_date", "") - title = f"Slack: {channel_name}" + title = channel_name if message_date: title += f" ({message_date})" return title @@ -1056,7 +1056,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: page_title = metadata.get("page_title", "Untitled Page") indexed_at = metadata.get("indexed_at", "") - title = f"Notion: {page_title}" + title = page_title if indexed_at: title += f" (indexed: {indexed_at})" return title @@ -1366,9 +1366,9 @@ class ConnectorService: issue_title = metadata.get("issue_title", "Untitled Issue") issue_state = metadata.get("state", "") title = ( - f"Linear: {issue_identifier} - {issue_title}" + f"{issue_identifier} - {issue_title}" if issue_identifier - else f"Linear: {issue_title}" + else issue_title ) if issue_state: title += f" ({issue_state})" @@ -1465,11 +1465,7 @@ class ConnectorService: issue_key = metadata.get("issue_key", "") issue_title = metadata.get("issue_title", "Untitled Issue") status = metadata.get("status", "") - title = ( - f"Jira: {issue_key} - {issue_title}" - if issue_key - else f"Jira: {issue_title}" - ) + title = f"{issue_key} - {issue_title}" if issue_key else issue_title if status: title += f" ({status})" return title @@ -1570,7 +1566,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: event_summary = metadata.get("event_summary", "Untitled Event") start_time = metadata.get("start_time", "") - title = f"Calendar: {event_summary}" + title = event_summary if start_time: title += f" ({start_time})" return title @@ -1675,7 +1671,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: record_id = metadata.get("record_id", "") - return f"Airtable Record: {record_id}" if record_id else "Airtable Record" + return record_id if record_id else "Airtable Record" def _description_fn( _chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any] @@ -1952,7 +1948,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: page_title = metadata.get("page_title", "Untitled Page") space_key = metadata.get("space_key", "") - title = f"Confluence: {page_title}" + title = page_title if space_key: title += f" ({space_key})" return title @@ -2238,7 +2234,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: channel_name = metadata.get("channel_name", "Unknown Channel") message_date = metadata.get("start_date", "") - title = f"Discord: {channel_name}" + title = channel_name if message_date: title += f" ({message_date})" return title @@ -2314,7 +2310,7 @@ class ConnectorService: team_name = metadata.get("team_name", "Unknown Team") channel_name = metadata.get("channel_name", "Unknown Channel") message_date = metadata.get("start_date", "") - title = f"Teams: {team_name} - {channel_name}" + title = f"{team_name} - {channel_name}" if message_date: title += f" ({message_date})" return title @@ -2387,11 +2383,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: event_name = metadata.get("event_name", "Untitled Event") start_time = metadata.get("start_time", "") - return ( - f"Luma: {event_name} ({start_time})" - if start_time - else f"Luma: {event_name}" - ) + return f"{event_name} ({start_time})" if start_time else event_name def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: return metadata.get("event_url", "") or "" @@ -2651,7 +2643,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: page_name = metadata.get("page_name", "Untitled Page") - return f"BookStack: {page_name}" + return page_name def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: page_slug = metadata.get("page_slug", "") diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index f310bb03e..6dfcbff46 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -537,6 +537,304 @@ async def _process_file_upload( raise +@celery_app.task(name="process_file_upload_with_document", bind=True) +def process_file_upload_with_document_task( + self, + document_id: int, + temp_path: str, + filename: str, + search_space_id: int, + user_id: str, +): + """ + Celery task to process uploaded file with existing pending document. + + This task is used by the 2-phase document upload flow: + - Phase 1 (API): Creates pending document (visible in UI immediately) + - Phase 2 (this task): Updates document status: pending → processing → ready/failed + + Args: + document_id: ID of the pending document created in Phase 1 + temp_path: Path to the uploaded file + filename: Original filename + search_space_id: ID of the search space + user_id: ID of the user + """ + import asyncio + import os + import traceback + + logger.info( + f"[process_file_upload_with_document] Task started - document_id: {document_id}, " + f"file: {filename}, search_space_id: {search_space_id}" + ) + + # Check if file exists and is accessible + if not os.path.exists(temp_path): + logger.error( + f"[process_file_upload_with_document] File does not exist: {temp_path}. " + "The temp file may have been cleaned up before the task ran." + ) + # Mark document as failed since file is missing + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + _mark_document_failed( + document_id, + "File not found - temp file may have been cleaned up", + ) + ) + finally: + loop.close() + return + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete( + _process_file_with_document( + document_id, temp_path, filename, search_space_id, user_id + ) + ) + logger.info( + f"[process_file_upload_with_document] Task completed successfully for: {filename}" + ) + except Exception as e: + logger.error( + f"[process_file_upload_with_document] Task failed for {filename}: {e}\n" + f"Traceback:\n{traceback.format_exc()}" + ) + raise + finally: + loop.close() + + +async def _mark_document_failed(document_id: int, reason: str): + """Mark a document as failed when task cannot proceed.""" + from app.db import Document, DocumentStatus + from app.tasks.document_processors.base import get_current_timestamp + + async with get_celery_session_maker()() as session: + document = await session.get(Document, document_id) + if document: + document.status = DocumentStatus.failed(reason) + document.updated_at = get_current_timestamp() + await session.commit() + logger.info(f"Marked document {document_id} as failed: {reason}") + + +async def _process_file_with_document( + document_id: int, + temp_path: str, + filename: str, + search_space_id: int, + user_id: str, +): + """ + Process file and update existing pending document status. + + This function implements Phase 2 of the 2-phase document upload: + - Sets document status to 'processing' (shows spinner in UI) + - Processes the file (parsing, embedding, chunking) + - Updates document to 'ready' on success or 'failed' on error + """ + import os + + from app.db import Document, DocumentStatus + from app.tasks.document_processors.base import get_current_timestamp + from app.tasks.document_processors.file_processors import ( + process_file_in_background_with_document, + ) + + logger.info( + f"[_process_file_with_document] Starting async processing for: {filename}" + ) + + async with get_celery_session_maker()() as session: + logger.info( + f"[_process_file_with_document] Database session created for: {filename}" + ) + task_logger = TaskLoggingService(session, search_space_id) + + # Get the document + document = await session.get(Document, document_id) + if not document: + logger.error(f"Document {document_id} not found") + return + + # Get file size for notification metadata + try: + file_size = os.path.getsize(temp_path) + logger.info(f"[_process_file_with_document] File size: {file_size} bytes") + except Exception as e: + logger.warning( + f"[_process_file_with_document] Could not get file size: {e}" + ) + file_size = None + + # Create notification for document processing + logger.info( + f"[_process_file_with_document] Creating notification for: {filename}" + ) + notification = ( + await NotificationService.document_processing.notify_processing_started( + session=session, + user_id=UUID(user_id), + document_type="FILE", + document_name=filename, + search_space_id=search_space_id, + file_size=file_size, + ) + ) + + log_entry = await task_logger.log_task_start( + task_name="process_file_upload_with_document", + source="document_processor", + message=f"Starting file processing for: {filename} (document_id: {document_id})", + metadata={ + "document_type": "FILE", + "document_id": document_id, + "filename": filename, + "file_path": temp_path, + "user_id": user_id, + }, + ) + + try: + # Set status to PROCESSING (shows spinner in UI via ElectricSQL) + document.status = DocumentStatus.processing() + await session.commit() + logger.info( + f"[_process_file_with_document] Document {document_id} status set to 'processing'" + ) + + # Process the file and update document + result = await process_file_in_background_with_document( + document=document, + file_path=temp_path, + filename=filename, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + notification=notification, + ) + + # Update notification on success + if result: + await ( + NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + document_id=result.id, + chunks_count=None, + ) + ) + logger.info( + f"[_process_file_with_document] Successfully processed document {document_id}" + ) + else: + # Duplicate detected - mark as failed + document.status = DocumentStatus.failed("Duplicate content detected") + document.updated_at = get_current_timestamp() + await session.commit() + await ( + NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message="Document already exists (duplicate)", + ) + ) + + except Exception as e: + # Import here to avoid circular dependencies + from fastapi import HTTPException + + from app.services.page_limit_service import PageLimitExceededError + + # Check if this is a page limit error + page_limit_error: PageLimitExceededError | None = None + if isinstance(e, PageLimitExceededError): + page_limit_error = e + elif ( + isinstance(e, HTTPException) + and e.__cause__ + and isinstance(e.__cause__, PageLimitExceededError) + ): + page_limit_error = e.__cause__ + + # Mark document as failed (shows error in UI via ElectricSQL) + error_message = str(e)[:500] + document.status = DocumentStatus.failed(error_message) + document.updated_at = get_current_timestamp() + await session.commit() + logger.info( + f"[_process_file_with_document] Document {document_id} marked as failed: {error_message[:100]}" + ) + + # Handle page limit errors with dedicated notification + if page_limit_error is not None: + try: + await session.refresh(notification) + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message="Page limit exceeded", + ) + await NotificationService.page_limit.notify_page_limit_exceeded( + session=session, + user_id=UUID(user_id), + document_name=filename, + document_type="FILE", + search_space_id=search_space_id, + pages_used=page_limit_error.pages_used, + pages_limit=page_limit_error.pages_limit, + pages_to_add=page_limit_error.pages_to_add, + ) + except Exception as notif_error: + logger.error( + f"Failed to create page limit notification: {notif_error!s}" + ) + else: + # Update notification on failure + try: + await session.refresh(notification) + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message=str(e)[:100], + ) + except Exception as notif_error: + logger.error( + f"Failed to update notification on failure: {notif_error!s}" + ) + + await task_logger.log_task_failure( + log_entry, + error_message[:100], + str(e), + {"error_type": type(e).__name__, "document_id": document_id}, + ) + logger.error(f"Error processing file {filename}: {e!s}") + raise + + finally: + # Clean up temp file + if os.path.exists(temp_path): + try: + os.unlink(temp_path) + logger.info( + f"[_process_file_with_document] Cleaned up temp file: {temp_path}" + ) + except Exception as cleanup_error: + logger.warning( + f"[_process_file_with_document] Failed to clean up temp file: {cleanup_error}" + ) + + @celery_app.task(name="process_circleback_meeting", bind=True) def process_circleback_meeting_task( self, diff --git a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py index 9041655b0..ef3a30e43 100644 --- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py @@ -4,33 +4,41 @@ This task runs periodically (every 5 minutes by default) to find notifications that are stuck in "in_progress" status but don't have an active Redis heartbeat key. These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state. +Additionally, it cleans up documents stuck in pending/processing state that belong +to connectors with stale notifications. + Detection mechanism: - Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat - If the task crashes, the Redis key expires automatically - This cleanup task checks for in-progress notifications without a Redis heartbeat key - Such notifications are marked as failed with O(1) batch UPDATE +- Documents with pending/processing status for those connectors are also marked as failed """ +import contextlib import json import logging import os from datetime import UTC, datetime import redis -from sqlalchemy import and_, text +from sqlalchemy import and_, or_, text from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.future import select from sqlalchemy.pool import NullPool from app.celery_app import celery_app from app.config import config -from app.db import Notification +from app.db import Document, DocumentStatus, Notification logger = logging.getLogger(__name__) # Redis client for checking heartbeats _redis_client: redis.Redis | None = None +# Error message shown to users when sync is interrupted +STALE_SYNC_ERROR_MESSAGE = "Sync was interrupted unexpectedly. Please retry." + def get_redis_client() -> redis.Redis: """Get or create Redis client for heartbeat checking.""" @@ -70,6 +78,7 @@ def cleanup_stale_indexing_notifications_task(): - Do NOT have a corresponding Redis heartbeat key (meaning task crashed) And marks them as failed with O(1) batch UPDATE. + Also marks associated pending/processing documents as failed. """ import asyncio @@ -86,15 +95,20 @@ async def _cleanup_stale_notifications(): """Find and mark stale connector indexing notifications as failed. Uses Redis TTL-based detection: - 1. Find all in-progress notifications + 1. Find all in-progress notifications with their connector_id 2. Check which ones are missing their Redis heartbeat key 3. Mark those as failed with O(1) batch UPDATE using JSONB || operator + 4. Mark associated documents (pending/processing) as failed """ async with get_celery_session_maker()() as session: try: # Find all in-progress connector indexing notifications + # Fetch full metadata to properly extract connector_id result = await session.execute( - select(Notification.id).where( + select( + Notification.id, + Notification.notification_metadata, + ).where( and_( Notification.type == "connector_indexing", Notification.notification_metadata["status"].astext @@ -102,24 +116,37 @@ async def _cleanup_stale_notifications(): ) ) ) - in_progress_ids = [row[0] for row in result.fetchall()] + in_progress_rows = result.fetchall() - if not in_progress_ids: + if not in_progress_rows: logger.debug("No in-progress connector indexing notifications found") return # Check which ones are missing heartbeat keys in Redis redis_client = get_redis_client() stale_notification_ids = [] + stale_connector_ids = [] - for notification_id in in_progress_ids: + for row in in_progress_rows: + notification_id = row[0] + metadata = row[1] # Full metadata dict heartbeat_key = _get_heartbeat_key(notification_id) if not redis_client.exists(heartbeat_key): stale_notification_ids.append(notification_id) + # Extract connector_id from metadata dict for document cleanup + if metadata and isinstance(metadata, dict): + connector_id = metadata.get("connector_id") + logger.debug( + f"Notification {notification_id} metadata: {metadata}, " + f"connector_id: {connector_id}" + ) + if connector_id is not None: + with contextlib.suppress(ValueError, TypeError): + stale_connector_ids.append(int(connector_id)) if not stale_notification_ids: logger.debug( - f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats" + f"All {len(in_progress_rows)} in-progress notifications have active Redis heartbeats" ) return @@ -127,18 +154,15 @@ async def _cleanup_stale_notifications(): f"Found {len(stale_notification_ids)} stale connector indexing notifications " f"(no Redis heartbeat key): {stale_notification_ids}" ) + logger.info(f"Connector IDs for document cleanup: {stale_connector_ids}") - # O(1) Batch UPDATE using JSONB || operator + # O(1) Batch UPDATE notifications using JSONB || operator # This merges the update data into existing notification_metadata # Also updates title and message for proper UI display - error_message = ( - "Something went wrong while syncing your content. Please retry." - ) - update_data = { "status": "failed", "completed_at": datetime.now(UTC).isoformat(), - "error_message": error_message, + "error_message": STALE_SYNC_ERROR_MESSAGE, "sync_stage": "failed", } @@ -152,16 +176,96 @@ async def _cleanup_stale_notifications(): """), { "update_json": json.dumps(update_data), - "display_message": f"{error_message}", + "display_message": STALE_SYNC_ERROR_MESSAGE, "ids": stale_notification_ids, }, ) - await session.commit() logger.info( - f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)" + f"Successfully marked {len(stale_notification_ids)} stale notifications as failed" ) + # ===== Clean up stuck documents for stale connectors ===== + if stale_connector_ids: + await _cleanup_stuck_documents(session, stale_connector_ids) + + await session.commit() + except Exception as e: logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True) await session.rollback() + + +async def _cleanup_stuck_documents(session, connector_ids: list[int]): + """ + Mark documents stuck in pending/processing state as failed for given connectors. + + This ensures that when a connector sync is interrupted, all partially-processed + documents are marked with a clear error state instead of being stuck indefinitely. + + Args: + session: Database session + connector_ids: List of connector IDs whose documents should be cleaned up + """ + if not connector_ids: + return + + try: + # Count documents that will be affected (for logging) + count_result = await session.execute( + select(Document.id).where( + and_( + Document.connector_id.in_(connector_ids), + or_( + Document.status["state"].astext == DocumentStatus.PENDING, + Document.status["state"].astext == DocumentStatus.PROCESSING, + ), + ) + ) + ) + stuck_doc_ids = [row[0] for row in count_result.fetchall()] + + if not stuck_doc_ids: + logger.debug(f"No stuck documents found for connector IDs: {connector_ids}") + return + + logger.warning( + f"Found {len(stuck_doc_ids)} stuck documents (pending/processing) " + f"for connector IDs {connector_ids}: {stuck_doc_ids[:20]}..." # Log first 20 + ) + + # O(1) Batch UPDATE: Mark all stuck documents as failed using JSONB + # The error message matches what we show in notifications + failed_status = DocumentStatus.failed(STALE_SYNC_ERROR_MESSAGE) + + await session.execute( + text(""" + UPDATE documents + SET status = CAST(:failed_status AS jsonb), + updated_at = :now + WHERE connector_id = ANY(:connector_ids) + AND ( + status->>'state' = :pending_state + OR status->>'state' = :processing_state + ) + """), + { + "failed_status": json.dumps(failed_status), + "now": datetime.now(UTC), + "connector_ids": connector_ids, + "pending_state": DocumentStatus.PENDING, + "processing_state": DocumentStatus.PROCESSING, + }, + ) + + logger.info( + f"Successfully marked {len(stuck_doc_ids)} stuck documents as failed " + f"for connector IDs: {connector_ids}" + ) + + except Exception as e: + logger.error( + f"Error cleaning up stuck documents for connectors {connector_ids}: {e!s}", + exc_info=True, + ) + # Don't raise - let the notification cleanup continue even if document cleanup fails diff --git a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py index 029c4a87c..46cd069c9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py @@ -1,5 +1,9 @@ """ Airtable connector indexer. + +Implements real-time document status updates using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (pending → processing → ready/failed) """ import time @@ -10,7 +14,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.airtable_history import AirtableHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -134,24 +139,32 @@ async def index_airtable_records( await task_logger.log_task_success( log_entry, success_msg, {"bases_count": 0} ) - return 0, success_msg + # CRITICAL: Update timestamp even when no bases found so Electric SQL syncs + await update_connector_last_indexed( + session, connector, update_last_indexed + ) + await session.commit() + return 0, None # Return None (not error) when no items found logger.info(f"Found {len(bases)} Airtable bases to process") # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() - total_documents_indexed = 0 - # Process each base + # Track overall statistics + documents_indexed = 0 + documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 + + # ======================================================================= + # PHASE 1: Collect all records and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + records_to_process = [] # List of dicts with document and record data + new_documents_created = False + for base in bases: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(total_documents_indexed) - last_heartbeat_time = time.time() base_id = base.get("id") base_name = base.get("name", "Unknown Base") @@ -201,7 +214,6 @@ async def index_airtable_records( max_records=max_records, ) ) - else: # Fetch all records records, records_error = airtable_connector.get_all_records( @@ -222,21 +234,14 @@ async def index_airtable_records( logger.info(f"Found {len(records)} records in table {table_name}") - documents_indexed = 0 - skipped_messages = [] - documents_skipped = 0 - # Process each record + # Phase 1: Analyze each record and create pending documents for record in records: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(total_documents_indexed) - last_heartbeat_time = time.time() - try: + record_id = record.get("id", "") + if not record_id: + documents_skipped += 1 + continue + # Generate markdown content markdown_content = ( airtable_connector.format_record_to_markdown( @@ -246,16 +251,11 @@ async def index_airtable_records( if not markdown_content.strip(): logger.warning( - f"Skipping message with no content: {record.get('id')}" - ) - skipped_messages.append( - f"{record.get('id')} (no content)" + f"Skipping record with no content: {record_id}" ) documents_skipped += 1 continue - record_id = record.get("id", "Unknown") - # Generate unique identifier hash for this Airtable record unique_identifier_hash = generate_unique_identifier_hash( DocumentType.AIRTABLE_CONNECTOR, @@ -278,77 +278,30 @@ async def index_airtable_records( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Airtable record {record_id} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Airtable record {record_id}. Updating document." - ) - # Generate document summary - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "record_id": record_id, - "created_time": record.get( - "CREATED_TIME()", "" - ), - "document_type": "Airtable Record", - "connector_type": "Airtable", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, - user_llm, - document_metadata, - ) - else: - summary_content = ( - f"Airtable Record: {record_id}\n\n" - ) - summary_embedding = ( - config.embedding_model_instance.embed( - summary_content - ) - ) - - # Process chunks - chunks = await create_document_chunks( - markdown_content - ) - - # Update existing document - existing_document.title = ( - f"Airtable Record: {record_id}" - ) - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + records_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, "record_id": record_id, - "created_time": record.get( - "CREATED_TIME()", "" - ), + "record": record, + "base_name": base_name, + "table_name": table_name, } - existing_document.chunks = chunks - existing_document.updated_at = ( - get_current_timestamp() - ) - - documents_indexed += 1 - logger.info( - f"Successfully updated Airtable record {record_id}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -365,123 +318,210 @@ async def index_airtable_records( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate document summary - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "record_id": record_id, - "created_time": record.get("CREATED_TIME()", ""), - "document_type": "Airtable Record", - "connector_type": "Airtable", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Airtable Record: {record_id}\n\n" - summary_embedding = ( - config.embedding_model_instance.embed( - summary_content - ) - ) - - # Process chunks - chunks = await create_document_chunks(markdown_content) - - # Create and store new document - logger.info( - f"Creating new document for Airtable record: {record_id}" - ) + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Airtable Record: {record_id}", + title=record_id, document_type=DocumentType.AIRTABLE_CONNECTOR, document_metadata={ "record_id": record_id, "created_time": record.get("CREATED_TIME()", ""), + "base_name": base_name, + "table_name": table_name, + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new Airtable record {summary_content}" - ) + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Airtable records processed so far" - ) - await session.commit() + records_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "record_id": record_id, + "record": record, + "base_name": base_name, + "table_name": table_name, + } + ) except Exception as e: logger.error( - f"Error processing the Airtable record {record.get('id', 'Unknown')}: {e!s}", - exc_info=True, + f"Error in Phase 1 for record: {e!s}", exc_info=True ) - skipped_messages.append( - f"{record.get('id', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this message and continue with others + documents_failed += 1 + continue - # Accumulate total processed across all tables - total_processed += documents_indexed + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([r for r in records_to_process if r['is_new']])} pending documents" + ) + await session.commit() - # Final commit for any remaining documents not yet committed in batches - if documents_indexed > 0: + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(records_to_process)} documents") + + for item in records_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "record_id": item["record_id"], + "created_time": item["record"].get("CREATED_TIME()", ""), + "document_type": "Airtable Record", + "connector_type": "Airtable", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Airtable Record: {item['record_id']}\n\n" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["markdown_content"]) + + # Update document to READY with actual content + document.title = item["record_id"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "record_id": item["record_id"], + "created_time": item["record"].get("CREATED_TIME()", ""), + "base_name": item["base_name"], + "table_name": item["table_name"], + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: logger.info( - f"Final commit for table {table_name}: {documents_indexed} Airtable records processed" + f"Committing batch: {documents_indexed} Airtable records processed so far" ) await session.commit() - logger.info( - f"Successfully committed all Airtable document changes for table {table_name}" - ) - # Update the last_indexed_at timestamp for the connector only if requested - # (after all tables in all bases are processed) - if total_processed > 0: - await update_connector_last_indexed( - session, connector, update_last_indexed + except Exception as e: + logger.error( + f"Error processing Airtable record: {e!s}", exc_info=True + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + + total_processed = documents_indexed + + # Final commit to ensure all documents are persisted (safety net) + logger.info( + f"Final commit: Total {documents_indexed} Airtable records processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all Airtable document changes to database" ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same record was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success after processing all bases and tables await task_logger.log_task_success( log_entry, f"Successfully completed Airtable indexing for connector {connector_id}", { - "events_processed": total_processed, - "documents_indexed": total_processed, + "documents_indexed": documents_indexed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"Airtable indexing completed: {total_processed} total records processed" + f"Airtable indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) return ( total_processed, - None, - ) # Return None as the error message to indicate success + warning_message, + ) except Exception as e: logger.error( diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index 5dc438b9b..6a1226230 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -28,6 +28,35 @@ def get_current_timestamp() -> datetime: return datetime.now(UTC) +def safe_set_chunks(document: Document, chunks: list) -> None: + """ + Safely assign chunks to a document without triggering lazy loading. + + ALWAYS use this instead of `document.chunks = chunks` to avoid + SQLAlchemy async errors (MissingGreenlet / greenlet_spawn). + + Why this is needed: + - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to + load the OLD chunks first (for comparison/orphan detection) + - This lazy loading fails in async context with asyncpg driver + - set_committed_value bypasses this by setting the value directly + + This function is safe regardless of how the document was loaded + (with or without selectinload). + + Args: + document: The Document object to update + chunks: List of Chunk objects to assign + + Example: + # Instead of: document.chunks = chunks (DANGEROUS!) + safe_set_chunks(document, chunks) # Always safe + """ + from sqlalchemy.orm.attributes import set_committed_value + + set_committed_value(document, "chunks", chunks) + + def parse_date_flexible(date_str: str) -> datetime: """ Parse date from multiple common formats. diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index fe608a8c9..d60884539 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -1,5 +1,9 @@ """ BookStack connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all pages and create pending documents (visible in UI immediately) +- Phase 2: Process each page: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.bookstack_connector import BookStackConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -184,22 +189,22 @@ async def index_bookstack_pages( logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True) return 0, f"Error fetching BookStack pages: {e!s}" - # Process and index each page + # ======================================================================= + # PHASE 1: Analyze all pages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 skipped_pages = [] documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + pages_to_process = [] # List of dicts with document and page data + new_documents_created = False + for page in pages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: page_id = page.get("id") page_name = page.get("name", "") @@ -218,7 +223,7 @@ async def index_bookstack_pages( # Fetch full page content (Markdown preferred) try: - page_detail, page_content = bookstack_client.get_page_with_content( + _, page_content = bookstack_client.get_page_with_content( page_id, use_markdown=True ) except Exception as e: @@ -252,82 +257,38 @@ async def index_bookstack_pages( # Build page URL page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}" - # Build document metadata - doc_metadata = { - "page_id": page_id, - "page_name": page_name, - "page_slug": page_slug, - "book_id": book_id, - "book_slug": book_slug, - "chapter_id": chapter_id, - "base_url": bookstack_base_url, - "page_url": page_url, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for BookStack page {page_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for BookStack page {page_name}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - summary_metadata = { - "page_name": page_name, - "page_id": page_id, - "book_id": book_id, - "document_type": "BookStack Page", - "connector_type": "BookStack", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - full_content, user_llm, summary_metadata - ) - else: - summary_content = ( - f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n" - ) - if page_content: - content_preview = page_content[:1000] - if len(page_content) > 1000: - content_preview += "..." - summary_content += ( - f"Content Preview: {content_preview}\n\n" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(full_content) - - # Update existing document - existing_document.title = f"BookStack - {page_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = doc_metadata - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated BookStack page {page_name}") - continue + # Queue existing document for update (will be set to processing in Phase 2) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "page_url": page_url, + "page_content": page_content, + "full_content": full_content, + "content_hash": content_hash, + } + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -345,17 +306,108 @@ async def index_bookstack_pages( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=page_name, + document_type=DocumentType.BOOKSTACK_CONNECTOR, + document_metadata={ + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "base_url": bookstack_base_url, + "page_url": page_url, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + pages_to_process.append( + { + "document": document, + "is_new": True, + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "page_url": page_url, + "page_content": page_content, + "full_content": full_content, + "content_hash": content_hash, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(pages_to_process)} documents") + + for item in pages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) + # Build document metadata + doc_metadata = { + "page_id": item["page_id"], + "page_name": item["page_name"], + "page_slug": item["page_slug"], + "book_id": item["book_id"], + "book_slug": item["book_slug"], + "chapter_id": item["chapter_id"], + "base_url": bookstack_base_url, + "page_url": item["page_url"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + if user_llm: summary_metadata = { - "page_name": page_name, - "page_id": page_id, - "book_id": book_id, + "page_name": item["page_name"], + "page_id": item["page_id"], + "book_id": item["book_id"], "document_type": "BookStack Page", "connector_type": "BookStack", } @@ -363,17 +415,15 @@ async def index_bookstack_pages( summary_content, summary_embedding, ) = await generate_document_summary( - full_content, user_llm, summary_metadata + item["full_content"], user_llm, summary_metadata ) else: # Fallback to simple summary if no LLM configured - summary_content = ( - f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n" - ) - if page_content: + summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n" + if item["page_content"]: # Take first 1000 characters of content for summary - content_preview = page_content[:1000] - if len(page_content) > 1000: + content_preview = item["page_content"][:1000] + if len(item["page_content"]) > 1000: content_preview += "..." summary_content += f"Content Preview: {content_preview}\n\n" summary_embedding = config.embedding_model_instance.embed( @@ -381,30 +431,21 @@ async def index_bookstack_pages( ) # Process chunks - using the full page content - chunks = await create_document_chunks(full_content) + chunks = await create_document_chunks(item["full_content"]) - # Create and store new document - logger.info(f"Creating new document for page {page_name}") - document = Document( - search_space_id=search_space_id, - title=f"BookStack - {page_name}", - document_type=DocumentType.BOOKSTACK_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) + # Update document to READY with actual content + document.title = item["page_name"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = doc_metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() - session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new page {page_name}") - # Batch commit every 10 documents + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} BookStack pages processed so far" @@ -413,46 +454,76 @@ async def index_bookstack_pages( except Exception as e: logger.error( - f"Error processing page {page.get('name', 'Unknown')}: {e!s}", + f"Error processing page {item.get('page_name', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_pages.append( - f"{page.get('name', 'Unknown')} (processing error)" + f"{item.get('page_name', 'Unknown')} (processing error)" ) - documents_skipped += 1 - continue # Skip this page and continue with others + documents_failed += 1 + continue - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} BookStack pages processed" ) - await session.commit() - logger.info("Successfully committed all BookStack document changes to database") + try: + await session.commit() + logger.info( + "Successfully committed all BookStack document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same page was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed BookStack indexing for connector {connector_id}", { - "pages_processed": total_processed, + "pages_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_pages_count": len(skipped_pages), }, ) logger.info( - f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + f"BookStack indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py index a8991647c..47c5d8b3b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py @@ -1,5 +1,9 @@ """ ClickUp connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.clickup_history import ClickUpHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -141,10 +146,18 @@ async def index_clickup_tasks( documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Collect all tasks and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + tasks_to_process = [] # List of dicts with document and task data + new_documents_created = False + # Iterate workspaces and fetch tasks for workspace in workspaces: workspace_id = workspace.get("id") @@ -183,15 +196,6 @@ async def index_clickup_tasks( ) for task in tasks: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() - try: task_id = task.get("id") task_name = task.get("name", "Untitled Task") @@ -255,73 +259,38 @@ async def index_clickup_tasks( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for ClickUp task {task_name} unchanged. Skipping." ) documents_skipped += 1 continue else: - # Content has changed - update the existing document + # Queue existing document for update (will be set to processing in Phase 2) logger.info( - f"Content changed for ClickUp task {task_name}. Updating document." + f"Content changed for ClickUp task {task_name}. Queuing for update." ) - - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { + tasks_to_process.append( + { + "document": existing_document, + "is_new": False, + "task_content": task_content, + "content_hash": content_hash, "task_id": task_id, "task_name": task_name, "task_status": task_status, "task_priority": task_priority, - "task_list": task_list_name, - "task_space": task_space_name, - "assignees": len(task_assignees), - "document_type": "ClickUp Task", - "connector_type": "ClickUp", + "task_list_name": task_list_name, + "task_space_name": task_space_name, + "task_assignees": task_assignees, + "task_due_date": task_due_date, + "task_created": task_created, + "task_updated": task_updated, } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - task_content, user_llm, document_metadata - ) - else: - summary_content = task_content - summary_embedding = ( - config.embedding_model_instance.embed(task_content) - ) - - # Process chunks - chunks = await create_document_chunks(task_content) - - # Update existing document - existing_document.title = f"Task - {task_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_assignees": task_assignees, - "task_due_date": task_due_date, - "task_created": task_created, - "task_updated": task_updated, - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated ClickUp task {task_name}" ) continue @@ -341,42 +310,10 @@ async def index_clickup_tasks( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_list": task_list_name, - "task_space": task_space_name, - "assignees": len(task_assignees), - "document_type": "ClickUp Task", - "connector_type": "ClickUp", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - task_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = task_content - summary_embedding = config.embedding_model_instance.embed( - task_content - ) - - chunks = await create_document_chunks(task_content) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Task - {task_name}", + title=task_name, document_type=DocumentType.CLICKUP_CONNECTOR, document_metadata={ "task_id": task_id, @@ -387,44 +324,180 @@ async def index_clickup_tasks( "task_due_date": task_due_date, "task_created": task_created, "task_updated": task_updated, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new task {task_name}") + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} ClickUp tasks processed so far" - ) - await session.commit() + tasks_to_process.append( + { + "document": document, + "is_new": True, + "task_content": task_content, + "content_hash": content_hash, + "task_id": task_id, + "task_name": task_name, + "task_status": task_status, + "task_priority": task_priority, + "task_list_name": task_list_name, + "task_space_name": task_space_name, + "task_assignees": task_assignees, + "task_due_date": task_due_date, + "task_created": task_created, + "task_updated": task_updated, + } + ) except Exception as e: logger.error( - f"Error processing task {task.get('name', 'Unknown')}: {e!s}", + f"Error in Phase 1 for task {task.get('name', 'Unknown')}: {e!s}", exc_info=True, ) - documents_skipped += 1 + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(tasks_to_process)} documents") + + for item in tasks_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "task_id": item["task_id"], + "task_name": item["task_name"], + "task_status": item["task_status"], + "task_priority": item["task_priority"], + "task_list": item["task_list_name"], + "task_space": item["task_space_name"], + "assignees": len(item["task_assignees"]), + "document_type": "ClickUp Task", + "connector_type": "ClickUp", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["task_content"], user_llm, document_metadata_for_summary + ) + else: + summary_content = item["task_content"] + summary_embedding = config.embedding_model_instance.embed( + item["task_content"] + ) + + chunks = await create_document_chunks(item["task_content"]) + + # Update document to READY with actual content + document.title = item["task_name"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "task_id": item["task_id"], + "task_name": item["task_name"], + "task_status": item["task_status"], + "task_priority": item["task_priority"], + "task_assignees": item["task_assignees"], + "task_due_date": item["task_due_date"], + "task_created": item["task_created"], + "task_updated": item["task_updated"], + "connector_id": connector_id, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} ClickUp tasks processed so far" + ) + await session.commit() + + except Exception as e: + logger.error( + f"Error processing task {item.get('task_name', 'Unknown')}: {e!s}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} ClickUp tasks processed") - await session.commit() + try: + await session.commit() + logger.info( + "Successfully committed all ClickUp document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same task was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise await task_logger.log_task_success( log_entry, @@ -433,11 +506,12 @@ async def index_clickup_tasks( "pages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, }, ) logger.info( - f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped" + f"clickup indexing completed: {documents_indexed} ready, {documents_skipped} skipped, {documents_failed} failed" ) # Close client connection diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 24859e685..a3a059d4e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -1,5 +1,9 @@ """ Confluence connector indexer. + +Provides real-time document status updates during indexing using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED) """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.confluence_history import ConfluenceHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -29,6 +33,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -180,22 +185,22 @@ async def index_confluence_pages( await confluence_client.close() return 0, f"Error fetching Confluence pages: {e!s}" - # Process and index each page + # ======================================================================= + # PHASE 1: Analyze all pages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 - skipped_pages = [] documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + pages_to_process = [] # List of dicts with document and page data + new_documents_created = False + for page in pages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: page_id = page.get("id") page_title = page.get("title", "") @@ -205,7 +210,6 @@ async def index_confluence_pages( logger.warning( f"Skipping page with missing ID or title: {page_id or 'Unknown'}" ) - skipped_pages.append(f"{page_title or 'Unknown'} (missing data)") documents_skipped += 1 continue @@ -236,7 +240,6 @@ async def index_confluence_pages( if not full_content.strip(): logger.warning(f"Skipping page with no content: {page_title}") - skipped_pages.append(f"{page_title} (no content)") documents_skipped += 1 continue @@ -258,74 +261,29 @@ async def index_confluence_pages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Confluence page {page_title} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Confluence page {page_title}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "page_title": page_title, - "page_id": page_id, - "space_id": space_id, - "comment_count": comment_count, - "document_type": "Confluence Page", - "connector_type": "Confluence", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - full_content, user_llm, document_metadata - ) - else: - summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" - if page_content: - content_preview = page_content[:1000] - if len(page_content) > 1000: - content_preview += "..." - summary_content += ( - f"Content Preview: {content_preview}\n\n" - ) - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(full_content) - - # Update existing document - existing_document.title = f"Confluence - {page_title}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "full_content": full_content, + "page_content": page_content, + "content_hash": content_hash, "page_id": page_id, "page_title": page_title, "space_id": space_id, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Confluence page {page_title}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -340,21 +298,92 @@ async def index_confluence_pages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=page_title, + document_type=DocumentType.CONFLUENCE_CONNECTOR, + document_metadata={ + "page_id": page_id, + "page_title": page_title, + "space_id": space_id, + "comment_count": comment_count, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + pages_to_process.append( + { + "document": document, + "is_new": True, + "full_content": full_content, + "page_content": page_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + "space_id": space_id, + "comment_count": comment_count, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(pages_to_process)} documents") + + for item in pages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) if user_llm: document_metadata = { - "page_title": page_title, - "page_id": page_id, - "space_id": space_id, - "comment_count": comment_count, + "page_title": item["page_title"], + "page_id": item["page_id"], + "space_id": item["space_id"], + "comment_count": item["comment_count"], "document_type": "Confluence Page", "connector_type": "Confluence", } @@ -362,55 +391,45 @@ async def index_confluence_pages( summary_content, summary_embedding, ) = await generate_document_summary( - full_content, user_llm, document_metadata + item["full_content"], user_llm, document_metadata ) else: # Fallback to simple summary if no LLM configured - summary_content = ( - f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" - ) - if page_content: - # Take first 500 characters of content for summary - content_preview = page_content[:1000] - if len(page_content) > 1000: + summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n" + if item["page_content"]: + # Take first 1000 characters of content for summary + content_preview = item["page_content"][:1000] + if len(item["page_content"]) > 1000: content_preview += "..." summary_content += f"Content Preview: {content_preview}\n\n" - summary_content += f"Comments: {comment_count}" + summary_content += f"Comments: {item['comment_count']}" summary_embedding = config.embedding_model_instance.embed( summary_content ) # Process chunks - using the full page content with comments - chunks = await create_document_chunks(full_content) + chunks = await create_document_chunks(item["full_content"]) - # Create and store new document - logger.info(f"Creating new document for page {page_title}") - document = Document( - search_space_id=search_space_id, - title=f"Confluence - {page_title}", - document_type=DocumentType.CONFLUENCE_CONNECTOR, - document_metadata={ - "page_id": page_id, - "page_title": page_title, - "space_id": space_id, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) + # Update document to READY with actual content + document.title = item["page_title"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "page_id": item["page_id"], + "page_title": item["page_title"], + "space_id": item["space_id"], + "comment_count": item["comment_count"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() - session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new page {page_title}") - # Batch commit every 10 documents + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Confluence pages processed so far" @@ -419,53 +438,80 @@ async def index_confluence_pages( except Exception as e: logger.error( - f"Error processing page {page.get('title', 'Unknown')}: {e!s}", + f"Error processing page {item.get('page_title', 'Unknown')}: {e!s}", exc_info=True, ) - skipped_pages.append( - f"{page.get('title', 'Unknown')} (processing error)" - ) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue # Skip this page and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches + # Final commit to ensure all documents are persisted (safety net) logger.info( f"Final commit: Total {documents_indexed} Confluence pages processed" ) - await session.commit() - logger.info( - "Successfully committed all Confluence document changes to database" - ) + try: + await session.commit() + logger.info( + "Successfully committed all Confluence document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same page was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Confluence indexing for connector {connector_id}", { - "pages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_pages_count": len(skipped_pages), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + f"Confluence indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) # Close the client connection if confluence_client: await confluence_client.close() - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py index 4999ba6d4..1595897a0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -1,5 +1,9 @@ """ Discord connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import asyncio @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.discord_connector import DiscordConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -48,7 +53,11 @@ async def index_discord_messages( on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, str | None]: """ - Index Discord messages from all accessible channels. + Index Discord messages from the configured guild's channels. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed Args: session: Database session @@ -113,6 +122,37 @@ async def index_discord_messages( logger.info(f"Starting Discord indexing for connector {connector_id}") + # ======================================================================= + # GUILD FILTERING: Only index the specific guild configured for this connector + # ======================================================================= + # Extract guild_id from connector config (set during OAuth flow) + configured_guild_id = connector.config.get("guild_id") + configured_guild_name = connector.config.get("guild_name") + + # Legacy connector check - if no guild_id, we need to warn and handle gracefully + is_legacy_connector = configured_guild_id is None + + if is_legacy_connector: + logger.warning( + f"Discord connector {connector_id} has no guild_id configured. " + "This is a legacy connector. Please reconnect the Discord server to fix this. " + "For now, indexing will be skipped to prevent indexing unwanted servers." + ) + await task_logger.log_task_failure( + log_entry, + f"Legacy Discord connector {connector_id} missing guild_id", + "No guild_id configured. Please reconnect this Discord server.", + {"error_type": "MissingGuildId", "is_legacy": True}, + ) + return ( + 0, + "This Discord connector needs to be reconnected. Please disconnect and reconnect your Discord server to enable indexing.", + ) + + logger.info( + f"Configured to index guild: {configured_guild_name} ({configured_guild_id})" + ) + # Initialize Discord client with OAuth credentials support await task_logger.log_task_progress( log_entry, @@ -255,77 +295,66 @@ async def index_discord_messages( try: await task_logger.log_task_progress( log_entry, - f"Starting Discord bot and fetching guilds for connector {connector_id}", - {"stage": "fetch_guilds"}, + f"Starting Discord bot for connector {connector_id}", + {"stage": "bot_initialization"}, ) - logger.info("Starting Discord bot to fetch guilds") + logger.info("Starting Discord bot") discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) await discord_client._wait_until_ready() - logger.info("Fetching Discord guilds") - guilds = await discord_client.get_guilds() - logger.info(f"Found {len(guilds)} guilds") + # We only process the configured guild, not all guilds + logger.info( + f"Processing configured guild only: {configured_guild_name} ({configured_guild_id})" + ) + except Exception as e: await task_logger.log_task_failure( log_entry, - f"Failed to get Discord guilds for connector {connector_id}", + f"Failed to start Discord bot for connector {connector_id}", str(e), - {"error_type": "GuildFetchError"}, + {"error_type": "BotStartError"}, ) - logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True) + logger.error(f"Failed to start Discord bot: {e!s}", exc_info=True) await discord_client.close_bot() - return 0, f"Failed to get Discord guilds: {e!s}" - - if not guilds: - await task_logger.log_task_success( - log_entry, - f"No Discord guilds found for connector {connector_id}", - {"guilds_found": 0}, - ) - logger.info("No Discord guilds found to index") - await discord_client.close_bot() - return 0, "No Discord guilds found" + return 0, f"Failed to start Discord bot: {e!s}" # Track results documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 skipped_channels: list[str] = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() - # Process each guild and channel + # Use the configured guild info + guild_id = configured_guild_id + guild_name = configured_guild_name or "Unknown Guild" + await task_logger.log_task_progress( log_entry, - f"Starting to process {len(guilds)} Discord guilds", - {"stage": "process_guilds", "total_guilds": len(guilds)}, + f"Processing Discord guild: {guild_name}", + {"stage": "process_guild", "guild_id": guild_id, "guild_name": guild_name}, ) + # ======================================================================= + # PHASE 1: Collect all messages and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + try: - for guild in guilds: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() - guild_id = guild["id"] - guild_name = guild["name"] - logger.info(f"Processing guild: {guild_name} ({guild_id})") - - try: - channels = await discord_client.get_text_channels(guild_id) - if not channels: - logger.info( - f"No channels found in guild {guild_name}. Skipping." - ) - skipped_channels.append(f"{guild_name} (no channels)") - documents_skipped += 1 - continue + logger.info(f"Processing guild: {guild_name} ({guild_id})") + try: + channels = await discord_client.get_text_channels(guild_id) + if not channels: + logger.info(f"No channels found in guild {guild_name}. Skipping.") + skipped_channels.append(f"{guild_name} (no channels)") + else: for channel in channels: channel_id = channel["id"] channel_name = channel["name"] @@ -343,14 +372,12 @@ async def index_discord_messages( skipped_channels.append( f"{guild_name}#{channel_name} (fetch error)" ) - documents_skipped += 1 continue if not messages: logger.info( f"No messages found in channel {channel_name} for the specified date range." ) - documents_skipped += 1 continue # Filter/format messages @@ -365,7 +392,6 @@ async def index_discord_messages( logger.info( f"No valid messages found in channel {channel_name} after filtering." ) - documents_skipped += 1 continue # Process each message as an individual document (like Slack) @@ -427,32 +453,23 @@ async def index_discord_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document." - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = ( - config.embedding_model_instance.embed( - combined_document_string - ) - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, "guild_name": guild_name, "guild_id": guild_id, "channel_name": channel_name, @@ -460,22 +477,9 @@ async def index_discord_messages( "message_id": msg_id, "message_timestamp": msg_timestamp, "message_user_name": msg_user_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = ( - get_current_timestamp() - ) - - documents_indexed += 1 - logger.info( - f"Successfully updated Discord message {msg_id}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -492,22 +496,14 @@ async def index_discord_messages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Discord - {guild_name}#{channel_name}", + title=f"{guild_name}#{channel_name}", document_type=DocumentType.DISCORD_CONNECTOR, document_metadata={ "guild_name": guild_name, @@ -515,87 +511,177 @@ async def index_discord_messages( "channel_name": channel_name, "channel_id": channel_id, "message_id": msg_id, - "message_timestamp": msg_timestamp, - "message_user_name": msg_user_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Discord messages processed so far" - ) - await session.commit() + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "guild_name": guild_name, + "guild_id": guild_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": msg_id, + "message_timestamp": msg_timestamp, + "message_user_name": msg_user_name, + } + ) - logger.info( - f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" - ) + except Exception as e: + logger.error( + f"Error processing guild {guild_name}: {e!s}", exc_info=True + ) + skipped_channels.append(f"{guild_name} (processing error)") - except Exception as e: - logger.error( - f"Error processing guild {guild_name}: {e!s}", exc_info=True - ) - skipped_channels.append(f"{guild_name} (processing error)") - documents_skipped += 1 - continue finally: await discord_client.close_bot() - # Update last_indexed_at only if we indexed at least one - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item["combined_document_string"]) + doc_embedding = config.embedding_model_instance.embed( + item["combined_document_string"] + ) + + # Update document to READY with actual content + document.title = f"{item['guild_name']}#{item['channel_name']}" + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] + document.embedding = doc_embedding + document.document_metadata = { + "guild_name": item["guild_name"], + "guild_id": item["guild_id"], + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "message_id": item["message_id"], + "message_timestamp": item["message_timestamp"], + "message_user_name": item["message_user_name"], + "indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Discord messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Discord message: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} Discord messages processed" ) - await session.commit() - - # Prepare result message - result_message = None - if skipped_channels: - result_message = ( - f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: " - + ", ".join(skipped_channels) + try: + await session.commit() + logger.info( + "Successfully committed all Discord document changes to database" ) - else: - result_message = f"Processed {documents_indexed} messages." + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + if skipped_channels: + warning_parts.append(f"{len(skipped_channels)} channels skipped") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Discord indexing for connector {connector_id}", { - "messages_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, "skipped_channels_count": len(skipped_channels), - "guilds_processed": len(guilds), - "result_message": result_message, + "guild_id": guild_id, + "guild_name": guild_name, }, ) logger.info( - f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped" + f"Discord indexing completed for guild {guild_name}: {documents_indexed} ready, {documents_skipped} skipped, " + f"{documents_failed} failed ({duplicate_content_count} duplicate content)" ) - return ( - documents_indexed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py index fb6487474..212afff39 100644 --- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py @@ -1,5 +1,9 @@ """ Elasticsearch indexer for SurfSense + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all documents and create pending documents (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import json @@ -13,7 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.connectors.elasticsearch_connector import ElasticsearchConnector -from app.db import Document, DocumentType, SearchSourceConnector +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -25,6 +29,7 @@ from .base import ( check_document_by_unique_identifier, check_duplicate_document_by_hash, get_current_timestamp, + safe_set_chunks, ) # Type hint for heartbeat callback @@ -164,6 +169,8 @@ async def index_elasticsearch_documents( ) documents_processed = 0 + documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() @@ -178,23 +185,22 @@ async def index_elasticsearch_documents( "max_documents": max_documents, }, ) - # Use scroll search for large result sets + + # ======================================================================= + # PHASE 1: Collect all documents from Elasticsearch and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + docs_to_process = [] # List of dicts with document and ES data + new_documents_created = False + hits_collected = 0 + async for hit in es_connector.scroll_search( index=index_name, query=query, size=min(max_documents, 100), # Scroll in batches fields=config.get("ELASTICSEARCH_FIELDS"), ): - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_processed) - last_heartbeat_time = time.time() - - if documents_processed >= max_documents: + if hits_collected >= max_documents: break try: @@ -220,26 +226,12 @@ async def index_elasticsearch_documents( if not content.strip(): logger.warning(f"Skipping document {doc_id} - no content found") + documents_skipped += 1 continue # Create content hash content_hash = generate_content_hash(content, search_space_id) - # Build metadata - metadata = { - "elasticsearch_id": doc_id, - "elasticsearch_index": hit.get("_index", index_name), - "elasticsearch_score": hit.get("_score"), - "indexed_at": datetime.now().isoformat(), - "source": "ELASTICSEARCH_CONNECTOR", - } - - # Add any additional metadata fields specified in config - if "ELASTICSEARCH_METADATA_FIELDS" in config: - for field in config["ELASTICSEARCH_METADATA_FIELDS"]: - if field in source: - metadata[f"es_{field}"] = source[field] - # Build source-unique identifier and hash (prefer source id dedupe) source_identifier = f"{hit.get('_index', index_name)}:{doc_id}" unique_identifier_hash = generate_unique_identifier_hash( @@ -258,98 +250,223 @@ async def index_elasticsearch_documents( ) if existing_doc: - # If content is unchanged, skip. Otherwise update the existing document. + # If content is unchanged, skip. Otherwise queue for update. if existing_doc.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_doc.status, DocumentStatus.READY + ): + existing_doc.status = DocumentStatus.ready() logger.info( f"Skipping ES doc {doc_id} — already indexed (doc id {existing_doc.id})" ) - continue - else: - logger.info( - f"Updating existing document {existing_doc.id} for ES doc {doc_id}" - ) - existing_doc.title = title - existing_doc.content = content - existing_doc.content_hash = content_hash - existing_doc.document_metadata = metadata - existing_doc.unique_identifier_hash = unique_identifier_hash - chunks = await create_document_chunks(content) - existing_doc.chunks = chunks - existing_doc.updated_at = get_current_timestamp() - await session.flush() - documents_processed += 1 - if documents_processed % 10 == 0: - await session.commit() + documents_skipped += 1 continue - # Create document + # Queue existing document for update (will be set to processing in Phase 2) + docs_to_process.append( + { + "document": existing_doc, + "is_new": False, + "doc_id": doc_id, + "title": title, + "content": content, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + "hit": hit, + "source": source, + } + ) + hits_collected += 1 + continue + + # Build metadata for new document + metadata = { + "elasticsearch_id": doc_id, + "elasticsearch_index": hit.get("_index", index_name), + "elasticsearch_score": hit.get("_score"), + "source": "ELASTICSEARCH_CONNECTOR", + "connector_id": connector_id, + } + + # Add any additional metadata fields specified in config + if "ELASTICSEARCH_METADATA_FIELDS" in config: + for field in config["ELASTICSEARCH_METADATA_FIELDS"]: + if field in source: + metadata[f"es_{field}"] = source[field] + + # Create new document with PENDING status (visible in UI immediately) document = Document( title=title, - content=content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, document_type=DocumentType.ELASTICSEARCH_CONNECTOR, document_metadata=metadata, search_space_id=search_space_id, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - - # Create chunks and attach to document (persist via relationship) - chunks = await create_document_chunks(content) - document.chunks = chunks session.add(document) - await session.flush() + new_documents_created = True + + docs_to_process.append( + { + "document": document, + "is_new": True, + "doc_id": doc_id, + "title": title, + "content": content, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + "hit": hit, + "source": source, + } + ) + hits_collected += 1 + + except Exception as e: + logger.error(f"Error in Phase 1 for ES doc: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(docs_to_process)} documents") + + for item in docs_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_processed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Build metadata + metadata = { + "elasticsearch_id": item["doc_id"], + "elasticsearch_index": item["hit"].get("_index", index_name), + "elasticsearch_score": item["hit"].get("_score"), + "indexed_at": datetime.now().isoformat(), + "source": "ELASTICSEARCH_CONNECTOR", + "connector_id": connector_id, + } + + # Add any additional metadata fields specified in config + if "ELASTICSEARCH_METADATA_FIELDS" in config: + for field in config["ELASTICSEARCH_METADATA_FIELDS"]: + if field in item["source"]: + metadata[f"es_{field}"] = item["source"][field] + + # Create chunks + chunks = await create_document_chunks(item["content"]) + + # Update document to READY with actual content + document.title = item["title"] + document.content = item["content"] + document.content_hash = item["content_hash"] + document.unique_identifier_hash = item["unique_identifier_hash"] + document.document_metadata = metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() documents_processed += 1 + # Batch commit every 10 documents (for ready status updates) if documents_processed % 10 == 0: logger.info( - f"Processed {documents_processed} Elasticsearch documents" + f"Committing batch: {documents_processed} Elasticsearch documents processed so far" ) await session.commit() except Exception as e: - msg = f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}" + msg = f"Error processing Elasticsearch document {item.get('doc_id', 'unknown')}: {e}" logger.error(msg) - await task_logger.log_task_failure( - log_entry, - "Document processing error", - msg, - { - "document_id": hit.get("_id", "unknown"), - "error_type": type(e).__name__, - }, - ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue - # Final commit - await session.commit() + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + if update_last_indexed: + connector.last_indexed_at = ( + datetime.now(UTC).isoformat().replace("+00:00", "Z") + ) + + # Final commit for any remaining documents not yet committed in batches + logger.info( + f"Final commit: Total {documents_processed} Elasticsearch documents processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all Elasticsearch document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same document was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, f"Successfully indexed {documents_processed} documents from Elasticsearch", - {"documents_indexed": documents_processed, "index": index_name}, + { + "documents_indexed": documents_processed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "index": index_name, + }, ) logger.info( - f"Successfully indexed {documents_processed} documents from Elasticsearch" + f"Elasticsearch indexing completed: {documents_processed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - # Update last indexed timestamp if requested - if update_last_indexed and documents_processed > 0: - # connector.last_indexed_at = datetime.now() - connector.last_indexed_at = ( - datetime.now(UTC).isoformat().replace("+00:00", "Z") - ) - await session.commit() - await task_logger.log_task_progress( - log_entry, - "Updated connector.last_indexed_at", - {"last_indexed_at": connector.last_indexed_at}, - ) - - return documents_processed, None + return documents_processed, warning_message finally: # Clean up Elasticsearch connection diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index d82f18944..e1a1ddd4d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -3,6 +3,10 @@ GitHub connector indexer using gitingest. This indexer processes entire repository digests in one pass, dramatically reducing LLM API calls compared to the previous file-by-file approach. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -13,8 +17,8 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.connectors.github_connector import GitHubConnector, RepositoryDigest -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.connectors.github_connector import GitHubConnector +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -30,6 +34,8 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, + update_connector_last_indexed, ) # Type hint for heartbeat callback @@ -164,7 +170,7 @@ async def index_github_repos( ) return 0, f"Failed to initialize GitHub client: {e!s}" - # 4. Process each repository with gitingest + # 4. Process each repository with gitingest using 2-phase approach await task_logger.log_task_progress( log_entry, f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories", @@ -181,24 +187,25 @@ async def index_github_repos( # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() documents_indexed = 0 + documents_skipped = 0 + documents_failed = 0 + + # ======================================================================= + # PHASE 1: Analyze all repos and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + repos_to_process = [] # List of dicts with document and digest data + new_documents_created = False for repo_full_name in repo_full_names_to_index: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() if not repo_full_name or not isinstance(repo_full_name, str): logger.warning(f"Skipping invalid repository entry: {repo_full_name}") continue - logger.info(f"Ingesting repository: {repo_full_name}") - try: + logger.info(f"Phase 1: Analyzing repository: {repo_full_name}") + # Run gitingest via subprocess (isolated from event loop) - # Using to_thread to not block the async database operations import asyncio digest = await asyncio.to_thread( @@ -212,30 +219,266 @@ async def index_github_repos( errors.append(f"No digest for {repo_full_name}") continue - # Process the digest and create documents - docs_created = await _process_repository_digest( - session=session, - digest=digest, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - connector_id=connector_id, + # Generate unique identifier based on repo name + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id ) - documents_processed += docs_created - logger.info( - f"Created {docs_created} documents from repository: {repo_full_name}" + # Generate content hash from digest + full_content = digest.full_digest + content_hash = generate_content_hash(full_content, search_space_id) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + logger.info(f"Repository {repo_full_name} unchanged. Skipping.") + documents_skipped += 1 + continue + + # Queue existing document for update (will be set to processing in Phase 2) + logger.info( + f"Content changed for repository {repo_full_name}. Queuing for update." + ) + repos_to_process.append( + { + "document": existing_document, + "is_new": False, + "digest": digest, + "content_hash": content_hash, + "repo_full_name": repo_full_name, + "unique_identifier_hash": unique_identifier_hash, + } + ) + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"Repository {repo_full_name} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping." + ) + documents_skipped += 1 + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=repo_full_name, + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata={ + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + repos_to_process.append( + { + "document": document, + "is_new": True, + "digest": digest, + "content_hash": content_hash, + "repo_full_name": repo_full_name, + "unique_identifier_hash": unique_identifier_hash, + } ) except Exception as repo_err: logger.error( - f"Failed to process repository {repo_full_name}: {repo_err}" + f"Error in Phase 1 for repository {repo_full_name}: {repo_err}", + exc_info=True, ) + errors.append(f"Phase 1 error for {repo_full_name}: {repo_err}") + documents_failed += 1 + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(repos_to_process)} documents") + + for item in repos_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + digest = item["digest"] + repo_full_name = item["repo_full_name"] + + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + document_metadata_for_summary = { + "repository": repo_full_name, + "document_type": "GitHub Repository", + "connector_type": "GitHub", + "ingestion_method": "gitingest", + "file_tree": digest.tree[:2000] + if len(digest.tree) > 2000 + else digest.tree, + "estimated_tokens": digest.estimated_tokens, + } + + if user_llm: + # Prepare content for summarization + summary_content = digest.full_digest + if len(summary_content) > MAX_DIGEST_CHARS: + summary_content = ( + f"# Repository: {repo_full_name}\n\n" + f"## File Structure\n\n{digest.tree}\n\n" + f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." + ) + + summary_text, summary_embedding = await generate_document_summary( + summary_content, user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + summary_text = ( + f"# GitHub Repository: {repo_full_name}\n\n" + f"## Summary\n{digest.summary}\n\n" + f"## File Structure\n{digest.tree[:3000]}" + ) + summary_embedding = config.embedding_model_instance.embed( + summary_text + ) + + # Chunk the full digest content for granular search + try: + chunks_data = await create_document_chunks(digest.content) + except Exception as chunk_err: + logger.error( + f"Failed to chunk repository {repo_full_name}: {chunk_err}" + ) + chunks_data = await _simple_chunk_content(digest.content) + + # Update document to READY with actual content + doc_metadata = { + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "file_tree": digest.tree, + "gitingest_summary": digest.summary, + "estimated_tokens": digest.estimated_tokens, + "connector_id": connector_id, + "indexed_at": datetime.now(UTC).isoformat(), + } + + document.title = repo_full_name + document.content = summary_text + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = doc_metadata + safe_set_chunks(document, chunks_data) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_processed += 1 + documents_indexed += 1 + + logger.info( + f"Created document for repository {repo_full_name} " + f"with {len(chunks_data)} chunks" + ) + + # Batch commit every 5 documents (repositories are large) + if documents_indexed % 5 == 0: + logger.info( + f"Committing batch: {documents_indexed} GitHub repos processed so far" + ) + await session.commit() + + except Exception as repo_err: + logger.error( + f"Error processing repository {repo_full_name}: {repo_err}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(repo_err)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit - await session.commit() + logger.info( + f"Final commit: Total {documents_processed} GitHub repositories processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all GitHub document changes to database" + ) + except Exception as e: + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + logger.info( f"Finished GitHub indexing for connector {connector_id}. " f"Created {documents_processed} documents." @@ -247,6 +490,8 @@ async def index_github_repos( f"Successfully completed GitHub indexing for connector {connector_id}", { "documents_processed": documents_processed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "errors_count": len(errors), "repo_count": len(repo_full_names_to_index), "method": "gitingest", @@ -286,163 +531,6 @@ async def index_github_repos( return documents_processed, error_message -async def _process_repository_digest( - session: AsyncSession, - digest: RepositoryDigest, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, - connector_id: int, -) -> int: - """ - Process a repository digest and create documents. - - For each repository, we create: - 1. One main document with the repository summary - 2. Chunks from the full digest content for granular search - - Args: - session: Database session - digest: The repository digest from gitingest - search_space_id: ID of the search space - user_id: ID of the user - task_logger: Task logging service - log_entry: Current log entry - - Returns: - Number of documents created - """ - repo_full_name = digest.repo_full_name - documents_created = 0 - - # Generate unique identifier based on repo name and content hash - # This allows updates when repo content changes - full_content = digest.full_digest - content_hash = generate_content_hash(full_content, search_space_id) - - # Use repo name as the unique identifier (one document per repo) - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id - ) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info(f"Repository {repo_full_name} unchanged. Skipping.") - return 0 - else: - logger.info( - f"Content changed for repository {repo_full_name}. Updating document." - ) - # Delete existing document to replace with new one - await session.delete(existing_document) - await session.flush() - else: - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"Repository {repo_full_name} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) - return 0 - - # Generate summary using LLM (ONE call per repository!) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - - document_metadata = { - "repository": repo_full_name, - "document_type": "GitHub Repository", - "connector_type": "GitHub", - "ingestion_method": "gitingest", - "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, - "estimated_tokens": digest.estimated_tokens, - } - - if user_llm: - # Prepare content for summarization - # Include tree structure and truncated content if too large - summary_content = digest.full_digest - if len(summary_content) > MAX_DIGEST_CHARS: - # Truncate but keep the tree and beginning of content - summary_content = ( - f"# Repository: {repo_full_name}\n\n" - f"## File Structure\n\n{digest.tree}\n\n" - f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." - ) - - summary_text, summary_embedding = await generate_document_summary( - summary_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_text = ( - f"# GitHub Repository: {repo_full_name}\n\n" - f"## Summary\n{digest.summary}\n\n" - f"## File Structure\n{digest.tree[:3000]}" - ) - summary_embedding = config.embedding_model_instance.embed(summary_text) - - # Chunk the full digest content for granular search - try: - # Use the content (not the summary) for chunking - # This preserves file-level granularity in search - chunks_data = await create_document_chunks(digest.content) - except Exception as chunk_err: - logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}") - # Fall back to a simpler chunking approach - chunks_data = await _simple_chunk_content(digest.content) - - # Create the document - doc_metadata = { - "repository_full_name": repo_full_name, - "url": f"https://github.com/{repo_full_name}", - "branch": digest.branch, - "ingestion_method": "gitingest", - "file_tree": digest.tree, - "gitingest_summary": digest.summary, - "estimated_tokens": digest.estimated_tokens, - "indexed_at": datetime.now(UTC).isoformat(), - } - - document = Document( - title=f"GitHub Repository: {repo_full_name}", - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_text, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - - session.add(document) - documents_created += 1 - - logger.info( - f"Created document for repository {repo_full_name} " - f"with {len(chunks_data)} chunks" - ) - - return documents_created - - async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: """ Simple fallback chunking when the regular chunker fails. diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 0b773025f..1476f3f40 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -1,5 +1,9 @@ """ Google Calendar connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.connectors.google_calendar_connector import GoogleCalendarConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_current_timestamp, logger, parse_date_flexible, + safe_set_chunks, update_connector_last_indexed, ) @@ -305,7 +310,7 @@ async def index_google_calendar_events( documents_indexed = 0 documents_skipped = 0 - skipped_events = [] + documents_failed = 0 # Track events that failed processing duplicate_content_count = ( 0 # Track events skipped due to duplicate content_hash ) @@ -313,14 +318,14 @@ async def index_google_calendar_events( # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all events, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + events_to_process = [] # List of dicts with document and event data + new_documents_created = False + for event in events: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: event_id = event.get("id") event_summary = event.get("summary", "No Title") @@ -328,14 +333,12 @@ async def index_google_calendar_events( if not event_id: logger.warning(f"Skipping event with missing ID: {event_summary}") - skipped_events.append(f"{event_summary} (missing ID)") documents_skipped += 1 continue event_markdown = calendar_client.format_event_to_markdown(event) if not event_markdown.strip(): logger.warning(f"Skipping event with no content: {event_summary}") - skipped_events.append(f"{event_summary} (no content)") documents_skipped += 1 continue @@ -362,82 +365,31 @@ async def index_google_calendar_events( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Google Calendar event {event_summary} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Google Calendar event {event_summary}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location or "No location", - "document_type": "Google Calendar Event", - "connector_type": "Google Calendar", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - summary_content = ( - f"Google Calendar Event: {event_summary}\n\n" - ) - summary_content += f"Calendar: {calendar_id}\n" - summary_content += f"Start: {start_time}\n" - summary_content += f"End: {end_time}\n" - if location: - summary_content += f"Location: {location}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(event_markdown) - - # Update existing document - existing_document.title = f"Calendar Event - {event_summary}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + events_to_process.append( + { + "document": existing_document, + "is_new": False, + "event_markdown": event_markdown, + "content_hash": content_hash, "event_id": event_id, "event_summary": event_summary, "calendar_id": calendar_id, "start_time": start_time, "end_time": end_time, "location": location, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "description": description, } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Google Calendar event {event_summary}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -455,55 +407,12 @@ async def index_google_calendar_events( ) duplicate_content_count += 1 documents_skipped += 1 - skipped_events.append( - f"{event_summary} (already indexed by another connector)" - ) continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_summary": event_summary, - "calendar_id": calendar_id, - "start_time": start_time, - "end_time": end_time, - "location": location or "No location", - "document_type": "Google Calendar Event", - "connector_type": "Google Calendar", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Google Calendar Event: {event_summary}\n\n" - summary_content += f"Calendar: {calendar_id}\n" - summary_content += f"Start: {start_time}\n" - summary_content += f"End: {end_time}\n" - if location: - summary_content += f"Location: {location}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - chunks = await create_document_chunks(event_markdown) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Calendar Event - {event_summary}", + title=event_summary, document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR, document_metadata={ "event_id": event_id, @@ -512,23 +421,133 @@ async def index_google_calendar_events( "start_time": start_time, "end_time": end_time, "location": location, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new event {event_summary}") + new_documents_created = True - # Batch commit every 10 documents + events_to_process.append( + { + "document": document, + "is_new": True, + "event_markdown": event_markdown, + "content_hash": content_hash, + "event_id": event_id, + "event_summary": event_summary, + "calendar_id": calendar_id, + "start_time": start_time, + "end_time": end_time, + "location": location, + "description": description, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(events_to_process)} documents") + + for item in events_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "event_id": item["event_id"], + "event_summary": item["event_summary"], + "calendar_id": item["calendar_id"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"] or "No location", + "document_type": "Google Calendar Event", + "connector_type": "Google Calendar", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["event_markdown"], user_llm, document_metadata_for_summary + ) + else: + summary_content = ( + f"Google Calendar Event: {item['event_summary']}\n\n" + ) + summary_content += f"Calendar: {item['calendar_id']}\n" + summary_content += f"Start: {item['start_time']}\n" + summary_content += f"End: {item['end_time']}\n" + if item["location"]: + summary_content += f"Location: {item['location']}\n" + if item["description"]: + desc_preview = item["description"][:1000] + if len(item["description"]) > 1000: + desc_preview += "..." + summary_content += f"Description: {desc_preview}\n" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["event_markdown"]) + + # Update document to READY with actual content + document.title = item["event_summary"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "event_id": item["event_id"], + "event_summary": item["event_summary"], + "calendar_id": item["calendar_id"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Google Calendar events processed so far" @@ -536,19 +555,20 @@ async def index_google_calendar_events( await session.commit() except Exception as e: - logger.error( - f"Error processing event {event.get('summary', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_events.append( - f"{event.get('summary', 'Unknown')} (processing error)" - ) - documents_skipped += 1 + logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( @@ -556,6 +576,9 @@ async def index_google_calendar_events( ) try: await session.commit() + logger.info( + "Successfully committed all Google Calendar document changes to database" + ) except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( @@ -572,10 +595,15 @@ async def index_google_calendar_events( else: raise - # Build warning message if duplicates were found - warning_message = None + # Build warning message if there were issues + warning_parts = [] if duplicate_content_count > 0: - warning_message = f"{duplicate_content_count} skipped (duplicate)" + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + + total_processed = documents_indexed await task_logger.log_task_success( log_entry, @@ -584,14 +612,15 @@ async def index_google_calendar_events( "events_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "duplicate_content_count": duplicate_content_count, - "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped " - f"({duplicate_content_count} due to duplicate content from other connectors)" + f"Google Calendar indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) return total_processed, warning_message diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 151c1abbc..f7624cffe 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -1,4 +1,9 @@ -"""Google Drive indexer using Surfsense file processors.""" +"""Google Drive indexer using Surfsense file processors. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed +""" import logging import time @@ -17,11 +22,12 @@ from app.connectors.google_drive import ( get_files_in_folder, get_start_page_token, ) -from app.db import DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( check_document_by_unique_identifier, get_connector_by_id, + get_current_timestamp, update_connector_last_indexed, ) from app.utils.document_converters import generate_unique_identifier_hash @@ -324,8 +330,29 @@ async def index_google_drive_single_file( display_name = file_name or file.get("name", "Unknown") logger.info(f"Indexing Google Drive file: {display_name} ({file_id})") + # Create pending document for status visibility + pending_doc, should_skip = await _create_pending_document_for_file( + session=session, + file=file, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + ) + + if should_skip: + await task_logger.log_task_progress( + log_entry, + f"File {display_name} is unchanged or not indexable", + {"status": "skipped"}, + ) + return 0, None + + # Commit pending document so it appears in UI + if pending_doc and pending_doc.id is None: + await session.commit() + # Process the file - indexed, skipped = await _process_single_file( + indexed, skipped, failed = await _process_single_file( drive_client=drive_client, session=session, file=file, @@ -334,6 +361,7 @@ async def index_google_drive_single_file( user_id=user_id, task_logger=task_logger, log_entry=log_entry, + pending_document=pending_doc, ) await session.commit() @@ -341,6 +369,15 @@ async def index_google_drive_single_file( "Successfully committed Google Drive file indexing changes to database" ) + if failed > 0: + error_msg = f"Failed to index file {display_name}" + await task_logger.log_task_failure( + log_entry, + error_msg, + {"file_name": display_name, "file_id": file_id}, + ) + return 0, error_msg + if indexed > 0: await task_logger.log_task_success( log_entry, @@ -397,7 +434,12 @@ async def _index_full_scan( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int]: - """Perform full scan indexing of a folder.""" + """Perform full scan indexing of a folder. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Collect all files and create pending documents (visible in UI immediately) + - Phase 2: Process each file: pending → processing → ready/failed + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})", @@ -410,29 +452,31 @@ async def _index_full_scan( documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 files_processed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Collect all files and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + files_to_process = [] # List of (file, pending_document or None) + new_documents_created = False + # Queue of folders to process: (folder_id, folder_name) folders_to_process = [(folder_id, folder_name)] + logger.info("Phase 1: Collecting files and creating pending documents") + while folders_to_process and files_processed < max_files: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() current_folder_id, current_folder_name = folders_to_process.pop(0) - logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})") + logger.info(f"Scanning folder: {current_folder_name} ({current_folder_id})") page_token = None while files_processed < max_files: # Get files and folders in current folder - # include_subfolders=True here so we get folder items to queue them files, next_token, error = await get_files_in_folder( drive_client, current_folder_id, @@ -462,35 +506,74 @@ async def _index_full_scan( logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}") continue - # Process the file files_processed += 1 - indexed, skipped = await _process_single_file( - drive_client=drive_client, + # Create pending document for this file + pending_doc, should_skip = await _create_pending_document_for_file( session=session, file=file, connector_id=connector_id, search_space_id=search_space_id, user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, ) - documents_indexed += indexed - documents_skipped += skipped + if should_skip: + documents_skipped += 1 + continue - if documents_indexed % 10 == 0 and documents_indexed > 0: - await session.commit() - logger.info( - f"Committed batch: {documents_indexed} files indexed so far" - ) + if pending_doc and pending_doc.id is None: + # New document was created + new_documents_created = True + + files_to_process.append((file, pending_doc)) page_token = next_token if not page_token: break + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each file one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(files_to_process)} files") + + for file, pending_doc in files_to_process: + # Check if it's time for a heartbeat update + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + indexed, skipped, failed = await _process_single_file( + drive_client=drive_client, + session=session, + file=file, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + pending_document=pending_doc, + ) + + documents_indexed += indexed + documents_skipped += skipped + documents_failed += failed + + if documents_indexed % 10 == 0 and documents_indexed > 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} files indexed so far") + logger.info( - f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed" ) return documents_indexed, documents_skipped @@ -514,6 +597,10 @@ async def _index_with_delta_sync( Note: include_subfolders is accepted for API consistency but delta sync automatically tracks changes across all folders including subfolders. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Collect all changes and create pending documents (visible in UI immediately) + - Phase 2: Process each file: pending → processing → ready/failed """ await task_logger.log_task_progress( log_entry, @@ -537,19 +624,21 @@ async def _index_with_delta_sync( documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 files_processed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze changes and create pending documents for new/modified files + # ======================================================================= + changes_to_process = [] # List of (change, file, pending_document or None) + new_documents_created = False + + logger.info("Phase 1: Analyzing changes and creating pending documents") + for change in changes: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() if files_processed >= max_files: break @@ -566,7 +655,45 @@ async def _index_with_delta_sync( if not file: continue - indexed, skipped = await _process_single_file( + # Create pending document for this file + pending_doc, should_skip = await _create_pending_document_for_file( + session=session, + file=file, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + ) + + if should_skip: + documents_skipped += 1 + continue + + if pending_doc and pending_doc.id is None: + # New document was created + new_documents_created = True + + changes_to_process.append((change, file, pending_doc)) + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info("Phase 1: Committing pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each file one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(changes_to_process)} changes") + + for _, file, pending_doc in changes_to_process: + # Check if it's time for a heartbeat update + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + indexed, skipped, failed = await _process_single_file( drive_client=drive_client, session=session, file=file, @@ -575,21 +702,125 @@ async def _index_with_delta_sync( user_id=user_id, task_logger=task_logger, log_entry=log_entry, + pending_document=pending_doc, ) documents_indexed += indexed documents_skipped += skipped + documents_failed += failed if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} changes processed") logger.info( - f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed" ) return documents_indexed, documents_skipped +async def _create_pending_document_for_file( + session: AsyncSession, + file: dict, + connector_id: int, + search_space_id: int, + user_id: str, +) -> tuple[Document | None, bool]: + """ + Create a pending document for a Google Drive file if it doesn't exist. + + This is Phase 1 of the 2-phase document status update pattern. + Creates documents with 'pending' status so they appear in UI immediately. + + Args: + session: Database session + file: File metadata from Google Drive API + connector_id: ID of the Drive connector + search_space_id: ID of the search space + user_id: ID of the user + + Returns: + Tuple of (document, should_skip): + - (existing_doc, False): Existing document that needs update + - (new_pending_doc, False): New pending document created + - (None, True): File should be skipped (unchanged, rename-only, or folder) + """ + from app.connectors.google_drive.file_types import should_skip_file + + file_id = file.get("id") + file_name = file.get("name", "Unknown") + mime_type = file.get("mimeType", "") + + # Skip folders and shortcuts + if should_skip_file(mime_type): + return None, True + + if not file_id: + return None, True + + # Generate unique identifier hash for this file + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + + # Check if document exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Check if this is a rename-only update (content unchanged) + incoming_md5 = file.get("md5Checksum") + incoming_modified_time = file.get("modifiedTime") + doc_metadata = existing_document.document_metadata or {} + stored_md5 = doc_metadata.get("md5_checksum") + stored_modified_time = doc_metadata.get("modified_time") + + # Determine if content changed + content_unchanged = False + if incoming_md5 and stored_md5: + content_unchanged = incoming_md5 == stored_md5 + elif not incoming_md5 and incoming_modified_time and stored_modified_time: + # Google Workspace file - use modifiedTime as fallback + content_unchanged = incoming_modified_time == stored_modified_time + + if content_unchanged: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + return None, True + + # Content changed - return existing document for update + return existing_document, False + + # Create new pending document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.GOOGLE_DRIVE_FILE, + document_metadata={ + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + + return document, False + + async def _check_rename_only_update( session: AsyncSession, file: dict, @@ -725,15 +956,31 @@ async def _process_single_file( user_id: str, task_logger: TaskLoggingService, log_entry: any, -) -> tuple[int, int]: + pending_document: Document | None = None, +) -> tuple[int, int, int]: """ Process a single file by downloading and using Surfsense's file processor. + Implements Phase 2 of the 2-phase document status update pattern. + Updates document status: pending → processing → ready/failed + + Args: + drive_client: Google Drive client + session: Database session + file: File metadata from Google Drive API + connector_id: ID of the connector + search_space_id: ID of the search space + user_id: ID of the user + task_logger: Task logging service + log_entry: Log entry for tracking + pending_document: Optional pending document created in Phase 1 + Returns: - Tuple of (indexed_count, skipped_count) + Tuple of (indexed_count, skipped_count, failed_count) """ file_name = file.get("name", "Unknown") mime_type = file.get("mimeType", "") + file_id = file.get("id") try: logger.info(f"Processing file: {file_name} ({mime_type})") @@ -756,10 +1003,15 @@ async def _process_single_file( # Return 1 for renamed files (they are "indexed" in the sense that they're updated) # Return 0 for unchanged files if "renamed" in (rename_message or "").lower(): - return 1, 0 - return 0, 1 + return 1, 0, 0 + return 0, 1, 0 - _, error, _ = await download_and_process_file( + # Set document to PROCESSING status if we have a pending document + if pending_document: + pending_document.status = DocumentStatus.processing() + await session.commit() + + _, error, metadata = await download_and_process_file( client=drive_client, file=file, search_space_id=search_space_id, @@ -776,14 +1028,46 @@ async def _process_single_file( f"Skipped {file_name}: {error}", {"status": "skipped", "reason": error}, ) - return 0, 1 + # Mark pending document as failed if it exists + if pending_document: + pending_document.status = DocumentStatus.failed(error) + pending_document.updated_at = get_current_timestamp() + await session.commit() + return 0, 1, 0 + + # The document was created/updated by download_and_process_file + # Find the document and ensure it has READY status + if file_id: + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + processed_doc = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + # Ensure status is READY + if processed_doc and not DocumentStatus.is_state( + processed_doc.status, DocumentStatus.READY + ): + processed_doc.status = DocumentStatus.ready() + processed_doc.updated_at = get_current_timestamp() + await session.commit() logger.info(f"Successfully indexed Google Drive file: {file_name}") - return 1, 0 + return 1, 0, 0 except Exception as e: logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True) - return 0, 1 + # Mark pending document as failed if it exists + if pending_document: + try: + pending_document.status = DocumentStatus.failed(str(e)) + pending_document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + return 0, 0, 1 async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index 34d06d796..c7caee4da 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -1,5 +1,9 @@ """ Google Gmail connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -13,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.connectors.google_gmail_connector import GoogleGmailConnector from app.db import ( Document, + DocumentStatus, DocumentType, SearchSourceConnectorType, ) @@ -32,6 +37,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -220,20 +226,23 @@ async def index_google_gmail_messages( logger.info(f"Found {len(messages)} Google gmail messages to index") documents_indexed = 0 - skipped_messages = [] documents_skipped = 0 + documents_failed = 0 # Track messages that failed processing + duplicate_content_count = ( + 0 # Track messages skipped due to duplicate content_hash + ) # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all messages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + for message in messages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: # Extract message information message_id = message.get("id", "") @@ -259,7 +268,6 @@ async def index_google_gmail_messages( if not message_id: logger.warning(f"Skipping message with missing ID: {subject}") - skipped_messages.append(f"{subject} (missing ID)") documents_skipped += 1 continue @@ -268,7 +276,6 @@ async def index_google_gmail_messages( if not markdown_content.strip(): logger.warning(f"Skipping message with no content: {subject}") - skipped_messages.append(f"{subject} (no content)") documents_skipped += 1 continue @@ -288,68 +295,29 @@ async def index_google_gmail_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Gmail message {subject} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Gmail message {subject}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "document_type": "Gmail Message", - "connector_type": "Google Gmail", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - summary_content = f"Google Gmail Message: {subject}\n\n" - summary_content += f"Sender: {sender}\n" - summary_content += f"Date: {date_str}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(markdown_content) - - # Update existing document - existing_document.title = f"Gmail: {subject}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, "message_id": message_id, "thread_id": thread_id, "subject": subject, "sender": sender, - "date": date_str, - "connector_id": connector_id, + "date_str": date_str, } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated Gmail message {subject}") - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -364,48 +332,14 @@ async def index_google_gmail_messages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "message_id": message_id, - "thread_id": thread_id, - "subject": subject, - "sender": sender, - "date": date_str, - "document_type": "Gmail Message", - "connector_type": "Google Gmail", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Google Gmail Message: {subject}\n\n" - summary_content += f"Sender: {sender}\n" - summary_content += f"Date: {date_str}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(markdown_content) - - # Create and store new document - logger.info(f"Creating new document for Gmail message: {subject}") + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Gmail: {subject}", + title=subject, document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR, document_metadata={ "message_id": message_id, @@ -413,21 +347,120 @@ async def index_google_gmail_messages( "subject": subject, "sender": sender, "date": date_str, + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new email {summary_content}") + new_documents_created = True - # Batch commit every 10 documents + messages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], + "document_type": "Gmail Message", + "connector_type": "Google Gmail", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, + ) + else: + summary_content = f"Google Gmail Message: {item['subject']}\n\n" + summary_content += f"Sender: {item['sender']}\n" + summary_content += f"Date: {item['date_str']}\n" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["markdown_content"]) + + # Update document to READY with actual content + document.title = item["subject"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Gmail messages processed so far" @@ -435,45 +468,76 @@ async def index_google_gmail_messages( await session.commit() except Exception as e: - logger.error( - f"Error processing the email {message_id}: {e!s}", - exc_info=True, - ) - skipped_messages.append(f"{subject} (processing error)") - documents_skipped += 1 - continue # Skip this message and continue with others + logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed") - await session.commit() - logger.info( - "Successfully committed all Google gmail document changes to database" - ) + try: + await session.commit() + logger.info( + "Successfully committed all Google Gmail document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same message was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + + total_processed = documents_indexed # Log success await task_logger.log_task_success( log_entry, - f"Successfully completed Google gmail indexing for connector {connector_id}", + f"Successfully completed Google Gmail indexing for connector {connector_id}", { "events_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_messages_count": len(skipped_messages), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"Google gmail indexing completed: {documents_indexed} new emails, {documents_skipped} skipped" + f"Google Gmail indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) return ( total_processed, - None, - ) # Return None as the error message to indicate success + warning_message, + ) # Return warning_message (None on success) except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index 6971703c1..65f56ce46 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -1,5 +1,9 @@ """ Jira connector indexer. + +Provides real-time document status updates during indexing using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED) """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.jira_history import JiraHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -29,6 +33,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -174,22 +179,22 @@ async def index_jira_issues( logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True) return 0, f"Error fetching Jira issues: {e!s}" - # Process and index each issue + # ======================================================================= + # PHASE 1: Analyze all issues, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 - skipped_issues = [] documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + issues_to_process = [] # List of dicts with document and issue data + new_documents_created = False + for issue in issues: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: issue_id = issue.get("key") issue_identifier = issue.get("key", "") @@ -199,9 +204,6 @@ async def index_jira_issues( logger.warning( f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" ) - skipped_issues.append( - f"{issue_identifier or 'Unknown'} (missing data)" - ) documents_skipped += 1 continue @@ -215,7 +217,6 @@ async def index_jira_issues( logger.warning( f"Skipping issue with no content: {issue_identifier} - {issue_title}" ) - skipped_issues.append(f"{issue_identifier} (no content)") documents_skipped += 1 continue @@ -237,73 +238,29 @@ async def index_jira_issues( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Jira issue {issue_identifier} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Jira issue {issue_identifier}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_key": issue_identifier, - "issue_title": issue_title, - "status": formatted_issue.get("status", "Unknown"), - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Jira Issue", - "connector_type": "Jira", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" - if formatted_issue.get("description"): - summary_content += f"Description: {formatted_issue.get('description')}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(issue_content) - - # Update existing document - existing_document.title = ( - f"Jira - {issue_identifier}: {issue_title}" - ) - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + issues_to_process.append( + { + "document": existing_document, + "is_new": False, + "issue_content": issue_content, + "content_hash": content_hash, "issue_id": issue_id, "issue_identifier": issue_identifier, "issue_title": issue_title, - "state": formatted_issue.get("status", "Unknown"), + "formatted_issue": formatted_issue, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Jira issue {issue_identifier}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -318,53 +275,14 @@ async def index_jira_issues( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_key": issue_identifier, - "issue_title": issue_title, - "status": formatted_issue.get("status", "Unknown"), - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Jira Issue", - "connector_type": "Jira", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" - if formatted_issue.get("description"): - summary_content += ( - f"Description: {formatted_issue.get('description')}\n\n" - ) - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = await create_document_chunks(issue_content) - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Jira - {issue_identifier}: {issue_title}", + title=f"{issue_identifier}: {issue_title}", document_type=DocumentType.JIRA_CONNECTOR, document_metadata={ "issue_id": issue_id, @@ -372,25 +290,122 @@ async def index_jira_issues( "issue_title": issue_title, "state": formatted_issue.get("status", "Unknown"), "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" + new_documents_created = True + + issues_to_process.append( + { + "document": document, + "is_new": True, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "formatted_issue": formatted_issue, + "comment_count": comment_count, + } ) - # Batch commit every 10 documents + except Exception as e: + logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(issues_to_process)} documents") + + for item in issues_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "issue_key": item["issue_identifier"], + "issue_title": item["issue_title"], + "status": item["formatted_issue"].get("status", "Unknown"), + "priority": item["formatted_issue"].get("priority", "Unknown"), + "comment_count": item["comment_count"], + "document_type": "Jira Issue", + "connector_type": "Jira", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["issue_content"], user_llm, document_metadata + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n" + if item["formatted_issue"].get("description"): + summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n" + summary_content += f"Comments: {item['comment_count']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full issue content with comments + chunks = await create_document_chunks(item["issue_content"]) + + # Update document to READY with actual content + document.title = f"{item['issue_identifier']}: {item['issue_title']}" + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "issue_id": item["issue_id"], + "issue_identifier": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["formatted_issue"].get("status", "Unknown"), + "comment_count": item["comment_count"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Jira issues processed so far" @@ -399,48 +414,75 @@ async def index_jira_issues( except Exception as e: logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}", exc_info=True, ) - skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" - ) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue # Skip this issue and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches + # Final commit to ensure all documents are persisted (safety net) logger.info(f"Final commit: Total {documents_indexed} Jira issues processed") - await session.commit() - logger.info("Successfully committed all JIRA document changes to database") + try: + await session.commit() + logger.info("Successfully committed all JIRA document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same issue was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed JIRA indexing for connector {connector_id}", { - "issues_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + f"JIRA indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) # Clean up the connector await jira_client.close() - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index a94420bc2..87bafb3c0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -1,5 +1,9 @@ """ Linear connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.linear_connector import LinearConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -196,6 +201,7 @@ async def index_linear_issues( # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Track issues that failed processing skipped_issues = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -207,16 +213,14 @@ async def index_linear_issues( {"stage": "process_issues", "total_issues": len(issues)}, ) - # Process each issue - for issue in issues: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all issues, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + issues_to_process = [] # List of dicts with document and issue data + new_documents_created = False + for issue in issues: try: issue_id = issue.get("id", "") issue_identifier = issue.get("identifier", "") @@ -262,80 +266,39 @@ async def index_linear_issues( state = formatted_issue.get("state", "Unknown") description = formatted_issue.get("description", "") comment_count = len(formatted_issue.get("comments", [])) + priority = formatted_issue.get("priority", "Unknown") if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Linear issue {issue_identifier} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Linear issue {issue_identifier}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_id": issue_identifier, - "issue_title": issue_title, - "state": state, - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Linear Issue", - "connector_type": "Linear", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - if description and len(description) > 1000: - description = description[:997] + "..." - summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(issue_content) - - # Update existing document - existing_document.title = ( - f"Linear - {issue_identifier}: {issue_title}" - ) - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + issues_to_process.append( + { + "document": existing_document, + "is_new": False, + "issue_content": issue_content, + "content_hash": content_hash, "issue_id": issue_id, "issue_identifier": issue_identifier, "issue_title": issue_title, "state": state, + "description": description, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "priority": priority, } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Linear issue {issue_identifier}" - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -353,51 +316,10 @@ async def index_linear_issues( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_id": issue_identifier, - "issue_title": issue_title, - "state": state, - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Linear Issue", - "connector_type": "Linear", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - # Truncate description if it's too long for the summary - if description and len(description) > 1000: - description = description[:997] + "..." - summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = await create_document_chunks(issue_content) - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Linear - {issue_identifier}: {issue_title}", + title=f"{issue_identifier}: {issue_title}", document_type=DocumentType.LINEAR_CONNECTOR, document_metadata={ "issue_id": issue_id, @@ -405,25 +327,126 @@ async def index_linear_issues( "issue_title": issue_title, "state": state, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" + new_documents_created = True + + issues_to_process.append( + { + "document": document, + "is_new": True, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": state, + "description": description, + "comment_count": comment_count, + "priority": priority, + } ) - # Batch commit every 10 documents + except Exception as e: + logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(issues_to_process)} documents") + + for item in issues_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "issue_id": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["state"], + "priority": item["priority"], + "comment_count": item["comment_count"], + "document_type": "Linear Issue", + "connector_type": "Linear", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["issue_content"], user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + description = item["description"] + if description and len(description) > 1000: + description = description[:997] + "..." + summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n" + if description: + summary_content += f"Description: {description}\n\n" + summary_content += f"Comments: {item['comment_count']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["issue_content"]) + + # Update document to READY with actual content + document.title = f"{item['issue_identifier']}: {item['issue_title']}" + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "issue_id": item["issue_id"], + "issue_identifier": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["state"], + "comment_count": item["comment_count"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Linear issues processed so far" @@ -432,44 +455,72 @@ async def index_linear_issues( except Exception as e: logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" + f"{item.get('issue_identifier', 'Unknown')} (processing error)" ) - documents_skipped += 1 - continue # Skip this issue and continue with others + documents_failed += 1 + continue - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Linear issues processed") - await session.commit() - logger.info("Successfully committed all Linear document changes to database") + try: + await session.commit() + logger.info( + "Successfully committed all Linear document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same issue was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Linear indexing for connector {connector_id}", { - "issues_processed": total_processed, + "issues_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_issues_count": len(skipped_issues), }, ) logger.info( - f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + f"Linear indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index c0eb58d1d..04af80e53 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -1,5 +1,9 @@ """ Luma connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all events and create pending documents (visible in UI immediately) +- Phase 2: Process each event: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.luma_connector import LumaConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -227,21 +232,22 @@ async def index_luma_events( logger.error(f"Error fetching Luma events: {e!s}", exc_info=True) return 0, f"Error fetching Luma events: {e!s}" + # ======================================================================= + # PHASE 1: Analyze all events, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 skipped_events = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + events_to_process = [] # List of dicts with document and event data + new_documents_created = False + for event in events: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: # Luma event structure fields - events have nested 'event' field event_data = event.get("event", {}) @@ -298,91 +304,38 @@ async def index_luma_events( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Luma event {event_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Luma event {event_name}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_name": event_name, - "event_url": event_url, - "start_at": start_at, - "end_at": end_at, - "timezone": timezone, - "location": location or "No location", - "city": city, - "hosts": host_names, - "document_type": "Luma Event", - "connector_type": "Luma", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - summary_content = f"Luma Event: {event_name}\n\n" - if event_url: - summary_content += f"URL: {event_url}\n" - summary_content += f"Start: {start_at}\n" - summary_content += f"End: {end_at}\n" - if timezone: - summary_content += f"Timezone: {timezone}\n" - if location: - summary_content += f"Location: {location}\n" - if city: - summary_content += f"City: {city}\n" - if host_names: - summary_content += f"Hosts: {host_names}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(event_markdown) - - # Update existing document - existing_document.title = f"Luma Event - {event_name}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + events_to_process.append( + { + "document": existing_document, + "is_new": False, "event_id": event_id, "event_name": event_name, "event_url": event_url, + "event_markdown": event_markdown, + "content_hash": content_hash, "start_at": start_at, "end_at": end_at, "timezone": timezone, "location": location, "city": city, - "hosts": host_names, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "host_names": host_names, + "description": description, + "cover_url": cover_url, } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated Luma event {event_name}") - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -400,62 +353,10 @@ async def index_luma_events( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_name": event_name, - "event_url": event_url, - "start_at": start_at, - "end_at": end_at, - "timezone": timezone, - "location": location or "No location", - "city": city, - "hosts": host_names, - "document_type": "Luma Event", - "connector_type": "Luma", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Luma Event: {event_name}\n\n" - if event_url: - summary_content += f"URL: {event_url}\n" - summary_content += f"Start: {start_at}\n" - summary_content += f"End: {end_at}\n" - if timezone: - summary_content += f"Timezone: {timezone}\n" - if location: - summary_content += f"Location: {location}\n" - if city: - summary_content += f"City: {city}\n" - if host_names: - summary_content += f"Hosts: {host_names}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(event_markdown) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Luma Event - {event_name}", + title=event_name, document_type=DocumentType.LUMA_CONNECTOR, document_metadata={ "event_id": event_id, @@ -468,23 +369,151 @@ async def index_luma_events( "city": city, "hosts": host_names, "cover_url": cover_url, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new event {event_name}") + new_documents_created = True - # Batch commit every 10 documents + events_to_process.append( + { + "document": document, + "is_new": True, + "event_id": event_id, + "event_name": event_name, + "event_url": event_url, + "event_markdown": event_markdown, + "content_hash": content_hash, + "start_at": start_at, + "end_at": end_at, + "timezone": timezone, + "location": location, + "city": city, + "host_names": host_names, + "description": description, + "cover_url": cover_url, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(events_to_process)} documents") + + for item in events_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "event_id": item["event_id"], + "event_name": item["event_name"], + "event_url": item["event_url"], + "start_at": item["start_at"], + "end_at": item["end_at"], + "timezone": item["timezone"], + "location": item["location"] or "No location", + "city": item["city"], + "hosts": item["host_names"], + "document_type": "Luma Event", + "connector_type": "Luma", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["event_markdown"], user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Luma Event: {item['event_name']}\n\n" + if item["event_url"]: + summary_content += f"URL: {item['event_url']}\n" + summary_content += f"Start: {item['start_at']}\n" + summary_content += f"End: {item['end_at']}\n" + if item["timezone"]: + summary_content += f"Timezone: {item['timezone']}\n" + if item["location"]: + summary_content += f"Location: {item['location']}\n" + if item["city"]: + summary_content += f"City: {item['city']}\n" + if item["host_names"]: + summary_content += f"Hosts: {item['host_names']}\n" + if item["description"]: + desc_preview = item["description"][:1000] + if len(item["description"]) > 1000: + desc_preview += "..." + summary_content += f"Description: {desc_preview}\n" + + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["event_markdown"]) + + # Update document to READY with actual content + document.title = item["event_name"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "event_id": item["event_id"], + "event_name": item["event_name"], + "event_url": item["event_url"], + "start_at": item["start_at"], + "end_at": item["end_at"], + "timezone": item["timezone"], + "location": item["location"], + "city": item["city"], + "hosts": item["host_names"], + "cover_url": item["cover_url"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Luma events processed so far" @@ -493,38 +522,71 @@ async def index_luma_events( except Exception as e: logger.error( - f"Error processing event {event.get('name', 'Unknown')}: {e!s}", + f"Error processing event {item.get('event_name', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_events.append( - f"{event.get('name', 'Unknown')} (processing error)" + f"{item.get('event_name', 'Unknown')} (processing error)" ) - documents_skipped += 1 + documents_failed += 1 continue - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Luma events processed") - await session.commit() + try: + await session.commit() + logger.info("Successfully committed all Luma document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, f"Successfully completed Luma indexing for connector {connector_id}", { - "events_processed": total_processed, + "events_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Luma indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + f"Luma indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return total_processed, None + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index ba494bb9f..1a67ee7fc 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -1,5 +1,9 @@ """ Notion connector indexer. + +Implements real-time document status updates using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (pending → processing → ready/failed) """ import time @@ -9,8 +13,9 @@ from datetime import datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession +from app.config import config from app.connectors.notion_history import NotionHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +33,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -245,12 +251,17 @@ async def index_notion_pages( {"pages_found": 0}, ) logger.info("No Notion pages found to index") + # CRITICAL: Update timestamp even when no pages found so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() await notion_client.close() return 0, None # Success with 0 pages, not an error # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 skipped_pages = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -262,22 +273,69 @@ async def index_notion_pages( {"stage": "process_pages", "total_pages": len(pages)}, ) - # Process each page - for page in pages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all pages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + pages_to_process = [] # List of dicts with document and page data + new_documents_created = False + # Helper function to convert page content to markdown + def process_blocks(blocks, level=0): + result = "" + for block in blocks: + block_type = block.get("type") + block_content = block.get("content", "") + children = block.get("children", []) + + # Add indentation based on level + indent = " " * level + + # Format based on block type + if block_type in ["paragraph", "text"]: + result += f"{indent}{block_content}\n\n" + elif block_type in ["heading_1", "header"]: + result += f"{indent}# {block_content}\n\n" + elif block_type == "heading_2": + result += f"{indent}## {block_content}\n\n" + elif block_type == "heading_3": + result += f"{indent}### {block_content}\n\n" + elif block_type == "bulleted_list_item": + result += f"{indent}* {block_content}\n" + elif block_type == "numbered_list_item": + result += f"{indent}1. {block_content}\n" + elif block_type == "to_do": + result += f"{indent}- [ ] {block_content}\n" + elif block_type == "toggle": + result += f"{indent}> {block_content}\n" + elif block_type == "code": + result += f"{indent}```\n{block_content}\n```\n\n" + elif block_type == "quote": + result += f"{indent}> {block_content}\n\n" + elif block_type == "callout": + result += f"{indent}> **Note:** {block_content}\n\n" + elif block_type == "image": + result += f"{indent}![Image]({block_content})\n\n" + else: + # Default for other block types + if block_content: + result += f"{indent}{block_content}\n\n" + + # Process children recursively + if children: + result += process_blocks(children, level + 1) + + return result + + for page in pages: try: page_id = page.get("page_id") page_title = page.get("title", f"Untitled page ({page_id})") page_content = page.get("content", []) - logger.info(f"Processing Notion page: {page_title} ({page_id})") + if not page_id: + documents_skipped += 1 + continue if not page_content: logger.info(f"No content found in page {page_title}. Skipping.") @@ -287,57 +345,6 @@ async def index_notion_pages( # Convert page content to markdown format markdown_content = f"# Notion Page: {page_title}\n\n" - - # Process blocks recursively - def process_blocks(blocks, level=0): - result = "" - for block in blocks: - block_type = block.get("type") - block_content = block.get("content", "") - children = block.get("children", []) - - # Add indentation based on level - indent = " " * level - - # Format based on block type - if block_type in ["paragraph", "text"]: - result += f"{indent}{block_content}\n\n" - elif block_type in ["heading_1", "header"]: - result += f"{indent}# {block_content}\n\n" - elif block_type == "heading_2": - result += f"{indent}## {block_content}\n\n" - elif block_type == "heading_3": - result += f"{indent}### {block_content}\n\n" - elif block_type == "bulleted_list_item": - result += f"{indent}* {block_content}\n" - elif block_type == "numbered_list_item": - result += f"{indent}1. {block_content}\n" - elif block_type == "to_do": - result += f"{indent}- [ ] {block_content}\n" - elif block_type == "toggle": - result += f"{indent}> {block_content}\n" - elif block_type == "code": - result += f"{indent}```\n{block_content}\n```\n\n" - elif block_type == "quote": - result += f"{indent}> {block_content}\n\n" - elif block_type == "callout": - result += f"{indent}> **Note:** {block_content}\n\n" - elif block_type == "image": - result += f"{indent}![Image]({block_content})\n\n" - else: - # Default for other block types - if block_content: - result += f"{indent}{block_content}\n\n" - - # Process children recursively - if children: - result += process_blocks(children, level + 1) - - return result - - logger.debug( - f"Converting {len(page_content)} blocks to markdown for page {page_title}" - ) markdown_content += process_blocks(page_content) # Format document metadata @@ -377,71 +384,26 @@ async def index_notion_pages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Notion page {page_title} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Notion page {page_title}. Updating document." - ) - # Get user's long context LLM - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if not user_llm: - logger.error( - f"No long context LLM configured for user {user_id}" - ) - skipped_pages.append(f"{page_title} (no LLM configured)") - documents_skipped += 1 - continue - - # Generate summary with metadata - document_metadata = { - "page_title": page_title, + # Queue existing document for update (will be set to processing in Phase 2) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, "page_id": page_id, - "document_type": "Notion Page", - "connector_type": "Notion", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - - # Process chunks - chunks = await create_document_chunks(markdown_content) - - # Update existing document - existing_document.title = f"Notion - {page_title}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { "page_title": page_title, - "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - existing_document.connector_id = connector_id - - documents_indexed += 1 - logger.info(f"Successfully updated Notion page: {page_title}") - - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} documents processed so far" - ) - await session.commit() - - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -456,91 +418,182 @@ async def index_notion_pages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Get user's long context LLM - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - if not user_llm: - logger.error(f"No long context LLM configured for user {user_id}") - skipped_pages.append(f"{page_title} (no LLM configured)") - documents_skipped += 1 - continue - - # Generate summary with metadata - logger.debug(f"Generating summary for page {page_title}") - document_metadata = { - "page_title": page_title, - "page_id": page_id, - "document_type": "Notion Page", - "connector_type": "Notion", - } - summary_content, summary_embedding = await generate_document_summary( - markdown_content, user_llm, document_metadata - ) - - # Process chunks - logger.debug(f"Chunking content for page {page_title}") - chunks = await create_document_chunks(markdown_content) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Notion - {page_title}", + title=page_title, document_type=DocumentType.NOTION_CONNECTOR, document_metadata={ "page_title": page_title, "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new Notion page: {page_title}") + new_documents_created = True - # Batch commit every 10 documents + pages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(pages_to_process)} documents") + + for item in pages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "page_title": item["page_title"], + "page_id": item["page_id"], + "document_type": "Notion Page", + "connector_type": "Notion", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content'][:500]}..." + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item["markdown_content"]) + + # Update document to READY with actual content + document.title = item["page_title"] + document.content = summary_content + document.content_hash = item["content_hash"] + document.embedding = summary_embedding + document.document_metadata = { + "page_title": item["page_title"], + "page_id": item["page_id"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( - f"Committing batch: {documents_indexed} documents processed so far" + f"Committing batch: {documents_indexed} Notion pages processed so far" ) await session.commit() except Exception as e: - logger.error( - f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}", - exc_info=True, - ) - skipped_pages.append( - f"{page.get('title', 'Unknown')} (processing error)" - ) - documents_skipped += 1 - continue # Skip this page and continue with others + logger.error(f"Error processing Notion page: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + skipped_pages.append(f"{item['page_title']} (processing error)") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one page total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches + # Final commit to ensure all documents are persisted (safety net) logger.info(f"Final commit: Total {documents_indexed} documents processed") - await session.commit() + try: + await session.commit() + logger.info( + "Successfully committed all Notion document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same page was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise # Get final count of pages with skipped Notion AI content pages_with_skipped_ai_content = notion_client.get_skipped_content_count() + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + # Prepare result message with user-friendly notification about skipped content result_message = None if skipped_pages: @@ -563,6 +616,8 @@ async def index_notion_pages( "pages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, "skipped_pages_count": len(skipped_pages), "pages_with_skipped_ai_content": pages_with_skipped_ai_content, "result_message": result_message, @@ -570,7 +625,9 @@ async def index_notion_pages( ) logger.info( - f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + f"Notion indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) # Clean up the async client @@ -590,6 +647,10 @@ async def index_notion_pages( "Using legacy token. Reconnect with OAuth for better reliability." ) + # Include warning message if there were issues + if warning_message: + notification_parts.append(warning_message) + user_notification_message = ( " ".join(notification_parts) if notification_parts else None ) diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py index cfc321df1..6dea1a730 100644 --- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py @@ -3,6 +3,10 @@ Obsidian connector indexer. Indexes markdown notes from a local Obsidian vault. This connector is only available in self-hosted mode. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import os @@ -17,7 +21,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -34,6 +38,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -307,25 +312,22 @@ async def index_obsidian_vault( logger.info(f"Processing {len(files)} files after date filtering") - # Get LLM for summarization - long_context_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - indexed_count = 0 skipped_count = 0 + failed_count = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all files, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + files_to_process = [] # List of dicts with document and file data + new_documents_created = False + for file_info in files: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(indexed_count) - last_heartbeat_time = time.time() try: file_path = file_info["path"] relative_path = file_info["relative_path"] @@ -368,13 +370,151 @@ async def index_obsidian_vault( search_space_id, ) + # Generate content hash + content_hash = generate_content_hash(content, search_space_id) + # Check for existing document existing_document = await check_document_by_unique_identifier( session, unique_identifier_hash ) - # Generate content hash - content_hash = generate_content_hash(content, search_space_id) + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + logger.debug(f"Note {title} unchanged, skipping") + skipped_count += 1 + continue + + # Queue existing document for update (will be set to processing in Phase 2) + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_info": file_info, + "content": content, + "body_content": body_content, + "frontmatter": frontmatter, + "wiki_links": wiki_links, + "tags": tags, + "title": title, + "relative_path": relative_path, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + } + ) + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"Obsidian note {title} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping." + ) + duplicate_content_count += 1 + skipped_count += 1 + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=title, + document_type=DocumentType.OBSIDIAN_CONNECTOR, + document_metadata={ + "vault_name": vault_name, + "file_path": relative_path, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + files_to_process.append( + { + "document": document, + "is_new": True, + "file_info": file_info, + "content": content, + "body_content": body_content, + "frontmatter": frontmatter, + "wiki_links": wiki_links, + "tags": tags, + "title": title, + "relative_path": relative_path, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + } + ) + + except Exception as e: + logger.exception( + f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}" + ) + failed_count += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(files_to_process)} documents") + + # Get LLM for summarization + long_context_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + for item in files_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(indexed_count) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Extract data from item + title = item["title"] + relative_path = item["relative_path"] + content = item["content"] + body_content = item["body_content"] + frontmatter = item["frontmatter"] + wiki_links = item["wiki_links"] + tags = item["tags"] + content_hash = item["content_hash"] + file_info = item["file_info"] # Build metadata document_metadata = { @@ -404,134 +544,114 @@ async def index_obsidian_vault( ] document_string = build_document_metadata_string(metadata_sections) - if existing_document: - # Check if content has changed - if existing_document.content_hash == content_hash: - logger.debug(f"Note {title} unchanged, skipping") - skipped_count += 1 - continue - - # Update existing document - logger.info(f"Updating note: {title}") - - # Generate new summary if content changed - if long_context_llm: - new_summary, _ = await generate_document_summary( - document_string, - long_context_llm, - document_metadata, - ) - # Store summary in metadata - document_metadata["summary"] = new_summary - - # Add URL and connector_id to metadata - document_metadata["url"] = ( - f"obsidian://{vault_name}/{relative_path}" - ) - document_metadata["connector_id"] = connector_id - - existing_document.content = document_string - existing_document.content_hash = content_hash - existing_document.document_metadata = document_metadata - existing_document.updated_at = get_current_timestamp() - - # Update embedding - embedding = config.embedding_model_instance.embed(document_string) - existing_document.embedding = embedding - - # Update chunks - delete old and create new - existing_document.chunks.clear() - new_chunks = await create_document_chunks(document_string) - existing_document.chunks = new_chunks - - indexed_count += 1 - - else: - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"Obsidian note {title} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) - skipped_count += 1 - continue - - # Create new document - logger.info(f"Indexing new note: {title}") - - # Generate summary - summary_content = "" - if long_context_llm: - summary_content, _ = await generate_document_summary( - document_string, - long_context_llm, - document_metadata, - ) - - # Generate embedding - embedding = config.embedding_model_instance.embed(document_string) - - # Add URL and summary to metadata - document_metadata["url"] = ( - f"obsidian://{vault_name}/{relative_path}" - ) - document_metadata["summary"] = summary_content - document_metadata["connector_id"] = connector_id - - # Create chunks - chunks = await create_document_chunks(document_string) - - # Create document - new_document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.OBSIDIAN_CONNECTOR, - content=document_string, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - document_metadata=document_metadata, - embedding=embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, + # Generate summary + summary_content = "" + if long_context_llm: + summary_content, _ = await generate_document_summary( + document_string, + long_context_llm, + document_metadata, ) - session.add(new_document) + # Generate embedding + embedding = config.embedding_model_instance.embed(document_string) - indexed_count += 1 + # Add URL and summary to metadata + document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}" + document_metadata["summary"] = summary_content + document_metadata["connector_id"] = connector_id + + # Create chunks + chunks = await create_document_chunks(document_string) + + # Update document to READY with actual content + document.title = title + document.content = document_string + document.content_hash = content_hash + document.embedding = embedding + document.document_metadata = document_metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + indexed_count += 1 + + # Batch commit every 10 documents (for ready status updates) + if indexed_count % 10 == 0: + logger.info( + f"Committing batch: {indexed_count} Obsidian notes processed so far" + ) + await session.commit() except Exception as e: logger.exception( - f"Error processing file {file_info.get('path', 'unknown')}: {e}" + f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}" ) - skipped_count += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + failed_count += 1 continue - # Update connector's last indexed timestamp + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs await update_connector_last_indexed(session, connector, update_last_indexed) - # Commit all changes - await session.commit() + # Final commit for any remaining documents not yet committed in batches + logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed") + try: + await session.commit() + logger.info( + "Successfully committed all Obsidian document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same note was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if failed_count > 0: + warning_parts.append(f"{failed_count} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + + total_processed = indexed_count await task_logger.log_task_success( log_entry, - f"Successfully indexed {indexed_count} Obsidian notes (skipped {skipped_count})", + f"Successfully completed Obsidian vault indexing for connector {connector_id}", { - "indexed_count": indexed_count, - "skipped_count": skipped_count, - "total_files": len(files), + "notes_processed": total_processed, + "documents_indexed": indexed_count, + "documents_skipped": skipped_count, + "documents_failed": failed_count, + "duplicate_content_count": duplicate_content_count, }, ) - return indexed_count, None + logger.info( + f"Obsidian vault indexing completed: {indexed_count} ready, " + f"{skipped_count} skipped, {failed_count} failed " + f"({duplicate_content_count} duplicate content)" + ) + return total_processed, warning_message except SQLAlchemyError as e: logger.exception(f"Database error during Obsidian indexing: {e}") diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py index 3cb4e3c85..111552fa6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -1,5 +1,9 @@ """ Slack connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.slack_history import SlackHistory -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -168,11 +173,15 @@ async def index_slack_messages( f"No Slack channels found for connector {connector_id}", {"channels_found": 0}, ) - return 0, "No Slack channels found" + # CRITICAL: Update timestamp even when no channels found so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no channels found # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Track messages that failed processing skipped_channels = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -184,15 +193,14 @@ async def index_slack_messages( {"stage": "process_channels", "total_channels": len(channels)}, ) - # Process each channel + # ======================================================================= + # PHASE 1: Collect all messages from all channels, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + for channel_obj in channels: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() channel_id = channel_obj["id"] channel_name = channel_obj["name"] is_private = channel_obj["is_private"] @@ -305,47 +313,33 @@ async def index_slack_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document." - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, "channel_name": channel_name, "channel_id": channel_id, + "msg_ts": msg_ts, "start_date": start_date_str, "end_date": end_date_str, "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated Slack message {msg_ts}") - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -363,48 +357,47 @@ async def index_slack_messages( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks(combined_document_string) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Slack - {channel_name}", + title=channel_name, document_type=DocumentType.SLACK_CONNECTOR, document_metadata={ "channel_name": channel_name, "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "msg_ts": msg_ts, + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Slack channels processed so far" - ) - await session.commit() + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "channel_name": channel_name, + "channel_id": channel_id, + "msg_ts": msg_ts, + "start_date": start_date_str, + "end_date": end_date_str, + "message_count": len(formatted_messages), + } + ) logger.info( - f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" + f"Phase 1: Collected {len(formatted_messages)} messages from channel {channel_name}" ) except SlackApiError as slack_error: @@ -420,43 +413,129 @@ async def index_slack_messages( documents_skipped += 1 continue # Skip this channel and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one channel - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item["combined_document_string"]) + doc_embedding = config.embedding_model_instance.embed( + item["combined_document_string"] + ) + + # Update document to READY with actual content + document.title = item["channel_name"] + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] + document.embedding = doc_embedding + document.document_metadata = { + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "start_date": item["start_date"], + "end_date": item["end_date"], + "message_count": item["message_count"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Slack messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error( + f"Error processing Slack message {item.get('msg_ts', 'Unknown')}: {e!s}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {documents_indexed} Slack channels processed") - await session.commit() + logger.info(f"Final commit: Total {documents_indexed} Slack messages processed") + try: + await session.commit() + logger.info("Successfully committed all Slack document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same message was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise - # Prepare result message - result_message = None - if skipped_channels: - result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {total_processed} channels." + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Slack indexing for connector {connector_id}", { - "channels_processed": total_processed, + "channels_processed": len(channels), "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_channels_count": len(skipped_channels), - "result_message": result_message, }, ) logger.info( - f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + f"Slack indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py index 1e26fbc42..1b13a2c37 100644 --- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py @@ -1,17 +1,21 @@ """ Microsoft Teams connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time from collections.abc import Awaitable, Callable -from datetime import UTC +from datetime import UTC, datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.teams_history import TeamsHistory -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -50,6 +55,10 @@ async def index_teams_messages( """ Index Microsoft Teams messages from all accessible teams and channels. + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed + Args: session: Database session connector_id: ID of the Teams connector @@ -165,11 +174,16 @@ async def index_teams_messages( f"No Teams found for connector {connector_id}", {"teams_found": 0}, ) - return 0, "No Teams found" + # CRITICAL: Update timestamp even when no teams found so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 skipped_channels = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -182,8 +196,6 @@ async def index_teams_messages( ) # Convert date strings to datetime objects for filtering - from datetime import datetime - start_datetime = None end_datetime = None if start_date_str: @@ -197,16 +209,14 @@ async def index_teams_messages( hour=23, minute=59, second=59, tzinfo=UTC ) - # Process each team - for team in teams: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Collect all messages and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + for team in teams: team_id = team.get("id") team_name = team.get("displayName", "Unknown Team") @@ -239,7 +249,6 @@ async def index_teams_messages( channel_name, team_name, ) - documents_skipped += 1 continue # Process each message @@ -322,60 +331,33 @@ async def index_teams_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - "Document for Teams message %s in channel %s unchanged. Skipping.", - message_id, - channel_name, - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - "Content changed for Teams message %s in channel %s. Updating document.", - message_id, - channel_name, - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = ( - config.embedding_model_instance.embed( - combined_document_string - ) - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, "team_name": team_name, "team_id": team_id, "channel_name": channel_name, "channel_id": channel_id, + "message_id": message_id, "start_date": start_date_str, "end_date": end_date_str, - "message_count": len(messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = ( - get_current_timestamp() - ) - - documents_indexed += 1 - logger.info( - "Successfully updated Teams message %s", - message_id, - ) - continue + ) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -395,62 +377,50 @@ async def index_teams_messages( duplicate_by_content.id, duplicate_by_content.document_type, ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, - title=f"Teams - {team_name} - {channel_name}", + title=f"{team_name} - {channel_name}", document_type=DocumentType.TEAMS_CONNECTOR, document_metadata={ "team_name": team_name, "team_id": team_id, "channel_name": channel_name, "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - "Committing batch: %s Teams messages processed so far", - documents_indexed, - ) - await session.commit() - - logger.info( - "Successfully indexed channel %s in team %s with %s messages", - channel_name, - team_name, - len(messages), - ) + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "team_name": team_name, + "team_id": team_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": message_id, + "start_date": start_date_str, + "end_date": end_date_str, + } + ) except Exception as e: logger.error( @@ -462,54 +432,143 @@ async def index_teams_messages( skipped_channels.append( f"{team_name}/{channel_name} (processing error)" ) - documents_skipped += 1 continue except Exception as e: logger.error("Error processing team %s: %s", team_name, str(e)) continue - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one document - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item["document"] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item["combined_document_string"]) + doc_embedding = config.embedding_model_instance.embed( + item["combined_document_string"] + ) + + # Update document to READY with actual content + document.title = f"{item['team_name']} - {item['channel_name']}" + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] + document.embedding = doc_embedding + document.document_metadata = { + "team_name": item["team_name"], + "team_id": item["team_id"], + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "start_date": item["start_date"], + "end_date": item["end_date"], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + "Committing batch: %s Teams messages processed so far", + documents_indexed, + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Teams message: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( "Final commit: Total %s Teams messages processed", documents_indexed ) - await session.commit() + try: + await session.commit() + logger.info("Successfully committed all Teams document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise - # Prepare result message - result_message = None + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") if skipped_channels: - result_message = f"Processed {total_processed} messages. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {total_processed} messages." + warning_parts.append(f"{len(skipped_channels)} channels skipped") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Teams indexing for connector {connector_id}", { - "messages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, "skipped_channels_count": len(skipped_channels), - "result_message": result_message, }, ) logger.info( - "Teams indexing completed: %s new messages, %s skipped", + "Teams indexing completed: %s ready, %s skipped, %s failed " + "(%s duplicate content)", documents_indexed, documents_skipped, + documents_failed, + duplicate_content_count, ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index 82ef8870d..8b6005b54 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -1,5 +1,9 @@ """ Webcrawler connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.webcrawler_connector import WebCrawlerConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -49,7 +54,11 @@ async def index_crawled_urls( on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, str | None]: """ - Index web page URLs. + Index web page URLs with real-time document status updates. + + Implements 2-phase approach for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed Args: session: Database session @@ -150,9 +159,9 @@ async def index_crawled_urls( await task_logger.log_task_progress( log_entry, - f"Starting to crawl {len(urls)} URLs", + f"Starting to process {len(urls)} URLs", { - "stage": "crawling", + "stage": "processing", "total_urls": len(urls), }, ) @@ -160,28 +169,128 @@ async def index_crawled_urls( documents_indexed = 0 documents_updated = 0 documents_skipped = 0 - failed_urls = [] + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() - for idx, url in enumerate(urls, 1): - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all URLs, create pending documents for new ones + # This makes ALL new documents visible in the UI immediately with pending status + # ======================================================================= + urls_to_process = [] # List of dicts with document and URL data + new_documents_created = False + + for url in urls: try: - logger.info(f"Processing URL {idx}/{len(urls)}: {url}") + # Generate unique identifier hash for this URL + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.CRAWLED_URL, url, search_space_id + ) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Document exists - check if it's already being processed + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ): + logger.info(f"URL {url} already pending. Skipping.") + documents_skipped += 1 + continue + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + logger.info(f"URL {url} already processing. Skipping.") + documents_skipped += 1 + continue + + # Queue existing document for potential update check + urls_to_process.append( + { + "document": existing_document, + "is_new": False, + "url": url, + "unique_identifier_hash": unique_identifier_hash, + } + ) + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=url[:100], # Placeholder - URL as title (truncated) + document_type=DocumentType.CRAWLED_URL, + document_metadata={ + "url": url, + "connector_id": connector_id, + }, + content="Pending crawl...", # Placeholder content + content_hash=unique_identifier_hash, # Temporary unique value + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # PENDING status - visible in UI + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + urls_to_process.append( + { + "document": document, + "is_new": True, + "url": url, + "unique_identifier_hash": unique_identifier_hash, + } + ) + + except Exception as e: + logger.error(f"Error in Phase 1 for URL {url}: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info( + f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents" + ) + await session.commit() + + # ======================================================================= + # PHASE 2: Process each URL one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(urls_to_process)} URLs") + + for item in urls_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed + documents_updated) + last_heartbeat_time = current_time + + document = item["document"] + url = item["url"] + is_new = item["is_new"] + + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() await task_logger.log_task_progress( log_entry, - f"Crawling URL {idx}/{len(urls)}: {url}", + f"Crawling URL: {url}", { "stage": "crawling_url", - "url_index": idx, "url": url, }, ) @@ -191,7 +300,10 @@ async def index_crawled_urls( if error or not crawl_result: logger.warning(f"Failed to crawl URL {url}: {error}") - failed_urls.append((url, error or "Unknown error")) + document.status = DocumentStatus.failed(error or "Crawl failed") + document.updated_at = get_current_timestamp() + await session.commit() + documents_failed += 1 continue # Extract content and metadata @@ -201,23 +313,18 @@ async def index_crawled_urls( if not content.strip(): logger.warning(f"Skipping URL with no content: {url}") - failed_urls.append((url, "No content extracted")) - documents_skipped += 1 + document.status = DocumentStatus.failed("No content extracted") + document.updated_at = get_current_timestamp() + await session.commit() + documents_failed += 1 continue - # Format content as structured document for summary generation (includes all metadata) + # Format content as structured document for summary generation structured_document = crawler.format_to_structured_document( crawl_result ) - # Generate unique identifier hash for this URL - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.CRAWLED_URL, url, search_space_id - ) - # Generate content hash using a version WITHOUT metadata - # This ensures the hash only changes when actual content changes, - # not when metadata (which contains dynamic fields like timestamps, IDs, etc.) changes structured_document_for_hash = crawler.format_to_structured_document( crawl_result, exclude_metadata=True ) @@ -225,114 +332,53 @@ async def index_crawled_urls( structured_document_for_hash, search_space_id ) - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - # Extract useful metadata title = metadata.get("title", url) description = metadata.get("description", "") language = metadata.get("language", "") - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info(f"Document for URL {url} unchanged. Skipping.") - documents_skipped += 1 - continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for URL {url}. Updating document." - ) + # Update title immediately for better UX + document.title = title + await session.commit() - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "url": url, - "title": title, - "description": description, - "language": language, - "document_type": "Crawled URL", - "crawler_type": crawler_type, - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - structured_document, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Crawled URL: {title}\n\n" - summary_content += f"URL: {url}\n" - if description: - summary_content += f"Description: {description}\n" - if language: - summary_content += f"Language: {language}\n" - summary_content += f"Crawler: {crawler_type}\n\n" - - # Add content preview - content_preview = content[:1000] - if len(content) > 1000: - content_preview += "..." - summary_content += f"Content Preview:\n{content_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(content) - - # Update existing document - existing_document.title = title - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - **metadata, - "crawler_type": crawler_type, - "last_crawled_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_updated += 1 - logger.info(f"Successfully updated URL {url}") - continue - - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"URL {url} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) + # For existing documents, check if content has changed + if not is_new and document.content_hash == content_hash: + logger.info(f"Document for URL {url} unchanged. Marking as ready.") + # Ensure status is ready (might have been stuck) + document.status = DocumentStatus.ready() + await session.commit() documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata + # For new documents, check if duplicate content exists elsewhere + if is_new: + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"URL {url} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}). " + f"Marking as failed." + ) + document.status = DocumentStatus.failed( + "Duplicate content exists" + ) + document.updated_at = get_current_timestamp() + await session.commit() + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Generate summary with LLM user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) if user_llm: - document_metadata = { + document_metadata_for_summary = { "url": url, "title": title, "description": description, @@ -344,7 +390,7 @@ async def index_crawled_urls( summary_content, summary_embedding, ) = await generate_document_summary( - structured_document, user_llm, document_metadata + structured_document, user_llm, document_metadata_for_summary ) else: # Fallback to simple summary if no LLM configured @@ -366,32 +412,32 @@ async def index_crawled_urls( summary_content ) + # Process chunks chunks = await create_document_chunks(content) - document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.CRAWLED_URL, - document_metadata={ - **metadata, - "crawler_type": crawler_type, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) + # Update document to READY with actual content + document.title = title + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + **metadata, + "crawler_type": crawler_type, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.status = DocumentStatus.ready() # READY status + document.updated_at = get_current_timestamp() - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new URL {url}") + if is_new: + documents_indexed += 1 + else: + documents_updated += 1 - # Batch commit every 10 documents + logger.info(f"Successfully processed URL {url}") + + # Batch commit every 10 documents (for ready status updates) if (documents_indexed + documents_updated) % 10 == 0: logger.info( f"Committing batch: {documents_indexed + documents_updated} URLs processed so far" @@ -399,32 +445,51 @@ async def index_crawled_urls( await session.commit() except Exception as e: - logger.error( - f"Error processing URL {url}: {e!s}", - exc_info=True, - ) - failed_urls.append((url, str(e))) + logger.error(f"Error processing URL {url}: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)[:200]) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) + documents_failed += 1 continue total_processed = documents_indexed + documents_updated - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed" ) - await session.commit() - - # Log failed URLs if any (for debugging purposes) - if failed_urls: - failed_summary = "; ".join( - [f"{url}: {error}" for url, error in failed_urls[:5]] + try: + await session.commit() + logger.info( + "Successfully committed all webcrawler document changes to database" ) - if len(failed_urls) > 5: - failed_summary += f" (and {len(failed_urls) - 5} more)" - logger.warning(f"Some URLs failed to index: {failed_summary}") + except Exception as e: + # Handle any remaining integrity errors gracefully + if "duplicate key value violates unique constraint" in str(e).lower(): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, @@ -434,19 +499,21 @@ async def index_crawled_urls( "documents_indexed": documents_indexed, "documents_updated": documents_updated, "documents_skipped": documents_skipped, - "failed_urls_count": len(failed_urls), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( f"Web page indexing completed: {documents_indexed} new, " f"{documents_updated} updated, {documents_skipped} skipped, " - f"{len(failed_urls)} failed" + f"{documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + + if warning_message: + return total_processed, f"Completed with issues: {warning_message}" + + return total_processed, None except SQLAlchemyError as db_error: await session.rollback() @@ -494,9 +561,7 @@ async def get_crawled_url_documents( ) if connector_id: - # Filter by connector if needed - you might need to add a connector_id field to Document - # or filter by some other means depending on your schema - pass + query = query.filter(Document.connector_id == connector_id) result = await session.execute(query) documents = result.scalars().all() diff --git a/surfsense_backend/app/tasks/document_processors/base.py b/surfsense_backend/app/tasks/document_processors/base.py index f29207448..2047ec63d 100644 --- a/surfsense_backend/app/tasks/document_processors/base.py +++ b/surfsense_backend/app/tasks/document_processors/base.py @@ -14,6 +14,35 @@ from app.db import Document md = MarkdownifyTransformer() +def safe_set_chunks(document: Document, chunks: list) -> None: + """ + Safely assign chunks to a document without triggering lazy loading. + + ALWAYS use this instead of `document.chunks = chunks` to avoid + SQLAlchemy async errors (MissingGreenlet / greenlet_spawn). + + Why this is needed: + - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to + load the OLD chunks first (for comparison/orphan detection) + - This lazy loading fails in async context with asyncpg driver + - set_committed_value bypasses this by setting the value directly + + This function is safe regardless of how the document was loaded + (with or without selectinload). + + Args: + document: The Document object to update + chunks: List of Chunk objects to assign + + Example: + # Instead of: document.chunks = chunks (DANGEROUS!) + safe_set_chunks(document, chunks) # Always safe + """ + from sqlalchemy.orm.attributes import set_committed_value + + set_committed_value(document, "chunks", chunks) + + def get_current_timestamp() -> datetime: """ Get the current timestamp with timezone for updated_at field. diff --git a/surfsense_backend/app/tasks/document_processors/circleback_processor.py b/surfsense_backend/app/tasks/document_processors/circleback_processor.py index f412b51dd..a513bcaf0 100644 --- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py +++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py @@ -3,6 +3,11 @@ Circleback meeting document processor. This module processes meeting data received from Circleback webhooks and stores it as searchable documents in the database. + +Implements real-time document status updates for UI feedback: +- Create document with 'pending' status (visible in UI immediately) +- Set to 'processing' while processing content +- Set to 'ready' or 'failed' when complete """ import logging @@ -14,6 +19,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import ( Document, + DocumentStatus, DocumentType, SearchSourceConnector, SearchSourceConnectorType, @@ -30,6 +36,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, get_current_timestamp, + safe_set_chunks, ) logger = logging.getLogger(__name__) @@ -47,6 +54,11 @@ async def add_circleback_meeting_document( """ Process and store a Circleback meeting document. + Implements real-time document status updates: + - Phase 1: Create document with 'pending' status (visible in UI immediately) + - Phase 2: Set to 'processing' while processing content + - Phase 3: Set to 'ready' or 'failed' when complete + Args: session: Database session meeting_id: Circleback meeting ID @@ -59,6 +71,7 @@ async def add_circleback_meeting_document( Returns: Document object if successful, None if failed or duplicate """ + document = None try: # Generate unique identifier hash using Circleback meeting ID unique_identifier = f"circleback_{meeting_id}" @@ -77,6 +90,12 @@ async def add_circleback_meeting_document( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + await session.commit() logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.") return existing_document else: @@ -84,6 +103,78 @@ async def add_circleback_meeting_document( logger.info( f"Content changed for Circleback meeting {meeting_id}. Updating document." ) + document = existing_document + # Set to PROCESSING status and commit - shows "processing" in UI + document.status = DocumentStatus.processing() + await session.commit() + else: + # ======================================================================= + # PHASE 1: Create document with PENDING status + # This makes the document visible in the UI immediately + # ======================================================================= + + # Fetch the user who set up the Circleback connector (preferred) + # or fall back to search space owner if no connector found + created_by_user_id = None + + # Try to find the Circleback connector for this search space + connector_result = await session.execute( + select(SearchSourceConnector.user_id).where( + SearchSourceConnector.search_space_id == search_space_id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR, + ) + ) + connector_user = connector_result.scalar_one_or_none() + + if connector_user: + # Use the user who set up the Circleback connector + created_by_user_id = connector_user + else: + # Fallback: use search space owner if no connector found + search_space_result = await session.execute( + select(SearchSpace.user_id).where(SearchSpace.id == search_space_id) + ) + created_by_user_id = search_space_result.scalar_one_or_none() + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=meeting_name, + document_type=DocumentType.CIRCLEBACK, + document_metadata={ + "CIRCLEBACK_MEETING_ID": meeting_id, + "MEETING_NAME": meeting_name, + "SOURCE": "CIRCLEBACK_WEBHOOK", + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + content_needs_reindexing=False, + updated_at=get_current_timestamp(), + created_by_id=created_by_user_id, + connector_id=connector_id, + ) + session.add(document) + # Commit immediately so document appears in UI with pending status + await session.commit() + logger.info( + f"Created pending Circleback meeting document {meeting_id} in search space {search_space_id}" + ) + + # ======================================================================= + # PHASE 2: Set to PROCESSING status + # ======================================================================= + document.status = DocumentStatus.processing() + await session.commit() + + # ======================================================================= + # PHASE 3: Process the document content + # ======================================================================= # Get LLM for generating summary llm = await get_document_summary_llm(session, search_space_id) @@ -100,7 +191,7 @@ async def add_circleback_meeting_document( summary_embedding = None else: # Generate summary with metadata - document_metadata = { + summary_metadata = { "meeting_name": meeting_name, "meeting_id": meeting_id, "document_type": "Circleback Meeting", @@ -111,7 +202,7 @@ async def add_circleback_meeting_document( }, } summary_content, summary_embedding = await generate_document_summary( - markdown_content, llm, document_metadata + markdown_content, llm, summary_metadata ) # Process chunks @@ -126,7 +217,7 @@ async def add_circleback_meeting_document( f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable" ) - # Prepare document metadata + # Prepare final document metadata document_metadata = { "CIRCLEBACK_MEETING_ID": meeting_id, "MEETING_NAME": meeting_name, @@ -134,77 +225,34 @@ async def add_circleback_meeting_document( **metadata, } - # Fetch the user who set up the Circleback connector (preferred) - # or fall back to search space owner if no connector found - created_by_user_id = None + # ======================================================================= + # PHASE 4: Update document to READY status with actual content + # ======================================================================= + document.title = meeting_name + document.content = summary_content + document.content_hash = content_hash + if summary_embedding is not None: + document.embedding = summary_embedding + document.document_metadata = document_metadata + safe_set_chunks(document, chunks) + document.blocknote_document = blocknote_json + document.content_needs_reindexing = False + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + # Ensure connector_id is set (backfill for documents created before this field) + if connector_id is not None: + document.connector_id = connector_id - # Try to find the Circleback connector for this search space - connector_result = await session.execute( - select(SearchSourceConnector.user_id).where( - SearchSourceConnector.search_space_id == search_space_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR, - ) - ) - connector_user = connector_result.scalar_one_or_none() + await session.commit() + await session.refresh(document) - if connector_user: - # Use the user who set up the Circleback connector - created_by_user_id = connector_user - else: - # Fallback: use search space owner if no connector found - search_space_result = await session.execute( - select(SearchSpace.user_id).where(SearchSpace.id == search_space_id) - ) - created_by_user_id = search_space_result.scalar_one_or_none() - - # Update or create document if existing_document: - # Update existing document - existing_document.title = meeting_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - if summary_embedding is not None: - existing_document.embedding = summary_embedding - existing_document.document_metadata = document_metadata - existing_document.chunks = chunks - existing_document.blocknote_document = blocknote_json - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - # Ensure connector_id is set (backfill for documents created before this field) - if connector_id is not None: - existing_document.connector_id = connector_id - - await session.commit() - await session.refresh(existing_document) - document = existing_document logger.info( f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}" ) else: - # Create new document - document = Document( - search_space_id=search_space_id, - title=meeting_name, - document_type=DocumentType.CIRCLEBACK, - document_metadata=document_metadata, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - blocknote_document=blocknote_json, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=created_by_user_id, - connector_id=connector_id, - ) - - session.add(document) - await session.commit() - await session.refresh(document) logger.info( - f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}" + f"Processed Circleback meeting document {meeting_id} in search space {search_space_id} - now ready" ) return document @@ -214,8 +262,28 @@ async def add_circleback_meeting_document( logger.error( f"Database error processing Circleback meeting {meeting_id}: {db_error}" ) + # Mark document as failed if it was created + if document is not None: + try: + document.status = DocumentStatus.failed(str(db_error)) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) raise db_error except Exception as e: await session.rollback() logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}") + # Mark document as failed if it was created + if document is not None: + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error( + f"Failed to update document status to failed: {status_error}" + ) raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 674773463..3fa57e998 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -17,7 +17,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config as app_config -from app.db import Document, DocumentType, Log, Notification +from app.db import Document, DocumentStatus, DocumentType, Log, Notification from app.services.llm_service import get_user_long_context_llm from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService @@ -33,6 +33,7 @@ from .base import ( check_document_by_unique_identifier, check_duplicate_document, get_current_timestamp, + safe_set_chunks, ) from .markdown_processor import add_received_markdown_file_document @@ -499,6 +500,7 @@ async def add_received_file_document_using_unstructured( existing_document.blocknote_document = blocknote_json existing_document.content_needs_reindexing = False existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() # Mark as ready await session.commit() await session.refresh(existing_document) @@ -528,6 +530,7 @@ async def add_received_file_document_using_unstructured( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), # Mark as ready ) session.add(document) @@ -640,6 +643,7 @@ async def add_received_file_document_using_llamacloud( existing_document.blocknote_document = blocknote_json existing_document.content_needs_reindexing = False existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() # Mark as ready await session.commit() await session.refresh(existing_document) @@ -669,6 +673,7 @@ async def add_received_file_document_using_llamacloud( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), # Mark as ready ) session.add(document) @@ -806,6 +811,7 @@ async def add_received_file_document_using_docling( existing_document.blocknote_document = blocknote_json existing_document.content_needs_reindexing = False existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() # Mark as ready await session.commit() await session.refresh(existing_document) @@ -835,6 +841,7 @@ async def add_received_file_document_using_docling( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), # Mark as ready ) session.add(document) @@ -1606,3 +1613,372 @@ async def process_file_in_background( logging.error(f"Error processing file in background: {error_message}") raise # Re-raise so the wrapper can also handle it + + +async def process_file_in_background_with_document( + document: Document, + file_path: str, + filename: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + connector: dict | None = None, + notification: Notification | None = None, +) -> Document | None: + """ + Process file and update existing pending document (2-phase pattern). + + This function is Phase 2 of the real-time document status updates: + - Phase 1 (API): Created document with pending status + - Phase 2 (this): Process file and update document to ready/failed + + The document already exists with pending status. This function: + 1. Parses the file content (markdown, audio, or ETL services) + 2. Updates the document with content, embeddings, and chunks + 3. Sets status to 'ready' on success + + Args: + document: Existing document with pending status + file_path: Path to the uploaded file + filename: Original filename + search_space_id: ID of the search space + user_id: ID of the user + session: Database session + task_logger: Task logging service + log_entry: Log entry for this task + connector: Optional connector info for Google Drive files + notification: Optional notification for progress updates + + Returns: + Updated Document object if successful, None if duplicate content detected + """ + import os + + from app.config import config as app_config + from app.services.llm_service import get_user_long_context_llm + from app.utils.blocknote_converter import convert_markdown_to_blocknote + + try: + markdown_content = None + etl_service = None + + # ===== STEP 1: Parse file content based on type ===== + + # Check if the file is a markdown or text file + if filename.lower().endswith((".md", ".markdown", ".txt")): + # Update notification: parsing stage + if notification: + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Reading file", + ) + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing markdown/text file: {filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + + # Read markdown content directly + with open(file_path, encoding="utf-8") as f: + markdown_content = f.read() + etl_service = "MARKDOWN" + + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + # Check if the file is an audio file + elif filename.lower().endswith( + (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") + ): + # Update notification: parsing stage (transcription) + if notification: + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Transcribing audio", + ) + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing audio file for transcription: {filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + + # Transcribe audio + stt_service_type = ( + "local" + if app_config.STT_SERVICE + and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + transcribed_text = result.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + markdown_content = ( + f"# Transcription of {filename}\n\n{transcribed_text}" + ) + else: + with open(file_path, "rb") as audio_file: + transcription_kwargs = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + transcription_kwargs["api_base"] = ( + app_config.STT_SERVICE_API_BASE + ) + transcription_response = await atranscription( + **transcription_kwargs + ) + transcribed_text = transcription_response.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + markdown_content = ( + f"# Transcription of {filename}\n\n{transcribed_text}" + ) + + etl_service = "AUDIO_TRANSCRIPTION" + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + else: + # Document files - use ETL service + from app.services.page_limit_service import ( + PageLimitExceededError, + PageLimitService, + ) + + page_limit_service = PageLimitService(session) + + # Estimate page count + try: + estimated_pages = page_limit_service.estimate_pages_before_processing( + file_path + ) + except Exception: + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + # Check page limit + await page_limit_service.check_page_limit(user_id, estimated_pages) + + if app_config.ETL_SERVICE == "UNSTRUCTURED": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Extracting content", + ) + + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + docs = await loader.aload() + markdown_content = await convert_document_to_markdown(docs) + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_page_count = max(estimated_pages, actual_pages) + etl_service = "UNSTRUCTURED" + + # Update page usage + await page_limit_service.update_page_usage( + user_id, final_page_count, allow_exceed=True + ) + + elif app_config.ETL_SERVICE == "LLAMACLOUD": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Extracting content", + ) + + result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + markdown_documents = await result.aget_markdown_documents( + split_by_page=False + ) + if not markdown_documents: + raise RuntimeError( + f"LlamaCloud parsing returned no documents: {filename}" + ) + markdown_content = markdown_documents[0].text + etl_service = "LLAMACLOUD" + + # Update page usage + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + elif app_config.ETL_SERVICE == "DOCLING": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Extracting content", + ) + + # Suppress logging during Docling import + getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) + getLogger("docling.document_converter").setLevel(ERROR) + getLogger( + "docling_core.transforms.chunker.hierarchical_chunker" + ).setLevel(ERROR) + + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + result = converter.convert(file_path) + markdown_content = result.document.export_to_markdown() + etl_service = "DOCLING" + + # Update page usage + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + else: + raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + if not markdown_content: + raise RuntimeError(f"Failed to extract content from file: {filename}") + + # ===== STEP 2: Check for duplicate content ===== + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_by_content = await check_duplicate_document(session, content_hash) + if existing_by_content and existing_by_content.id != document.id: + # Duplicate content found - mark this document as failed + logging.info( + f"Duplicate content detected for {filename}, " + f"matches document {existing_by_content.id}" + ) + return None + + # ===== STEP 3: Generate embeddings and chunks ===== + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="chunking" + ) + + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_name": filename, + "etl_service": etl_service, + "document_type": "File Document", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + # Fallback: use truncated content as summary + summary_content = markdown_content[:4000] + from app.config import config + + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + # Convert to BlockNote for editing + blocknote_json = await convert_markdown_to_blocknote(markdown_content) + + # ===== STEP 4: Update document to READY ===== + from sqlalchemy.orm.attributes import flag_modified + + document.title = filename + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "FILE_NAME": filename, + "ETL_SERVICE": etl_service or "UNKNOWN", + **(document.document_metadata or {}), + } + flag_modified(document, "document_metadata") + + # Use safe_set_chunks to avoid async issues + safe_set_chunks(document, chunks) + + document.blocknote_document = blocknote_json + document.content_needs_reindexing = False + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() # Shows checkmark in UI + + await session.commit() + await session.refresh(document) + + await task_logger.log_task_success( + log_entry, + f"Successfully processed file: {filename}", + { + "document_id": document.id, + "content_hash": content_hash, + "file_type": etl_service, + "chunks_count": len(chunks), + }, + ) + + return document + + except Exception as e: + await session.rollback() + + from app.services.page_limit_service import PageLimitExceededError + + if isinstance(e, PageLimitExceededError): + error_message = str(e) + elif isinstance(e, HTTPException) and "page limit" in str(e.detail).lower(): + error_message = str(e.detail) + else: + error_message = f"Failed to process file: {filename}" + + await task_logger.log_task_failure( + log_entry, + error_message, + str(e), + { + "error_type": type(e).__name__, + "filename": filename, + "document_id": document.id, + }, + ) + logging.error(f"Error processing file with document: {error_message}") + raise diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py index ff85d962e..8ecbb1370 100644 --- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py +++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py @@ -7,7 +7,7 @@ import logging from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.db import Document, DocumentType +from app.db import Document, DocumentStatus, DocumentType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -270,6 +270,7 @@ async def add_received_markdown_file_document( existing_document.chunks = chunks existing_document.blocknote_document = blocknote_json existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() # Mark as ready await session.commit() await session.refresh(existing_document) @@ -297,6 +298,7 @@ async def add_received_markdown_file_document( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), # Mark as ready ) session.add(document) diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index e5599e78b..9dac6d554 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -1,5 +1,9 @@ """ YouTube video document processor. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create document with 'pending' status (visible in UI immediately) +- Phase 2: Process document: pending → processing → ready/failed """ import logging @@ -12,7 +16,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from youtube_transcript_api import YouTubeTranscriptApi -from app.db import Document, DocumentType +from app.db import Document, DocumentStatus, DocumentType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -26,6 +30,7 @@ from app.utils.proxy_config import get_requests_proxies from .base import ( check_document_by_unique_identifier, get_current_timestamp, + safe_set_chunks, ) @@ -61,6 +66,10 @@ async def add_youtube_video_document( """ Process a YouTube video URL, extract transcripts, and store as a document. + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create document with 'pending' status (visible in UI immediately) + - Phase 2: Process document: pending → processing → ready/failed + Args: session: Database session for storing the document url: YouTube video URL (supports standard, shortened, and embed formats) @@ -85,15 +94,18 @@ async def add_youtube_video_document( metadata={"url": url, "user_id": str(user_id)}, ) + document = None + video_id = None + is_new_document = False + try: - # Extract video ID from URL + # Extract video ID from URL (lightweight operation) await task_logger.log_task_progress( log_entry, f"Extracting video ID from URL: {url}", {"stage": "video_id_extraction"}, ) - # Get video ID video_id = get_youtube_video_id(url) if not video_id: raise ValueError(f"Could not extract video ID from URL: {url}") @@ -104,13 +116,87 @@ async def add_youtube_video_document( {"stage": "video_id_extracted", "video_id": video_id}, ) - # Get video metadata + # Generate unique identifier hash for this YouTube video + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.YOUTUBE_VIDEO, video_id, search_space_id + ) + + # Check if document with this unique identifier already exists + await task_logger.log_task_progress( + log_entry, + f"Checking for existing video: {video_id}", + {"stage": "duplicate_check", "video_id": video_id}, + ) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # ======================================================================= + # PHASE 1: Create pending document or prepare existing for update + # ======================================================================= + if existing_document: + document = existing_document + is_new_document = False + # Check if already being processed + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ): + logging.info( + f"YouTube video {video_id} already pending. Returning existing." + ) + return existing_document + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + logging.info( + f"YouTube video {video_id} already processing. Returning existing." + ) + return existing_document + else: + # Create new document with PENDING status (visible in UI immediately) + await task_logger.log_task_progress( + log_entry, + f"Creating pending document for video: {video_id}", + {"stage": "pending_document_creation"}, + ) + + document = Document( + title=f"YouTube Video: {video_id}", # Placeholder title + document_type=DocumentType.YOUTUBE_VIDEO, + document_metadata={ + "url": url, + "video_id": video_id, + }, + content="Processing video...", # Placeholder content + content_hash=unique_identifier_hash, # Temporary unique value + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation + status=DocumentStatus.pending(), # PENDING status - visible in UI + search_space_id=search_space_id, + updated_at=get_current_timestamp(), + created_by_id=user_id, + ) + session.add(document) + await session.commit() # Document visible in UI now with pending status! + is_new_document = True + + logging.info(f"Created pending document for YouTube video {video_id}") + + # ======================================================================= + # PHASE 2: Set to PROCESSING and do heavy work + # ======================================================================= + document.status = DocumentStatus.processing() + await session.commit() # UI shows "processing" status + await task_logger.log_task_progress( log_entry, f"Fetching video metadata for: {video_id}", {"stage": "metadata_fetch"}, ) + # Fetch video metadata params = { "format": "json", "url": f"https://www.youtube.com/watch?v={video_id}", @@ -130,6 +216,10 @@ async def add_youtube_video_document( ): video_data = await response.json() + # Update title immediately for better UX (user sees actual title sooner) + document.title = video_data.get("title", f"YouTube Video: {video_id}") + await session.commit() + await task_logger.log_task_progress( log_entry, f"Video metadata fetched: {video_data.get('title', 'Unknown')}", @@ -219,53 +309,28 @@ async def add_youtube_video_document( document_parts.append("") combined_document_string = "\n".join(document_parts) - # Generate unique identifier hash for this YouTube video - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.YOUTUBE_VIDEO, video_id, search_space_id - ) - # Generate content hash content_hash = generate_content_hash(combined_document_string, search_space_id) - # Check if document with this unique identifier already exists - await task_logger.log_task_progress( - log_entry, - f"Checking for existing video: {video_id}", - {"stage": "duplicate_check", "video_id": video_id}, - ) + # For existing documents, check if content has changed + if not is_new_document and existing_document.content_hash == content_hash: + await task_logger.log_task_success( + log_entry, + f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + "video_id": video_id, + }, + ) + logging.info( + f"Document for YouTube video {video_id} unchanged. Marking as ready." + ) + document.status = DocumentStatus.ready() + await session.commit() + return document - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - await task_logger.log_task_success( - log_entry, - f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - "video_id": video_id, - }, - ) - logging.info( - f"Document for YouTube video {video_id} unchanged. Skipping." - ) - return existing_document - else: - # Content has changed - update the existing document - logging.info( - f"Content changed for YouTube video {video_id}. Updating document." - ) - await task_logger.log_task_progress( - log_entry, - f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_update", "video_id": video_id}, - ) - - # Get LLM for summary generation (needed for both create and update) + # Get LLM for summary generation await task_logger.log_task_progress( log_entry, f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}", @@ -287,7 +352,7 @@ async def add_youtube_video_document( ) # Generate summary with metadata - document_metadata = { + document_metadata_for_summary = { "url": url, "video_id": video_id, "title": video_data.get("title", "YouTube Video"), @@ -297,7 +362,7 @@ async def add_youtube_video_document( "has_transcript": "No captions available" not in transcript_text, } summary_content, summary_embedding = await generate_document_summary( - combined_document_string, user_llm, document_metadata + combined_document_string, user_llm, document_metadata_for_summary ) # Process chunks @@ -319,65 +384,33 @@ async def add_youtube_video_document( chunks = await create_document_chunks(combined_document_string) - # Update or create document - if existing_document: - # Update existing document - await task_logger.log_task_progress( - log_entry, - f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_update", "chunks_count": len(chunks)}, - ) + # ======================================================================= + # PHASE 3: Update document to READY with all content + # ======================================================================= + await task_logger.log_task_progress( + log_entry, + f"Finalizing document: {video_data.get('title', 'YouTube Video')}", + {"stage": "document_finalization", "chunks_count": len(chunks)}, + ) - existing_document.title = video_data.get("title", "YouTube Video") - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "url": url, - "video_id": video_id, - "video_title": video_data.get("title", "YouTube Video"), - "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", ""), - } - existing_document.chunks = chunks - existing_document.blocknote_document = blocknote_json - existing_document.updated_at = get_current_timestamp() + document.title = video_data.get("title", "YouTube Video") + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "url": url, + "video_id": video_id, + "video_title": video_data.get("title", "YouTube Video"), + "author": video_data.get("author_name", "Unknown"), + "thumbnail": video_data.get("thumbnail_url", ""), + } + safe_set_chunks(document, chunks) + document.blocknote_document = blocknote_json + document.status = DocumentStatus.ready() # READY status - fully processed + document.updated_at = get_current_timestamp() - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - await task_logger.log_task_progress( - log_entry, - f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_creation", "chunks_count": len(chunks)}, - ) - - document = Document( - title=video_data.get("title", "YouTube Video"), - document_type=DocumentType.YOUTUBE_VIDEO, - document_metadata={ - "url": url, - "video_id": video_id, - "video_title": video_data.get("title", "YouTube Video"), - "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", ""), - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - search_space_id=search_space_id, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - blocknote_document=blocknote_json, - updated_at=get_current_timestamp(), - created_by_id=user_id, - ) - - session.add(document) - await session.commit() - await session.refresh(document) + await session.commit() + await session.refresh(document) # Log success await task_logger.log_task_success( @@ -395,27 +428,51 @@ async def add_youtube_video_document( ) return document + except SQLAlchemyError as db_error: - await session.rollback() + # Mark document as failed if it exists + if document: + try: + document.status = DocumentStatus.failed( + f"Database error: {str(db_error)[:150]}" + ) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception: + await session.rollback() + else: + await session.rollback() + await task_logger.log_task_failure( log_entry, f"Database error while processing YouTube video: {url}", str(db_error), { "error_type": "SQLAlchemyError", - "video_id": video_id if "video_id" in locals() else None, + "video_id": video_id, }, ) raise db_error + except Exception as e: - await session.rollback() + # Mark document as failed if it exists + if document: + try: + document.status = DocumentStatus.failed(str(e)[:200]) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception: + await session.rollback() + else: + await session.rollback() + await task_logger.log_task_failure( log_entry, f"Failed to process YouTube video: {url}", str(e), { "error_type": type(e).__name__, - "video_id": video_id if "video_id" in locals() else None, + "video_id": video_id, }, ) logging.error(f"Failed to process YouTube video: {e!s}") diff --git a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx index b9ddb9b74..83a579970 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx @@ -13,6 +13,7 @@ import { llmPreferencesAtom, } from "@/atoms/new-llm-config/new-llm-config-query.atoms"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; +import { ConnectorIndicator } from "@/components/assistant-ui/connector-popup"; import { DocumentUploadDialogProvider } from "@/components/assistant-ui/document-upload-popup"; import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb"; import { LayoutDataProvider } from "@/components/layout"; @@ -192,6 +193,8 @@ export function DashboardClientLayout({ }> {children} + {/* Global connector dialog - triggered from documents page */} + ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx index e483dea12..b214c96be 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx @@ -1,10 +1,12 @@ "use client"; import type React from "react"; +import { useRef, useState, useEffect } from "react"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; -export function getDocumentTypeIcon(type: string): React.ReactNode { - return getConnectorIcon(type); +export function getDocumentTypeIcon(type: string, className?: string): React.ReactNode { + return getConnectorIcon(type, className); } export function getDocumentTypeLabel(type: string): string { @@ -15,16 +17,43 @@ export function getDocumentTypeLabel(type: string): string { } export function DocumentTypeChip({ type, className }: { type: string; className?: string }) { - const icon = getDocumentTypeIcon(type); - return ( - (null); + const [isTruncated, setIsTruncated] = useState(false); + + useEffect(() => { + const checkTruncation = () => { + if (textRef.current) { + setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth); } + }; + checkTruncation(); + window.addEventListener("resize", checkTruncation); + return () => window.removeEventListener("resize", checkTruncation); + }, []); + + const chip = ( + - {icon} - {getDocumentTypeLabel(type)} + {icon} + + {fullLabel} + ); + + if (isTruncated) { + return ( + + {chip} + +

{fullLabel}

+
+
+ ); + } + + return chip; } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index 67413d6f0..ebdf431e4 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -1,9 +1,21 @@ "use client"; -import { CircleAlert, CircleX, Columns3, Filter, ListFilter, Trash } from "lucide-react"; -import { AnimatePresence, motion, type Variants } from "motion/react"; +import { useSetAtom } from "jotai"; +import { + CircleAlert, + CircleX, + FilePlus2, + FileType, + ListFilter, + Search, + SlidersHorizontal, + Trash, +} from "lucide-react"; +import { motion } from "motion/react"; import { useTranslations } from "next-intl"; -import React, { useMemo, useRef } from "react"; +import React, { useMemo, useRef, useState } from "react"; +import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms"; +import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; import { AlertDialog, AlertDialogAction, @@ -17,24 +29,10 @@ import { } from "@/components/ui/alert-dialog"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; -import { - DropdownMenu, - DropdownMenuCheckboxItem, - DropdownMenuContent, - DropdownMenuLabel, - DropdownMenuTrigger, -} from "@/components/ui/dropdown-menu"; import { Input } from "@/components/ui/input"; -import { Label } from "@/components/ui/label"; import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; -import type { ColumnVisibility } from "./types"; - -const fadeInScale: Variants = { - hidden: { opacity: 0, scale: 0.95 }, - visible: { opacity: 1, scale: 1, transition: { type: "spring", stiffness: 300, damping: 30 } }, - exit: { opacity: 0, scale: 0.95, transition: { duration: 0.15 } }, -}; +import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon"; export function DocumentsFilters({ typeCounts: typeCountsRecord, @@ -44,8 +42,6 @@ export function DocumentsFilters({ onBulkDelete, onToggleType, activeTypes, - columnVisibility, - onToggleColumn, }: { typeCounts: Partial>; selectedIds: Set; @@ -54,17 +50,27 @@ export function DocumentsFilters({ onBulkDelete: () => Promise; onToggleType: (type: DocumentTypeEnum, checked: boolean) => void; activeTypes: DocumentTypeEnum[]; - columnVisibility: ColumnVisibility; - onToggleColumn: (id: keyof ColumnVisibility, checked: boolean) => void; }) { const t = useTranslations("documents"); const id = React.useId(); const inputRef = useRef(null); + // Dialog hooks for action buttons + const { openDialog: openUploadDialog } = useDocumentUploadDialog(); + const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom); + + const [typeSearchQuery, setTypeSearchQuery] = useState(""); + const uniqueTypes = useMemo(() => { return Object.keys(typeCountsRecord).sort() as DocumentTypeEnum[]; }, [typeCountsRecord]); + const filteredTypes = useMemo(() => { + if (!typeSearchQuery.trim()) return uniqueTypes; + const query = typeSearchQuery.toLowerCase(); + return uniqueTypes.filter((type) => getDocumentTypeLabel(type).toLowerCase().includes(query)); + }, [uniqueTypes, typeSearchQuery]); + const typeCounts = useMemo(() => { const map = new Map(); for (const [type, count] of Object.entries(typeCountsRecord)) { @@ -75,202 +81,233 @@ export function DocumentsFilters({ return ( -
+ {/* Main toolbar row */} +
+ {/* Action Buttons - Left Side */} +
+ + +
+ + {/* Spacer */} +
+ + {/* Search Input */} +
+
onSearch(e.target.value)} - placeholder={t("filter_placeholder")} + placeholder="Filter by title" type="text" aria-label={t("filter_placeholder")} /> - - {Boolean(searchValue) && ( { onSearch(""); inputRef.current?.focus(); }} - initial={{ opacity: 0, rotate: -90 }} - animate={{ opacity: 1, rotate: 0 }} - exit={{ opacity: 0, rotate: 90 }} + initial={{ opacity: 0, scale: 0.8 }} + animate={{ opacity: 1, scale: 1 }} + exit={{ opacity: 0, scale: 0.8 }} whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.9 }} > - )}
- - - - - - - - -
-
Filters
-
- - {uniqueTypes.map((value: DocumentTypeEnum, i) => ( - + +
+ {/* Search input */} +
+
+ + setTypeSearchQuery(e.target.value)} + className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0" + /> +
+
+ +
+ {filteredTypes.length === 0 ? ( +
+ No types found +
+ ) : ( + filteredTypes.map((value: DocumentTypeEnum, i) => ( +
onToggleType(value, !activeTypes.includes(value))} + onKeyDown={(e) => { + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onToggleType(value, !activeTypes.includes(value)); + } + }} > + {/* Icon */} +
+ {getDocumentTypeIcon(value, "h-4 w-4")} +
+ {/* Text content */} +
+ + {getDocumentTypeLabel(value)} + + + {typeCounts.get(value)} document + {(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""} + +
+ {/* Checkbox */} onToggleType(value, !!checked)} + className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary" /> - - - ))} - +
+ )) + )}
+ {activeTypes.length > 0 && ( +
+ +
+ )}
-
- - + + - - - - - - - - Toggle columns - {( - [ - ["title", "Title"], - ["document_type", "Type"], - ["content", "Content"], - ["created_at", "Created At"], - ] as Array<[keyof ColumnVisibility, string]> - ).map(([key, label]) => ( - onToggleColumn(key, !!v)} - onSelect={(e) => e.preventDefault()} - > - {label} - - ))} - - -
- -
- {selectedIds.size > 0 && ( - - - - - -
- - - Cancel - Delete - - - - )} + + Cancel + + Delete + + + + + )} +
); diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index d9908f46c..d5ee00dfb 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -1,14 +1,30 @@ "use client"; -import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react"; +import { formatDistanceToNow } from "date-fns"; +import { + AlertCircle, + Calendar, + CheckCircle2, + ChevronDown, + ChevronUp, + Clock, + FileText, + FileX, + Loader2, + Network, + Plus, + User, +} from "lucide-react"; import { motion } from "motion/react"; -import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; -import React from "react"; +import React, { useRef, useState, useEffect, useCallback } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; -import { DocumentViewer } from "@/components/document-viewer"; +import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; +import { MarkdownViewer } from "@/components/markdown-viewer"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; +import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog"; +import { Skeleton } from "@/components/ui/skeleton"; import { Spinner } from "@/components/ui/spinner"; import { Table, @@ -19,9 +35,64 @@ import { TableRow, } from "@/components/ui/table"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; -import { DocumentTypeChip, getDocumentTypeIcon } from "./DocumentTypeIcon"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; +import { DocumentTypeChip } from "./DocumentTypeIcon"; import { RowActions } from "./RowActions"; -import type { ColumnVisibility, Document } from "./types"; +import type { ColumnVisibility, Document, DocumentStatus } from "./types"; + +// Status indicator component for document processing status +function StatusIndicator({ status }: { status?: DocumentStatus }) { + const state = status?.state ?? "ready"; + + switch (state) { + case "pending": + return ( + + +
+ +
+
+ Pending - waiting to be synced +
+ ); + case "processing": + return ( + + +
+ +
+
+ Syncing +
+ ); + case "failed": + return ( + + +
+ +
+
+ + {status?.reason || "Processing failed"} + +
+ ); + case "ready": + return ( + + +
+ +
+
+ Ready +
+ ); + } +} export type SortKey = keyof Pick; @@ -36,57 +107,215 @@ function sortDocuments(docs: Document[], key: SortKey, desc: boolean): Document[ return desc ? sorted.reverse() : sorted; } -function truncate(text: string, len = 150): string { - const plain = text - .replace(/[#*_`>\-[\]()]+/g, " ") - .replace(/\s+/g, " ") - .trim(); - if (plain.length <= len) return plain; - return `${plain.slice(0, len)}...`; +function formatRelativeDate(dateStr: string): string { + return formatDistanceToNow(new Date(dateStr), { addSuffix: true }); +} + +function formatAbsoluteDate(dateStr: string): string { + const date = new Date(dateStr); + return date.toLocaleString("en-US", { + year: "numeric", + month: "long", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); +} + +function TruncatedText({ text, className }: { text: string; className?: string }) { + const textRef = useRef(null); + const [isTruncated, setIsTruncated] = useState(false); + + useEffect(() => { + const checkTruncation = () => { + if (textRef.current) { + setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth); + } + }; + checkTruncation(); + window.addEventListener("resize", checkTruncation); + return () => window.removeEventListener("resize", checkTruncation); + }, []); + + if (isTruncated) { + return ( + + + + {text} + + + +

{text}

+
+
+ ); + } + + return ( + + {text} + + ); +} + +function SortableHeader({ + children, + sortKey, + currentSortKey, + sortDesc, + onSort, + icon, +}: { + children: React.ReactNode; + sortKey: SortKey; + currentSortKey: SortKey; + sortDesc: boolean; + onSort: (key: SortKey) => void; + icon?: React.ReactNode; +}) { + const isActive = currentSortKey === sortKey; + return ( + + ); } export function DocumentsTableShell({ documents, loading, error, - onRefresh, selectedIds, setSelectedIds, columnVisibility, - deleteDocument, sortKey, sortDesc, onSortChange, + deleteDocument, + searchSpaceId, }: { documents: Document[]; loading: boolean; error: boolean; - onRefresh: () => Promise; selectedIds: Set; setSelectedIds: (update: Set) => void; columnVisibility: ColumnVisibility; - deleteDocument: (id: number) => Promise; sortKey: SortKey; sortDesc: boolean; onSortChange: (key: SortKey) => void; + deleteDocument: (id: number) => Promise; + searchSpaceId: string; }) { const t = useTranslations("documents"); - const params = useParams(); - const searchSpaceId = params.search_space_id; const { openDialog } = useDocumentUploadDialog(); + // State for metadata viewer (opened via Ctrl/Cmd+Click) + // Real-time documents don't sync metadata - we fetch on-demand when viewing + const [metadataDoc, setMetadataDoc] = useState(null); + const [metadataContent, setMetadataContent] = useState(null); + const [metadataLoading, setMetadataLoading] = useState(false); + + // State for lazy document content viewer + // Real-time documents don't sync content - we fetch on-demand when viewing + const [viewingDoc, setViewingDoc] = useState(null); + const [viewingContent, setViewingContent] = useState(""); + const [viewingLoading, setViewingLoading] = useState(false); + + // Fetch document metadata on-demand when metadata viewer is opened + const handleViewMetadata = useCallback(async (doc: Document) => { + setMetadataDoc(doc); + + // If metadata is already available (from API/search), use it directly + if (doc.document_metadata) { + setMetadataContent(doc.document_metadata); + return; + } + + // Otherwise, fetch from API (lazy loading for real-time synced documents) + setMetadataLoading(true); + try { + const fullDoc = await documentsApiService.getDocument({ id: doc.id }); + setMetadataContent(fullDoc.document_metadata); + } catch (err) { + console.error("[DocumentsTableShell] Failed to fetch document metadata:", err); + setMetadataContent(null); + } finally { + setMetadataLoading(false); + } + }, []); + + // Close metadata viewer + const handleCloseMetadata = useCallback(() => { + setMetadataDoc(null); + setMetadataContent(null); + setMetadataLoading(false); + }, []); + + // Fetch document content on-demand when viewer is opened + const handleViewDocument = useCallback(async (doc: Document) => { + setViewingDoc(doc); + + // If content is already available (from API/search), use it directly + if (doc.content) { + setViewingContent(doc.content); + return; + } + + // Otherwise, fetch from API (lazy loading for real-time synced documents) + setViewingLoading(true); + try { + const fullDoc = await documentsApiService.getDocument({ id: doc.id }); + setViewingContent(fullDoc.content); + } catch (err) { + console.error("[DocumentsTableShell] Failed to fetch document content:", err); + setViewingContent("Failed to load document content."); + } finally { + setViewingLoading(false); + } + }, []); + + // Close document viewer + const handleCloseViewer = useCallback(() => { + setViewingDoc(null); + setViewingContent(""); + setViewingLoading(false); + }, []); + const sorted = React.useMemo( () => sortDocuments(documents, sortKey, sortDesc), [documents, sortKey, sortDesc] ); - const allSelectedOnPage = sorted.length > 0 && sorted.every((d) => selectedIds.has(d.id)); - const someSelectedOnPage = sorted.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage; + // Helper: check if document can be selected (not processing/pending) + const isSelectable = (doc: Document) => { + const state = doc.status?.state; + return state !== "pending" && state !== "processing"; + }; + + // Only consider selectable documents for "select all" logic + const selectableDocs = sorted.filter(isSelectable); + const allSelectedOnPage = + selectableDocs.length > 0 && selectableDocs.every((d) => selectedIds.has(d.id)); + const someSelectedOnPage = + selectableDocs.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage; const toggleAll = (checked: boolean) => { const next = new Set(selectedIds); if (checked) - sorted.forEach((d) => { + // Only select documents that are not processing/pending + selectableDocs.forEach((d) => { next.add(d.id); }); else @@ -107,39 +336,139 @@ export function DocumentsTableShell({ return ( {loading ? ( -
-
- -

{t("loading")}

+ <> + {/* Desktop Skeleton View */} +
+ + + + +
+ +
+
+ + + + {columnVisibility.document_type && ( + + + + )} + {columnVisibility.created_by && ( + + + + )} + {columnVisibility.created_at && ( + + + + )} + {columnVisibility.status && ( + + + + )} + + Actions + +
+
+
+
+ + + {[65, 80, 45, 72, 55, 88, 40, 60, 50, 75].map((widthPercent, index) => ( + + +
+ +
+
+ + + + {columnVisibility.document_type && ( + + + + )} + {columnVisibility.created_by && ( + + + + )} + {columnVisibility.created_at && ( + + + + )} + {columnVisibility.status && ( + + + + )} + + + +
+ ))} +
+
+
-
+ {/* Mobile Skeleton View */} +
+ {[70, 85, 55, 78, 62, 90].map((widthPercent, index) => ( +
+
+ +
+ +
+ + {columnVisibility.created_by && } + {columnVisibility.created_at && } +
+
+
+ {columnVisibility.status && } + +
+
+
+ ))} +
+ ) : error ? ( -
-
+
+
+

{t("error_loading")}

-
) : sorted.length === 0 ? ( -
+
-
- +
+
-
+

{t("no_documents")}

Get started by uploading your first document. @@ -153,234 +482,301 @@ export function DocumentsTableShell({

) : ( <> -
+ {/* Desktop Table View - Notion Style */} +
+ {/* Fixed Header */} - - - - toggleAll(!!v)} - aria-label="Select all" - /> + + + +
+ toggleAll(!!v)} + aria-label="Select all" + className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary" + /> +
+
+ + } + > + Document + - {columnVisibility.title && ( - - - - )} {columnVisibility.document_type && ( - - + Source + )} - {columnVisibility.content && ( - {t("content_summary")} + {columnVisibility.created_by && ( + + + + User + + )} {columnVisibility.created_at && ( - - + Created + )} - + {columnVisibility.status && ( + + Status + + )} + Actions
- - {sorted.map((doc, index) => { - const icon = getDocumentTypeIcon(doc.document_type); - const title = doc.title; - const truncatedTitle = title.length > 30 ? `${title.slice(0, 30)}...` : title; - return ( - - - toggleOne(doc.id, !!v)} - aria-label="Select row" - /> - - {columnVisibility.title && ( - - + {/* Scrollable Body */} +
+
+ + {sorted.map((doc, index) => { + const title = doc.title; + const isSelected = selectedIds.has(doc.id); + const canSelect = isSelectable(doc); + return ( + + +
+ canSelect && toggleOne(doc.id, !!v)} + disabled={!canSelect} + aria-label={ + canSelect ? "Select row" : "Cannot select while processing" + } + className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`} + /> +
+
+ + + + {columnVisibility.document_type && ( + + + + )} + {columnVisibility.created_by && ( + + {doc.created_by_name || "—"} + + )} + {columnVisibility.created_at && ( + - - {icon} - {truncatedTitle} + + {formatRelativeDate(doc.created_at)} - -

{title}

+ + {formatAbsoluteDate(doc.created_at)}
- +
+ )} + {columnVisibility.status && ( + + + + )} + + - )} - {columnVisibility.document_type && ( - -
- -
-
- )} - {columnVisibility.content && ( - -
-
- {truncate(doc.content)} -
- - {t("view_full")} - - } - /> -
-
- )} - {columnVisibility.created_at && ( - - {new Date(doc.created_at).toLocaleDateString()} - - )} - - { - await onRefresh(); - }} - searchSpaceId={searchSpaceId as string} - /> - -
- ); - })} -
-
+ + ); + })} + + +
-
- {sorted.map((doc) => { - const icon = getDocumentTypeIcon(doc.document_type); + + {/* Mobile Card View - Notion Style */} +
+ {sorted.map((doc, index) => { + const isSelected = selectedIds.has(doc.id); + const canSelect = isSelectable(doc); return ( -
+
toggleOne(doc.id, !!v)} - aria-label="Select row" + checked={isSelected} + onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)} + disabled={!canSelect} + aria-label={canSelect ? "Select row" : "Cannot select while processing"} + className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`} /> -
-
- {icon} -
{doc.title}
-
-
+
+ +
- - {new Date(doc.created_at).toLocaleDateString()} - + {columnVisibility.created_by && doc.created_by_name && ( + {doc.created_by_name} + )} + {columnVisibility.created_at && ( + + + + {formatRelativeDate(doc.created_at)} + + + + {formatAbsoluteDate(doc.created_at)} + + + )}
- {columnVisibility.content && ( -
- {truncate(doc.content)} -
- - {t("view_full")} - - } - /> -
-
- )}
- { - await onRefresh(); - }} - searchSpaceId={searchSpaceId as string} - /> +
+ {columnVisibility.status && } + +
-
+ ); })}
)} + + {/* Metadata Viewer - opened via Ctrl/Cmd+Click on document title */} + {/* Lazy loads metadata from API for real-time synced documents */} + { + if (!open) handleCloseMetadata(); + }} + /> + + {/* Document Content Viewer - lazy loads content on-demand */} + !open && handleCloseViewer()}> + + + {viewingDoc?.title} + +
+ {viewingLoading ? ( +
+ +
+ ) : ( + + )} +
+
+
); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx index d87fa2dc9..bd8a9f1cc 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx @@ -2,164 +2,89 @@ import { ChevronFirst, ChevronLast, ChevronLeft, ChevronRight } from "lucide-react"; import { motion } from "motion/react"; -import { useTranslations } from "next-intl"; import { Button } from "@/components/ui/button"; -import { Label } from "@/components/ui/label"; -import { Pagination, PaginationContent, PaginationItem } from "@/components/ui/pagination"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; + +const PAGE_SIZE = 50; export function PaginationControls({ pageIndex, - pageSize, total, - onPageSizeChange, onFirst, onPrev, onNext, onLast, canPrev, canNext, - id, }: { pageIndex: number; - pageSize: number; total: number; - onPageSizeChange: (size: number) => void; onFirst: () => void; onPrev: () => void; onNext: () => void; onLast: () => void; canPrev: boolean; canNext: boolean; - id: string; }) { - const t = useTranslations("documents"); - const start = total === 0 ? 0 : pageIndex * pageSize + 1; - const end = Math.min((pageIndex + 1) * pageSize, total); + const start = pageIndex * PAGE_SIZE + 1; + const end = Math.min((pageIndex + 1) * PAGE_SIZE, total); return ( -
- - - - + + {/* Range indicator */} + + {start}-{end} of {total} + - -

- - {start}-{end} - {" "} - of {total} -

-
- -
- - - - - - - - - - - - - - - - - - - - - - - - + {/* Navigation buttons */} +
+ + + +
-
+
); } + +export { PAGE_SIZE }; diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index d9a894e5a..ec355f576 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -1,11 +1,9 @@ "use client"; -import { FileText, MoreHorizontal, Pencil, Trash2 } from "lucide-react"; -import { motion } from "motion/react"; +import { MoreHorizontal, Pencil, Trash2 } from "lucide-react"; import { useRouter } from "next/navigation"; import { useState } from "react"; import { toast } from "sonner"; -import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { AlertDialog, AlertDialogAction, @@ -22,7 +20,6 @@ import { DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; -import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import type { Document } from "./types"; // Only FILE and NOTE document types can be edited @@ -34,16 +31,13 @@ const NON_DELETABLE_DOCUMENT_TYPES = ["SURFSENSE_DOCS"] as const; export function RowActions({ document, deleteDocument, - refreshDocuments, searchSpaceId, }: { document: Document; deleteDocument: (id: number) => Promise; - refreshDocuments: () => Promise; searchSpaceId: string; }) { const [isDeleteOpen, setIsDeleteOpen] = useState(false); - const [isMetadataOpen, setIsMetadataOpen] = useState(false); const [isDeleting, setIsDeleting] = useState(false); const router = useRouter(); @@ -51,20 +45,37 @@ export function RowActions({ document.document_type as (typeof EDITABLE_DOCUMENT_TYPES)[number] ); - const isDeletable = !NON_DELETABLE_DOCUMENT_TYPES.includes( + // Documents in "pending" or "processing" state should show disabled delete + const isBeingProcessed = + document.status?.state === "pending" || document.status?.state === "processing"; + + // SURFSENSE_DOCS are system-managed and should not show delete at all + const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes( document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number] ); + // Edit and Delete are disabled while processing + const isEditDisabled = isBeingProcessed; + const isDeleteDisabled = isBeingProcessed; + const handleDelete = async () => { setIsDeleting(true); try { const ok = await deleteDocument(document.id); - if (ok) toast.success("Document deleted successfully"); - else toast.error("Failed to delete document"); - await refreshDocuments(); - } catch (error) { + if (!ok) toast.error("Failed to delete document"); + // Note: Success toast is handled by the mutation atom's onSuccess callback + // Cache is updated optimistically by the mutation, no need to refresh + } catch (error: unknown) { console.error("Error deleting document:", error); - toast.error("Failed to delete document"); + // Check for 409 Conflict (document started processing after UI loaded) + const status = + (error as { response?: { status?: number } })?.response?.status ?? + (error as { status?: number })?.status; + if (status === 409) { + toast.error("Document is now being processed. Please try again later."); + } else { + toast.error("Failed to delete document"); + } } finally { setIsDeleting(false); setIsDeleteOpen(false); @@ -76,124 +87,121 @@ export function RowActions({ }; return ( -
+ <> {/* Desktop Actions */} -
- {isEditable && ( - - - - - - - -

Edit Document

-
-
- )} - - - - +
+ {isEditable ? ( + // Editable documents: show 3-dot dropdown with edit + delete + + - - - -

View Metadata

-
- - - {isDeletable && ( - - - + + !isEditDisabled && handleEdit()} + disabled={isEditDisabled} + className={ + isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "" + } > - - - - -

Delete

-
-
+ + Delete + + )} + +
+ ) : ( + // Non-editable documents: show only delete button directly + shouldShowDelete && ( + + ) )}
{/* Mobile Actions Dropdown */} -
- - - - - - {isEditable && ( - +
+ {isEditable ? ( + // Editable documents: show 3-dot dropdown + + + + + + !isEditDisabled && handleEdit()} + disabled={isEditDisabled} + className={ + isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "" + } + > Edit - )} - setIsMetadataOpen(true)}> - - Metadata - - {isDeletable && ( - setIsDeleteOpen(true)} - className="text-destructive focus:text-destructive" - > - - Delete - - )} - - + {shouldShowDelete && ( + !isDeleteDisabled && setIsDeleteOpen(true)} + disabled={isDeleteDisabled} + className={ + isDeleteDisabled + ? "text-muted-foreground cursor-not-allowed opacity-50" + : "text-destructive focus:text-destructive" + } + > + + Delete + + )} + + + ) : ( + // Non-editable documents: show only delete button directly + shouldShowDelete && ( + + ) + )}
- - @@ -214,6 +222,6 @@ export function RowActions({ -
+ ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts index 73b68b588..9dcf0ef00 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts @@ -1,18 +1,27 @@ export type DocumentType = string; +export type DocumentStatus = { + state: "ready" | "pending" | "processing" | "failed"; + reason?: string; +}; + export type Document = { id: number; title: string; document_type: DocumentType; - document_metadata: any; - content: string; + // Optional: Only needed when viewing document details (lazy loaded) + document_metadata?: any; + content?: string; created_at: string; search_space_id: number; + created_by_id?: string | null; + created_by_name?: string | null; + status?: DocumentStatus; }; export type ColumnVisibility = { - title: boolean; document_type: boolean; - content: boolean; + created_by: boolean; created_at: boolean; + status: boolean; }; diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index 52eb3546c..8cf2fe8da 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -2,22 +2,19 @@ import { useQuery } from "@tanstack/react-query"; import { useAtomValue } from "jotai"; -import { RefreshCw, SquarePlus, Upload } from "lucide-react"; import { motion } from "motion/react"; -import { useParams, useRouter } from "next/navigation"; +import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; -import { useCallback, useEffect, useId, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; import { toast } from "sonner"; import { deleteDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; -import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms"; -import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; -import { Button } from "@/components/ui/button"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; +import { useDocuments } from "@/hooks/use-documents"; import { documentsApiService } from "@/lib/apis/documents-api.service"; import { cacheKeys } from "@/lib/query-client/cache-keys"; import { DocumentsFilters } from "./components/DocumentsFilters"; import { DocumentsTableShell, type SortKey } from "./components/DocumentsTableShell"; -import { PaginationControls } from "./components/PaginationControls"; +import { PAGE_SIZE, PaginationControls } from "./components/PaginationControls"; import type { ColumnVisibility } from "./components/types"; function useDebounced(value: T, delay = 250) { @@ -31,70 +28,48 @@ function useDebounced(value: T, delay = 250) { export default function DocumentsTable() { const t = useTranslations("documents"); - const id = useId(); const params = useParams(); - const router = useRouter(); const searchSpaceId = Number(params.search_space_id); - const { openDialog: openUploadDialog } = useDocumentUploadDialog(); - - const handleNewNote = useCallback(() => { - router.push(`/dashboard/${searchSpaceId}/editor/new`); - }, [router, searchSpaceId]); const [search, setSearch] = useState(""); const debouncedSearch = useDebounced(search, 250); const [activeTypes, setActiveTypes] = useState([]); const [columnVisibility, setColumnVisibility] = useState({ - title: true, document_type: true, - content: true, + created_by: true, created_at: true, + status: true, }); const [pageIndex, setPageIndex] = useState(0); - const [pageSize, setPageSize] = useState(50); - const [sortKey, setSortKey] = useState("title"); - const [sortDesc, setSortDesc] = useState(false); + const [sortKey, setSortKey] = useState("created_at"); + const [sortDesc, setSortDesc] = useState(true); const [selectedIds, setSelectedIds] = useState>(new Set()); - const { data: rawTypeCounts } = useAtomValue(documentTypeCountsAtom); const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom); - // Build query parameters for fetching documents - const queryParams = useMemo( - () => ({ - search_space_id: searchSpaceId, - page: pageIndex, - page_size: pageSize, - ...(activeTypes.length > 0 && { document_types: activeTypes }), - }), - [searchSpaceId, pageIndex, pageSize, activeTypes] - ); + // REAL-TIME: Use Electric SQL hook for live document updates (when not searching) + const { + documents: realtimeDocuments, + typeCounts: realtimeTypeCounts, + loading: realtimeLoading, + error: realtimeError, + } = useDocuments(searchSpaceId, activeTypes); - // Build search query parameters + // Check if we're in search mode + const isSearchMode = !!debouncedSearch.trim(); + + // Build search query parameters (only used when searching) const searchQueryParams = useMemo( () => ({ search_space_id: searchSpaceId, page: pageIndex, - page_size: pageSize, + page_size: PAGE_SIZE, title: debouncedSearch.trim(), ...(activeTypes.length > 0 && { document_types: activeTypes }), }), - [searchSpaceId, pageIndex, pageSize, activeTypes, debouncedSearch] + [searchSpaceId, pageIndex, activeTypes, debouncedSearch] ); - // Use query for fetching documents - const { - data: documentsResponse, - isLoading: isDocumentsLoading, - refetch: refetchDocuments, - error: documentsError, - } = useQuery({ - queryKey: cacheKeys.documents.globalQueryParams(queryParams), - queryFn: () => documentsApiService.getDocuments({ queryParams }), - staleTime: 3 * 60 * 1000, // 3 minutes - enabled: !!searchSpaceId && !debouncedSearch.trim(), - }); - - // Use query for searching documents + // API search query (only enabled when searching - Electric doesn't do full-text search) const { data: searchResponse, isLoading: isSearchLoading, @@ -103,134 +78,135 @@ export default function DocumentsTable() { } = useQuery({ queryKey: cacheKeys.documents.globalQueryParams(searchQueryParams), queryFn: () => documentsApiService.searchDocuments({ queryParams: searchQueryParams }), - staleTime: 3 * 60 * 1000, // 3 minutes - enabled: !!searchSpaceId && !!debouncedSearch.trim(), + staleTime: 30 * 1000, // 30 seconds for search (shorter since it's on-demand) + enabled: !!searchSpaceId && isSearchMode, }); - // Determine if we should show SurfSense docs (when no type filter or SURFSENSE_DOCS is selected) - const showSurfsenseDocs = - activeTypes.length === 0 || activeTypes.includes("SURFSENSE_DOCS" as DocumentTypeEnum); + // Client-side sorting for real-time documents + const sortedRealtimeDocuments = useMemo(() => { + const docs = [...realtimeDocuments]; + docs.sort((a, b) => { + const av = a[sortKey] ?? ""; + const bv = b[sortKey] ?? ""; + let cmp: number; + if (sortKey === "created_at") { + cmp = new Date(av as string).getTime() - new Date(bv as string).getTime(); + } else { + cmp = String(av).localeCompare(String(bv)); + } + return sortDesc ? -cmp : cmp; + }); + return docs; + }, [realtimeDocuments, sortKey, sortDesc]); - // Use query for fetching SurfSense docs - const { - data: surfsenseDocsResponse, - isLoading: isSurfsenseDocsLoading, - refetch: refetchSurfsenseDocs, - } = useQuery({ - queryKey: ["surfsense-docs", debouncedSearch, pageIndex, pageSize], - queryFn: () => - documentsApiService.getSurfsenseDocs({ - queryParams: { - page: pageIndex, - page_size: pageSize, - title: debouncedSearch.trim() || undefined, - }, - }), - staleTime: 3 * 60 * 1000, // 3 minutes - enabled: showSurfsenseDocs, - }); + // Client-side pagination for real-time documents + const paginatedRealtimeDocuments = useMemo(() => { + const start = pageIndex * PAGE_SIZE; + const end = start + PAGE_SIZE; + return sortedRealtimeDocuments.slice(start, end); + }, [sortedRealtimeDocuments, pageIndex]); - // Transform SurfSense docs to match the Document type - const surfsenseDocsAsDocuments: Document[] = useMemo(() => { - if (!surfsenseDocsResponse?.items) return []; - return surfsenseDocsResponse.items.map((doc) => ({ - id: doc.id, - title: doc.title, - document_type: "SURFSENSE_DOCS", - document_metadata: { source: doc.source }, - content: doc.content, - created_at: new Date().toISOString(), - search_space_id: -1, // Special value for global docs - })); - }, [surfsenseDocsResponse]); + // Determine what to display based on search mode + const displayDocs = isSearchMode + ? (searchResponse?.items || []).map((item) => ({ + id: item.id, + search_space_id: item.search_space_id, + document_type: item.document_type, + title: item.title, + created_by_id: item.created_by_id ?? null, + created_by_name: item.created_by_name ?? null, + created_at: item.created_at, + status: ( + item as { + status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string }; + } + ).status ?? { state: "ready" as const }, + })) + : paginatedRealtimeDocuments; - // Merge type counts with SURFSENSE_DOCS count - const typeCounts = useMemo(() => { - const counts = { ...(rawTypeCounts || {}) }; - if (surfsenseDocsResponse?.total) { - counts.SURFSENSE_DOCS = surfsenseDocsResponse.total; - } - return counts; - }, [rawTypeCounts, surfsenseDocsResponse?.total]); + const displayTotal = isSearchMode ? searchResponse?.total || 0 : sortedRealtimeDocuments.length; - // Extract documents and total based on search state - const documents = debouncedSearch.trim() - ? searchResponse?.items || [] - : documentsResponse?.items || []; - const total = debouncedSearch.trim() ? searchResponse?.total || 0 : documentsResponse?.total || 0; + const loading = isSearchMode ? isSearchLoading : realtimeLoading; + const error = isSearchMode ? searchError : realtimeError; - const loading = debouncedSearch.trim() ? isSearchLoading : isDocumentsLoading; - const error = debouncedSearch.trim() ? searchError : documentsError; - - // Display results directly - const displayDocs = documents; - const displayTotal = total; - const pageStart = pageIndex * pageSize; - const pageEnd = Math.min(pageStart + pageSize, displayTotal); + const pageEnd = Math.min((pageIndex + 1) * PAGE_SIZE, displayTotal); const onToggleType = (type: DocumentTypeEnum, checked: boolean) => { - setActiveTypes((prev) => (checked ? [...prev, type] : prev.filter((t) => t !== type))); + setActiveTypes((prev) => { + if (checked) { + return prev.includes(type) ? prev : [...prev, type]; + } else { + return prev.filter((t) => t !== type); + } + }); setPageIndex(0); }; - const onToggleColumn = (id: keyof ColumnVisibility, checked: boolean) => { - setColumnVisibility((prev) => ({ ...prev, [id]: checked })); - }; - - const [isRefreshing, setIsRefreshing] = useState(false); - - const refreshCurrentView = useCallback(async () => { - if (isRefreshing) return; - setIsRefreshing(true); - try { - if (debouncedSearch.trim()) { - await refetchSearch(); - } else { - await refetchDocuments(); - } - toast.success(t("refresh_success") || "Documents refreshed"); - } finally { - setIsRefreshing(false); - } - }, [debouncedSearch, refetchSearch, refetchDocuments, t, isRefreshing]); - - // Create a delete function for single document deletion - const deleteDocument = useCallback( - async (id: number) => { - try { - await deleteDocumentMutation({ id }); - return true; - } catch (error) { - console.error("Failed to delete document:", error); - return false; - } - }, - [deleteDocumentMutation] - ); - const onBulkDelete = async () => { if (selectedIds.size === 0) { toast.error(t("no_rows_selected")); return; } + + // Filter out pending/processing documents - they cannot be deleted + // For real-time mode, use sortedRealtimeDocuments (which has status) + // For search mode, use searchResponse items (need to safely access status) + const allDocs = isSearchMode + ? (searchResponse?.items || []).map((item) => ({ + id: item.id, + status: (item as { status?: { state: string } }).status, + })) + : sortedRealtimeDocuments.map((doc) => ({ id: doc.id, status: doc.status })); + + const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id)); + const deletableIds = selectedDocs + .filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing") + .map((doc) => doc.id); + const inProgressCount = selectedIds.size - deletableIds.length; + + if (inProgressCount > 0) { + toast.warning( + `${inProgressCount} document(s) are pending or processing and cannot be deleted.` + ); + } + + if (deletableIds.length === 0) { + return; + } + try { // Delete documents one by one using the mutation + // Track 409 conflicts separately (document started processing after UI loaded) + let conflictCount = 0; const results = await Promise.all( - Array.from(selectedIds).map(async (id) => { + deletableIds.map(async (id) => { try { await deleteDocumentMutation({ id }); return true; - } catch { + } catch (error: unknown) { + const status = + (error as { response?: { status?: number } })?.response?.status ?? + (error as { status?: number })?.status; + if (status === 409) conflictCount++; return false; } }) ); const okCount = results.filter((r) => r === true).length; - if (okCount === selectedIds.size) + if (okCount === deletableIds.length) { toast.success(t("delete_success_count", { count: okCount })); - else toast.error(t("delete_partial_failed")); - // Refetch the current page with appropriate method - await refreshCurrentView(); + } else if (conflictCount > 0) { + toast.error(`${conflictCount} document(s) started processing. Please try again later.`); + } else { + toast.error(t("delete_partial_failed")); + } + + // If in search mode, refetch search results to reflect deletion + if (isSearchMode) { + await refetchSearch(); + } + // Real-time mode: Electric will sync the deletion automatically + setSelectedIds(new Set()); } catch (e) { console.error(e); @@ -238,10 +214,47 @@ export default function DocumentsTable() { } }; + // Single document delete handler for RowActions + const handleDeleteDocument = useCallback( + async (id: number): Promise => { + try { + await deleteDocumentMutation({ id }); + toast.success(t("delete_success") || "Document deleted"); + // If in search mode, refetch search results to reflect deletion + if (isSearchMode) { + await refetchSearch(); + } + // Real-time mode: Electric will sync the deletion automatically + return true; + } catch (e) { + console.error("Error deleting document:", e); + return false; + } + }, + [deleteDocumentMutation, isSearchMode, refetchSearch, t] + ); + + const handleSortChange = useCallback((key: SortKey) => { + setSortKey((currentKey) => { + if (currentKey === key) { + setSortDesc((v) => !v); + return currentKey; + } + setSortDesc(false); + return key; + }); + }, []); + + // Reset page when search changes (type filter already resets via onToggleType) + // biome-ignore lint/correctness/useExhaustiveDependencies: Intentionally reset page on search change + useEffect(() => { + setPageIndex(0); + }, [debouncedSearch]); + useEffect(() => { const mq = window.matchMedia("(max-width: 768px)"); const apply = (isSmall: boolean) => { - setColumnVisibility((prev) => ({ ...prev, content: !isSmall, created_at: !isSmall })); + setColumnVisibility((prev) => ({ ...prev, created_by: !isSmall, created_at: !isSmall })); }; apply(mq.matches); const onChange = (e: MediaQueryListEvent) => apply(e.matches); @@ -254,81 +267,44 @@ export default function DocumentsTable() { initial={{ opacity: 0, y: 20 }} animate={{ opacity: 1, y: 0 }} transition={{ duration: 0.3 }} - className="w-full px-6 py-4 space-y-6 min-h-[calc(100vh-64px)]" + className="w-full max-w-7xl mx-auto px-6 pt-17 pb-6 space-y-6 min-h-[calc(100vh-64px)]" > - -
-

{t("title")}

-

{t("subtitle")}

-
-
- - - -
-
- + {/* Filters - use real-time type counts */} + {/* Table */} { - if (sortKey === key) setSortDesc((v) => !v); - else { - setSortKey(key); - setSortDesc(false); - } - }} + onSortChange={handleSortChange} + deleteDocument={handleDeleteDocument} + searchSpaceId={String(searchSpaceId)} /> + {/* Pagination */} { - setPageSize(s); - setPageIndex(0); - }} onFirst={() => setPageIndex(0)} onPrev={() => setPageIndex((i) => Math.max(0, i - 1))} onNext={() => setPageIndex((i) => (pageEnd < displayTotal ? i + 1 : i))} - onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / pageSize) - 1))} + onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / PAGE_SIZE) - 1))} canPrev={pageIndex > 0} canNext={pageEnd < displayTotal} - id={id} />
); diff --git a/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts new file mode 100644 index 000000000..cbdf17244 --- /dev/null +++ b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts @@ -0,0 +1,4 @@ +import { atom } from "jotai"; + +// Atom to control the connector dialog open state from anywhere in the app +export const connectorDialogOpenAtom = atom(false); diff --git a/surfsense_web/atoms/connectors/connector-mutation.atoms.ts b/surfsense_web/atoms/connectors/connector-mutation.atoms.ts index 70b5b0322..b928f8631 100644 --- a/surfsense_web/atoms/connectors/connector-mutation.atoms.ts +++ b/surfsense_web/atoms/connectors/connector-mutation.atoms.ts @@ -1,5 +1,4 @@ import { atomWithMutation } from "jotai-tanstack-query"; -import { toast } from "sonner"; import type { CreateConnectorRequest, DeleteConnectorRequest, @@ -17,15 +16,16 @@ export const createConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: CreateConnectorRequest) => { return connectorsApiService.createConnector(request); }, onSuccess: () => { + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); }, }; @@ -35,15 +35,16 @@ export const updateConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: UpdateConnectorRequest) => { return connectorsApiService.updateConnector(request); }, onSuccess: (_, request: UpdateConnectorRequest) => { + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); queryClient.invalidateQueries({ queryKey: cacheKeys.connectors.byId(String(request.id)), @@ -56,15 +57,16 @@ export const deleteConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: DeleteConnectorRequest) => { return connectorsApiService.deleteConnector(request); }, onSuccess: (_, request: DeleteConnectorRequest) => { + if (!searchSpaceId) return; queryClient.setQueryData( - cacheKeys.connectors.all(searchSpaceId!), + cacheKeys.connectors.all(searchSpaceId), (oldData: GetConnectorsResponse | undefined) => { if (!oldData) return oldData; return oldData.filter((connector) => connector.id !== request.id); @@ -88,9 +90,9 @@ export const indexConnectorMutationAtom = atomWithMutation((get) => { }, onSuccess: (response: IndexConnectorResponse) => { - toast.success(response.message); + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); queryClient.invalidateQueries({ queryKey: cacheKeys.connectors.byId(String(response.connector_id)), diff --git a/surfsense_web/atoms/documents/document-mutation.atoms.ts b/surfsense_web/atoms/documents/document-mutation.atoms.ts index 09e127735..8089bacd4 100644 --- a/surfsense_web/atoms/documents/document-mutation.atoms.ts +++ b/surfsense_web/atoms/documents/document-mutation.atoms.ts @@ -48,7 +48,7 @@ export const uploadDocumentMutationAtom = atomWithMutation((get) => { }, onSuccess: () => { - toast.success("Files uploaded for processing"); + // Note: Toast notification is handled by the caller (DocumentUploadTab) to use i18n // Invalidate logs summary to show new processing tasks immediately on documents page queryClient.invalidateQueries({ queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined), @@ -95,7 +95,7 @@ export const deleteDocumentMutationAtom = atomWithMutation((get) => { }, onSuccess: (_, request: DeleteDocumentRequest) => { - toast.success("Document deleted successfully"); + // Note: Toast is handled by the caller (page.tsx onBulkDelete) to show count info queryClient.setQueryData( cacheKeys.documents.globalQueryParams(documentsQueryParams), (oldData: GetDocumentsResponse | undefined) => { diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 9b201e96b..e597770ee 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -19,7 +19,7 @@ import { Spinner } from "@/components/ui/spinner"; import { Tabs, TabsContent } from "@/components/ui/tabs"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; import { useConnectorsElectric } from "@/hooks/use-connectors-electric"; -import { useDocumentsElectric } from "@/hooks/use-documents-electric"; +import { useDocuments } from "@/hooks/use-documents"; import { useInbox } from "@/hooks/use-inbox"; import { cn } from "@/lib/utils"; import { ConnectorDialogHeader } from "./connector-popup/components/connector-dialog-header"; @@ -37,7 +37,7 @@ import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab"; import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view"; import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view"; -export const ConnectorIndicator: FC = () => { +export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger = false }) => { const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom); const searchParams = useSearchParams(); const { data: currentUser } = useAtomValue(currentUserAtom); @@ -63,7 +63,9 @@ export const ConnectorIndicator: FC = () => { const llmConfigLoading = preferencesLoading || globalConfigsLoading; // Fetch document type counts using Electric SQL + PGlite for real-time updates - const { documentTypeCounts, loading: documentTypesLoading } = useDocumentsElectric(searchSpaceId); + const { typeCounts: documentTypeCounts, loading: documentTypesLoading } = useDocuments( + searchSpaceId ? Number(searchSpaceId) : null + ); // Fetch notifications to detect indexing failures const { inboxItems = [] } = useInbox( @@ -186,34 +188,38 @@ export const ConnectorIndicator: FC = () => { return ( - handleOpenChange(true)} - > - {isLoading ? ( - - ) : ( - <> - - {activeConnectorsCount > 0 && ( - - {activeConnectorsCount > 99 ? "99+" : activeConnectorsCount} - - )} - - )} - + {!hideTrigger && ( + handleOpenChange(true)} + > + {isLoading ? ( + + ) : ( + <> + + {activeConnectorsCount > 0 && ( + + {activeConnectorsCount > 99 ? "99+" : activeConnectorsCount} + + )} + + )} + + )} Manage Connectors diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 639d0f7ed..69c1b797d 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1,8 +1,9 @@ import { format } from "date-fns"; -import { useAtomValue } from "jotai"; +import { useAtom, useAtomValue } from "jotai"; import { useRouter, useSearchParams } from "next/navigation"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; +import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms"; import { createConnectorMutationAtom, deleteConnectorMutationAtom, @@ -49,7 +50,8 @@ export const useConnectorDialog = () => { const { mutateAsync: deleteConnector } = useAtomValue(deleteConnectorMutationAtom); const { mutateAsync: createConnector } = useAtomValue(createConnectorMutationAtom); - const [isOpen, setIsOpen] = useState(false); + // Use global atom for dialog open state so it can be controlled from anywhere + const [isOpen, setIsOpen] = useAtom(connectorDialogOpenAtom); const [activeTab, setActiveTab] = useState("all"); const [connectingId, setConnectingId] = useState(null); const [isScrolled, setIsScrolled] = useState(false); @@ -293,6 +295,7 @@ export const useConnectorDialog = () => { connectingConnectorType, viewingAccountsType, viewingMCPList, + setIsOpen, ]); // Detect OAuth success / Failure and transition to config view @@ -345,9 +348,10 @@ export const useConnectorDialog = () => { // If we found the connector, find the matching OAuth/Composio connector by type if (newConnector) { + const connectorType = newConnector.connector_type; oauthConnector = - OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) || - COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type); + OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); } } @@ -358,8 +362,9 @@ export const useConnectorDialog = () => { COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); if (oauthConnector) { + const oauthConnectorType = oauthConnector.connectorType; newConnector = result.data.find( - (c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType + (c: SearchSourceConnector) => c.connector_type === oauthConnectorType ); } } @@ -399,7 +404,7 @@ export const useConnectorDialog = () => { // Invalid query params - log but don't crash console.warn("Invalid connector popup query params in OAuth success handler:", error); } - }, [searchParams, searchSpaceId, refetchAllConnectors]); + }, [searchParams, searchSpaceId, refetchAllConnectors, setIsOpen]); // Handle OAuth connection const handleConnectOAuth = useCallback( @@ -514,7 +519,7 @@ export const useConnectorDialog = () => { } finally { setConnectingId(null); } - }, [searchSpaceId, createConnector, refetchAllConnectors]); + }, [searchSpaceId, createConnector, refetchAllConnectors, setIsOpen]); // Handle connecting non-OAuth connectors (like Tavily API) const handleConnectNonOAuth = useCallback( @@ -677,12 +682,8 @@ export const useConnectorDialog = () => { const successMessage = currentConnectorType === "MCP_CONNECTOR" ? `${connector.name} added successfully` - : `${connectorTitle} connected and indexing started!`; - toast.success(successMessage, { - description: periodicEnabledForIndexing - ? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutesForIndexing)}.` - : "You can continue working while we sync your data.", - }); + : `${connectorTitle} connected and syncing started!`; + toast.success(successMessage); const url = new URL(window.location.href); url.searchParams.delete("modal"); @@ -782,7 +783,6 @@ export const useConnectorDialog = () => { updateConnector, indexConnector, router, - getFrequencyLabel, ] ); @@ -1010,11 +1010,7 @@ export const useConnectorDialog = () => { ); } - toast.success(`${indexingConfig.connectorTitle} indexing started`, { - description: periodicEnabled - ? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutes)}.` - : "You can continue working while we sync your data.", - }); + toast.success(`${indexingConfig.connectorTitle} indexing started`); // Update URL - the effect will handle closing the modal and clearing state const url = new URL(window.location.href); @@ -1045,7 +1041,6 @@ export const useConnectorDialog = () => { updateConnector, periodicEnabled, frequencyMinutes, - getFrequencyLabel, router, indexingConnectorConfig, ] @@ -1426,9 +1421,7 @@ export const useConnectorDialog = () => { end_date: endDateStr, }, }); - toast.success("Indexing started", { - description: "You can continue working while we sync your data.", - }); + toast.success("Indexing started"); // Invalidate queries to refresh data queryClient.invalidateQueries({ @@ -1445,7 +1438,7 @@ export const useConnectorDialog = () => { } } }, - [searchSpaceId, indexConnector, queryClient] + [searchSpaceId, indexConnector] ); // Handle going back from edit view @@ -1527,7 +1520,7 @@ export const useConnectorDialog = () => { } } }, - [activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector] + [activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector, setIsOpen] ); // Handle tab change diff --git a/surfsense_web/components/json-metadata-viewer.tsx b/surfsense_web/components/json-metadata-viewer.tsx index 982d16786..faab000ad 100644 --- a/surfsense_web/components/json-metadata-viewer.tsx +++ b/surfsense_web/components/json-metadata-viewer.tsx @@ -1,4 +1,4 @@ -import { FileJson } from "lucide-react"; +import { FileJson, Loader2 } from "lucide-react"; import React from "react"; import { defaultStyles, JsonView } from "react-json-view-lite"; import { Button } from "@/components/ui/button"; @@ -17,6 +17,7 @@ interface JsonMetadataViewerProps { trigger?: React.ReactNode; open?: boolean; onOpenChange?: (open: boolean) => void; + loading?: boolean; } export function JsonMetadataViewer({ @@ -25,6 +26,7 @@ export function JsonMetadataViewer({ trigger, open, onOpenChange, + loading, }: JsonMetadataViewerProps) { // Ensure metadata is a valid object const jsonData = React.useMemo(() => { @@ -54,7 +56,13 @@ export function JsonMetadataViewer({
- + {loading ? ( +
+ +
+ ) : ( + + )}
diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index 2c2af7d46..66d2f419a 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -90,7 +90,7 @@ export function LayoutDataProvider({ }); // Fetch threads (40 total to allow up to 20 per section - shared/private) - const { data: threadsData } = useQuery({ + const { data: threadsData, isPending: isLoadingThreads } = useQuery({ queryKey: ["threads", searchSpaceId, { limit: 40 }], queryFn: () => fetchThreads(Number(searchSpaceId), 40), enabled: !!searchSpaceId, @@ -585,6 +585,7 @@ export function LayoutDataProvider({ theme={theme} setTheme={setTheme} isChatPage={isChatPage} + isLoadingChats={isLoadingThreads} inbox={{ isOpen: isInboxSidebarOpen, onOpenChange: setIsInboxSidebarOpen, diff --git a/surfsense_web/components/layout/ui/shell/LayoutShell.tsx b/surfsense_web/components/layout/ui/shell/LayoutShell.tsx index 8eae99b03..3a8255e7a 100644 --- a/surfsense_web/components/layout/ui/shell/LayoutShell.tsx +++ b/surfsense_web/components/layout/ui/shell/LayoutShell.tsx @@ -74,6 +74,7 @@ interface LayoutShellProps { className?: string; // Inbox props inbox?: InboxProps; + isLoadingChats?: boolean; } export function LayoutShell({ @@ -110,6 +111,7 @@ export function LayoutShell({ children, className, inbox, + isLoadingChats = false, }: LayoutShellProps) { const isMobile = useIsMobile(); const [mobileMenuOpen, setMobileMenuOpen] = useState(false); @@ -162,6 +164,7 @@ export function LayoutShell({ pageUsage={pageUsage} theme={theme} setTheme={setTheme} + isLoadingChats={isLoadingChats} />
@@ -232,6 +235,7 @@ export function LayoutShell({ theme={theme} setTheme={setTheme} className="hidden md:flex border-r shrink-0" + isLoadingChats={isLoadingChats} /> {/* Docked Inbox Sidebar - renders as flex sibling between sidebar and content */} diff --git a/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx index 3ed2f9cca..71d85f600 100644 --- a/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/MobileSidebar.tsx @@ -37,6 +37,7 @@ interface MobileSidebarProps { pageUsage?: PageUsage; theme?: string; setTheme?: (theme: "light" | "dark" | "system") => void; + isLoadingChats?: boolean; } export function MobileSidebarTrigger({ onClick }: { onClick: () => void }) { @@ -78,6 +79,7 @@ export function MobileSidebar({ pageUsage, theme, setTheme, + isLoadingChats = false, }: MobileSidebarProps) { const handleSearchSpaceSelect = (id: number) => { onSearchSpaceSelect(id); @@ -158,6 +160,7 @@ export function MobileSidebar({ theme={theme} setTheme={setTheme} className="w-full border-none" + isLoadingChats={isLoadingChats} />
diff --git a/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx b/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx index 8763056ed..7b53fdc6a 100644 --- a/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/Sidebar.tsx @@ -3,6 +3,7 @@ import { FolderOpen, PenSquare } from "lucide-react"; import { useTranslations } from "next-intl"; import { Button } from "@/components/ui/button"; +import { Skeleton } from "@/components/ui/skeleton"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { cn } from "@/lib/utils"; import type { ChatItem, NavItem, PageUsage, SearchSpace, User } from "../../types/layout.types"; @@ -14,6 +15,15 @@ import { SidebarHeader } from "./SidebarHeader"; import { SidebarSection } from "./SidebarSection"; import { SidebarUserProfile } from "./SidebarUserProfile"; +function ChatListItemSkeleton() { + return ( +
+ + +
+ ); +} + interface SidebarProps { searchSpace: SearchSpace | null; isCollapsed?: boolean; @@ -39,6 +49,7 @@ interface SidebarProps { theme?: string; setTheme?: (theme: "light" | "dark" | "system") => void; className?: string; + isLoadingChats?: boolean; } export function Sidebar({ @@ -66,6 +77,7 @@ export function Sidebar({ theme, setTheme, className, + isLoadingChats = false, }: SidebarProps) { const t = useTranslations("sidebar"); @@ -153,7 +165,15 @@ export function Sidebar({ ) : undefined } > - {sharedChats.length > 0 ? ( + {isLoadingChats ? ( +
+ + + + + +
+ ) : sharedChats.length > 0 ? (
4 ? "pb-8" : ""}`} @@ -206,7 +226,15 @@ export function Sidebar({ ) : undefined } > - {chats.length > 0 ? ( + {isLoadingChats ? ( +
+ + + + + +
+ ) : chats.length > 0 ? (
4 ? "pb-8" : ""}`} diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index aaf476215..18a872d94 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -92,7 +92,7 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas case "FILE": return ; case "GOOGLE_DRIVE_FILE": - return ; + return Google Drive; case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": return Google Drive; case "COMPOSIO_GMAIL_CONNECTOR": diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 01a58173e..b7a2d2cf8 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -23,6 +23,7 @@ export const documentTypeEnum = z.enum([ "ELASTICSEARCH_CONNECTOR", "BOOKSTACK_CONNECTOR", "CIRCLEBACK", + "OBSIDIAN_CONNECTOR", "SURFSENSE_DOCS", "NOTE", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", @@ -41,6 +42,8 @@ export const document = z.object({ created_at: z.string(), updated_at: z.string().nullable(), search_space_id: z.number(), + created_by_id: z.string().nullable().optional(), + created_by_name: z.string().nullable().optional(), }); export const extensionDocumentContent = z.object({ diff --git a/surfsense_web/hooks/use-documents-electric.ts b/surfsense_web/hooks/use-documents-electric.ts deleted file mode 100644 index 43809499e..000000000 --- a/surfsense_web/hooks/use-documents-electric.ts +++ /dev/null @@ -1,185 +0,0 @@ -"use client"; - -import { useEffect, useMemo, useRef, useState } from "react"; -import type { SyncHandle } from "@/lib/electric/client"; -import { useElectricClient } from "@/lib/electric/context"; - -interface Document { - id: number; - search_space_id: number; - document_type: string; - created_at: string; -} - -/** - * Hook for managing documents with Electric SQL real-time sync - * - * Uses the Electric client from context (provided by ElectricProvider) - * instead of initializing its own - prevents race conditions and memory leaks - */ -export function useDocumentsElectric(searchSpaceId: number | string | null) { - // Get Electric client from context - ElectricProvider handles initialization - const electricClient = useElectricClient(); - - const [documents, setDocuments] = useState([]); - const [loading, setLoading] = useState(true); - const [error, setError] = useState(null); - const syncHandleRef = useRef(null); - const liveQueryRef = useRef<{ unsubscribe: () => void } | null>(null); - const syncKeyRef = useRef(null); - - // Calculate document type counts from synced documents - const documentTypeCounts = useMemo(() => { - if (!documents.length) return {}; - - const counts: Record = {}; - for (const doc of documents) { - counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; - } - return counts; - }, [documents]); - - // Start syncing when Electric client is available - useEffect(() => { - // Wait for both searchSpaceId and Electric client to be available - if (!searchSpaceId || !electricClient) { - setLoading(!electricClient); // Still loading if waiting for Electric - if (!searchSpaceId) { - setDocuments([]); - } - return; - } - - // Create a unique key for this sync to prevent duplicate subscriptions - const syncKey = `documents_${searchSpaceId}`; - if (syncKeyRef.current === syncKey) { - // Already syncing for this search space - return; - } - - let mounted = true; - syncKeyRef.current = syncKey; - - async function startSync() { - try { - console.log("[useDocumentsElectric] Starting sync for search space:", searchSpaceId); - - const handle = await electricClient.syncShape({ - table: "documents", - where: `search_space_id = ${searchSpaceId}`, - columns: ["id", "document_type", "search_space_id", "created_at"], - primaryKey: ["id"], - }); - - console.log("[useDocumentsElectric] Sync started:", { - isUpToDate: handle.isUpToDate, - }); - - // Wait for initial sync with timeout - if (!handle.isUpToDate && handle.initialSyncPromise) { - try { - await Promise.race([ - handle.initialSyncPromise, - new Promise((resolve) => setTimeout(resolve, 2000)), - ]); - } catch (syncErr) { - console.error("[useDocumentsElectric] Initial sync failed:", syncErr); - } - } - - if (!mounted) { - handle.unsubscribe(); - return; - } - - syncHandleRef.current = handle; - setLoading(false); - setError(null); - - // Fetch initial documents - await fetchDocuments(); - - // Set up live query for real-time updates - await setupLiveQuery(); - } catch (err) { - if (!mounted) return; - console.error("[useDocumentsElectric] Failed to start sync:", err); - setError(err instanceof Error ? err : new Error("Failed to sync documents")); - setLoading(false); - } - } - - async function fetchDocuments() { - try { - const result = await electricClient.db.query( - `SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`, - [searchSpaceId] - ); - if (mounted) { - setDocuments(result.rows || []); - } - } catch (err) { - console.error("[useDocumentsElectric] Failed to fetch:", err); - } - } - - async function setupLiveQuery() { - try { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const db = electricClient.db as any; - - if (db.live?.query && typeof db.live.query === "function") { - const liveQuery = await db.live.query( - `SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`, - [searchSpaceId] - ); - - if (!mounted) { - liveQuery.unsubscribe?.(); - return; - } - - // Set initial results - if (liveQuery.initialResults?.rows) { - setDocuments(liveQuery.initialResults.rows); - } else if (liveQuery.rows) { - setDocuments(liveQuery.rows); - } - - // Subscribe to changes - if (typeof liveQuery.subscribe === "function") { - liveQuery.subscribe((result: { rows: Document[] }) => { - if (mounted && result.rows) { - setDocuments(result.rows); - } - }); - } - - if (typeof liveQuery.unsubscribe === "function") { - liveQueryRef.current = liveQuery; - } - } - } catch (liveErr) { - console.error("[useDocumentsElectric] Failed to set up live query:", liveErr); - } - } - - startSync(); - - return () => { - mounted = false; - syncKeyRef.current = null; - - if (syncHandleRef.current) { - syncHandleRef.current.unsubscribe(); - syncHandleRef.current = null; - } - if (liveQueryRef.current) { - liveQueryRef.current.unsubscribe(); - liveQueryRef.current = null; - } - }; - }, [searchSpaceId, electricClient]); - - return { documentTypeCounts, loading, error }; -} diff --git a/surfsense_web/hooks/use-documents.ts b/surfsense_web/hooks/use-documents.ts new file mode 100644 index 000000000..369cc7b41 --- /dev/null +++ b/surfsense_web/hooks/use-documents.ts @@ -0,0 +1,449 @@ +"use client"; + +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import type { DocumentTypeEnum } from "@/contracts/types/document.types"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; +import type { SyncHandle } from "@/lib/electric/client"; +import { useElectricClient } from "@/lib/electric/context"; + +// Stable empty array to prevent infinite re-renders when no typeFilter is provided +const EMPTY_TYPE_FILTER: DocumentTypeEnum[] = []; + +// Document status type (matches backend DocumentStatus JSONB) +export interface DocumentStatusType { + state: "ready" | "pending" | "processing" | "failed"; + reason?: string; +} + +// Document from Electric sync (lightweight table columns - NO content/metadata) +interface DocumentElectric { + id: number; + search_space_id: number; + document_type: string; + title: string; + created_by_id: string | null; + created_at: string; + status: DocumentStatusType | null; +} + +// Document for display (with resolved user name) +export interface DocumentDisplay { + id: number; + search_space_id: number; + document_type: string; + title: string; + created_by_id: string | null; + created_by_name: string | null; + created_at: string; + status: DocumentStatusType; +} + +/** + * Deduplicate by ID and sort by created_at descending (newest first) + */ +function deduplicateAndSort(items: T[]): T[] { + const seen = new Map(); + for (const item of items) { + // Keep the most recent version if duplicate + const existing = seen.get(item.id); + if (!existing || new Date(item.created_at) > new Date(existing.created_at)) { + seen.set(item.id, item); + } + } + return Array.from(seen.values()).sort( + (a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime() + ); +} + +/** + * Check if a document has valid/complete data + */ +function isValidDocument(doc: DocumentElectric): boolean { + return doc.id != null && doc.title != null && doc.title !== ""; +} + +/** + * Real-time documents hook with Electric SQL + * + * Architecture (100% Reliable): + * 1. API is the PRIMARY source of truth - always loads first + * 2. Electric provides REAL-TIME updates for additions and deletions + * 3. Use syncHandle.isUpToDate to determine if deletions can be trusted + * 4. Handles bulk deletions correctly by checking sync state + * + * @param searchSpaceId - The search space ID to filter documents + * @param typeFilter - Optional document types to filter by + */ +export function useDocuments( + searchSpaceId: number | null, + typeFilter: DocumentTypeEnum[] = EMPTY_TYPE_FILTER +) { + const electricClient = useElectricClient(); + + const [documents, setDocuments] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + // Track if initial API load is complete (source of truth) + const apiLoadedRef = useRef(false); + + // User cache: userId → displayName + const userCacheRef = useRef>(new Map()); + + // Electric sync refs + const syncHandleRef = useRef(null); + const liveQueryRef = useRef<{ unsubscribe?: () => void } | null>(null); + + // Real-time type counts + const typeCounts = useMemo(() => { + const counts: Record = {}; + for (const doc of documents) { + counts[doc.document_type] = (counts[doc.document_type] || 0) + 1; + } + return counts; + }, [documents]); + + // Populate user cache from API response + const populateUserCache = useCallback( + (items: Array<{ created_by_id?: string | null; created_by_name?: string | null }>) => { + for (const item of items) { + if (item.created_by_id && item.created_by_name) { + userCacheRef.current.set(item.created_by_id, item.created_by_name); + } + } + }, + [] + ); + + // Convert API item to display doc + const apiToDisplayDoc = useCallback( + (item: { + id: number; + search_space_id: number; + document_type: string; + title: string; + created_by_id?: string | null; + created_by_name?: string | null; + created_at: string; + status?: DocumentStatusType | null; + }): DocumentDisplay => ({ + id: item.id, + search_space_id: item.search_space_id, + document_type: item.document_type, + title: item.title, + created_by_id: item.created_by_id ?? null, + created_by_name: item.created_by_name ?? null, + created_at: item.created_at, + status: item.status ?? { state: "ready" }, + }), + [] + ); + + // Convert Electric doc to display doc + const electricToDisplayDoc = useCallback( + (doc: DocumentElectric): DocumentDisplay => ({ + ...doc, + created_by_name: doc.created_by_id + ? (userCacheRef.current.get(doc.created_by_id) ?? null) + : null, + status: doc.status ?? { state: "ready" }, + }), + [] + ); + + // EFFECT 1: Load from API (PRIMARY source of truth) + useEffect(() => { + if (!searchSpaceId) { + setLoading(false); + return; + } + + // Capture validated value for async closure + const spaceId = searchSpaceId; + const currentTypeFilter = typeFilter; + + let mounted = true; + apiLoadedRef.current = false; + + async function loadFromApi() { + try { + setLoading(true); + console.log("[useDocuments] Loading from API (source of truth):", spaceId); + + const response = await documentsApiService.getDocuments({ + queryParams: { + search_space_id: spaceId, + page: 0, + page_size: -1, // Fetch all documents + ...(currentTypeFilter.length > 0 && { document_types: currentTypeFilter }), + }, + }); + + if (!mounted) return; + + populateUserCache(response.items); + const docs = response.items.map(apiToDisplayDoc); + setDocuments(docs); + apiLoadedRef.current = true; + setError(null); + console.log("[useDocuments] API loaded", docs.length, "documents"); + } catch (err) { + if (!mounted) return; + console.error("[useDocuments] API load failed:", err); + setError(err instanceof Error ? err : new Error("Failed to load documents")); + } finally { + if (mounted) setLoading(false); + } + } + + loadFromApi(); + + return () => { + mounted = false; + }; + }, [searchSpaceId, typeFilter, populateUserCache, apiToDisplayDoc]); + + // EFFECT 2: Start Electric sync + live query for real-time updates + useEffect(() => { + if (!searchSpaceId || !electricClient) return; + + // Capture validated values for async closure + const spaceId = searchSpaceId; + const client = electricClient; + const currentTypeFilter = typeFilter; + + let mounted = true; + + async function setupElectricRealtime() { + // Cleanup previous subscriptions + if (syncHandleRef.current) { + syncHandleRef.current.unsubscribe(); + syncHandleRef.current = null; + } + if (liveQueryRef.current) { + liveQueryRef.current.unsubscribe?.(); + liveQueryRef.current = null; + } + + try { + console.log("[useDocuments] Starting Electric sync for real-time updates"); + + // Start Electric sync + const handle = await client.syncShape({ + table: "documents", + where: `search_space_id = ${spaceId}`, + columns: [ + "id", + "document_type", + "search_space_id", + "title", + "created_by_id", + "created_at", + "status", + ], + primaryKey: ["id"], + }); + + if (!mounted) { + handle.unsubscribe(); + return; + } + + syncHandleRef.current = handle; + console.log("[useDocuments] Sync started, isUpToDate:", handle.isUpToDate); + + // Wait for initial sync (with timeout) + if (!handle.isUpToDate && handle.initialSyncPromise) { + await Promise.race([ + handle.initialSyncPromise, + new Promise((resolve) => setTimeout(resolve, 5000)), + ]); + console.log("[useDocuments] Initial sync complete, isUpToDate:", handle.isUpToDate); + } + + if (!mounted) return; + + // Set up live query + const db = client.db as { + live?: { + query: ( + sql: string, + params?: (number | string)[] + ) => Promise<{ + subscribe: (cb: (result: { rows: T[] }) => void) => void; + unsubscribe?: () => void; + }>; + }; + }; + + if (!db.live?.query) { + console.warn("[useDocuments] Live queries not available"); + return; + } + + let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at, status + FROM documents + WHERE search_space_id = $1`; + + const params: (number | string)[] = [spaceId]; + + if (currentTypeFilter.length > 0) { + const placeholders = currentTypeFilter.map((_, i) => `$${i + 2}`).join(", "); + query += ` AND document_type IN (${placeholders})`; + params.push(...currentTypeFilter); + } + + query += ` ORDER BY created_at DESC`; + + const liveQuery = await db.live.query(query, params); + + if (!mounted) { + liveQuery.unsubscribe?.(); + return; + } + + console.log("[useDocuments] Live query subscribed"); + + liveQuery.subscribe((result: { rows: DocumentElectric[] }) => { + if (!mounted || !result.rows) return; + + // DEBUG: Log first few raw documents to see what's coming from Electric + console.log("[useDocuments] Raw data sample:", result.rows.slice(0, 3)); + + const validItems = result.rows.filter(isValidDocument); + const isFullySynced = syncHandleRef.current?.isUpToDate ?? false; + + console.log( + `[useDocuments] Live update: ${result.rows.length} raw, ${validItems.length} valid, synced: ${isFullySynced}` + ); + + // Fetch user names for new users (non-blocking) + const unknownUserIds = validItems + .filter( + (doc): doc is DocumentElectric & { created_by_id: string } => + doc.created_by_id !== null && !userCacheRef.current.has(doc.created_by_id) + ) + .map((doc) => doc.created_by_id); + + if (unknownUserIds.length > 0) { + documentsApiService + .getDocuments({ + queryParams: { search_space_id: spaceId, page: 0, page_size: 20 }, + }) + .then((response) => { + populateUserCache(response.items); + if (mounted) { + setDocuments((prev) => + prev.map((doc) => ({ + ...doc, + created_by_name: doc.created_by_id + ? (userCacheRef.current.get(doc.created_by_id) ?? null) + : null, + })) + ); + } + }) + .catch(() => {}); + } + + // Smart update logic based on sync state + setDocuments((prev) => { + // Don't process if API hasn't loaded yet + if (!apiLoadedRef.current) { + console.log("[useDocuments] Waiting for API load, skipping live update"); + return prev; + } + + // Case 1: Live query is empty + if (validItems.length === 0) { + if (isFullySynced && prev.length > 0) { + // Electric is fully synced and says 0 items - trust it (all deleted) + console.log("[useDocuments] All documents deleted (Electric synced)"); + return []; + } + // Partial sync or error - keep existing + console.log("[useDocuments] Empty live result, keeping existing"); + return prev; + } + + // Case 2: Electric is fully synced - TRUST IT COMPLETELY (handles bulk deletes) + if (isFullySynced) { + const liveDocs = deduplicateAndSort(validItems.map(electricToDisplayDoc)); + console.log( + `[useDocuments] Synced update: ${liveDocs.length} docs (was ${prev.length})` + ); + return liveDocs; + } + + // Case 3: Partial sync - only ADD new items, don't remove any + const existingIds = new Set(prev.map((d) => d.id)); + const liveIds = new Set(validItems.map((d) => d.id)); + + // Find new items (in live but not in prev) + const newItems = validItems + .filter((item) => !existingIds.has(item.id)) + .map(electricToDisplayDoc); + + // Find updated items (in both, update with latest data) + const updatedPrev = prev.map((doc) => { + if (liveIds.has(doc.id)) { + const liveItem = validItems.find((v) => v.id === doc.id); + if (liveItem) { + return electricToDisplayDoc(liveItem); + } + } + return doc; + }); + + if (newItems.length > 0) { + console.log(`[useDocuments] Adding ${newItems.length} new items (partial sync)`); + return deduplicateAndSort([...newItems, ...updatedPrev]); + } + + return updatedPrev; + }); + }); + + liveQueryRef.current = liveQuery; + } catch (err) { + console.error("[useDocuments] Electric setup failed:", err); + // Don't set error - API data is already loaded + } + } + + setupElectricRealtime(); + + return () => { + mounted = false; + if (syncHandleRef.current) { + syncHandleRef.current.unsubscribe(); + syncHandleRef.current = null; + } + if (liveQueryRef.current) { + liveQueryRef.current.unsubscribe?.(); + liveQueryRef.current = null; + } + }; + }, [searchSpaceId, electricClient, typeFilter, electricToDisplayDoc, populateUserCache]); + + // Track previous searchSpaceId to detect actual changes + const prevSearchSpaceIdRef = useRef(null); + + // Reset on search space change (not on initial mount) + useEffect(() => { + if (prevSearchSpaceIdRef.current !== null && prevSearchSpaceIdRef.current !== searchSpaceId) { + setDocuments([]); + apiLoadedRef.current = false; + userCacheRef.current.clear(); + } + prevSearchSpaceIdRef.current = searchSpaceId; + }, [searchSpaceId]); + + return { + documents, + typeCounts, + total: documents.length, + loading, + error, + }; +} diff --git a/surfsense_web/hooks/use-inbox.ts b/surfsense_web/hooks/use-inbox.ts index 362feb747..56ddb46a4 100644 --- a/surfsense_web/hooks/use-inbox.ts +++ b/surfsense_web/hooks/use-inbox.ts @@ -38,10 +38,14 @@ function deduplicateAndSort(items: InboxItem[]): InboxItem[] { /** * Calculate the cutoff date for sync window + * IMPORTANT: Rounds to the start of the day (midnight UTC) to ensure stable values + * across re-renders. Without this, millisecond differences cause multiple syncs! */ function getSyncCutoffDate(): string { const cutoff = new Date(); cutoff.setDate(cutoff.getDate() - SYNC_WINDOW_DAYS); + // Round to start of day to prevent millisecond differences causing duplicate syncs + cutoff.setUTCHours(0, 0, 0, 0); return cutoff.toISOString(); } diff --git a/surfsense_web/lib/electric/client.ts b/surfsense_web/lib/electric/client.ts index 177a66d28..9d596a261 100644 --- a/surfsense_web/lib/electric/client.ts +++ b/surfsense_web/lib/electric/client.ts @@ -12,10 +12,21 @@ * 3. Works even if logout cleanup fails */ -import { PGlite } from "@electric-sql/pglite"; +import { PGlite, type Transaction } from "@electric-sql/pglite"; import { live } from "@electric-sql/pglite/live"; import { electricSync } from "@electric-sql/pglite-sync"; +// Debug logging - only logs in development, silent in production +const IS_DEV = process.env.NODE_ENV === "development"; + +function debugLog(...args: unknown[]) { + if (IS_DEV) console.log(...args); +} + +function debugWarn(...args: unknown[]) { + if (IS_DEV) console.warn(...args); +} + // Types export interface ElectricClient { db: PGlite; @@ -56,7 +67,14 @@ const pendingSyncs = new Map>(); // v2: user-specific database architecture // v3: consistent cutoff date for sync+queries, visibility refresh support // v4: heartbeat-based stale notification detection with updated_at tracking -const SYNC_VERSION = 4; +// v5: fixed duplicate key errors (root cause: unstable cutoff dates in use-inbox.ts) +// - added onMustRefetch handler for server-side refetch scenarios +// - fixed getSyncCutoffDate to use stable midnight UTC timestamps +// v6: real-time documents table - added title and created_by_id columns for live document display +// v7: removed use-documents-electric.ts - consolidated to single documents sync to prevent conflicts +// v8: added status column for real-time document processing status (ready/processing/failed) +// v9: added pending state for accurate document queue visibility +const SYNC_VERSION = 11; // Database name prefix for identifying SurfSense databases const DB_PREFIX = "surfsense-"; @@ -77,7 +95,7 @@ function getDbName(userId: string): string { } /** - * Clean up databases from OTHER users (not the current user) + * Clean up databases from OTHER users AND old versions * This is called on login to ensure clean state */ async function cleanupOtherUserDatabases(currentUserId: string): Promise { @@ -85,6 +103,10 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise { return; } + // The exact database identifier we want to keep (current user + current version) + // Format: "surfsense-{userId}-v{version}" + const currentDbIdentifier = `${DB_PREFIX}${currentUserId}-v${SYNC_VERSION}`; + try { // Try to list all databases (not supported in all browsers) if (typeof window.indexedDB.databases === "function") { @@ -95,26 +117,27 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise { if (!dbName) continue; // Check if this is a SurfSense database - if (dbName.startsWith(DB_PREFIX) || dbName.includes("surfsense")) { - // Don't delete current user's database - if (dbName.includes(currentUserId)) { - console.log(`[Electric] Keeping current user's database: ${dbName}`); + if (dbName.includes("surfsense")) { + // Check if this is the current database + // PGlite stores with "/pglite/" prefix, so we check if the name ENDS WITH our identifier + if (dbName.endsWith(currentDbIdentifier)) { + debugLog(`[Electric] Keeping current database: ${dbName}`); continue; } - // Delete databases from other users + // Delete ALL other databases (other users OR old versions of current user) try { - console.log(`[Electric] Deleting stale database: ${dbName}`); + debugLog(`[Electric] Deleting stale database: ${dbName}`); window.indexedDB.deleteDatabase(dbName); } catch (deleteErr) { - console.warn(`[Electric] Failed to delete database ${dbName}:`, deleteErr); + debugWarn(`[Electric] Failed to delete database ${dbName}:`, deleteErr); } } } } } catch (err) { // indexedDB.databases() not supported - that's okay, login cleanup is best-effort - console.warn("[Electric] Could not enumerate databases for cleanup:", err); + debugWarn("[Electric] Could not enumerate databases for cleanup:", err); } } @@ -140,7 +163,7 @@ export async function initElectric(userId: string): Promise { // If initialized for a different user, close the old client first if (electricClient && currentUserId !== userId) { - console.log(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`); + debugLog(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`); await cleanupElectric(); } @@ -155,12 +178,12 @@ export async function initElectric(userId: string): Promise { initPromise = (async () => { try { // STEP 1: Clean up databases from other users (login-time cleanup) - console.log("[Electric] Cleaning up databases from other users..."); + debugLog("[Electric] Cleaning up databases from other users..."); await cleanupOtherUserDatabases(userId); // STEP 2: Create user-specific PGlite database const dbName = getDbName(userId); - console.log(`[Electric] Initializing database: ${dbName}`); + debugLog(`[Electric] Initializing database: ${dbName}`); const db = await PGlite.create({ dataDir: dbName, @@ -216,18 +239,22 @@ export async function initElectric(userId: string): Promise { `); // Create the documents table schema in PGlite - // Only sync minimal fields needed for type counts: id, document_type, search_space_id + // Sync columns needed for real-time table display (lightweight - no content/metadata) await db.exec(` CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY, search_space_id INTEGER NOT NULL, document_type TEXT NOT NULL, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + title TEXT NOT NULL DEFAULT '', + created_by_id TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + status JSONB DEFAULT '{"state": "ready"}'::jsonb ); CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents(search_space_id); CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(document_type); CREATE INDEX IF NOT EXISTS idx_documents_search_space_type ON documents(search_space_id, document_type); + CREATE INDEX IF NOT EXISTS idx_documents_status ON documents((status->>'state')); `); await db.exec(` @@ -290,14 +317,14 @@ export async function initElectric(userId: string): Promise { // Check if we already have an active sync for this shape (memory optimization) const existingHandle = activeSyncHandles.get(cacheKey); if (existingHandle) { - console.log(`[Electric] Reusing existing sync handle for: ${cacheKey}`); + debugLog(`[Electric] Reusing existing sync handle for: ${cacheKey}`); return existingHandle; } // Check if there's already a pending sync for this shape (prevent race condition) const pendingSync = pendingSyncs.get(cacheKey); if (pendingSync) { - console.log(`[Electric] Waiting for pending sync to complete: ${cacheKey}`); + debugLog(`[Electric] Waiting for pending sync to complete: ${cacheKey}`); return pendingSync; } @@ -323,7 +350,7 @@ export async function initElectric(userId: string): Promise { if (singleQuoteCount % 2 !== 0) { // Odd number of quotes means unterminated string literal - console.warn("Where clause has unmatched quotes, fixing:", where); + debugWarn("Where clause has unmatched quotes, fixing:", where); // Add closing quote at the end validatedWhere = `${where}'`; params.where = validatedWhere; @@ -337,15 +364,15 @@ export async function initElectric(userId: string): Promise { if (columns) params.columns = columns.join(","); - console.log("[Electric] Syncing shape with params:", params); - console.log("[Electric] Electric URL:", `${electricUrl}/v1/shape`); - console.log("[Electric] Where clause:", where, "Validated:", validatedWhere); + debugLog("[Electric] Syncing shape with params:", params); + debugLog("[Electric] Electric URL:", `${electricUrl}/v1/shape`); + debugLog("[Electric] Where clause:", where, "Validated:", validatedWhere); try { // Debug: Test Electric SQL connection directly first (DEV ONLY - skipped in production) if (process.env.NODE_ENV === "development") { const testUrl = `${electricUrl}/v1/shape?table=${table}&offset=-1${validatedWhere ? `&where=${encodeURIComponent(validatedWhere)}` : ""}`; - console.log("[Electric] Testing Electric SQL directly:", testUrl); + debugLog("[Electric] Testing Electric SQL directly:", testUrl); try { const testResponse = await fetch(testUrl); const testHeaders = { @@ -353,9 +380,9 @@ export async function initElectric(userId: string): Promise { offset: testResponse.headers.get("electric-offset"), upToDate: testResponse.headers.get("electric-up-to-date"), }; - console.log("[Electric] Direct Electric SQL response headers:", testHeaders); + debugLog("[Electric] Direct Electric SQL response headers:", testHeaders); const testData = await testResponse.json(); - console.log( + debugLog( "[Electric] Direct Electric SQL data count:", Array.isArray(testData) ? testData.length : "not array", testData @@ -396,14 +423,14 @@ export async function initElectric(userId: string): Promise { // Shorter timeout (5 seconds) as fallback setTimeout(() => { if (!syncResolved) { - console.warn( + debugWarn( `[Electric] ⚠️ Sync timeout for ${table} - checking isUpToDate one more time...` ); // Check isUpToDate one more time before resolving // This will be checked after shape is created setTimeout(() => { if (!syncResolved) { - console.warn( + debugWarn( `[Electric] ⚠️ Sync timeout for ${table} - resolving anyway after 5s` ); resolveInitialSync(); @@ -413,7 +440,22 @@ export async function initElectric(userId: string): Promise { }, 5000); }); - // Include userId in shapeKey for user-specific sync state + // ROOT CAUSE FIX: The duplicate key errors were caused by unstable cutoff dates + // in use-inbox.ts generating different sync keys on each render. + // That's now fixed (rounded to midnight UTC in getSyncCutoffDate). + // We can safely use shapeKey for fast incremental sync. + + const shapeKey = `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`; + + // Type assertion to PGlite with electric extension + const pgWithElectric = db as unknown as { + electric: { + syncShapeToTable: ( + config: Record + ) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>; + }; + }; + const shapeConfig = { shape: { url: `${electricUrl}/v1/shape`, @@ -425,9 +467,9 @@ export async function initElectric(userId: string): Promise { }, table, primaryKey, - shapeKey: `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`, // User-specific versioned key + shapeKey, // Re-enabled for fast incremental sync (root cause in use-inbox.ts is fixed) onInitialSync: () => { - console.log( + debugLog( `[Electric] ✅ Initial sync complete for ${table} - data should now be in PGlite` ); resolveInitialSync(); @@ -440,21 +482,37 @@ export async function initElectric(userId: string): Promise { ); rejectInitialSync(error); }, + // Handle must-refetch: clear table data before Electric re-inserts from scratch + // This prevents "duplicate key" errors when the shape is invalidated + onMustRefetch: async (tx: Transaction) => { + debugLog( + `[Electric] ⚠️ Must refetch triggered for ${table} - clearing existing data` + ); + try { + // Delete rows matching the shape's WHERE clause + // If no WHERE clause, delete all rows from the table + if (validatedWhere) { + // Parse the WHERE clause to build a DELETE statement + // The WHERE clause is already validated and formatted + await tx.exec(`DELETE FROM ${table} WHERE ${validatedWhere}`); + debugLog(`[Electric] 🗑️ Cleared ${table} rows matching: ${validatedWhere}`); + } else { + // No WHERE clause means we're syncing the entire table + await tx.exec(`DELETE FROM ${table}`); + debugLog(`[Electric] 🗑️ Cleared all rows from ${table}`); + } + } catch (cleanupError) { + console.error( + `[Electric] ❌ Failed to clear ${table} during must-refetch:`, + cleanupError + ); + // Re-throw to let Electric handle the error + throw cleanupError; + } + }, }; - console.log( - "[Electric] syncShapeToTable config:", - JSON.stringify(shapeConfig, null, 2) - ); - - // Type assertion to PGlite with electric extension - const pgWithElectric = db as PGlite & { - electric: { - syncShapeToTable: ( - config: typeof shapeConfig - ) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>; - }; - }; + debugLog("[Electric] syncShapeToTable config:", JSON.stringify(shapeConfig, null, 2)); let shape: { unsubscribe: () => void; isUpToDate: boolean; stream: unknown }; try { @@ -464,7 +522,7 @@ export async function initElectric(userId: string): Promise { const errorMessage = syncError instanceof Error ? syncError.message : String(syncError); if (errorMessage.includes("Already syncing")) { - console.warn( + debugWarn( `[Electric] Already syncing ${table}, waiting for existing sync to settle...` ); @@ -474,12 +532,12 @@ export async function initElectric(userId: string): Promise { // Check if an active handle now exists (another sync might have completed) const existingHandle = activeSyncHandles.get(cacheKey); if (existingHandle) { - console.log(`[Electric] Found existing handle after waiting: ${cacheKey}`); + debugLog(`[Electric] Found existing handle after waiting: ${cacheKey}`); return existingHandle; } // Retry once after waiting - console.log(`[Electric] Retrying sync for ${table}...`); + debugLog(`[Electric] Retrying sync for ${table}...`); try { shape = await pgWithElectric.electric.syncShapeToTable(shapeConfig); } catch (retryError) { @@ -487,12 +545,10 @@ export async function initElectric(userId: string): Promise { retryError instanceof Error ? retryError.message : String(retryError); if (retryMessage.includes("Already syncing")) { // Still syncing - create a placeholder handle that indicates the table is being synced - console.warn( - `[Electric] ${table} still syncing, creating placeholder handle` - ); + debugWarn(`[Electric] ${table} still syncing, creating placeholder handle`); const placeholderHandle: SyncHandle = { unsubscribe: () => { - console.log(`[Electric] Placeholder unsubscribe for: ${cacheKey}`); + debugLog(`[Electric] Placeholder unsubscribe for: ${cacheKey}`); activeSyncHandles.delete(cacheKey); }, get isUpToDate() { @@ -516,7 +572,7 @@ export async function initElectric(userId: string): Promise { } // Log the actual shape result structure - console.log("[Electric] Shape sync result (initial):", { + debugLog("[Electric] Shape sync result (initial):", { hasUnsubscribe: typeof shape?.unsubscribe === "function", isUpToDate: shape?.isUpToDate, hasStream: !!shape?.stream, @@ -525,7 +581,7 @@ export async function initElectric(userId: string): Promise { // Recommended Approach Step 1: Check isUpToDate immediately if (shape.isUpToDate) { - console.log( + debugLog( `[Electric] ✅ Sync already up-to-date for ${table} (resuming from previous state)` ); resolveInitialSync(); @@ -533,7 +589,7 @@ export async function initElectric(userId: string): Promise { // Recommended Approach Step 2: Subscribe to stream and watch for "up-to-date" message if (shape?.stream) { const stream = shape.stream as any; - console.log("[Electric] Shape stream details:", { + debugLog("[Electric] Shape stream details:", { shapeHandle: stream?.shapeHandle, lastOffset: stream?.lastOffset, isUpToDate: stream?.isUpToDate, @@ -546,14 +602,14 @@ export async function initElectric(userId: string): Promise { // NOTE: We keep this subscription active - don't unsubscribe! // The stream is what Electric SQL uses for real-time updates if (typeof stream?.subscribe === "function") { - console.log( + debugLog( "[Electric] Subscribing to shape stream to watch for up-to-date message..." ); // Subscribe but don't store unsubscribe - we want it to stay active stream.subscribe((messages: unknown[]) => { // Continue receiving updates even after sync is resolved if (!syncResolved) { - console.log( + debugLog( "[Electric] 🔵 Shape stream received messages:", messages?.length || 0 ); @@ -570,14 +626,14 @@ export async function initElectric(userId: string): Promise { (typeof msg === "object" && "up-to-date" in msg) ) { if (!syncResolved) { - console.log(`[Electric] ✅ Received up-to-date message for ${table}`); + debugLog(`[Electric] ✅ Received up-to-date message for ${table}`); resolveInitialSync(); } // Continue listening for real-time updates - don't return! } } if (!syncResolved && messages.length > 0) { - console.log( + debugLog( "[Electric] First message:", JSON.stringify(messages[0], null, 2) ); @@ -586,16 +642,14 @@ export async function initElectric(userId: string): Promise { // Also check stream's isUpToDate property after receiving messages if (!syncResolved && stream?.isUpToDate) { - console.log(`[Electric] ✅ Stream isUpToDate is true for ${table}`); + debugLog(`[Electric] ✅ Stream isUpToDate is true for ${table}`); resolveInitialSync(); } }); // Also check stream's isUpToDate property immediately if (stream?.isUpToDate) { - console.log( - `[Electric] ✅ Stream isUpToDate is true immediately for ${table}` - ); + debugLog(`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`); resolveInitialSync(); } } @@ -608,9 +662,7 @@ export async function initElectric(userId: string): Promise { } if (shape.isUpToDate || stream?.isUpToDate) { - console.log( - `[Electric] ✅ Sync completed (detected via polling) for ${table}` - ); + debugLog(`[Electric] ✅ Sync completed (detected via polling) for ${table}`); clearInterval(pollInterval); resolveInitialSync(); } @@ -621,7 +673,7 @@ export async function initElectric(userId: string): Promise { clearInterval(pollInterval); }); } else { - console.warn( + debugWarn( `[Electric] ⚠️ No stream available for ${table}, relying on callback and timeout` ); } @@ -630,7 +682,7 @@ export async function initElectric(userId: string): Promise { // Create the sync handle with proper cleanup const syncHandle: SyncHandle = { unsubscribe: () => { - console.log(`[Electric] Unsubscribing from: ${cacheKey}`); + debugLog(`[Electric] Unsubscribing from: ${cacheKey}`); // Remove from cache first activeSyncHandles.delete(cacheKey); // Then unsubscribe from the shape @@ -648,7 +700,7 @@ export async function initElectric(userId: string): Promise { // Cache the sync handle for reuse (memory optimization) activeSyncHandles.set(cacheKey, syncHandle); - console.log( + debugLog( `[Electric] Cached sync handle for: ${cacheKey} (total cached: ${activeSyncHandles.size})` ); @@ -660,7 +712,7 @@ export async function initElectric(userId: string): Promise { const response = await fetch(`${electricUrl}/v1/shape?table=${table}&offset=-1`, { method: "GET", }); - console.log( + debugLog( "[Electric] Electric SQL server response:", response.status, response.statusText @@ -682,14 +734,14 @@ export async function initElectric(userId: string): Promise { // Clean up the pending sync when done (whether success or failure) syncPromise.finally(() => { pendingSyncs.delete(cacheKey); - console.log(`[Electric] Pending sync removed for: ${cacheKey}`); + debugLog(`[Electric] Pending sync removed for: ${cacheKey}`); }); return syncPromise; }, }; - console.log(`[Electric] ✅ Initialized successfully for user: ${userId}`); + debugLog(`[Electric] ✅ Initialized successfully for user: ${userId}`); return electricClient; } catch (error) { console.error("[Electric] Failed to initialize:", error); @@ -715,10 +767,10 @@ export async function cleanupElectric(): Promise { } const userIdToClean = currentUserId; - console.log(`[Electric] Cleaning up for user: ${userIdToClean}`); + debugLog(`[Electric] Cleaning up for user: ${userIdToClean}`); // Unsubscribe from all active sync handles first (memory cleanup) - console.log(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`); + debugLog(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`); // Copy keys to array to avoid mutation during iteration const handleKeys = Array.from(activeSyncHandles.keys()); for (const key of handleKeys) { @@ -727,7 +779,7 @@ export async function cleanupElectric(): Promise { try { handle.unsubscribe(); } catch (err) { - console.warn(`[Electric] Failed to unsubscribe from ${key}:`, err); + debugWarn(`[Electric] Failed to unsubscribe from ${key}:`, err); } } } @@ -738,7 +790,7 @@ export async function cleanupElectric(): Promise { try { // Close the PGlite database connection await electricClient.db.close(); - console.log("[Electric] Database closed"); + debugLog("[Electric] Database closed"); } catch (error) { console.error("[Electric] Error closing database:", error); } @@ -754,13 +806,13 @@ export async function cleanupElectric(): Promise { try { const dbName = `${DB_PREFIX}${userIdToClean}-v${SYNC_VERSION}`; window.indexedDB.deleteDatabase(dbName); - console.log(`[Electric] Deleted database: ${dbName}`); + debugLog(`[Electric] Deleted database: ${dbName}`); } catch (err) { - console.warn("[Electric] Failed to delete database:", err); + debugWarn("[Electric] Failed to delete database:", err); } } - console.log("[Electric] Cleanup complete"); + debugLog("[Electric] Cleanup complete"); } /** diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 5a18f80c3..fae4c7265 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -308,6 +308,7 @@ "no_rows_selected": "No rows selected", "delete_success_count": "Successfully deleted {count} document(s)", "delete_partial_failed": "Some documents could not be deleted", + "delete_success": "Document deleted successfully", "delete_error": "Error deleting documents", "filter_by_title": "Filter by title...", "bulk_delete": "Delete Selected", @@ -328,7 +329,6 @@ "filter_placeholder": "Filter by title...", "rows_per_page": "Rows per page", "refresh": "Refresh", - "refresh_success": "Documents refreshed", "upload_documents": "Upload Documents", "create_shared_note": "Create Shared Note", "processing_documents": "Processing documents...", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 1046b7296..2667a06d1 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -313,7 +313,6 @@ "filter_placeholder": "按标题筛选...", "rows_per_page": "每页行数", "refresh": "刷新", - "refresh_success": "文档已刷新", "upload_documents": "上传文档", "create_shared_note": "创建共享笔记", "processing_documents": "正在处理文档...",