mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
commit
d97068882a
65 changed files with 8215 additions and 4388 deletions
|
|
@ -17,13 +17,6 @@ from collections.abc import Sequence
|
|||
|
||||
from alembic import context, op
|
||||
|
||||
# Get Electric SQL user credentials from env.py configuration
|
||||
_config = context.config
|
||||
ELECTRIC_DB_USER = _config.get_main_option("electric_db_user", "electric")
|
||||
ELECTRIC_DB_PASSWORD = _config.get_main_option(
|
||||
"electric_db_password", "electric_password"
|
||||
)
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "66"
|
||||
down_revision: str | None = "65"
|
||||
|
|
@ -31,8 +24,21 @@ branch_labels: str | Sequence[str] | None = None
|
|||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def _get_electric_credentials() -> tuple[str, str]:
|
||||
"""Get Electric SQL credentials from Alembic config.
|
||||
|
||||
Must be called inside upgrade()/downgrade(), not at module level,
|
||||
because context.config is only available during migration execution.
|
||||
"""
|
||||
_config = context.config
|
||||
user = _config.get_main_option("electric_db_user", "electric")
|
||||
password = _config.get_main_option("electric_db_password", "electric_password")
|
||||
return user, password
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema - add notifications table and Electric SQL replication."""
|
||||
electric_db_user, electric_db_password = _get_electric_credentials()
|
||||
# Create notifications table
|
||||
op.execute(
|
||||
"""
|
||||
|
|
@ -74,8 +80,8 @@ def upgrade() -> None:
|
|||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{ELECTRIC_DB_USER}') THEN
|
||||
CREATE USER {ELECTRIC_DB_USER} WITH REPLICATION PASSWORD '{ELECTRIC_DB_PASSWORD}';
|
||||
IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{electric_db_user}') THEN
|
||||
CREATE USER {electric_db_user} WITH REPLICATION PASSWORD '{electric_db_password}';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
|
@ -89,19 +95,19 @@ def upgrade() -> None:
|
|||
DECLARE
|
||||
db_name TEXT := current_database();
|
||||
BEGIN
|
||||
EXECUTE format('GRANT CONNECT ON DATABASE %I TO {ELECTRIC_DB_USER}', db_name);
|
||||
EXECUTE format('GRANT CONNECT ON DATABASE %I TO {electric_db_user}', db_name);
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
op.execute(f"GRANT USAGE ON SCHEMA public TO {ELECTRIC_DB_USER};")
|
||||
op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {ELECTRIC_DB_USER};")
|
||||
op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {ELECTRIC_DB_USER};")
|
||||
op.execute(f"GRANT USAGE ON SCHEMA public TO {electric_db_user};")
|
||||
op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {electric_db_user};")
|
||||
op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {electric_db_user};")
|
||||
op.execute(
|
||||
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {ELECTRIC_DB_USER};"
|
||||
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {electric_db_user};"
|
||||
)
|
||||
op.execute(
|
||||
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {ELECTRIC_DB_USER};"
|
||||
f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {electric_db_user};"
|
||||
)
|
||||
|
||||
# Create the publication if not exists
|
||||
|
|
|
|||
|
|
@ -10,8 +10,6 @@ SECRET_KEY rotation.
|
|||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
|
|
@ -23,17 +21,45 @@ depends_on: str | Sequence[str] | None = None
|
|||
|
||||
def upgrade() -> None:
|
||||
# Add access_token column (nullable so existing rows are unaffected)
|
||||
op.add_column(
|
||||
"image_generations",
|
||||
sa.Column("access_token", sa.String(64), nullable=True),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_image_generations_access_token",
|
||||
"image_generations",
|
||||
["access_token"],
|
||||
# Guard: skip entirely if image_generations table doesn't exist
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.tables
|
||||
WHERE table_name = 'image_generations'
|
||||
) THEN
|
||||
-- Add column if not exists
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'image_generations' AND column_name = 'access_token'
|
||||
) THEN
|
||||
ALTER TABLE image_generations
|
||||
ADD COLUMN access_token VARCHAR(64);
|
||||
END IF;
|
||||
|
||||
-- Create index if not exists
|
||||
CREATE INDEX IF NOT EXISTS ix_image_generations_access_token
|
||||
ON image_generations (access_token);
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_image_generations_access_token", table_name="image_generations")
|
||||
op.drop_column("image_generations", "access_token")
|
||||
op.execute("DROP INDEX IF EXISTS ix_image_generations_access_token")
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'image_generations' AND column_name = 'access_token'
|
||||
) THEN
|
||||
ALTER TABLE image_generations DROP COLUMN access_token;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,77 @@
|
|||
"""Add status column to documents table for per-document processing status
|
||||
|
||||
Revision ID: 95
|
||||
Revises: 94
|
||||
Create Date: 2026-02-05
|
||||
|
||||
Changes:
|
||||
1. Add status column (JSONB) to documents table
|
||||
2. Default value is {"state": "ready"} for backward compatibility
|
||||
3. Existing documents are set to ready status
|
||||
4. Index created for efficient status filtering
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "95"
|
||||
down_revision: str | None = "94"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add status column to documents with default ready state."""
|
||||
|
||||
# 1. Add status column with default value for new rows
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'status'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD COLUMN status JSONB NOT NULL DEFAULT '{"state": "ready"}'::jsonb;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# 2. Create index on status for efficient filtering by state
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_documents_status
|
||||
ON documents ((status->>'state'));
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove status column from documents."""
|
||||
|
||||
# Drop index
|
||||
op.execute(
|
||||
"""
|
||||
DROP INDEX IF EXISTS ix_documents_status;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop column
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'status'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP COLUMN status;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
|
@ -16,11 +16,15 @@ from sqlalchemy.orm import selectinload
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import calculate_date_range
|
||||
from app.tasks.connector_indexers.base import (
|
||||
calculate_date_range,
|
||||
check_duplicate_document_by_hash,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
|
|
@ -206,26 +210,24 @@ class ComposioGmailConnector(ComposioConnector):
|
|||
# ============ Indexer Functions ============
|
||||
|
||||
|
||||
async def _process_gmail_message_batch(
|
||||
async def _analyze_gmail_messages_phase1(
|
||||
session: AsyncSession,
|
||||
messages: list[dict[str, Any]],
|
||||
composio_connector: ComposioGmailConnector,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
total_documents_indexed: int = 0,
|
||||
) -> tuple[int, int]:
|
||||
) -> tuple[list[dict[str, Any]], int, int]:
|
||||
"""
|
||||
Process a batch of Gmail messages and index them.
|
||||
|
||||
Args:
|
||||
total_documents_indexed: Running total of documents indexed so far (for batch commits).
|
||||
Phase 1: Analyze all messages, create pending documents.
|
||||
Makes ALL documents visible in the UI immediately with pending status.
|
||||
|
||||
Returns:
|
||||
Tuple of (documents_indexed, documents_skipped)
|
||||
Tuple of (messages_to_process, documents_skipped, duplicate_content_count)
|
||||
"""
|
||||
documents_indexed = 0
|
||||
messages_to_process = []
|
||||
documents_skipped = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
|
|
@ -235,11 +237,7 @@ async def _process_gmail_message_batch(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Composio's GMAIL_FETCH_EMAILS already returns full message content
|
||||
# No need for a separate detail API call
|
||||
|
||||
# Extract message info from Composio response
|
||||
# Composio structure: messageId, messageText, messageTimestamp, payload.headers, labelIds
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
|
||||
|
|
@ -262,7 +260,7 @@ async def _process_gmail_message_batch(
|
|||
message
|
||||
)
|
||||
|
||||
# Check for empty content (defensive parsing per Composio best practices)
|
||||
# Check for empty content
|
||||
if not markdown_content.strip():
|
||||
logger.warning(f"Skipping Gmail message with no content: {subject}")
|
||||
documents_skipped += 1
|
||||
|
|
@ -280,102 +278,58 @@ async def _process_gmail_message_batch(
|
|||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Get label IDs from Composio response
|
||||
# Get label IDs and thread_id from Composio response
|
||||
label_ids = message.get("labelIds", [])
|
||||
# Extract thread_id if available (for consistency with non-Composio implementation)
|
||||
thread_id = message.get("threadId", "") or message.get("thread_id", "")
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
"date_str": date_str,
|
||||
"label_ids": label_ids,
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Gmail: {subject}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"labels": label_ids,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
current_total = total_documents_indexed + documents_indexed
|
||||
if current_total % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {current_total} Gmail messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
)
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Gmail: {subject}\n\nFrom: {sender}\nDate: {date_str}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from standard connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Message {subject} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Gmail: {subject}",
|
||||
title=subject,
|
||||
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["gmail"]),
|
||||
document_metadata={
|
||||
"message_id": message_id,
|
||||
|
|
@ -388,39 +342,140 @@ async def _process_gmail_message_batch(
|
|||
"toolkit_id": "gmail",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date_str": date_str,
|
||||
"label_ids": label_ids,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
return messages_to_process, documents_skipped, duplicate_content_count
|
||||
|
||||
|
||||
async def _process_gmail_messages_phase2(
|
||||
session: AsyncSession,
|
||||
messages_to_process: list[dict[str, Any]],
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Phase 2: Process each document one by one.
|
||||
Each document transitions: pending → processing → ready/failed
|
||||
|
||||
Returns:
|
||||
Tuple of (documents_indexed, documents_failed)
|
||||
"""
|
||||
documents_indexed = 0
|
||||
documents_failed = 0
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for item in messages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
"subject": item["subject"],
|
||||
"sender": item["sender"],
|
||||
"document_type": "Gmail Message (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
item["markdown_content"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["subject"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
"subject": item["subject"],
|
||||
"sender": item["sender"],
|
||||
"date": item["date_str"],
|
||||
"labels": item["label_ids"],
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
current_total = total_documents_indexed + documents_indexed
|
||||
if current_total % 10 == 0:
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {current_total} Gmail messages processed so far"
|
||||
f"Committing batch: {documents_indexed} Gmail messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
# Rollback on error to avoid partial state (per Composio best practices)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
await session.rollback()
|
||||
except Exception as rollback_error:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Error during rollback: {rollback_error!s}", exc_info=True
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
return documents_indexed, documents_skipped
|
||||
return documents_indexed, documents_failed
|
||||
|
||||
|
||||
async def index_composio_gmail(
|
||||
|
|
@ -437,7 +492,7 @@ async def index_composio_gmail(
|
|||
max_items: int = 1000,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str]:
|
||||
"""Index Gmail messages via Composio with pagination and incremental processing."""
|
||||
"""Index Gmail messages via Composio with real-time document status updates."""
|
||||
try:
|
||||
composio_connector = ComposioGmailConnector(session, connector_id)
|
||||
|
||||
|
|
@ -448,14 +503,10 @@ async def index_composio_gmail(
|
|||
end_date = None
|
||||
|
||||
# Use provided dates directly if both are provided, otherwise calculate from last_indexed_at
|
||||
# This ensures user-selected dates are respected (matching non-Composio Gmail connector behavior)
|
||||
if start_date is not None and end_date is not None:
|
||||
# User provided both dates - use them directly
|
||||
start_date_str = start_date
|
||||
end_date_str = end_date
|
||||
else:
|
||||
# Calculate date range with defaults (uses last_indexed_at or 365 days back)
|
||||
# This ensures indexing works even when user doesn't specify dates
|
||||
start_date_str, end_date_str = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
|
@ -473,48 +524,32 @@ async def index_composio_gmail(
|
|||
f"(start_date={start_date_str}, end_date={end_date_str})"
|
||||
)
|
||||
|
||||
# Use smaller batch size to avoid 413 payload too large errors
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Gmail messages via Composio for connector {connector_id}",
|
||||
{"stage": "fetching_messages"},
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# FETCH ALL MESSAGES FIRST
|
||||
# =======================================================================
|
||||
batch_size = 50
|
||||
page_token = None
|
||||
total_documents_indexed = 0
|
||||
total_documents_skipped = 0
|
||||
total_messages_fetched = 0
|
||||
result_size_estimate = None # Will be set from first API response
|
||||
all_messages = []
|
||||
result_size_estimate = None
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
while total_messages_fetched < max_items:
|
||||
# Send heartbeat periodically to indicate task is still alive
|
||||
while len(all_messages) < max_items:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(total_documents_indexed)
|
||||
await on_heartbeat_callback(len(all_messages))
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
# Calculate how many messages to fetch in this batch
|
||||
remaining = max_items - total_messages_fetched
|
||||
remaining = max_items - len(all_messages)
|
||||
current_batch_size = min(batch_size, remaining)
|
||||
|
||||
# Use result_size_estimate if available, otherwise fall back to max_items
|
||||
estimated_total = (
|
||||
result_size_estimate if result_size_estimate is not None else max_items
|
||||
)
|
||||
# Cap estimated_total at max_items to avoid showing misleading progress
|
||||
estimated_total = min(estimated_total, max_items)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Gmail messages batch via Composio for connector {connector_id} "
|
||||
f"({total_messages_fetched}/{estimated_total} fetched, {total_documents_indexed} indexed)",
|
||||
{
|
||||
"stage": "fetching_messages",
|
||||
"batch_size": current_batch_size,
|
||||
"total_fetched": total_messages_fetched,
|
||||
"total_indexed": total_documents_indexed,
|
||||
"estimated_total": estimated_total,
|
||||
},
|
||||
)
|
||||
|
||||
# Fetch batch of messages
|
||||
(
|
||||
messages,
|
||||
next_token,
|
||||
|
|
@ -533,97 +568,136 @@ async def index_composio_gmail(
|
|||
return 0, f"Failed to fetch Gmail messages: {error}"
|
||||
|
||||
if not messages:
|
||||
# No more messages available
|
||||
break
|
||||
|
||||
# Update result_size_estimate from first response (Gmail provides this estimate)
|
||||
if result_size_estimate is None and result_size_estimate_batch is not None:
|
||||
result_size_estimate = result_size_estimate_batch
|
||||
logger.info(
|
||||
f"Gmail API estimated {result_size_estimate} total messages for query: '{query}'"
|
||||
)
|
||||
|
||||
total_messages_fetched += len(messages)
|
||||
# Recalculate estimated_total after potentially updating result_size_estimate
|
||||
estimated_total = (
|
||||
result_size_estimate if result_size_estimate is not None else max_items
|
||||
)
|
||||
estimated_total = min(estimated_total, max_items)
|
||||
|
||||
all_messages.extend(messages)
|
||||
logger.info(
|
||||
f"Fetched batch of {len(messages)} Gmail messages "
|
||||
f"(total: {total_messages_fetched}/{estimated_total})"
|
||||
f"Fetched {len(messages)} messages (total: {len(all_messages)})"
|
||||
)
|
||||
|
||||
# Process batch incrementally
|
||||
batch_indexed, batch_skipped = await _process_gmail_message_batch(
|
||||
session=session,
|
||||
messages=messages,
|
||||
composio_connector=composio_connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
total_documents_indexed=total_documents_indexed,
|
||||
)
|
||||
|
||||
total_documents_indexed += batch_indexed
|
||||
total_documents_skipped += batch_skipped
|
||||
|
||||
logger.info(
|
||||
f"Processed batch: {batch_indexed} indexed, {batch_skipped} skipped "
|
||||
f"(total: {total_documents_indexed} indexed, {total_documents_skipped} skipped)"
|
||||
)
|
||||
|
||||
# Batch commits happen in _process_gmail_message_batch every 10 documents
|
||||
# This ensures progress is saved incrementally, preventing data loss on crashes
|
||||
|
||||
# Check if we should continue
|
||||
if not next_token:
|
||||
# No more pages available
|
||||
if not next_token or len(messages) < current_batch_size:
|
||||
break
|
||||
|
||||
if len(messages) < current_batch_size:
|
||||
# Last page had fewer items than requested, we're done
|
||||
break
|
||||
|
||||
# Continue with next page
|
||||
page_token = next_token
|
||||
|
||||
if total_messages_fetched == 0:
|
||||
if not all_messages:
|
||||
success_msg = "No Gmail messages found in the specified date range"
|
||||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"messages_count": 0}
|
||||
)
|
||||
# CRITICAL: Update timestamp even when no messages found so Electric SQL syncs and UI shows indexed status
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
return 0, None # Return None (not error) when no items found
|
||||
return (
|
||||
0,
|
||||
None,
|
||||
) # Return None (not error) when no items found - this is success with 0 items
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
logger.info(f"Found {len(all_messages)} Gmail messages to index via Composio")
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all messages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Phase 1: Creating pending documents for {len(all_messages)} messages",
|
||||
{"stage": "phase1_pending"},
|
||||
)
|
||||
|
||||
(
|
||||
messages_to_process,
|
||||
documents_skipped,
|
||||
duplicate_content_count,
|
||||
) = await _analyze_gmail_messages_phase1(
|
||||
session=session,
|
||||
messages=all_messages,
|
||||
composio_connector=composio_connector,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
new_documents_count = len([m for m in messages_to_process if m["is_new"]])
|
||||
if new_documents_count > 0:
|
||||
logger.info(f"Phase 1: Committing {new_documents_count} pending documents")
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Phase 2: Processing {len(messages_to_process)} documents",
|
||||
{"stage": "phase2_processing"},
|
||||
)
|
||||
|
||||
documents_indexed, documents_failed = await _process_gmail_messages_phase2(
|
||||
session=session,
|
||||
messages_to_process=messages_to_process,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
|
||||
# CRITICAL: Always update timestamp so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
# This matches the pattern used in non-Composio Gmail indexer
|
||||
logger.info(
|
||||
f"Final commit: Total {total_documents_indexed} Gmail messages processed"
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Composio Gmail document changes to database"
|
||||
)
|
||||
# Final commit to ensure all documents are persisted
|
||||
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Composio Gmail document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Gmail indexing via Composio for connector {connector_id}",
|
||||
{
|
||||
"documents_indexed": total_documents_indexed,
|
||||
"documents_skipped": total_documents_skipped,
|
||||
"messages_fetched": total_messages_fetched,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
return total_documents_indexed, None
|
||||
logger.info(
|
||||
f"Composio Gmail indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index Gmail via Composio: {e!s}", exc_info=True)
|
||||
|
|
|
|||
|
|
@ -16,13 +16,14 @@ from sqlalchemy.orm import selectinload
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import (
|
||||
calculate_date_range,
|
||||
check_duplicate_document_by_hash,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -266,18 +267,20 @@ async def index_composio_google_calendar(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0 # Track events that failed processing
|
||||
duplicate_content_count = (
|
||||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all events, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
events_to_process = [] # List of dicts with document and event data
|
||||
new_documents_created = False
|
||||
|
||||
for event in events:
|
||||
# Send heartbeat periodically to indicate task is still alive
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
event_id = event.get("id", "") or event.get("eventId", "")
|
||||
|
|
@ -315,61 +318,28 @@ async def index_composio_google_calendar(
|
|||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Calendar: {summary}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
|
|
@ -380,49 +350,19 @@ async def index_composio_google_calendar(
|
|||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
# A document with the same content already exists (likely from standard connector)
|
||||
logger.info(
|
||||
f"Event {summary} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
)
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Calendar: {summary}",
|
||||
title=summary,
|
||||
document_type=DocumentType(
|
||||
TOOLKIT_TO_DOCUMENT_TYPE["googlecalendar"]
|
||||
),
|
||||
|
|
@ -436,19 +376,116 @@ async def index_composio_google_calendar(
|
|||
"toolkit_id": "googlecalendar",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
|
||||
|
||||
for item in events_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"summary": item["summary"],
|
||||
"start_time": item["start_time"],
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["markdown_content"],
|
||||
user_llm,
|
||||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}"
|
||||
if item["location"]:
|
||||
summary_content += f"\nLocation: {item['location']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["summary"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"event_id": item["event_id"],
|
||||
"summary": item["summary"],
|
||||
"start_time": item["start_time"],
|
||||
"end_time": item["end_time"],
|
||||
"location": item["location"],
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
|
|
@ -457,7 +494,15 @@ async def index_composio_google_calendar(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
|
|
@ -490,10 +535,13 @@ async def index_composio_google_calendar(
|
|||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if duplicates were found
|
||||
warning_message = None
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -501,13 +549,15 @@ async def index_composio_google_calendar(
|
|||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||
f"Composio Google Calendar indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
|
|
|
|||
|
|
@ -21,10 +21,14 @@ from sqlalchemy.orm.attributes import flag_modified
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType, Log
|
||||
from app.db import Document, DocumentStatus, DocumentType, Log
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import (
|
||||
check_duplicate_document_by_hash,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
|
|
@ -537,22 +541,6 @@ async def check_document_by_unique_identifier(
|
|||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def check_document_by_content_hash(
|
||||
session: AsyncSession, content_hash: str
|
||||
) -> Document | None:
|
||||
"""Check if a document with the given content hash already exists.
|
||||
|
||||
This is used to prevent duplicate content from being indexed, regardless
|
||||
of which connector originally indexed it.
|
||||
"""
|
||||
from sqlalchemy.future import select
|
||||
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
return existing_doc_result.scalars().first()
|
||||
|
||||
|
||||
async def check_document_by_google_drive_file_id(
|
||||
session: AsyncSession, file_id: str, search_space_id: int
|
||||
) -> Document | None:
|
||||
|
|
@ -843,14 +831,16 @@ async def _index_composio_drive_delta_sync(
|
|||
log_entry,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int, list[str]]:
|
||||
"""Index Google Drive files using delta sync (only changed files).
|
||||
"""Index Google Drive files using delta sync with real-time document status updates.
|
||||
|
||||
Uses GOOGLEDRIVE_LIST_CHANGES to fetch only files that changed since last sync.
|
||||
Handles: new files, modified files, and deleted files.
|
||||
"""
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
processing_errors = []
|
||||
duplicate_content_count = 0
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# Fetch all changes with pagination
|
||||
|
|
@ -881,14 +871,13 @@ async def _index_composio_drive_delta_sync(
|
|||
|
||||
logger.info(f"Processing {len(all_changes)} changes from delta sync")
|
||||
|
||||
for change in all_changes[:max_items]:
|
||||
# Send heartbeat periodically to indicate task is still alive
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all changes, handle deletions, create pending documents
|
||||
# =======================================================================
|
||||
files_to_process = []
|
||||
new_documents_created = False
|
||||
|
||||
for change in all_changes[:max_items]:
|
||||
try:
|
||||
# Handle removed files
|
||||
is_removed = change.get("removed", False)
|
||||
|
|
@ -899,9 +888,8 @@ async def _index_composio_drive_delta_sync(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Check if file was trashed or removed
|
||||
# Check if file was trashed or removed - handle deletions immediately
|
||||
if is_removed or file_info.get("trashed", False):
|
||||
# Remove document from database
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"drive_{file_id}", search_space_id
|
||||
|
|
@ -923,37 +911,233 @@ async def _index_composio_drive_delta_sync(
|
|||
if mime_type == "application/vnd.google-apps.folder":
|
||||
continue
|
||||
|
||||
# Process the file
|
||||
indexed, skipped, errors = await _process_single_drive_file(
|
||||
session=session,
|
||||
composio_connector=composio_connector,
|
||||
file_id=file_id,
|
||||
file_name=file_name,
|
||||
mime_type=mime_type,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
# Check for existing document by file ID (from any connector)
|
||||
existing_by_file_id = await check_document_by_google_drive_file_id(
|
||||
session, file_id, search_space_id
|
||||
)
|
||||
|
||||
documents_indexed += indexed
|
||||
documents_skipped += skipped
|
||||
processing_errors.extend(errors)
|
||||
# Generate unique identifier hash
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"drive_{file_id}", search_space_id
|
||||
)
|
||||
|
||||
# Check if document exists by unique identifier
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_by_file_id and not existing_document:
|
||||
# File already indexed by different connector - skip
|
||||
logger.info(
|
||||
f"Skipping file {file_name} (file_id={file_id}): already indexed "
|
||||
f"by {existing_by_file_id.document_type.value}"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if existing_document:
|
||||
# Queue existing document for update
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
|
||||
document_metadata={
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"FILE_NAME": file_name,
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googledrive",
|
||||
"source": "composio",
|
||||
},
|
||||
content="Pending...",
|
||||
content_hash=unique_identifier_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[],
|
||||
status=DocumentStatus.pending(),
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for change: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
|
||||
|
||||
for item in files_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Get file content
|
||||
content, content_error = await composio_connector.get_drive_file_content(
|
||||
item["file_id"], original_mime_type=item["mime_type"]
|
||||
)
|
||||
|
||||
if content_error or not content:
|
||||
logger.warning(
|
||||
f"Could not get content for file {item['file_name']}: {content_error}"
|
||||
)
|
||||
markdown_content = f"# {item['file_name']}\n\n"
|
||||
markdown_content += f"**File ID:** {item['file_id']}\n"
|
||||
markdown_content += f"**Type:** {item['mime_type']}\n"
|
||||
elif isinstance(content, dict):
|
||||
error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}"
|
||||
logger.error(error_msg)
|
||||
processing_errors.append(error_msg)
|
||||
markdown_content = f"# {item['file_name']}\n\n"
|
||||
markdown_content += f"**File ID:** {item['file_id']}\n"
|
||||
markdown_content += f"**Type:** {item['mime_type']}\n"
|
||||
else:
|
||||
markdown_content = await _process_file_content(
|
||||
content=content,
|
||||
file_name=item["file_name"],
|
||||
file_id=item["file_id"],
|
||||
mime_type=item["mime_type"],
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
session=session,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
# For existing documents, check if content changed
|
||||
if not item["is_new"] and document.content_hash == content_hash:
|
||||
if not DocumentStatus.is_state(document.status, DocumentStatus.READY):
|
||||
document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Check for duplicate content hash (for new documents)
|
||||
if item["is_new"]:
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"File {item['file_name']} already indexed by another connector. Skipping."
|
||||
)
|
||||
await session.delete(document)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
"mime_type": item["mime_type"],
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Update document to READY
|
||||
document.title = item["file_name"]
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
"FILE_NAME": item["file_name"],
|
||||
"mime_type": item["mime_type"],
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed > 0 and documents_indexed % 10 == 0:
|
||||
if documents_indexed % 10 == 0:
|
||||
await session.commit()
|
||||
logger.info(f"Committed batch: {documents_indexed} changes processed")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing change for file {file_id}: {e!s}"
|
||||
error_msg = f"Error processing change for file {item['file_id']}: {e!s}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
processing_errors.append(error_msg)
|
||||
documents_skipped += 1
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
||||
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, "
|
||||
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return documents_indexed, documents_skipped, processing_errors
|
||||
|
||||
|
|
@ -973,10 +1157,12 @@ async def _index_composio_drive_full_scan(
|
|||
log_entry,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int, list[str]]:
|
||||
"""Index Google Drive files using full scan (first sync or when no delta token)."""
|
||||
"""Index Google Drive files using full scan with real-time document status updates."""
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
processing_errors = []
|
||||
duplicate_content_count = 0
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
all_files = []
|
||||
|
|
@ -1108,14 +1294,14 @@ async def _index_composio_drive_full_scan(
|
|||
f"Found {len(all_files)} Google Drive files to index via Composio (full scan)"
|
||||
)
|
||||
|
||||
for file_info in all_files:
|
||||
# Send heartbeat periodically to indicate task is still alive
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all files, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
files_to_process = [] # List of dicts with document and file data
|
||||
new_documents_created = False
|
||||
|
||||
for file_info in all_files:
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
file_id = file_info.get("id", "") or file_info.get("fileId", "")
|
||||
|
|
@ -1132,227 +1318,242 @@ async def _index_composio_drive_full_scan(
|
|||
if mime_type == "application/vnd.google-apps.folder":
|
||||
continue
|
||||
|
||||
# Process the file
|
||||
indexed, skipped, errors = await _process_single_drive_file(
|
||||
session=session,
|
||||
composio_connector=composio_connector,
|
||||
file_id=file_id,
|
||||
file_name=file_name,
|
||||
mime_type=mime_type,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
# ========== EARLY DUPLICATE CHECK BY FILE ID ==========
|
||||
existing_by_file_id = await check_document_by_google_drive_file_id(
|
||||
session, file_id, search_space_id
|
||||
)
|
||||
if existing_by_file_id:
|
||||
logger.info(
|
||||
f"Skipping file {file_name} (file_id={file_id}): already indexed "
|
||||
f"by {existing_by_file_id.document_type.value}"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate unique identifier hash
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"drive_{file_id}", search_space_id
|
||||
)
|
||||
|
||||
documents_indexed += indexed
|
||||
documents_skipped += skipped
|
||||
processing_errors.extend(errors)
|
||||
# Check if document exists by unique identifier
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
|
||||
document_metadata={
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"FILE_NAME": file_name,
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"toolkit_id": "googledrive",
|
||||
"source": "composio",
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for file: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
|
||||
|
||||
for item in files_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Get file content (pass mime_type for Google Workspace export handling)
|
||||
content, content_error = await composio_connector.get_drive_file_content(
|
||||
item["file_id"], original_mime_type=item["mime_type"]
|
||||
)
|
||||
|
||||
if content_error or not content:
|
||||
logger.warning(
|
||||
f"Could not get content for file {item['file_name']}: {content_error}"
|
||||
)
|
||||
markdown_content = f"# {item['file_name']}\n\n"
|
||||
markdown_content += f"**File ID:** {item['file_id']}\n"
|
||||
markdown_content += f"**Type:** {item['mime_type']}\n"
|
||||
elif isinstance(content, dict):
|
||||
error_msg = f"Unexpected dict content format for file {item['file_name']}: {list(content.keys())}"
|
||||
logger.error(error_msg)
|
||||
processing_errors.append(error_msg)
|
||||
markdown_content = f"# {item['file_name']}\n\n"
|
||||
markdown_content += f"**File ID:** {item['file_id']}\n"
|
||||
markdown_content += f"**Type:** {item['mime_type']}\n"
|
||||
else:
|
||||
# Process content based on file type
|
||||
markdown_content = await _process_file_content(
|
||||
content=content,
|
||||
file_name=item["file_name"],
|
||||
file_id=item["file_id"],
|
||||
mime_type=item["mime_type"],
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
session=session,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
# For existing documents, check if content changed
|
||||
if not item["is_new"] and document.content_hash == content_hash:
|
||||
# Ensure status is ready
|
||||
if not DocumentStatus.is_state(document.status, DocumentStatus.READY):
|
||||
document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Check for duplicate content hash (for new documents)
|
||||
if item["is_new"]:
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"File {item['file_name']} already indexed by another connector. Skipping."
|
||||
)
|
||||
# Remove the pending document we created
|
||||
await session.delete(document)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
"mime_type": item["mime_type"],
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["file_name"]
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"file_id": item["file_id"],
|
||||
"file_name": item["file_name"],
|
||||
"FILE_NAME": item["file_name"],
|
||||
"mime_type": item["mime_type"],
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed > 0 and documents_indexed % 10 == 0:
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Drive files processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing Drive file {file_name or 'unknown'}: {e!s}"
|
||||
error_msg = f"Error processing Drive file {item['file_name']}: {e!s}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
processing_errors.append(error_msg)
|
||||
documents_skipped += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
||||
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, "
|
||||
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return documents_indexed, documents_skipped, processing_errors
|
||||
|
||||
|
||||
async def _process_single_drive_file(
|
||||
session: AsyncSession,
|
||||
composio_connector: ComposioGoogleDriveConnector,
|
||||
file_id: str,
|
||||
file_name: str,
|
||||
mime_type: str,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
) -> tuple[int, int, list[str]]:
|
||||
"""Process a single Google Drive file for indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (documents_indexed, documents_skipped, processing_errors)
|
||||
"""
|
||||
processing_errors = []
|
||||
|
||||
# ========== EARLY DUPLICATE CHECK BY FILE ID ==========
|
||||
# Check if this Google Drive file was already indexed by ANY connector
|
||||
# This happens BEFORE download/ETL to save expensive API calls
|
||||
existing_by_file_id = await check_document_by_google_drive_file_id(
|
||||
session, file_id, search_space_id
|
||||
)
|
||||
if existing_by_file_id:
|
||||
logger.info(
|
||||
f"Skipping file {file_name} (file_id={file_id}): already indexed "
|
||||
f"by {existing_by_file_id.document_type.value} as '{existing_by_file_id.title}' "
|
||||
f"(saved download & ETL cost)"
|
||||
)
|
||||
return 0, 1, processing_errors # Skip - NO download, NO ETL!
|
||||
# ======================================================
|
||||
|
||||
# Generate unique identifier hash
|
||||
document_type = DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"])
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
document_type, f"drive_{file_id}", search_space_id
|
||||
)
|
||||
|
||||
# Check if document exists by unique identifier (same connector, same file)
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Get file content (pass mime_type for Google Workspace export handling)
|
||||
content, content_error = await composio_connector.get_drive_file_content(
|
||||
file_id, original_mime_type=mime_type
|
||||
)
|
||||
|
||||
if content_error or not content:
|
||||
logger.warning(f"Could not get content for file {file_name}: {content_error}")
|
||||
# Use metadata as content fallback
|
||||
markdown_content = f"# {file_name}\n\n"
|
||||
markdown_content += f"**File ID:** {file_id}\n"
|
||||
markdown_content += f"**Type:** {mime_type}\n"
|
||||
elif isinstance(content, dict):
|
||||
# Safety check: if content is still a dict, log error and use fallback
|
||||
error_msg = f"Unexpected dict content format for file {file_name}: {list(content.keys())}"
|
||||
logger.error(error_msg)
|
||||
processing_errors.append(error_msg)
|
||||
markdown_content = f"# {file_name}\n\n"
|
||||
markdown_content += f"**File ID:** {file_id}\n"
|
||||
markdown_content += f"**Type:** {mime_type}\n"
|
||||
else:
|
||||
# Process content based on file type
|
||||
markdown_content = await _process_file_content(
|
||||
content=content,
|
||||
file_name=file_name,
|
||||
file_id=file_id,
|
||||
mime_type=mime_type,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
session=session,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
processing_errors=processing_errors,
|
||||
)
|
||||
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
return 0, 1, processing_errors # Skipped - unchanged
|
||||
|
||||
# Update existing document
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = f"Drive: {file_name}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"FILE_NAME": file_name, # For compatibility
|
||||
"mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
return 1, 0, processing_errors # Indexed - updated
|
||||
|
||||
# Check if content_hash already exists (from any connector)
|
||||
# This prevents duplicate content and avoids IntegrityError on unique constraint
|
||||
existing_by_content_hash = await check_document_by_content_hash(
|
||||
session, content_hash
|
||||
)
|
||||
if existing_by_content_hash:
|
||||
logger.info(
|
||||
f"Skipping file {file_name} (file_id={file_id}): identical content "
|
||||
f"already indexed as '{existing_by_content_hash.title}'"
|
||||
)
|
||||
return 0, 1, processing_errors # Skipped - duplicate content
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
"document_type": "Google Drive File (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Drive File: {file_name}\n\nType: {mime_type}"
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Drive: {file_name}",
|
||||
document_type=DocumentType(TOOLKIT_TO_DOCUMENT_TYPE["googledrive"]),
|
||||
document_metadata={
|
||||
"file_id": file_id,
|
||||
"file_name": file_name,
|
||||
"FILE_NAME": file_name, # For compatibility
|
||||
"mime_type": mime_type,
|
||||
"toolkit_id": "googledrive",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
|
||||
return 1, 0, processing_errors # Indexed - new
|
||||
|
||||
|
||||
async def _fetch_folder_files_recursively(
|
||||
composio_connector: ComposioGoogleDriveConnector,
|
||||
folder_id: str,
|
||||
|
|
|
|||
|
|
@ -100,6 +100,83 @@ class PodcastStatus(str, Enum):
|
|||
FAILED = "failed"
|
||||
|
||||
|
||||
class DocumentStatus:
|
||||
"""
|
||||
Helper class for document processing status (stored as JSONB).
|
||||
|
||||
Status values:
|
||||
- {"state": "ready"} - Document is fully processed and searchable
|
||||
- {"state": "pending"} - Document is queued, waiting to be processed
|
||||
- {"state": "processing"} - Document is currently being processed (only 1 at a time)
|
||||
- {"state": "failed", "reason": "..."} - Processing failed with reason
|
||||
|
||||
Usage:
|
||||
document.status = DocumentStatus.pending()
|
||||
document.status = DocumentStatus.processing()
|
||||
document.status = DocumentStatus.ready()
|
||||
document.status = DocumentStatus.failed("LLM rate limit exceeded")
|
||||
"""
|
||||
|
||||
# State constants
|
||||
READY = "ready"
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
FAILED = "failed"
|
||||
|
||||
@staticmethod
|
||||
def ready() -> dict:
|
||||
"""Return status dict for a ready/searchable document."""
|
||||
return {"state": DocumentStatus.READY}
|
||||
|
||||
@staticmethod
|
||||
def pending() -> dict:
|
||||
"""Return status dict for a document waiting to be processed."""
|
||||
return {"state": DocumentStatus.PENDING}
|
||||
|
||||
@staticmethod
|
||||
def processing() -> dict:
|
||||
"""Return status dict for a document being processed."""
|
||||
return {"state": DocumentStatus.PROCESSING}
|
||||
|
||||
@staticmethod
|
||||
def failed(reason: str, **extra_details) -> dict:
|
||||
"""
|
||||
Return status dict for a failed document.
|
||||
|
||||
Args:
|
||||
reason: Human-readable failure reason
|
||||
**extra_details: Optional additional details (duplicate_of, error_code, etc.)
|
||||
"""
|
||||
status = {
|
||||
"state": DocumentStatus.FAILED,
|
||||
"reason": reason[:500],
|
||||
} # Truncate long reasons
|
||||
if extra_details:
|
||||
status.update(extra_details)
|
||||
return status
|
||||
|
||||
@staticmethod
|
||||
def get_state(status: dict | None) -> str | None:
|
||||
"""Extract state from status dict, returns None if invalid."""
|
||||
if status is None:
|
||||
return None
|
||||
return status.get("state") if isinstance(status, dict) else None
|
||||
|
||||
@staticmethod
|
||||
def is_state(status: dict | None, state: str) -> bool:
|
||||
"""Check if status matches a given state."""
|
||||
return DocumentStatus.get_state(status) == state
|
||||
|
||||
@staticmethod
|
||||
def get_failure_reason(status: dict | None) -> str | None:
|
||||
"""Extract failure reason from status dict."""
|
||||
if status is None or not isinstance(status, dict):
|
||||
return None
|
||||
if status.get("state") == DocumentStatus.FAILED:
|
||||
return status.get("reason")
|
||||
return None
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
"""
|
||||
Enum for LLM providers supported by LiteLLM.
|
||||
|
|
@ -813,6 +890,17 @@ class Document(BaseModel, TimestampMixin):
|
|||
index=True,
|
||||
)
|
||||
|
||||
# Processing status for real-time visibility (JSONB)
|
||||
# Format: {"state": "ready"} or {"state": "processing"} or {"state": "failed", "reason": "..."}
|
||||
# Default to {"state": "ready"} for backward compatibility with existing documents
|
||||
status = Column(
|
||||
JSONB,
|
||||
nullable=False,
|
||||
default=DocumentStatus.ready,
|
||||
server_default=text('\'{"state": "ready"}\'::jsonb'),
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Relationships
|
||||
search_space = relationship("SearchSpace", back_populates="documents")
|
||||
created_by = relationship("User", back_populates="documents")
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from app.db import (
|
|||
from app.schemas import (
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusSchema,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
|
|
@ -112,9 +113,23 @@ async def create_documents_file_upload(
|
|||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Upload files as documents.
|
||||
Upload files as documents with real-time status tracking.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL)
|
||||
- Phase 2: Celery processes each file: pending → processing → ready/failed
|
||||
|
||||
Requires DOCUMENTS_CREATE permission.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
from app.db import DocumentStatus
|
||||
from app.tasks.document_processors.base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
)
|
||||
from app.utils.document_converters import generate_unique_identifier_hash
|
||||
|
||||
try:
|
||||
# Check permission
|
||||
await check_permission(
|
||||
|
|
@ -128,38 +143,105 @@ async def create_documents_file_upload(
|
|||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
created_documents: list[Document] = []
|
||||
files_to_process: list[
|
||||
tuple[Document, str, str]
|
||||
] = [] # (document, temp_path, filename)
|
||||
skipped_duplicates = 0
|
||||
|
||||
# ===== PHASE 1: Create pending documents for all files =====
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
for file in files:
|
||||
try:
|
||||
# Save file to a temporary location to avoid stream issues
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
# Create temp file
|
||||
# Save file to temp location
|
||||
with tempfile.NamedTemporaryFile(
|
||||
delete=False, suffix=os.path.splitext(file.filename)[1]
|
||||
delete=False, suffix=os.path.splitext(file.filename or "")[1]
|
||||
) as temp_file:
|
||||
temp_path = temp_file.name
|
||||
|
||||
# Write uploaded file to temp file
|
||||
content = await file.read()
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
process_file_upload_task,
|
||||
file_size = len(content)
|
||||
|
||||
# Generate unique identifier for deduplication check
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, file.filename or "unknown", search_space_id
|
||||
)
|
||||
|
||||
process_file_upload_task.delay(
|
||||
temp_path, file.filename, search_space_id, str(user.id)
|
||||
# Check if document already exists (by unique identifier)
|
||||
existing = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
if existing:
|
||||
# Clean up temp file for duplicates
|
||||
os.unlink(temp_path)
|
||||
skipped_duplicates += 1
|
||||
continue
|
||||
|
||||
# Create pending document (visible immediately in UI via ElectricSQL)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file.filename or "Uploaded File",
|
||||
document_type=DocumentType.FILE,
|
||||
document_metadata={
|
||||
"FILE_NAME": file.filename,
|
||||
"file_size": file_size,
|
||||
"upload_time": datetime.now().isoformat(),
|
||||
},
|
||||
content="Processing...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary, updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
status=DocumentStatus.pending(), # Shows "pending" in UI
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=str(user.id),
|
||||
)
|
||||
session.add(document)
|
||||
created_documents.append(document)
|
||||
files_to_process.append(
|
||||
(document, temp_path, file.filename or "unknown")
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Failed to process file {file.filename}: {e!s}",
|
||||
) from e
|
||||
|
||||
await session.commit()
|
||||
return {"message": "Files uploaded for processing"}
|
||||
# Commit all pending documents - they appear in UI immediately via ElectricSQL
|
||||
if created_documents:
|
||||
await session.commit()
|
||||
# Refresh to get generated IDs
|
||||
for doc in created_documents:
|
||||
await session.refresh(doc)
|
||||
|
||||
# ===== PHASE 2: Dispatch Celery tasks for each file =====
|
||||
# Each task will update document status: pending → processing → ready/failed
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
process_file_upload_with_document_task,
|
||||
)
|
||||
|
||||
for document, temp_path, filename in files_to_process:
|
||||
process_file_upload_with_document_task.delay(
|
||||
document_id=document.id,
|
||||
temp_path=temp_path,
|
||||
filename=filename,
|
||||
search_space_id=search_space_id,
|
||||
user_id=str(user.id),
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "Files uploaded for processing",
|
||||
"document_ids": [doc.id for doc in created_documents],
|
||||
"total_files": len(files),
|
||||
"pending_files": len(files_to_process),
|
||||
"skipped_duplicates": skipped_duplicates,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
@ -211,7 +293,11 @@ async def read_documents(
|
|||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
query = select(Document).filter(Document.search_space_id == search_space_id)
|
||||
query = (
|
||||
select(Document)
|
||||
.options(selectinload(Document.created_by))
|
||||
.filter(Document.search_space_id == search_space_id)
|
||||
)
|
||||
count_query = (
|
||||
select(func.count())
|
||||
.select_from(Document)
|
||||
|
|
@ -221,6 +307,7 @@ async def read_documents(
|
|||
# Get documents from all search spaces user has membership in
|
||||
query = (
|
||||
select(Document)
|
||||
.options(selectinload(Document.created_by))
|
||||
.join(SearchSpace)
|
||||
.join(SearchSpaceMembership)
|
||||
.filter(SearchSpaceMembership.user_id == user.id)
|
||||
|
|
@ -261,6 +348,19 @@ async def read_documents(
|
|||
# Convert database objects to API-friendly format
|
||||
api_documents = []
|
||||
for doc in db_documents:
|
||||
# Get user name (display_name or email fallback)
|
||||
created_by_name = None
|
||||
if doc.created_by:
|
||||
created_by_name = doc.created_by.display_name or doc.created_by.email
|
||||
|
||||
# Parse status from JSONB
|
||||
status_data = None
|
||||
if hasattr(doc, "status") and doc.status:
|
||||
status_data = DocumentStatusSchema(
|
||||
state=doc.status.get("state", "ready"),
|
||||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
|
|
@ -273,6 +373,9 @@ async def read_documents(
|
|||
created_at=doc.created_at,
|
||||
updated_at=doc.updated_at,
|
||||
search_space_id=doc.search_space_id,
|
||||
created_by_id=doc.created_by_id,
|
||||
created_by_name=created_by_name,
|
||||
status=status_data,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -341,7 +444,11 @@ async def search_documents(
|
|||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
query = select(Document).filter(Document.search_space_id == search_space_id)
|
||||
query = (
|
||||
select(Document)
|
||||
.options(selectinload(Document.created_by))
|
||||
.filter(Document.search_space_id == search_space_id)
|
||||
)
|
||||
count_query = (
|
||||
select(func.count())
|
||||
.select_from(Document)
|
||||
|
|
@ -351,6 +458,7 @@ async def search_documents(
|
|||
# Get documents from all search spaces user has membership in
|
||||
query = (
|
||||
select(Document)
|
||||
.options(selectinload(Document.created_by))
|
||||
.join(SearchSpace)
|
||||
.join(SearchSpaceMembership)
|
||||
.filter(SearchSpaceMembership.user_id == user.id)
|
||||
|
|
@ -395,6 +503,19 @@ async def search_documents(
|
|||
# Convert database objects to API-friendly format
|
||||
api_documents = []
|
||||
for doc in db_documents:
|
||||
# Get user name (display_name or email fallback)
|
||||
created_by_name = None
|
||||
if doc.created_by:
|
||||
created_by_name = doc.created_by.display_name or doc.created_by.email
|
||||
|
||||
# Parse status from JSONB
|
||||
status_data = None
|
||||
if hasattr(doc, "status") and doc.status:
|
||||
status_data = DocumentStatusSchema(
|
||||
state=doc.status.get("state", "ready"),
|
||||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
|
|
@ -407,6 +528,9 @@ async def search_documents(
|
|||
created_at=doc.created_at,
|
||||
updated_at=doc.updated_at,
|
||||
search_space_id=doc.search_space_id,
|
||||
created_by_id=doc.created_by_id,
|
||||
created_by_name=created_by_name,
|
||||
status=status_data,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -782,6 +906,7 @@ async def delete_document(
|
|||
"""
|
||||
Delete a document.
|
||||
Requires DOCUMENTS_DELETE permission for the search space.
|
||||
Documents in "processing" state cannot be deleted.
|
||||
"""
|
||||
try:
|
||||
result = await session.execute(
|
||||
|
|
@ -794,6 +919,14 @@ async def delete_document(
|
|||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
# Check if document is pending or currently being processed
|
||||
doc_state = document.status.get("state") if document.status else None
|
||||
if doc_state in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
status_code=409, # Conflict
|
||||
detail="Cannot delete document while it is pending or being processed. Please wait for processing to complete.",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
|
|
|
|||
|
|
@ -230,6 +230,14 @@ async def delete_note(
|
|||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Note not found")
|
||||
|
||||
# Check if note is pending or currently being processed
|
||||
doc_state = document.status.get("state") if document.status else None
|
||||
if doc_state in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Cannot delete note while it is pending or being processed. Please wait for processing to complete.",
|
||||
)
|
||||
|
||||
# Delete document (chunks will be cascade deleted)
|
||||
await session.delete(document)
|
||||
await session.commit()
|
||||
|
|
|
|||
|
|
@ -2127,6 +2127,7 @@ async def run_google_gmail_indexing(
|
|||
start_date: str | None,
|
||||
end_date: str | None,
|
||||
update_last_indexed: bool,
|
||||
on_heartbeat_callback=None,
|
||||
) -> tuple[int, str | None]:
|
||||
# Use a reasonable default for max_messages
|
||||
max_messages = 1000
|
||||
|
|
@ -2139,6 +2140,7 @@ async def run_google_gmail_indexing(
|
|||
end_date=end_date,
|
||||
update_last_indexed=update_last_indexed,
|
||||
max_messages=max_messages,
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
# index_google_gmail_messages returns (int, str) but we need (int, str | None)
|
||||
return indexed_count, error_message if error_message else None
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from .documents import (
|
|||
DocumentBase,
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusSchema,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
|
|
@ -104,6 +105,7 @@ __all__ = [
|
|||
# Document schemas
|
||||
"DocumentBase",
|
||||
"DocumentRead",
|
||||
"DocumentStatusSchema",
|
||||
"DocumentTitleRead",
|
||||
"DocumentTitleSearchResponse",
|
||||
"DocumentUpdate",
|
||||
|
|
|
|||
|
|
@ -41,6 +41,13 @@ class DocumentUpdate(DocumentBase):
|
|||
pass
|
||||
|
||||
|
||||
class DocumentStatusSchema(BaseModel):
|
||||
"""Document processing status."""
|
||||
|
||||
state: str # "ready", "processing", "failed"
|
||||
reason: str | None = None
|
||||
|
||||
|
||||
class DocumentRead(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
|
|
@ -53,6 +60,12 @@ class DocumentRead(BaseModel):
|
|||
updated_at: datetime | None
|
||||
search_space_id: int
|
||||
created_by_id: UUID | None = None # User who created/uploaded this document
|
||||
created_by_name: str | None = (
|
||||
None # Display name or email of the user who created this document
|
||||
)
|
||||
status: DocumentStatusSchema | None = (
|
||||
None # Processing status (ready, processing, failed)
|
||||
)
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -982,7 +982,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
channel_name = metadata.get("channel_name", "Unknown Channel")
|
||||
message_date = metadata.get("start_date", "")
|
||||
title = f"Slack: {channel_name}"
|
||||
title = channel_name
|
||||
if message_date:
|
||||
title += f" ({message_date})"
|
||||
return title
|
||||
|
|
@ -1056,7 +1056,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
page_title = metadata.get("page_title", "Untitled Page")
|
||||
indexed_at = metadata.get("indexed_at", "")
|
||||
title = f"Notion: {page_title}"
|
||||
title = page_title
|
||||
if indexed_at:
|
||||
title += f" (indexed: {indexed_at})"
|
||||
return title
|
||||
|
|
@ -1366,9 +1366,9 @@ class ConnectorService:
|
|||
issue_title = metadata.get("issue_title", "Untitled Issue")
|
||||
issue_state = metadata.get("state", "")
|
||||
title = (
|
||||
f"Linear: {issue_identifier} - {issue_title}"
|
||||
f"{issue_identifier} - {issue_title}"
|
||||
if issue_identifier
|
||||
else f"Linear: {issue_title}"
|
||||
else issue_title
|
||||
)
|
||||
if issue_state:
|
||||
title += f" ({issue_state})"
|
||||
|
|
@ -1465,11 +1465,7 @@ class ConnectorService:
|
|||
issue_key = metadata.get("issue_key", "")
|
||||
issue_title = metadata.get("issue_title", "Untitled Issue")
|
||||
status = metadata.get("status", "")
|
||||
title = (
|
||||
f"Jira: {issue_key} - {issue_title}"
|
||||
if issue_key
|
||||
else f"Jira: {issue_title}"
|
||||
)
|
||||
title = f"{issue_key} - {issue_title}" if issue_key else issue_title
|
||||
if status:
|
||||
title += f" ({status})"
|
||||
return title
|
||||
|
|
@ -1570,7 +1566,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
event_summary = metadata.get("event_summary", "Untitled Event")
|
||||
start_time = metadata.get("start_time", "")
|
||||
title = f"Calendar: {event_summary}"
|
||||
title = event_summary
|
||||
if start_time:
|
||||
title += f" ({start_time})"
|
||||
return title
|
||||
|
|
@ -1675,7 +1671,7 @@ class ConnectorService:
|
|||
|
||||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
record_id = metadata.get("record_id", "")
|
||||
return f"Airtable Record: {record_id}" if record_id else "Airtable Record"
|
||||
return record_id if record_id else "Airtable Record"
|
||||
|
||||
def _description_fn(
|
||||
_chunk: dict[str, Any], _doc_info: dict[str, Any], metadata: dict[str, Any]
|
||||
|
|
@ -1952,7 +1948,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
page_title = metadata.get("page_title", "Untitled Page")
|
||||
space_key = metadata.get("space_key", "")
|
||||
title = f"Confluence: {page_title}"
|
||||
title = page_title
|
||||
if space_key:
|
||||
title += f" ({space_key})"
|
||||
return title
|
||||
|
|
@ -2238,7 +2234,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
channel_name = metadata.get("channel_name", "Unknown Channel")
|
||||
message_date = metadata.get("start_date", "")
|
||||
title = f"Discord: {channel_name}"
|
||||
title = channel_name
|
||||
if message_date:
|
||||
title += f" ({message_date})"
|
||||
return title
|
||||
|
|
@ -2314,7 +2310,7 @@ class ConnectorService:
|
|||
team_name = metadata.get("team_name", "Unknown Team")
|
||||
channel_name = metadata.get("channel_name", "Unknown Channel")
|
||||
message_date = metadata.get("start_date", "")
|
||||
title = f"Teams: {team_name} - {channel_name}"
|
||||
title = f"{team_name} - {channel_name}"
|
||||
if message_date:
|
||||
title += f" ({message_date})"
|
||||
return title
|
||||
|
|
@ -2387,11 +2383,7 @@ class ConnectorService:
|
|||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
event_name = metadata.get("event_name", "Untitled Event")
|
||||
start_time = metadata.get("start_time", "")
|
||||
return (
|
||||
f"Luma: {event_name} ({start_time})"
|
||||
if start_time
|
||||
else f"Luma: {event_name}"
|
||||
)
|
||||
return f"{event_name} ({start_time})" if start_time else event_name
|
||||
|
||||
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
return metadata.get("event_url", "") or ""
|
||||
|
|
@ -2651,7 +2643,7 @@ class ConnectorService:
|
|||
|
||||
def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
page_name = metadata.get("page_name", "Untitled Page")
|
||||
return f"BookStack: {page_name}"
|
||||
return page_name
|
||||
|
||||
def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str:
|
||||
page_slug = metadata.get("page_slug", "")
|
||||
|
|
|
|||
|
|
@ -537,6 +537,304 @@ async def _process_file_upload(
|
|||
raise
|
||||
|
||||
|
||||
@celery_app.task(name="process_file_upload_with_document", bind=True)
|
||||
def process_file_upload_with_document_task(
|
||||
self,
|
||||
document_id: int,
|
||||
temp_path: str,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
):
|
||||
"""
|
||||
Celery task to process uploaded file with existing pending document.
|
||||
|
||||
This task is used by the 2-phase document upload flow:
|
||||
- Phase 1 (API): Creates pending document (visible in UI immediately)
|
||||
- Phase 2 (this task): Updates document status: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
document_id: ID of the pending document created in Phase 1
|
||||
temp_path: Path to the uploaded file
|
||||
filename: Original filename
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import traceback
|
||||
|
||||
logger.info(
|
||||
f"[process_file_upload_with_document] Task started - document_id: {document_id}, "
|
||||
f"file: {filename}, search_space_id: {search_space_id}"
|
||||
)
|
||||
|
||||
# Check if file exists and is accessible
|
||||
if not os.path.exists(temp_path):
|
||||
logger.error(
|
||||
f"[process_file_upload_with_document] File does not exist: {temp_path}. "
|
||||
"The temp file may have been cleaned up before the task ran."
|
||||
)
|
||||
# Mark document as failed since file is missing
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
loop.run_until_complete(
|
||||
_mark_document_failed(
|
||||
document_id,
|
||||
"File not found - temp file may have been cleaned up",
|
||||
)
|
||||
)
|
||||
finally:
|
||||
loop.close()
|
||||
return
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(
|
||||
_process_file_with_document(
|
||||
document_id, temp_path, filename, search_space_id, user_id
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
f"[process_file_upload_with_document] Task completed successfully for: {filename}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"[process_file_upload_with_document] Task failed for {filename}: {e}\n"
|
||||
f"Traceback:\n{traceback.format_exc()}"
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _mark_document_failed(document_id: int, reason: str):
|
||||
"""Mark a document as failed when task cannot proceed."""
|
||||
from app.db import Document, DocumentStatus
|
||||
from app.tasks.document_processors.base import get_current_timestamp
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
document = await session.get(Document, document_id)
|
||||
if document:
|
||||
document.status = DocumentStatus.failed(reason)
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
logger.info(f"Marked document {document_id} as failed: {reason}")
|
||||
|
||||
|
||||
async def _process_file_with_document(
|
||||
document_id: int,
|
||||
temp_path: str,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
):
|
||||
"""
|
||||
Process file and update existing pending document status.
|
||||
|
||||
This function implements Phase 2 of the 2-phase document upload:
|
||||
- Sets document status to 'processing' (shows spinner in UI)
|
||||
- Processes the file (parsing, embedding, chunking)
|
||||
- Updates document to 'ready' on success or 'failed' on error
|
||||
"""
|
||||
import os
|
||||
|
||||
from app.db import Document, DocumentStatus
|
||||
from app.tasks.document_processors.base import get_current_timestamp
|
||||
from app.tasks.document_processors.file_processors import (
|
||||
process_file_in_background_with_document,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Starting async processing for: {filename}"
|
||||
)
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Database session created for: {filename}"
|
||||
)
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Get the document
|
||||
document = await session.get(Document, document_id)
|
||||
if not document:
|
||||
logger.error(f"Document {document_id} not found")
|
||||
return
|
||||
|
||||
# Get file size for notification metadata
|
||||
try:
|
||||
file_size = os.path.getsize(temp_path)
|
||||
logger.info(f"[_process_file_with_document] File size: {file_size} bytes")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"[_process_file_with_document] Could not get file size: {e}"
|
||||
)
|
||||
file_size = None
|
||||
|
||||
# Create notification for document processing
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Creating notification for: {filename}"
|
||||
)
|
||||
notification = (
|
||||
await NotificationService.document_processing.notify_processing_started(
|
||||
session=session,
|
||||
user_id=UUID(user_id),
|
||||
document_type="FILE",
|
||||
document_name=filename,
|
||||
search_space_id=search_space_id,
|
||||
file_size=file_size,
|
||||
)
|
||||
)
|
||||
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_file_upload_with_document",
|
||||
source="document_processor",
|
||||
message=f"Starting file processing for: {filename} (document_id: {document_id})",
|
||||
metadata={
|
||||
"document_type": "FILE",
|
||||
"document_id": document_id,
|
||||
"filename": filename,
|
||||
"file_path": temp_path,
|
||||
"user_id": user_id,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Set status to PROCESSING (shows spinner in UI via ElectricSQL)
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Document {document_id} status set to 'processing'"
|
||||
)
|
||||
|
||||
# Process the file and update document
|
||||
result = await process_file_in_background_with_document(
|
||||
document=document,
|
||||
file_path=temp_path,
|
||||
filename=filename,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
session=session,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
notification=notification,
|
||||
)
|
||||
|
||||
# Update notification on success
|
||||
if result:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
document_id=result.id,
|
||||
chunks_count=None,
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Successfully processed document {document_id}"
|
||||
)
|
||||
else:
|
||||
# Duplicate detected - mark as failed
|
||||
document.status = DocumentStatus.failed("Duplicate content detected")
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
error_message="Document already exists (duplicate)",
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Import here to avoid circular dependencies
|
||||
from fastapi import HTTPException
|
||||
|
||||
from app.services.page_limit_service import PageLimitExceededError
|
||||
|
||||
# Check if this is a page limit error
|
||||
page_limit_error: PageLimitExceededError | None = None
|
||||
if isinstance(e, PageLimitExceededError):
|
||||
page_limit_error = e
|
||||
elif (
|
||||
isinstance(e, HTTPException)
|
||||
and e.__cause__
|
||||
and isinstance(e.__cause__, PageLimitExceededError)
|
||||
):
|
||||
page_limit_error = e.__cause__
|
||||
|
||||
# Mark document as failed (shows error in UI via ElectricSQL)
|
||||
error_message = str(e)[:500]
|
||||
document.status = DocumentStatus.failed(error_message)
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Document {document_id} marked as failed: {error_message[:100]}"
|
||||
)
|
||||
|
||||
# Handle page limit errors with dedicated notification
|
||||
if page_limit_error is not None:
|
||||
try:
|
||||
await session.refresh(notification)
|
||||
await NotificationService.document_processing.notify_processing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
error_message="Page limit exceeded",
|
||||
)
|
||||
await NotificationService.page_limit.notify_page_limit_exceeded(
|
||||
session=session,
|
||||
user_id=UUID(user_id),
|
||||
document_name=filename,
|
||||
document_type="FILE",
|
||||
search_space_id=search_space_id,
|
||||
pages_used=page_limit_error.pages_used,
|
||||
pages_limit=page_limit_error.pages_limit,
|
||||
pages_to_add=page_limit_error.pages_to_add,
|
||||
)
|
||||
except Exception as notif_error:
|
||||
logger.error(
|
||||
f"Failed to create page limit notification: {notif_error!s}"
|
||||
)
|
||||
else:
|
||||
# Update notification on failure
|
||||
try:
|
||||
await session.refresh(notification)
|
||||
await NotificationService.document_processing.notify_processing_completed(
|
||||
session=session,
|
||||
notification=notification,
|
||||
error_message=str(e)[:100],
|
||||
)
|
||||
except Exception as notif_error:
|
||||
logger.error(
|
||||
f"Failed to update notification on failure: {notif_error!s}"
|
||||
)
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
error_message[:100],
|
||||
str(e),
|
||||
{"error_type": type(e).__name__, "document_id": document_id},
|
||||
)
|
||||
logger.error(f"Error processing file {filename}: {e!s}")
|
||||
raise
|
||||
|
||||
finally:
|
||||
# Clean up temp file
|
||||
if os.path.exists(temp_path):
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
logger.info(
|
||||
f"[_process_file_with_document] Cleaned up temp file: {temp_path}"
|
||||
)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(
|
||||
f"[_process_file_with_document] Failed to clean up temp file: {cleanup_error}"
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(name="process_circleback_meeting", bind=True)
|
||||
def process_circleback_meeting_task(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -4,33 +4,41 @@ This task runs periodically (every 5 minutes by default) to find notifications
|
|||
that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
|
||||
These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
|
||||
|
||||
Additionally, it cleans up documents stuck in pending/processing state that belong
|
||||
to connectors with stale notifications.
|
||||
|
||||
Detection mechanism:
|
||||
- Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
|
||||
- If the task crashes, the Redis key expires automatically
|
||||
- This cleanup task checks for in-progress notifications without a Redis heartbeat key
|
||||
- Such notifications are marked as failed with O(1) batch UPDATE
|
||||
- Documents with pending/processing status for those connectors are also marked as failed
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import redis
|
||||
from sqlalchemy import and_, text
|
||||
from sqlalchemy import and_, or_, text
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
||||
from app.celery_app import celery_app
|
||||
from app.config import config
|
||||
from app.db import Notification
|
||||
from app.db import Document, DocumentStatus, Notification
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Redis client for checking heartbeats
|
||||
_redis_client: redis.Redis | None = None
|
||||
|
||||
# Error message shown to users when sync is interrupted
|
||||
STALE_SYNC_ERROR_MESSAGE = "Sync was interrupted unexpectedly. Please retry."
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
"""Get or create Redis client for heartbeat checking."""
|
||||
|
|
@ -70,6 +78,7 @@ def cleanup_stale_indexing_notifications_task():
|
|||
- Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
|
||||
|
||||
And marks them as failed with O(1) batch UPDATE.
|
||||
Also marks associated pending/processing documents as failed.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
|
|
@ -86,15 +95,20 @@ async def _cleanup_stale_notifications():
|
|||
"""Find and mark stale connector indexing notifications as failed.
|
||||
|
||||
Uses Redis TTL-based detection:
|
||||
1. Find all in-progress notifications
|
||||
1. Find all in-progress notifications with their connector_id
|
||||
2. Check which ones are missing their Redis heartbeat key
|
||||
3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
|
||||
4. Mark associated documents (pending/processing) as failed
|
||||
"""
|
||||
async with get_celery_session_maker()() as session:
|
||||
try:
|
||||
# Find all in-progress connector indexing notifications
|
||||
# Fetch full metadata to properly extract connector_id
|
||||
result = await session.execute(
|
||||
select(Notification.id).where(
|
||||
select(
|
||||
Notification.id,
|
||||
Notification.notification_metadata,
|
||||
).where(
|
||||
and_(
|
||||
Notification.type == "connector_indexing",
|
||||
Notification.notification_metadata["status"].astext
|
||||
|
|
@ -102,24 +116,37 @@ async def _cleanup_stale_notifications():
|
|||
)
|
||||
)
|
||||
)
|
||||
in_progress_ids = [row[0] for row in result.fetchall()]
|
||||
in_progress_rows = result.fetchall()
|
||||
|
||||
if not in_progress_ids:
|
||||
if not in_progress_rows:
|
||||
logger.debug("No in-progress connector indexing notifications found")
|
||||
return
|
||||
|
||||
# Check which ones are missing heartbeat keys in Redis
|
||||
redis_client = get_redis_client()
|
||||
stale_notification_ids = []
|
||||
stale_connector_ids = []
|
||||
|
||||
for notification_id in in_progress_ids:
|
||||
for row in in_progress_rows:
|
||||
notification_id = row[0]
|
||||
metadata = row[1] # Full metadata dict
|
||||
heartbeat_key = _get_heartbeat_key(notification_id)
|
||||
if not redis_client.exists(heartbeat_key):
|
||||
stale_notification_ids.append(notification_id)
|
||||
# Extract connector_id from metadata dict for document cleanup
|
||||
if metadata and isinstance(metadata, dict):
|
||||
connector_id = metadata.get("connector_id")
|
||||
logger.debug(
|
||||
f"Notification {notification_id} metadata: {metadata}, "
|
||||
f"connector_id: {connector_id}"
|
||||
)
|
||||
if connector_id is not None:
|
||||
with contextlib.suppress(ValueError, TypeError):
|
||||
stale_connector_ids.append(int(connector_id))
|
||||
|
||||
if not stale_notification_ids:
|
||||
logger.debug(
|
||||
f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
|
||||
f"All {len(in_progress_rows)} in-progress notifications have active Redis heartbeats"
|
||||
)
|
||||
return
|
||||
|
||||
|
|
@ -127,18 +154,15 @@ async def _cleanup_stale_notifications():
|
|||
f"Found {len(stale_notification_ids)} stale connector indexing notifications "
|
||||
f"(no Redis heartbeat key): {stale_notification_ids}"
|
||||
)
|
||||
logger.info(f"Connector IDs for document cleanup: {stale_connector_ids}")
|
||||
|
||||
# O(1) Batch UPDATE using JSONB || operator
|
||||
# O(1) Batch UPDATE notifications using JSONB || operator
|
||||
# This merges the update data into existing notification_metadata
|
||||
# Also updates title and message for proper UI display
|
||||
error_message = (
|
||||
"Something went wrong while syncing your content. Please retry."
|
||||
)
|
||||
|
||||
update_data = {
|
||||
"status": "failed",
|
||||
"completed_at": datetime.now(UTC).isoformat(),
|
||||
"error_message": error_message,
|
||||
"error_message": STALE_SYNC_ERROR_MESSAGE,
|
||||
"sync_stage": "failed",
|
||||
}
|
||||
|
||||
|
|
@ -152,16 +176,96 @@ async def _cleanup_stale_notifications():
|
|||
"""),
|
||||
{
|
||||
"update_json": json.dumps(update_data),
|
||||
"display_message": f"{error_message}",
|
||||
"display_message": STALE_SYNC_ERROR_MESSAGE,
|
||||
"ids": stale_notification_ids,
|
||||
},
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
|
||||
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed"
|
||||
)
|
||||
|
||||
# ===== Clean up stuck documents for stale connectors =====
|
||||
if stale_connector_ids:
|
||||
await _cleanup_stuck_documents(session, stale_connector_ids)
|
||||
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
|
||||
await session.rollback()
|
||||
|
||||
|
||||
async def _cleanup_stuck_documents(session, connector_ids: list[int]):
|
||||
"""
|
||||
Mark documents stuck in pending/processing state as failed for given connectors.
|
||||
|
||||
This ensures that when a connector sync is interrupted, all partially-processed
|
||||
documents are marked with a clear error state instead of being stuck indefinitely.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_ids: List of connector IDs whose documents should be cleaned up
|
||||
"""
|
||||
if not connector_ids:
|
||||
return
|
||||
|
||||
try:
|
||||
# Count documents that will be affected (for logging)
|
||||
count_result = await session.execute(
|
||||
select(Document.id).where(
|
||||
and_(
|
||||
Document.connector_id.in_(connector_ids),
|
||||
or_(
|
||||
Document.status["state"].astext == DocumentStatus.PENDING,
|
||||
Document.status["state"].astext == DocumentStatus.PROCESSING,
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
stuck_doc_ids = [row[0] for row in count_result.fetchall()]
|
||||
|
||||
if not stuck_doc_ids:
|
||||
logger.debug(f"No stuck documents found for connector IDs: {connector_ids}")
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
f"Found {len(stuck_doc_ids)} stuck documents (pending/processing) "
|
||||
f"for connector IDs {connector_ids}: {stuck_doc_ids[:20]}..." # Log first 20
|
||||
)
|
||||
|
||||
# O(1) Batch UPDATE: Mark all stuck documents as failed using JSONB
|
||||
# The error message matches what we show in notifications
|
||||
failed_status = DocumentStatus.failed(STALE_SYNC_ERROR_MESSAGE)
|
||||
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE documents
|
||||
SET status = CAST(:failed_status AS jsonb),
|
||||
updated_at = :now
|
||||
WHERE connector_id = ANY(:connector_ids)
|
||||
AND (
|
||||
status->>'state' = :pending_state
|
||||
OR status->>'state' = :processing_state
|
||||
)
|
||||
"""),
|
||||
{
|
||||
"failed_status": json.dumps(failed_status),
|
||||
"now": datetime.now(UTC),
|
||||
"connector_ids": connector_ids,
|
||||
"pending_state": DocumentStatus.PENDING,
|
||||
"processing_state": DocumentStatus.PROCESSING,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully marked {len(stuck_doc_ids)} stuck documents as failed "
|
||||
f"for connector IDs: {connector_ids}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error cleaning up stuck documents for connectors {connector_ids}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Don't raise - let the notification cleanup continue even if document cleanup fails
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Airtable connector indexer.
|
||||
|
||||
Implements real-time document status updates using a two-phase approach:
|
||||
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
|
||||
- Phase 2: Process each document one by one (pending → processing → ready/failed)
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -10,7 +14,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.airtable_history import AirtableHistoryConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -27,6 +31,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -134,24 +139,32 @@ async def index_airtable_records(
|
|||
await task_logger.log_task_success(
|
||||
log_entry, success_msg, {"bases_count": 0}
|
||||
)
|
||||
return 0, success_msg
|
||||
# CRITICAL: Update timestamp even when no bases found so Electric SQL syncs
|
||||
await update_connector_last_indexed(
|
||||
session, connector, update_last_indexed
|
||||
)
|
||||
await session.commit()
|
||||
return 0, None # Return None (not error) when no items found
|
||||
|
||||
logger.info(f"Found {len(bases)} Airtable bases to process")
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
total_documents_indexed = 0
|
||||
|
||||
# Process each base
|
||||
# Track overall statistics
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all records and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
records_to_process = [] # List of dicts with document and record data
|
||||
new_documents_created = False
|
||||
|
||||
for base in bases:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(total_documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
base_id = base.get("id")
|
||||
base_name = base.get("name", "Unknown Base")
|
||||
|
||||
|
|
@ -201,7 +214,6 @@ async def index_airtable_records(
|
|||
max_records=max_records,
|
||||
)
|
||||
)
|
||||
|
||||
else:
|
||||
# Fetch all records
|
||||
records, records_error = airtable_connector.get_all_records(
|
||||
|
|
@ -222,21 +234,14 @@ async def index_airtable_records(
|
|||
|
||||
logger.info(f"Found {len(records)} records in table {table_name}")
|
||||
|
||||
documents_indexed = 0
|
||||
skipped_messages = []
|
||||
documents_skipped = 0
|
||||
# Process each record
|
||||
# Phase 1: Analyze each record and create pending documents
|
||||
for record in records:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(total_documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
record_id = record.get("id", "")
|
||||
if not record_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate markdown content
|
||||
markdown_content = (
|
||||
airtable_connector.format_record_to_markdown(
|
||||
|
|
@ -246,16 +251,11 @@ async def index_airtable_records(
|
|||
|
||||
if not markdown_content.strip():
|
||||
logger.warning(
|
||||
f"Skipping message with no content: {record.get('id')}"
|
||||
)
|
||||
skipped_messages.append(
|
||||
f"{record.get('id')} (no content)"
|
||||
f"Skipping record with no content: {record_id}"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
record_id = record.get("id", "Unknown")
|
||||
|
||||
# Generate unique identifier hash for this Airtable record
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.AIRTABLE_CONNECTOR,
|
||||
|
|
@ -278,77 +278,30 @@ async def index_airtable_records(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Airtable record {record_id} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = (
|
||||
DocumentStatus.ready()
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Airtable record {record_id}. Updating document."
|
||||
)
|
||||
|
||||
# Generate document summary
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"record_id": record_id,
|
||||
"created_time": record.get(
|
||||
"CREATED_TIME()", ""
|
||||
),
|
||||
"document_type": "Airtable Record",
|
||||
"connector_type": "Airtable",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content,
|
||||
user_llm,
|
||||
document_metadata,
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Airtable Record: {record_id}\n\n"
|
||||
)
|
||||
summary_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(
|
||||
markdown_content
|
||||
)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = (
|
||||
f"Airtable Record: {record_id}"
|
||||
)
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
records_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"record_id": record_id,
|
||||
"created_time": record.get(
|
||||
"CREATED_TIME()", ""
|
||||
),
|
||||
"record": record,
|
||||
"base_name": base_name,
|
||||
"table_name": table_name,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = (
|
||||
get_current_timestamp()
|
||||
)
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Airtable record {record_id}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -365,123 +318,210 @@ async def index_airtable_records(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate document summary
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"record_id": record_id,
|
||||
"created_time": record.get("CREATED_TIME()", ""),
|
||||
"document_type": "Airtable Record",
|
||||
"connector_type": "Airtable",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Airtable Record: {record_id}\n\n"
|
||||
summary_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Create and store new document
|
||||
logger.info(
|
||||
f"Creating new document for Airtable record: {record_id}"
|
||||
)
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Airtable Record: {record_id}",
|
||||
title=record_id,
|
||||
document_type=DocumentType.AIRTABLE_CONNECTOR,
|
||||
document_metadata={
|
||||
"record_id": record_id,
|
||||
"created_time": record.get("CREATED_TIME()", ""),
|
||||
"base_name": base_name,
|
||||
"table_name": table_name,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully indexed new Airtable record {summary_content}"
|
||||
)
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Airtable records processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
records_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"record_id": record_id,
|
||||
"record": record,
|
||||
"base_name": base_name,
|
||||
"table_name": table_name,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing the Airtable record {record.get('id', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
f"Error in Phase 1 for record: {e!s}", exc_info=True
|
||||
)
|
||||
skipped_messages.append(
|
||||
f"{record.get('id', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue # Skip this message and continue with others
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Accumulate total processed across all tables
|
||||
total_processed += documents_indexed
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([r for r in records_to_process if r['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
if documents_indexed > 0:
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(records_to_process)} documents")
|
||||
|
||||
for item in records_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"record_id": item["record_id"],
|
||||
"created_time": item["record"].get("CREATED_TIME()", ""),
|
||||
"document_type": "Airtable Record",
|
||||
"connector_type": "Airtable",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["markdown_content"],
|
||||
user_llm,
|
||||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Airtable Record: {item['record_id']}\n\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["record_id"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"record_id": item["record_id"],
|
||||
"created_time": item["record"].get("CREATED_TIME()", ""),
|
||||
"base_name": item["base_name"],
|
||||
"table_name": item["table_name"],
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Final commit for table {table_name}: {documents_indexed} Airtable records processed"
|
||||
f"Committing batch: {documents_indexed} Airtable records processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Successfully committed all Airtable document changes for table {table_name}"
|
||||
)
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
# (after all tables in all bases are processed)
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(
|
||||
session, connector, update_last_indexed
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing Airtable record: {e!s}", exc_info=True
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
total_processed = documents_indexed
|
||||
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Airtable records processed"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Airtable document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same record was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success after processing all bases and tables
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Airtable indexing for connector {connector_id}",
|
||||
{
|
||||
"events_processed": total_processed,
|
||||
"documents_indexed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Airtable indexing completed: {total_processed} total records processed"
|
||||
f"Airtable indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
warning_message,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
|
|
|
|||
|
|
@ -28,6 +28,35 @@ def get_current_timestamp() -> datetime:
|
|||
return datetime.now(UTC)
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
"""
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
|
||||
|
||||
def parse_date_flexible(date_str: str) -> datetime:
|
||||
"""
|
||||
Parse date from multiple common formats.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
BookStack connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Collect all pages and create pending documents (visible in UI immediately)
|
||||
- Phase 2: Process each page: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.bookstack_connector import BookStackConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -184,22 +189,22 @@ async def index_bookstack_pages(
|
|||
logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True)
|
||||
return 0, f"Error fetching BookStack pages: {e!s}"
|
||||
|
||||
# Process and index each page
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all pages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
documents_indexed = 0
|
||||
skipped_pages = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
pages_to_process = [] # List of dicts with document and page data
|
||||
new_documents_created = False
|
||||
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
page_id = page.get("id")
|
||||
page_name = page.get("name", "")
|
||||
|
|
@ -218,7 +223,7 @@ async def index_bookstack_pages(
|
|||
|
||||
# Fetch full page content (Markdown preferred)
|
||||
try:
|
||||
page_detail, page_content = bookstack_client.get_page_with_content(
|
||||
_, page_content = bookstack_client.get_page_with_content(
|
||||
page_id, use_markdown=True
|
||||
)
|
||||
except Exception as e:
|
||||
|
|
@ -252,82 +257,38 @@ async def index_bookstack_pages(
|
|||
# Build page URL
|
||||
page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}"
|
||||
|
||||
# Build document metadata
|
||||
doc_metadata = {
|
||||
"page_id": page_id,
|
||||
"page_name": page_name,
|
||||
"page_slug": page_slug,
|
||||
"book_id": book_id,
|
||||
"book_slug": book_slug,
|
||||
"chapter_id": chapter_id,
|
||||
"base_url": bookstack_base_url,
|
||||
"page_url": page_url,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for BookStack page {page_name} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for BookStack page {page_name}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
summary_metadata = {
|
||||
"page_name": page_name,
|
||||
"page_id": page_id,
|
||||
"book_id": book_id,
|
||||
"document_type": "BookStack Page",
|
||||
"connector_type": "BookStack",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
full_content, user_llm, summary_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
|
||||
)
|
||||
if page_content:
|
||||
content_preview = page_content[:1000]
|
||||
if len(page_content) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += (
|
||||
f"Content Preview: {content_preview}\n\n"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(full_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"BookStack - {page_name}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = doc_metadata
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated BookStack page {page_name}")
|
||||
continue
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"page_id": page_id,
|
||||
"page_name": page_name,
|
||||
"page_slug": page_slug,
|
||||
"book_id": book_id,
|
||||
"book_slug": book_slug,
|
||||
"chapter_id": chapter_id,
|
||||
"page_url": page_url,
|
||||
"page_content": page_content,
|
||||
"full_content": full_content,
|
||||
"content_hash": content_hash,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -345,17 +306,108 @@ async def index_bookstack_pages(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=page_name,
|
||||
document_type=DocumentType.BOOKSTACK_CONNECTOR,
|
||||
document_metadata={
|
||||
"page_id": page_id,
|
||||
"page_name": page_name,
|
||||
"page_slug": page_slug,
|
||||
"book_id": book_id,
|
||||
"book_slug": book_slug,
|
||||
"chapter_id": chapter_id,
|
||||
"base_url": bookstack_base_url,
|
||||
"page_url": page_url,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"page_id": page_id,
|
||||
"page_name": page_name,
|
||||
"page_slug": page_slug,
|
||||
"book_id": book_id,
|
||||
"book_slug": book_slug,
|
||||
"chapter_id": chapter_id,
|
||||
"page_url": page_url,
|
||||
"page_content": page_content,
|
||||
"full_content": full_content,
|
||||
"content_hash": content_hash,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
|
||||
|
||||
for item in pages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
# Build document metadata
|
||||
doc_metadata = {
|
||||
"page_id": item["page_id"],
|
||||
"page_name": item["page_name"],
|
||||
"page_slug": item["page_slug"],
|
||||
"book_id": item["book_id"],
|
||||
"book_slug": item["book_slug"],
|
||||
"chapter_id": item["chapter_id"],
|
||||
"base_url": bookstack_base_url,
|
||||
"page_url": item["page_url"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
summary_metadata = {
|
||||
"page_name": page_name,
|
||||
"page_id": page_id,
|
||||
"book_id": book_id,
|
||||
"page_name": item["page_name"],
|
||||
"page_id": item["page_id"],
|
||||
"book_id": item["book_id"],
|
||||
"document_type": "BookStack Page",
|
||||
"connector_type": "BookStack",
|
||||
}
|
||||
|
|
@ -363,17 +415,15 @@ async def index_bookstack_pages(
|
|||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
full_content, user_llm, summary_metadata
|
||||
item["full_content"], user_llm, summary_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = (
|
||||
f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n"
|
||||
)
|
||||
if page_content:
|
||||
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n"
|
||||
if item["page_content"]:
|
||||
# Take first 1000 characters of content for summary
|
||||
content_preview = page_content[:1000]
|
||||
if len(page_content) > 1000:
|
||||
content_preview = item["page_content"][:1000]
|
||||
if len(item["page_content"]) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview: {content_preview}\n\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
|
|
@ -381,30 +431,21 @@ async def index_bookstack_pages(
|
|||
)
|
||||
|
||||
# Process chunks - using the full page content
|
||||
chunks = await create_document_chunks(full_content)
|
||||
chunks = await create_document_chunks(item["full_content"])
|
||||
|
||||
# Create and store new document
|
||||
logger.info(f"Creating new document for page {page_name}")
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"BookStack - {page_name}",
|
||||
document_type=DocumentType.BOOKSTACK_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
# Update document to READY with actual content
|
||||
document.title = item["page_name"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = doc_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new page {page_name}")
|
||||
|
||||
# Batch commit every 10 documents
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} BookStack pages processed so far"
|
||||
|
|
@ -413,46 +454,76 @@ async def index_bookstack_pages(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing page {page.get('name', 'Unknown')}: {e!s}",
|
||||
f"Error processing page {item.get('page_name', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_pages.append(
|
||||
f"{page.get('name', 'Unknown')} (processing error)"
|
||||
f"{item.get('page_name', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue # Skip this page and continue with others
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
total_processed = documents_indexed
|
||||
if update_last_indexed:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} BookStack pages processed"
|
||||
)
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all BookStack document changes to database")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all BookStack document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same page was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed BookStack indexing for connector {connector_id}",
|
||||
{
|
||||
"pages_processed": total_processed,
|
||||
"pages_processed": documents_indexed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"skipped_pages_count": len(skipped_pages),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
|
||||
f"BookStack indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
ClickUp connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.clickup_history import ClickUpHistoryConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -141,10 +146,18 @@ async def index_clickup_tasks(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all tasks and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
tasks_to_process = [] # List of dicts with document and task data
|
||||
new_documents_created = False
|
||||
|
||||
# Iterate workspaces and fetch tasks
|
||||
for workspace in workspaces:
|
||||
workspace_id = workspace.get("id")
|
||||
|
|
@ -183,15 +196,6 @@ async def index_clickup_tasks(
|
|||
)
|
||||
|
||||
for task in tasks:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
task_id = task.get("id")
|
||||
task_name = task.get("name", "Untitled Task")
|
||||
|
|
@ -255,73 +259,38 @@ async def index_clickup_tasks(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for ClickUp task {task_name} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
logger.info(
|
||||
f"Content changed for ClickUp task {task_name}. Updating document."
|
||||
f"Content changed for ClickUp task {task_name}. Queuing for update."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
tasks_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"task_content": task_content,
|
||||
"content_hash": content_hash,
|
||||
"task_id": task_id,
|
||||
"task_name": task_name,
|
||||
"task_status": task_status,
|
||||
"task_priority": task_priority,
|
||||
"task_list": task_list_name,
|
||||
"task_space": task_space_name,
|
||||
"assignees": len(task_assignees),
|
||||
"document_type": "ClickUp Task",
|
||||
"connector_type": "ClickUp",
|
||||
"task_list_name": task_list_name,
|
||||
"task_space_name": task_space_name,
|
||||
"task_assignees": task_assignees,
|
||||
"task_due_date": task_due_date,
|
||||
"task_created": task_created,
|
||||
"task_updated": task_updated,
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
task_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = task_content
|
||||
summary_embedding = (
|
||||
config.embedding_model_instance.embed(task_content)
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(task_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Task - {task_name}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"task_id": task_id,
|
||||
"task_name": task_name,
|
||||
"task_status": task_status,
|
||||
"task_priority": task_priority,
|
||||
"task_assignees": task_assignees,
|
||||
"task_due_date": task_due_date,
|
||||
"task_created": task_created,
|
||||
"task_updated": task_updated,
|
||||
"indexed_at": datetime.now().strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated ClickUp task {task_name}"
|
||||
)
|
||||
continue
|
||||
|
||||
|
|
@ -341,42 +310,10 @@ async def index_clickup_tasks(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"task_id": task_id,
|
||||
"task_name": task_name,
|
||||
"task_status": task_status,
|
||||
"task_priority": task_priority,
|
||||
"task_list": task_list_name,
|
||||
"task_space": task_space_name,
|
||||
"assignees": len(task_assignees),
|
||||
"document_type": "ClickUp Task",
|
||||
"connector_type": "ClickUp",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
task_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = task_content
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
task_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(task_content)
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Task - {task_name}",
|
||||
title=task_name,
|
||||
document_type=DocumentType.CLICKUP_CONNECTOR,
|
||||
document_metadata={
|
||||
"task_id": task_id,
|
||||
|
|
@ -387,44 +324,180 @@ async def index_clickup_tasks(
|
|||
"task_due_date": task_due_date,
|
||||
"task_created": task_created,
|
||||
"task_updated": task_updated,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new task {task_name}")
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} ClickUp tasks processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
tasks_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"task_content": task_content,
|
||||
"content_hash": content_hash,
|
||||
"task_id": task_id,
|
||||
"task_name": task_name,
|
||||
"task_status": task_status,
|
||||
"task_priority": task_priority,
|
||||
"task_list_name": task_list_name,
|
||||
"task_space_name": task_space_name,
|
||||
"task_assignees": task_assignees,
|
||||
"task_due_date": task_due_date,
|
||||
"task_created": task_created,
|
||||
"task_updated": task_updated,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing task {task.get('name', 'Unknown')}: {e!s}",
|
||||
f"Error in Phase 1 for task {task.get('name', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
documents_skipped += 1
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(tasks_to_process)} documents")
|
||||
|
||||
for item in tasks_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"task_id": item["task_id"],
|
||||
"task_name": item["task_name"],
|
||||
"task_status": item["task_status"],
|
||||
"task_priority": item["task_priority"],
|
||||
"task_list": item["task_list_name"],
|
||||
"task_space": item["task_space_name"],
|
||||
"assignees": len(item["task_assignees"]),
|
||||
"document_type": "ClickUp Task",
|
||||
"connector_type": "ClickUp",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["task_content"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = item["task_content"]
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
item["task_content"]
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["task_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["task_name"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"task_id": item["task_id"],
|
||||
"task_name": item["task_name"],
|
||||
"task_status": item["task_status"],
|
||||
"task_priority": item["task_priority"],
|
||||
"task_assignees": item["task_assignees"],
|
||||
"task_due_date": item["task_due_date"],
|
||||
"task_created": item["task_created"],
|
||||
"task_updated": item["task_updated"],
|
||||
"connector_id": connector_id,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} ClickUp tasks processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing task {item.get('task_name', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
total_processed = documents_indexed
|
||||
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} ClickUp tasks processed")
|
||||
await session.commit()
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all ClickUp document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same task was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -433,11 +506,12 @@ async def index_clickup_tasks(
|
|||
"pages_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped"
|
||||
f"clickup indexing completed: {documents_indexed} ready, {documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
|
||||
# Close client connection
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Confluence connector indexer.
|
||||
|
||||
Provides real-time document status updates during indexing using a two-phase approach:
|
||||
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
|
||||
- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED)
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.confluence_history import ConfluenceHistoryConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -29,6 +33,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -180,22 +185,22 @@ async def index_confluence_pages(
|
|||
await confluence_client.close()
|
||||
return 0, f"Error fetching Confluence pages: {e!s}"
|
||||
|
||||
# Process and index each page
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all pages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
documents_indexed = 0
|
||||
skipped_pages = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
pages_to_process = [] # List of dicts with document and page data
|
||||
new_documents_created = False
|
||||
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
page_id = page.get("id")
|
||||
page_title = page.get("title", "")
|
||||
|
|
@ -205,7 +210,6 @@ async def index_confluence_pages(
|
|||
logger.warning(
|
||||
f"Skipping page with missing ID or title: {page_id or 'Unknown'}"
|
||||
)
|
||||
skipped_pages.append(f"{page_title or 'Unknown'} (missing data)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -236,7 +240,6 @@ async def index_confluence_pages(
|
|||
|
||||
if not full_content.strip():
|
||||
logger.warning(f"Skipping page with no content: {page_title}")
|
||||
skipped_pages.append(f"{page_title} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -258,74 +261,29 @@ async def index_confluence_pages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Confluence page {page_title} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Confluence page {page_title}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
"document_type": "Confluence Page",
|
||||
"connector_type": "Confluence",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
full_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
|
||||
if page_content:
|
||||
content_preview = page_content[:1000]
|
||||
if len(page_content) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += (
|
||||
f"Content Preview: {content_preview}\n\n"
|
||||
)
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(full_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Confluence - {page_title}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"full_content": full_content,
|
||||
"page_content": page_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Confluence page {page_title}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -340,21 +298,92 @@ async def index_confluence_pages(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=page_title,
|
||||
document_type=DocumentType.CONFLUENCE_CONNECTOR,
|
||||
document_metadata={
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"full_content": full_content,
|
||||
"page_content": page_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
|
||||
|
||||
for item in pages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
"space_id": item["space_id"],
|
||||
"comment_count": item["comment_count"],
|
||||
"document_type": "Confluence Page",
|
||||
"connector_type": "Confluence",
|
||||
}
|
||||
|
|
@ -362,55 +391,45 @@ async def index_confluence_pages(
|
|||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
full_content, user_llm, document_metadata
|
||||
item["full_content"], user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = (
|
||||
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n"
|
||||
)
|
||||
if page_content:
|
||||
# Take first 500 characters of content for summary
|
||||
content_preview = page_content[:1000]
|
||||
if len(page_content) > 1000:
|
||||
summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n"
|
||||
if item["page_content"]:
|
||||
# Take first 1000 characters of content for summary
|
||||
content_preview = item["page_content"][:1000]
|
||||
if len(item["page_content"]) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview: {content_preview}\n\n"
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks - using the full page content with comments
|
||||
chunks = await create_document_chunks(full_content)
|
||||
chunks = await create_document_chunks(item["full_content"])
|
||||
|
||||
# Create and store new document
|
||||
logger.info(f"Creating new document for page {page_title}")
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Confluence - {page_title}",
|
||||
document_type=DocumentType.CONFLUENCE_CONNECTOR,
|
||||
document_metadata={
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
"space_id": space_id,
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
# Update document to READY with actual content
|
||||
document.title = item["page_title"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"page_id": item["page_id"],
|
||||
"page_title": item["page_title"],
|
||||
"space_id": item["space_id"],
|
||||
"comment_count": item["comment_count"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new page {page_title}")
|
||||
|
||||
# Batch commit every 10 documents
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Confluence pages processed so far"
|
||||
|
|
@ -419,53 +438,80 @@ async def index_confluence_pages(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing page {page.get('title', 'Unknown')}: {e!s}",
|
||||
f"Error processing page {item.get('page_title', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
skipped_pages.append(
|
||||
f"{page.get('title', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue # Skip this page and continue with others
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
total_processed = documents_indexed
|
||||
if update_last_indexed:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Confluence pages processed"
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Confluence document changes to database"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Confluence document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same page was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Confluence indexing for connector {connector_id}",
|
||||
{
|
||||
"pages_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"skipped_pages_count": len(skipped_pages),
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
|
||||
f"Confluence indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
|
||||
# Close the client connection
|
||||
if confluence_client:
|
||||
await confluence_client.close()
|
||||
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Discord connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.discord_connector import DiscordConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -27,6 +31,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -48,7 +53,11 @@ async def index_discord_messages(
|
|||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Discord messages from all accessible channels.
|
||||
Index Discord messages from the configured guild's channels.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
|
|
@ -113,6 +122,37 @@ async def index_discord_messages(
|
|||
|
||||
logger.info(f"Starting Discord indexing for connector {connector_id}")
|
||||
|
||||
# =======================================================================
|
||||
# GUILD FILTERING: Only index the specific guild configured for this connector
|
||||
# =======================================================================
|
||||
# Extract guild_id from connector config (set during OAuth flow)
|
||||
configured_guild_id = connector.config.get("guild_id")
|
||||
configured_guild_name = connector.config.get("guild_name")
|
||||
|
||||
# Legacy connector check - if no guild_id, we need to warn and handle gracefully
|
||||
is_legacy_connector = configured_guild_id is None
|
||||
|
||||
if is_legacy_connector:
|
||||
logger.warning(
|
||||
f"Discord connector {connector_id} has no guild_id configured. "
|
||||
"This is a legacy connector. Please reconnect the Discord server to fix this. "
|
||||
"For now, indexing will be skipped to prevent indexing unwanted servers."
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Legacy Discord connector {connector_id} missing guild_id",
|
||||
"No guild_id configured. Please reconnect this Discord server.",
|
||||
{"error_type": "MissingGuildId", "is_legacy": True},
|
||||
)
|
||||
return (
|
||||
0,
|
||||
"This Discord connector needs to be reconnected. Please disconnect and reconnect your Discord server to enable indexing.",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Configured to index guild: {configured_guild_name} ({configured_guild_id})"
|
||||
)
|
||||
|
||||
# Initialize Discord client with OAuth credentials support
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -255,77 +295,66 @@ async def index_discord_messages(
|
|||
try:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting Discord bot and fetching guilds for connector {connector_id}",
|
||||
{"stage": "fetch_guilds"},
|
||||
f"Starting Discord bot for connector {connector_id}",
|
||||
{"stage": "bot_initialization"},
|
||||
)
|
||||
|
||||
logger.info("Starting Discord bot to fetch guilds")
|
||||
logger.info("Starting Discord bot")
|
||||
discord_client._bot_task = asyncio.create_task(discord_client.start_bot())
|
||||
await discord_client._wait_until_ready()
|
||||
|
||||
logger.info("Fetching Discord guilds")
|
||||
guilds = await discord_client.get_guilds()
|
||||
logger.info(f"Found {len(guilds)} guilds")
|
||||
# We only process the configured guild, not all guilds
|
||||
logger.info(
|
||||
f"Processing configured guild only: {configured_guild_name} ({configured_guild_id})"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Discord guilds for connector {connector_id}",
|
||||
f"Failed to start Discord bot for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": "GuildFetchError"},
|
||||
{"error_type": "BotStartError"},
|
||||
)
|
||||
logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True)
|
||||
logger.error(f"Failed to start Discord bot: {e!s}", exc_info=True)
|
||||
await discord_client.close_bot()
|
||||
return 0, f"Failed to get Discord guilds: {e!s}"
|
||||
|
||||
if not guilds:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"No Discord guilds found for connector {connector_id}",
|
||||
{"guilds_found": 0},
|
||||
)
|
||||
logger.info("No Discord guilds found to index")
|
||||
await discord_client.close_bot()
|
||||
return 0, "No Discord guilds found"
|
||||
return 0, f"Failed to start Discord bot: {e!s}"
|
||||
|
||||
# Track results
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
skipped_channels: list[str] = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# Process each guild and channel
|
||||
# Use the configured guild info
|
||||
guild_id = configured_guild_id
|
||||
guild_name = configured_guild_name or "Unknown Guild"
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to process {len(guilds)} Discord guilds",
|
||||
{"stage": "process_guilds", "total_guilds": len(guilds)},
|
||||
f"Processing Discord guild: {guild_name}",
|
||||
{"stage": "process_guild", "guild_id": guild_id, "guild_name": guild_name},
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all messages and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
messages_to_process = [] # List of dicts with document and message data
|
||||
new_documents_created = False
|
||||
|
||||
try:
|
||||
for guild in guilds:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
guild_id = guild["id"]
|
||||
guild_name = guild["name"]
|
||||
logger.info(f"Processing guild: {guild_name} ({guild_id})")
|
||||
|
||||
try:
|
||||
channels = await discord_client.get_text_channels(guild_id)
|
||||
if not channels:
|
||||
logger.info(
|
||||
f"No channels found in guild {guild_name}. Skipping."
|
||||
)
|
||||
skipped_channels.append(f"{guild_name} (no channels)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
logger.info(f"Processing guild: {guild_name} ({guild_id})")
|
||||
|
||||
try:
|
||||
channels = await discord_client.get_text_channels(guild_id)
|
||||
if not channels:
|
||||
logger.info(f"No channels found in guild {guild_name}. Skipping.")
|
||||
skipped_channels.append(f"{guild_name} (no channels)")
|
||||
else:
|
||||
for channel in channels:
|
||||
channel_id = channel["id"]
|
||||
channel_name = channel["name"]
|
||||
|
|
@ -343,14 +372,12 @@ async def index_discord_messages(
|
|||
skipped_channels.append(
|
||||
f"{guild_name}#{channel_name} (fetch error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if not messages:
|
||||
logger.info(
|
||||
f"No messages found in channel {channel_name} for the specified date range."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Filter/format messages
|
||||
|
|
@ -365,7 +392,6 @@ async def index_discord_messages(
|
|||
logger.info(
|
||||
f"No valid messages found in channel {channel_name} after filtering."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Process each message as an individual document (like Slack)
|
||||
|
|
@ -427,32 +453,23 @@ async def index_discord_messages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = (
|
||||
DocumentStatus.ready()
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document."
|
||||
)
|
||||
|
||||
# Update chunks and embedding
|
||||
chunks = await create_document_chunks(
|
||||
combined_document_string
|
||||
)
|
||||
doc_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
)
|
||||
|
||||
# Update existing document
|
||||
existing_document.content = combined_document_string
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = doc_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"guild_name": guild_name,
|
||||
"guild_id": guild_id,
|
||||
"channel_name": channel_name,
|
||||
|
|
@ -460,22 +477,9 @@ async def index_discord_messages(
|
|||
"message_id": msg_id,
|
||||
"message_timestamp": msg_timestamp,
|
||||
"message_user_name": msg_user_name,
|
||||
"indexed_at": datetime.now(UTC).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
|
||||
# Delete old chunks and add new ones
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = (
|
||||
get_current_timestamp()
|
||||
)
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Discord message {msg_id}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -492,22 +496,14 @@ async def index_discord_messages(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(
|
||||
combined_document_string
|
||||
)
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
|
||||
# Create and store new document
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Discord - {guild_name}#{channel_name}",
|
||||
title=f"{guild_name}#{channel_name}",
|
||||
document_type=DocumentType.DISCORD_CONNECTOR,
|
||||
document_metadata={
|
||||
"guild_name": guild_name,
|
||||
|
|
@ -515,87 +511,177 @@ async def index_discord_messages(
|
|||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"message_id": msg_id,
|
||||
"message_timestamp": msg_timestamp,
|
||||
"message_user_name": msg_user_name,
|
||||
"indexed_at": datetime.now(UTC).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=combined_document_string,
|
||||
embedding=doc_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Discord messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"guild_name": guild_name,
|
||||
"guild_id": guild_id,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"message_id": msg_id,
|
||||
"message_timestamp": msg_timestamp,
|
||||
"message_user_name": msg_user_name,
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing guild {guild_name}: {e!s}", exc_info=True
|
||||
)
|
||||
skipped_channels.append(f"{guild_name} (processing error)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing guild {guild_name}: {e!s}", exc_info=True
|
||||
)
|
||||
skipped_channels.append(f"{guild_name} (processing error)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
finally:
|
||||
await discord_client.close_bot()
|
||||
|
||||
# Update last_indexed_at only if we indexed at least one
|
||||
if documents_indexed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
||||
|
||||
for item in messages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['guild_name']}#{item['channel_name']}"
|
||||
document.content = item["combined_document_string"]
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = doc_embedding
|
||||
document.document_metadata = {
|
||||
"guild_name": item["guild_name"],
|
||||
"guild_id": item["guild_id"],
|
||||
"channel_name": item["channel_name"],
|
||||
"channel_id": item["channel_id"],
|
||||
"message_id": item["message_id"],
|
||||
"message_timestamp": item["message_timestamp"],
|
||||
"message_user_name": item["message_user_name"],
|
||||
"indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Discord messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Discord message: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Discord messages processed"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# Prepare result message
|
||||
result_message = None
|
||||
if skipped_channels:
|
||||
result_message = (
|
||||
f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: "
|
||||
+ ", ".join(skipped_channels)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Discord document changes to database"
|
||||
)
|
||||
else:
|
||||
result_message = f"Processed {documents_indexed} messages."
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
if skipped_channels:
|
||||
warning_parts.append(f"{len(skipped_channels)} channels skipped")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Discord indexing for connector {connector_id}",
|
||||
{
|
||||
"messages_processed": documents_indexed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_channels_count": len(skipped_channels),
|
||||
"guilds_processed": len(guilds),
|
||||
"result_message": result_message,
|
||||
"guild_id": guild_id,
|
||||
"guild_name": guild_name,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped"
|
||||
f"Discord indexing completed for guild {guild_name}: {documents_indexed} ready, {documents_skipped} skipped, "
|
||||
f"{documents_failed} failed ({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return (
|
||||
documents_indexed,
|
||||
None,
|
||||
) # Return None on success (result_message is for logging only)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Elasticsearch indexer for SurfSense
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Collect all documents and create pending documents (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import json
|
||||
|
|
@ -13,7 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from sqlalchemy.future import select
|
||||
|
||||
from app.connectors.elasticsearch_connector import ElasticsearchConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -25,6 +29,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
|
|
@ -164,6 +169,8 @@ async def index_elasticsearch_documents(
|
|||
)
|
||||
|
||||
documents_processed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
|
@ -178,23 +185,22 @@ async def index_elasticsearch_documents(
|
|||
"max_documents": max_documents,
|
||||
},
|
||||
)
|
||||
# Use scroll search for large result sets
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all documents from Elasticsearch and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
docs_to_process = [] # List of dicts with document and ES data
|
||||
new_documents_created = False
|
||||
hits_collected = 0
|
||||
|
||||
async for hit in es_connector.scroll_search(
|
||||
index=index_name,
|
||||
query=query,
|
||||
size=min(max_documents, 100), # Scroll in batches
|
||||
fields=config.get("ELASTICSEARCH_FIELDS"),
|
||||
):
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_processed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
if documents_processed >= max_documents:
|
||||
if hits_collected >= max_documents:
|
||||
break
|
||||
|
||||
try:
|
||||
|
|
@ -220,26 +226,12 @@ async def index_elasticsearch_documents(
|
|||
|
||||
if not content.strip():
|
||||
logger.warning(f"Skipping document {doc_id} - no content found")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create content hash
|
||||
content_hash = generate_content_hash(content, search_space_id)
|
||||
|
||||
# Build metadata
|
||||
metadata = {
|
||||
"elasticsearch_id": doc_id,
|
||||
"elasticsearch_index": hit.get("_index", index_name),
|
||||
"elasticsearch_score": hit.get("_score"),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
"source": "ELASTICSEARCH_CONNECTOR",
|
||||
}
|
||||
|
||||
# Add any additional metadata fields specified in config
|
||||
if "ELASTICSEARCH_METADATA_FIELDS" in config:
|
||||
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
|
||||
if field in source:
|
||||
metadata[f"es_{field}"] = source[field]
|
||||
|
||||
# Build source-unique identifier and hash (prefer source id dedupe)
|
||||
source_identifier = f"{hit.get('_index', index_name)}:{doc_id}"
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
|
|
@ -258,98 +250,223 @@ async def index_elasticsearch_documents(
|
|||
)
|
||||
|
||||
if existing_doc:
|
||||
# If content is unchanged, skip. Otherwise update the existing document.
|
||||
# If content is unchanged, skip. Otherwise queue for update.
|
||||
if existing_doc.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_doc.status, DocumentStatus.READY
|
||||
):
|
||||
existing_doc.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Skipping ES doc {doc_id} — already indexed (doc id {existing_doc.id})"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
logger.info(
|
||||
f"Updating existing document {existing_doc.id} for ES doc {doc_id}"
|
||||
)
|
||||
existing_doc.title = title
|
||||
existing_doc.content = content
|
||||
existing_doc.content_hash = content_hash
|
||||
existing_doc.document_metadata = metadata
|
||||
existing_doc.unique_identifier_hash = unique_identifier_hash
|
||||
chunks = await create_document_chunks(content)
|
||||
existing_doc.chunks = chunks
|
||||
existing_doc.updated_at = get_current_timestamp()
|
||||
await session.flush()
|
||||
documents_processed += 1
|
||||
if documents_processed % 10 == 0:
|
||||
await session.commit()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create document
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
docs_to_process.append(
|
||||
{
|
||||
"document": existing_doc,
|
||||
"is_new": False,
|
||||
"doc_id": doc_id,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"content_hash": content_hash,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
"hit": hit,
|
||||
"source": source,
|
||||
}
|
||||
)
|
||||
hits_collected += 1
|
||||
continue
|
||||
|
||||
# Build metadata for new document
|
||||
metadata = {
|
||||
"elasticsearch_id": doc_id,
|
||||
"elasticsearch_index": hit.get("_index", index_name),
|
||||
"elasticsearch_score": hit.get("_score"),
|
||||
"source": "ELASTICSEARCH_CONNECTOR",
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
|
||||
# Add any additional metadata fields specified in config
|
||||
if "ELASTICSEARCH_METADATA_FIELDS" in config:
|
||||
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
|
||||
if field in source:
|
||||
metadata[f"es_{field}"] = source[field]
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
title=title,
|
||||
content=content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
document_type=DocumentType.ELASTICSEARCH_CONNECTOR,
|
||||
document_metadata=metadata,
|
||||
search_space_id=search_space_id,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
# Create chunks and attach to document (persist via relationship)
|
||||
chunks = await create_document_chunks(content)
|
||||
document.chunks = chunks
|
||||
session.add(document)
|
||||
await session.flush()
|
||||
new_documents_created = True
|
||||
|
||||
docs_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"doc_id": doc_id,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"content_hash": content_hash,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
"hit": hit,
|
||||
"source": source,
|
||||
}
|
||||
)
|
||||
hits_collected += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for ES doc: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(docs_to_process)} documents")
|
||||
|
||||
for item in docs_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_processed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Build metadata
|
||||
metadata = {
|
||||
"elasticsearch_id": item["doc_id"],
|
||||
"elasticsearch_index": item["hit"].get("_index", index_name),
|
||||
"elasticsearch_score": item["hit"].get("_score"),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
"source": "ELASTICSEARCH_CONNECTOR",
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
|
||||
# Add any additional metadata fields specified in config
|
||||
if "ELASTICSEARCH_METADATA_FIELDS" in config:
|
||||
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
|
||||
if field in item["source"]:
|
||||
metadata[f"es_{field}"] = item["source"][field]
|
||||
|
||||
# Create chunks
|
||||
chunks = await create_document_chunks(item["content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["title"]
|
||||
document.content = item["content"]
|
||||
document.content_hash = item["content_hash"]
|
||||
document.unique_identifier_hash = item["unique_identifier_hash"]
|
||||
document.document_metadata = metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_processed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_processed % 10 == 0:
|
||||
logger.info(
|
||||
f"Processed {documents_processed} Elasticsearch documents"
|
||||
f"Committing batch: {documents_processed} Elasticsearch documents processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
msg = f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}"
|
||||
msg = f"Error processing Elasticsearch document {item.get('doc_id', 'unknown')}: {e}"
|
||||
logger.error(msg)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
"Document processing error",
|
||||
msg,
|
||||
{
|
||||
"document_id": hit.get("_id", "unknown"),
|
||||
"error_type": type(e).__name__,
|
||||
},
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Final commit
|
||||
await session.commit()
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
if update_last_indexed:
|
||||
connector.last_indexed_at = (
|
||||
datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
||||
)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_processed} Elasticsearch documents processed"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Elasticsearch document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same document was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully indexed {documents_processed} documents from Elasticsearch",
|
||||
{"documents_indexed": documents_processed, "index": index_name},
|
||||
{
|
||||
"documents_indexed": documents_processed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"index": index_name,
|
||||
},
|
||||
)
|
||||
logger.info(
|
||||
f"Successfully indexed {documents_processed} documents from Elasticsearch"
|
||||
f"Elasticsearch indexing completed: {documents_processed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
|
||||
# Update last indexed timestamp if requested
|
||||
if update_last_indexed and documents_processed > 0:
|
||||
# connector.last_indexed_at = datetime.now()
|
||||
connector.last_indexed_at = (
|
||||
datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
||||
)
|
||||
await session.commit()
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
"Updated connector.last_indexed_at",
|
||||
{"last_indexed_at": connector.last_indexed_at},
|
||||
)
|
||||
|
||||
return documents_processed, None
|
||||
return documents_processed, warning_message
|
||||
|
||||
finally:
|
||||
# Clean up Elasticsearch connection
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@ GitHub connector indexer using gitingest.
|
|||
|
||||
This indexer processes entire repository digests in one pass, dramatically
|
||||
reducing LLM API calls compared to the previous file-by-file approach.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -13,8 +17,8 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.github_connector import GitHubConnector, RepositoryDigest
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.connectors.github_connector import GitHubConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -30,6 +34,8 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
|
|
@ -164,7 +170,7 @@ async def index_github_repos(
|
|||
)
|
||||
return 0, f"Failed to initialize GitHub client: {e!s}"
|
||||
|
||||
# 4. Process each repository with gitingest
|
||||
# 4. Process each repository with gitingest using 2-phase approach
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories",
|
||||
|
|
@ -181,24 +187,25 @@ async def index_github_repos(
|
|||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all repos and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
repos_to_process = [] # List of dicts with document and digest data
|
||||
new_documents_created = False
|
||||
|
||||
for repo_full_name in repo_full_names_to_index:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
||||
logger.info(f"Ingesting repository: {repo_full_name}")
|
||||
|
||||
try:
|
||||
logger.info(f"Phase 1: Analyzing repository: {repo_full_name}")
|
||||
|
||||
# Run gitingest via subprocess (isolated from event loop)
|
||||
# Using to_thread to not block the async database operations
|
||||
import asyncio
|
||||
|
||||
digest = await asyncio.to_thread(
|
||||
|
|
@ -212,30 +219,266 @@ async def index_github_repos(
|
|||
errors.append(f"No digest for {repo_full_name}")
|
||||
continue
|
||||
|
||||
# Process the digest and create documents
|
||||
docs_created = await _process_repository_digest(
|
||||
session=session,
|
||||
digest=digest,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
connector_id=connector_id,
|
||||
# Generate unique identifier based on repo name
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
|
||||
)
|
||||
|
||||
documents_processed += docs_created
|
||||
logger.info(
|
||||
f"Created {docs_created} documents from repository: {repo_full_name}"
|
||||
# Generate content hash from digest
|
||||
full_content = digest.full_digest
|
||||
content_hash = generate_content_hash(full_content, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(f"Repository {repo_full_name} unchanged. Skipping.")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
logger.info(
|
||||
f"Content changed for repository {repo_full_name}. Queuing for update."
|
||||
)
|
||||
repos_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"digest": digest,
|
||||
"content_hash": content_hash,
|
||||
"repo_full_name": repo_full_name,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Repository {repo_full_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=repo_full_name,
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata={
|
||||
"repository_full_name": repo_full_name,
|
||||
"url": f"https://github.com/{repo_full_name}",
|
||||
"branch": digest.branch,
|
||||
"ingestion_method": "gitingest",
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
repos_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"digest": digest,
|
||||
"content_hash": content_hash,
|
||||
"repo_full_name": repo_full_name,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as repo_err:
|
||||
logger.error(
|
||||
f"Failed to process repository {repo_full_name}: {repo_err}"
|
||||
f"Error in Phase 1 for repository {repo_full_name}: {repo_err}",
|
||||
exc_info=True,
|
||||
)
|
||||
errors.append(f"Phase 1 error for {repo_full_name}: {repo_err}")
|
||||
documents_failed += 1
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(repos_to_process)} documents")
|
||||
|
||||
for item in repos_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
digest = item["digest"]
|
||||
repo_full_name = item["repo_full_name"]
|
||||
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
document_metadata_for_summary = {
|
||||
"repository": repo_full_name,
|
||||
"document_type": "GitHub Repository",
|
||||
"connector_type": "GitHub",
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree[:2000]
|
||||
if len(digest.tree) > 2000
|
||||
else digest.tree,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
# Prepare content for summarization
|
||||
summary_content = digest.full_digest
|
||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||
summary_content = (
|
||||
f"# Repository: {repo_full_name}\n\n"
|
||||
f"## File Structure\n\n{digest.tree}\n\n"
|
||||
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
||||
)
|
||||
|
||||
summary_text, summary_embedding = await generate_document_summary(
|
||||
summary_content, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_text = (
|
||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||
f"## Summary\n{digest.summary}\n\n"
|
||||
f"## File Structure\n{digest.tree[:3000]}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_text
|
||||
)
|
||||
|
||||
# Chunk the full digest content for granular search
|
||||
try:
|
||||
chunks_data = await create_document_chunks(digest.content)
|
||||
except Exception as chunk_err:
|
||||
logger.error(
|
||||
f"Failed to chunk repository {repo_full_name}: {chunk_err}"
|
||||
)
|
||||
chunks_data = await _simple_chunk_content(digest.content)
|
||||
|
||||
# Update document to READY with actual content
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"url": f"https://github.com/{repo_full_name}",
|
||||
"branch": digest.branch,
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree,
|
||||
"gitingest_summary": digest.summary,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
"connector_id": connector_id,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
document.title = repo_full_name
|
||||
document.content = summary_text
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = doc_metadata
|
||||
safe_set_chunks(document, chunks_data)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_processed += 1
|
||||
documents_indexed += 1
|
||||
|
||||
logger.info(
|
||||
f"Created document for repository {repo_full_name} "
|
||||
f"with {len(chunks_data)} chunks"
|
||||
)
|
||||
|
||||
# Batch commit every 5 documents (repositories are large)
|
||||
if documents_indexed % 5 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} GitHub repos processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as repo_err:
|
||||
logger.error(
|
||||
f"Error processing repository {repo_full_name}: {repo_err}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(repo_err))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
errors.append(f"Failed processing {repo_full_name}: {repo_err}")
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_processed} GitHub repositories processed"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all GitHub document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
logger.info(
|
||||
f"Finished GitHub indexing for connector {connector_id}. "
|
||||
f"Created {documents_processed} documents."
|
||||
|
|
@ -247,6 +490,8 @@ async def index_github_repos(
|
|||
f"Successfully completed GitHub indexing for connector {connector_id}",
|
||||
{
|
||||
"documents_processed": documents_processed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"errors_count": len(errors),
|
||||
"repo_count": len(repo_full_names_to_index),
|
||||
"method": "gitingest",
|
||||
|
|
@ -286,163 +531,6 @@ async def index_github_repos(
|
|||
return documents_processed, error_message
|
||||
|
||||
|
||||
async def _process_repository_digest(
|
||||
session: AsyncSession,
|
||||
digest: RepositoryDigest,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry,
|
||||
connector_id: int,
|
||||
) -> int:
|
||||
"""
|
||||
Process a repository digest and create documents.
|
||||
|
||||
For each repository, we create:
|
||||
1. One main document with the repository summary
|
||||
2. Chunks from the full digest content for granular search
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
digest: The repository digest from gitingest
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
task_logger: Task logging service
|
||||
log_entry: Current log entry
|
||||
|
||||
Returns:
|
||||
Number of documents created
|
||||
"""
|
||||
repo_full_name = digest.repo_full_name
|
||||
documents_created = 0
|
||||
|
||||
# Generate unique identifier based on repo name and content hash
|
||||
# This allows updates when repo content changes
|
||||
full_content = digest.full_digest
|
||||
content_hash = generate_content_hash(full_content, search_space_id)
|
||||
|
||||
# Use repo name as the unique identifier (one document per repo)
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(f"Repository {repo_full_name} unchanged. Skipping.")
|
||||
return 0
|
||||
else:
|
||||
logger.info(
|
||||
f"Content changed for repository {repo_full_name}. Updating document."
|
||||
)
|
||||
# Delete existing document to replace with new one
|
||||
await session.delete(existing_document)
|
||||
await session.flush()
|
||||
else:
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Repository {repo_full_name} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
return 0
|
||||
|
||||
# Generate summary using LLM (ONE call per repository!)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
document_metadata = {
|
||||
"repository": repo_full_name,
|
||||
"document_type": "GitHub Repository",
|
||||
"connector_type": "GitHub",
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
}
|
||||
|
||||
if user_llm:
|
||||
# Prepare content for summarization
|
||||
# Include tree structure and truncated content if too large
|
||||
summary_content = digest.full_digest
|
||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
||||
# Truncate but keep the tree and beginning of content
|
||||
summary_content = (
|
||||
f"# Repository: {repo_full_name}\n\n"
|
||||
f"## File Structure\n\n{digest.tree}\n\n"
|
||||
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
||||
)
|
||||
|
||||
summary_text, summary_embedding = await generate_document_summary(
|
||||
summary_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_text = (
|
||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||
f"## Summary\n{digest.summary}\n\n"
|
||||
f"## File Structure\n{digest.tree[:3000]}"
|
||||
)
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_text)
|
||||
|
||||
# Chunk the full digest content for granular search
|
||||
try:
|
||||
# Use the content (not the summary) for chunking
|
||||
# This preserves file-level granularity in search
|
||||
chunks_data = await create_document_chunks(digest.content)
|
||||
except Exception as chunk_err:
|
||||
logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}")
|
||||
# Fall back to a simpler chunking approach
|
||||
chunks_data = await _simple_chunk_content(digest.content)
|
||||
|
||||
# Create the document
|
||||
doc_metadata = {
|
||||
"repository_full_name": repo_full_name,
|
||||
"url": f"https://github.com/{repo_full_name}",
|
||||
"branch": digest.branch,
|
||||
"ingestion_method": "gitingest",
|
||||
"file_tree": digest.tree,
|
||||
"gitingest_summary": digest.summary,
|
||||
"estimated_tokens": digest.estimated_tokens,
|
||||
"indexed_at": datetime.now(UTC).isoformat(),
|
||||
}
|
||||
|
||||
document = Document(
|
||||
title=f"GitHub Repository: {repo_full_name}",
|
||||
document_type=DocumentType.GITHUB_CONNECTOR,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_text,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
search_space_id=search_space_id,
|
||||
chunks=chunks_data,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_created += 1
|
||||
|
||||
logger.info(
|
||||
f"Created document for repository {repo_full_name} "
|
||||
f"with {len(chunks_data)} chunks"
|
||||
)
|
||||
|
||||
return documents_created
|
||||
|
||||
|
||||
async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
||||
"""
|
||||
Simple fallback chunking when the regular chunker fails.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Google Calendar connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -11,7 +15,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.connectors.google_calendar_connector import GoogleCalendarConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_current_timestamp,
|
||||
logger,
|
||||
parse_date_flexible,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -305,7 +310,7 @@ async def index_google_calendar_events(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
skipped_events = []
|
||||
documents_failed = 0 # Track events that failed processing
|
||||
duplicate_content_count = (
|
||||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
|
|
@ -313,14 +318,14 @@ async def index_google_calendar_events(
|
|||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all events, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
events_to_process = [] # List of dicts with document and event data
|
||||
new_documents_created = False
|
||||
|
||||
for event in events:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
event_id = event.get("id")
|
||||
event_summary = event.get("summary", "No Title")
|
||||
|
|
@ -328,14 +333,12 @@ async def index_google_calendar_events(
|
|||
|
||||
if not event_id:
|
||||
logger.warning(f"Skipping event with missing ID: {event_summary}")
|
||||
skipped_events.append(f"{event_summary} (missing ID)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
event_markdown = calendar_client.format_event_to_markdown(event)
|
||||
if not event_markdown.strip():
|
||||
logger.warning(f"Skipping event with no content: {event_summary}")
|
||||
skipped_events.append(f"{event_summary} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -362,82 +365,31 @@ async def index_google_calendar_events(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Google Calendar event {event_summary} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Google Calendar event {event_summary}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"event_summary": event_summary,
|
||||
"calendar_id": calendar_id,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location or "No location",
|
||||
"document_type": "Google Calendar Event",
|
||||
"connector_type": "Google Calendar",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
event_markdown, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Google Calendar Event: {event_summary}\n\n"
|
||||
)
|
||||
summary_content += f"Calendar: {calendar_id}\n"
|
||||
summary_content += f"Start: {start_time}\n"
|
||||
summary_content += f"End: {end_time}\n"
|
||||
if location:
|
||||
summary_content += f"Location: {location}\n"
|
||||
if description:
|
||||
desc_preview = description[:1000]
|
||||
if len(description) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(event_markdown)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Calendar Event - {event_summary}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"event_markdown": event_markdown,
|
||||
"content_hash": content_hash,
|
||||
"event_id": event_id,
|
||||
"event_summary": event_summary,
|
||||
"calendar_id": calendar_id,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"description": description,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Google Calendar event {event_summary}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -455,55 +407,12 @@ async def index_google_calendar_events(
|
|||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
skipped_events.append(
|
||||
f"{event_summary} (already indexed by another connector)"
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"event_summary": event_summary,
|
||||
"calendar_id": calendar_id,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location or "No location",
|
||||
"document_type": "Google Calendar Event",
|
||||
"connector_type": "Google Calendar",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
event_markdown, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Google Calendar Event: {event_summary}\n\n"
|
||||
summary_content += f"Calendar: {calendar_id}\n"
|
||||
summary_content += f"Start: {start_time}\n"
|
||||
summary_content += f"End: {end_time}\n"
|
||||
if location:
|
||||
summary_content += f"Location: {location}\n"
|
||||
if description:
|
||||
desc_preview = description[:1000]
|
||||
if len(description) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
chunks = await create_document_chunks(event_markdown)
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Calendar Event - {event_summary}",
|
||||
title=event_summary,
|
||||
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
|
||||
document_metadata={
|
||||
"event_id": event_id,
|
||||
|
|
@ -512,23 +421,133 @@ async def index_google_calendar_events(
|
|||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new event {event_summary}")
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"event_markdown": event_markdown,
|
||||
"content_hash": content_hash,
|
||||
"event_id": event_id,
|
||||
"event_summary": event_summary,
|
||||
"calendar_id": calendar_id,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"description": description,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
|
||||
|
||||
for item in events_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"event_summary": item["event_summary"],
|
||||
"calendar_id": item["calendar_id"],
|
||||
"start_time": item["start_time"],
|
||||
"end_time": item["end_time"],
|
||||
"location": item["location"] or "No location",
|
||||
"document_type": "Google Calendar Event",
|
||||
"connector_type": "Google Calendar",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["event_markdown"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Google Calendar Event: {item['event_summary']}\n\n"
|
||||
)
|
||||
summary_content += f"Calendar: {item['calendar_id']}\n"
|
||||
summary_content += f"Start: {item['start_time']}\n"
|
||||
summary_content += f"End: {item['end_time']}\n"
|
||||
if item["location"]:
|
||||
summary_content += f"Location: {item['location']}\n"
|
||||
if item["description"]:
|
||||
desc_preview = item["description"][:1000]
|
||||
if len(item["description"]) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["event_markdown"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["event_summary"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"event_id": item["event_id"],
|
||||
"event_summary": item["event_summary"],
|
||||
"calendar_id": item["calendar_id"],
|
||||
"start_time": item["start_time"],
|
||||
"end_time": item["end_time"],
|
||||
"location": item["location"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
|
|
@ -536,19 +555,20 @@ async def index_google_calendar_events(
|
|||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing event {event.get('summary', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
skipped_events.append(
|
||||
f"{event.get('summary', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
|
|
@ -556,6 +576,9 @@ async def index_google_calendar_events(
|
|||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Google Calendar document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
|
|
@ -572,10 +595,15 @@ async def index_google_calendar_events(
|
|||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if duplicates were found
|
||||
warning_message = None
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
total_processed = documents_indexed
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -584,14 +612,15 @@ async def index_google_calendar_events(
|
|||
"events_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_events_count": len(skipped_events),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||
f"Google Calendar indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return total_processed, warning_message
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,9 @@
|
|||
"""Google Drive indexer using Surfsense file processors."""
|
||||
"""Google Drive indexer using Surfsense file processors.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
|
@ -17,11 +22,12 @@ from app.connectors.google_drive import (
|
|||
get_files_in_folder,
|
||||
get_start_page_token,
|
||||
)
|
||||
from app.db import DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
from app.utils.document_converters import generate_unique_identifier_hash
|
||||
|
|
@ -324,8 +330,29 @@ async def index_google_drive_single_file(
|
|||
display_name = file_name or file.get("name", "Unknown")
|
||||
logger.info(f"Indexing Google Drive file: {display_name} ({file_id})")
|
||||
|
||||
# Create pending document for status visibility
|
||||
pending_doc, should_skip = await _create_pending_document_for_file(
|
||||
session=session,
|
||||
file=file,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
if should_skip:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"File {display_name} is unchanged or not indexable",
|
||||
{"status": "skipped"},
|
||||
)
|
||||
return 0, None
|
||||
|
||||
# Commit pending document so it appears in UI
|
||||
if pending_doc and pending_doc.id is None:
|
||||
await session.commit()
|
||||
|
||||
# Process the file
|
||||
indexed, skipped = await _process_single_file(
|
||||
indexed, skipped, failed = await _process_single_file(
|
||||
drive_client=drive_client,
|
||||
session=session,
|
||||
file=file,
|
||||
|
|
@ -334,6 +361,7 @@ async def index_google_drive_single_file(
|
|||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
pending_document=pending_doc,
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
|
|
@ -341,6 +369,15 @@ async def index_google_drive_single_file(
|
|||
"Successfully committed Google Drive file indexing changes to database"
|
||||
)
|
||||
|
||||
if failed > 0:
|
||||
error_msg = f"Failed to index file {display_name}"
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
error_msg,
|
||||
{"file_name": display_name, "file_id": file_id},
|
||||
)
|
||||
return 0, error_msg
|
||||
|
||||
if indexed > 0:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -397,7 +434,12 @@ async def _index_full_scan(
|
|||
include_subfolders: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""Perform full scan indexing of a folder."""
|
||||
"""Perform full scan indexing of a folder.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Collect all files and create pending documents (visible in UI immediately)
|
||||
- Phase 2: Process each file: pending → processing → ready/failed
|
||||
"""
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
|
||||
|
|
@ -410,29 +452,31 @@ async def _index_full_scan(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
files_processed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all files and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
files_to_process = [] # List of (file, pending_document or None)
|
||||
new_documents_created = False
|
||||
|
||||
# Queue of folders to process: (folder_id, folder_name)
|
||||
folders_to_process = [(folder_id, folder_name)]
|
||||
|
||||
logger.info("Phase 1: Collecting files and creating pending documents")
|
||||
|
||||
while folders_to_process and files_processed < max_files:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
current_folder_id, current_folder_name = folders_to_process.pop(0)
|
||||
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
|
||||
logger.info(f"Scanning folder: {current_folder_name} ({current_folder_id})")
|
||||
page_token = None
|
||||
|
||||
while files_processed < max_files:
|
||||
# Get files and folders in current folder
|
||||
# include_subfolders=True here so we get folder items to queue them
|
||||
files, next_token, error = await get_files_in_folder(
|
||||
drive_client,
|
||||
current_folder_id,
|
||||
|
|
@ -462,35 +506,74 @@ async def _index_full_scan(
|
|||
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
|
||||
continue
|
||||
|
||||
# Process the file
|
||||
files_processed += 1
|
||||
|
||||
indexed, skipped = await _process_single_file(
|
||||
drive_client=drive_client,
|
||||
# Create pending document for this file
|
||||
pending_doc, should_skip = await _create_pending_document_for_file(
|
||||
session=session,
|
||||
file=file,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
)
|
||||
|
||||
documents_indexed += indexed
|
||||
documents_skipped += skipped
|
||||
if should_skip:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Committed batch: {documents_indexed} files indexed so far"
|
||||
)
|
||||
if pending_doc and pending_doc.id is None:
|
||||
# New document was created
|
||||
new_documents_created = True
|
||||
|
||||
files_to_process.append((file, pending_doc))
|
||||
|
||||
page_token = next_token
|
||||
if not page_token:
|
||||
break
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each file one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(files_to_process)} files")
|
||||
|
||||
for file, pending_doc in files_to_process:
|
||||
# Check if it's time for a heartbeat update
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
indexed, skipped, failed = await _process_single_file(
|
||||
drive_client=drive_client,
|
||||
session=session,
|
||||
file=file,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
pending_document=pending_doc,
|
||||
)
|
||||
|
||||
documents_indexed += indexed
|
||||
documents_skipped += skipped
|
||||
documents_failed += failed
|
||||
|
||||
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
||||
await session.commit()
|
||||
logger.info(f"Committed batch: {documents_indexed} files indexed so far")
|
||||
|
||||
logger.info(
|
||||
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
||||
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return documents_indexed, documents_skipped
|
||||
|
||||
|
|
@ -514,6 +597,10 @@ async def _index_with_delta_sync(
|
|||
|
||||
Note: include_subfolders is accepted for API consistency but delta sync
|
||||
automatically tracks changes across all folders including subfolders.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Collect all changes and create pending documents (visible in UI immediately)
|
||||
- Phase 2: Process each file: pending → processing → ready/failed
|
||||
"""
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -537,19 +624,21 @@ async def _index_with_delta_sync(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
files_processed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze changes and create pending documents for new/modified files
|
||||
# =======================================================================
|
||||
changes_to_process = [] # List of (change, file, pending_document or None)
|
||||
new_documents_created = False
|
||||
|
||||
logger.info("Phase 1: Analyzing changes and creating pending documents")
|
||||
|
||||
for change in changes:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
if files_processed >= max_files:
|
||||
break
|
||||
|
||||
|
|
@ -566,7 +655,45 @@ async def _index_with_delta_sync(
|
|||
if not file:
|
||||
continue
|
||||
|
||||
indexed, skipped = await _process_single_file(
|
||||
# Create pending document for this file
|
||||
pending_doc, should_skip = await _create_pending_document_for_file(
|
||||
session=session,
|
||||
file=file,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
if should_skip:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if pending_doc and pending_doc.id is None:
|
||||
# New document was created
|
||||
new_documents_created = True
|
||||
|
||||
changes_to_process.append((change, file, pending_doc))
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info("Phase 1: Committing pending documents")
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each file one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(changes_to_process)} changes")
|
||||
|
||||
for _, file, pending_doc in changes_to_process:
|
||||
# Check if it's time for a heartbeat update
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
indexed, skipped, failed = await _process_single_file(
|
||||
drive_client=drive_client,
|
||||
session=session,
|
||||
file=file,
|
||||
|
|
@ -575,21 +702,125 @@ async def _index_with_delta_sync(
|
|||
user_id=user_id,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
pending_document=pending_doc,
|
||||
)
|
||||
|
||||
documents_indexed += indexed
|
||||
documents_skipped += skipped
|
||||
documents_failed += failed
|
||||
|
||||
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
||||
await session.commit()
|
||||
logger.info(f"Committed batch: {documents_indexed} changes processed")
|
||||
|
||||
logger.info(
|
||||
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
||||
f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return documents_indexed, documents_skipped
|
||||
|
||||
|
||||
async def _create_pending_document_for_file(
|
||||
session: AsyncSession,
|
||||
file: dict,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
) -> tuple[Document | None, bool]:
|
||||
"""
|
||||
Create a pending document for a Google Drive file if it doesn't exist.
|
||||
|
||||
This is Phase 1 of the 2-phase document status update pattern.
|
||||
Creates documents with 'pending' status so they appear in UI immediately.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
file: File metadata from Google Drive API
|
||||
connector_id: ID of the Drive connector
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
|
||||
Returns:
|
||||
Tuple of (document, should_skip):
|
||||
- (existing_doc, False): Existing document that needs update
|
||||
- (new_pending_doc, False): New pending document created
|
||||
- (None, True): File should be skipped (unchanged, rename-only, or folder)
|
||||
"""
|
||||
from app.connectors.google_drive.file_types import should_skip_file
|
||||
|
||||
file_id = file.get("id")
|
||||
file_name = file.get("name", "Unknown")
|
||||
mime_type = file.get("mimeType", "")
|
||||
|
||||
# Skip folders and shortcuts
|
||||
if should_skip_file(mime_type):
|
||||
return None, True
|
||||
|
||||
if not file_id:
|
||||
return None, True
|
||||
|
||||
# Generate unique identifier hash for this file
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
)
|
||||
|
||||
# Check if document exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Check if this is a rename-only update (content unchanged)
|
||||
incoming_md5 = file.get("md5Checksum")
|
||||
incoming_modified_time = file.get("modifiedTime")
|
||||
doc_metadata = existing_document.document_metadata or {}
|
||||
stored_md5 = doc_metadata.get("md5_checksum")
|
||||
stored_modified_time = doc_metadata.get("modified_time")
|
||||
|
||||
# Determine if content changed
|
||||
content_unchanged = False
|
||||
if incoming_md5 and stored_md5:
|
||||
content_unchanged = incoming_md5 == stored_md5
|
||||
elif not incoming_md5 and incoming_modified_time and stored_modified_time:
|
||||
# Google Workspace file - use modifiedTime as fallback
|
||||
content_unchanged = incoming_modified_time == stored_modified_time
|
||||
|
||||
if content_unchanged:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
return None, True
|
||||
|
||||
# Content changed - return existing document for update
|
||||
return existing_document, False
|
||||
|
||||
# Create new pending document
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=DocumentType.GOOGLE_DRIVE_FILE,
|
||||
document_metadata={
|
||||
"google_drive_file_id": file_id,
|
||||
"google_drive_file_name": file_name,
|
||||
"google_drive_mime_type": mime_type,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
|
||||
return document, False
|
||||
|
||||
|
||||
async def _check_rename_only_update(
|
||||
session: AsyncSession,
|
||||
file: dict,
|
||||
|
|
@ -725,15 +956,31 @@ async def _process_single_file(
|
|||
user_id: str,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry: any,
|
||||
) -> tuple[int, int]:
|
||||
pending_document: Document | None = None,
|
||||
) -> tuple[int, int, int]:
|
||||
"""
|
||||
Process a single file by downloading and using Surfsense's file processor.
|
||||
|
||||
Implements Phase 2 of the 2-phase document status update pattern.
|
||||
Updates document status: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
drive_client: Google Drive client
|
||||
session: Database session
|
||||
file: File metadata from Google Drive API
|
||||
connector_id: ID of the connector
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
task_logger: Task logging service
|
||||
log_entry: Log entry for tracking
|
||||
pending_document: Optional pending document created in Phase 1
|
||||
|
||||
Returns:
|
||||
Tuple of (indexed_count, skipped_count)
|
||||
Tuple of (indexed_count, skipped_count, failed_count)
|
||||
"""
|
||||
file_name = file.get("name", "Unknown")
|
||||
mime_type = file.get("mimeType", "")
|
||||
file_id = file.get("id")
|
||||
|
||||
try:
|
||||
logger.info(f"Processing file: {file_name} ({mime_type})")
|
||||
|
|
@ -756,10 +1003,15 @@ async def _process_single_file(
|
|||
# Return 1 for renamed files (they are "indexed" in the sense that they're updated)
|
||||
# Return 0 for unchanged files
|
||||
if "renamed" in (rename_message or "").lower():
|
||||
return 1, 0
|
||||
return 0, 1
|
||||
return 1, 0, 0
|
||||
return 0, 1, 0
|
||||
|
||||
_, error, _ = await download_and_process_file(
|
||||
# Set document to PROCESSING status if we have a pending document
|
||||
if pending_document:
|
||||
pending_document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
_, error, metadata = await download_and_process_file(
|
||||
client=drive_client,
|
||||
file=file,
|
||||
search_space_id=search_space_id,
|
||||
|
|
@ -776,14 +1028,46 @@ async def _process_single_file(
|
|||
f"Skipped {file_name}: {error}",
|
||||
{"status": "skipped", "reason": error},
|
||||
)
|
||||
return 0, 1
|
||||
# Mark pending document as failed if it exists
|
||||
if pending_document:
|
||||
pending_document.status = DocumentStatus.failed(error)
|
||||
pending_document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
return 0, 1, 0
|
||||
|
||||
# The document was created/updated by download_and_process_file
|
||||
# Find the document and ensure it has READY status
|
||||
if file_id:
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
)
|
||||
processed_doc = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
# Ensure status is READY
|
||||
if processed_doc and not DocumentStatus.is_state(
|
||||
processed_doc.status, DocumentStatus.READY
|
||||
):
|
||||
processed_doc.status = DocumentStatus.ready()
|
||||
processed_doc.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Successfully indexed Google Drive file: {file_name}")
|
||||
return 1, 0
|
||||
return 1, 0, 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True)
|
||||
return 0, 1
|
||||
# Mark pending document as failed if it exists
|
||||
if pending_document:
|
||||
try:
|
||||
pending_document.status = DocumentStatus.failed(str(e))
|
||||
pending_document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
return 0, 0, 1
|
||||
|
||||
|
||||
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Google Gmail connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -13,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.connectors.google_gmail_connector import GoogleGmailConnector
|
||||
from app.db import (
|
||||
Document,
|
||||
DocumentStatus,
|
||||
DocumentType,
|
||||
SearchSourceConnectorType,
|
||||
)
|
||||
|
|
@ -32,6 +37,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -220,20 +226,23 @@ async def index_google_gmail_messages(
|
|||
logger.info(f"Found {len(messages)} Google gmail messages to index")
|
||||
|
||||
documents_indexed = 0
|
||||
skipped_messages = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0 # Track messages that failed processing
|
||||
duplicate_content_count = (
|
||||
0 # Track messages skipped due to duplicate content_hash
|
||||
)
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all messages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
messages_to_process = [] # List of dicts with document and message data
|
||||
new_documents_created = False
|
||||
|
||||
for message in messages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
# Extract message information
|
||||
message_id = message.get("id", "")
|
||||
|
|
@ -259,7 +268,6 @@ async def index_google_gmail_messages(
|
|||
|
||||
if not message_id:
|
||||
logger.warning(f"Skipping message with missing ID: {subject}")
|
||||
skipped_messages.append(f"{subject} (missing ID)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -268,7 +276,6 @@ async def index_google_gmail_messages(
|
|||
|
||||
if not markdown_content.strip():
|
||||
logger.warning(f"Skipping message with no content: {subject}")
|
||||
skipped_messages.append(f"{subject} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -288,68 +295,29 @@ async def index_google_gmail_messages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Gmail message {subject} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Gmail message {subject}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"document_type": "Gmail Message",
|
||||
"connector_type": "Google Gmail",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Gmail Message: {subject}\n\n"
|
||||
summary_content += f"Sender: {sender}\n"
|
||||
summary_content += f"Date: {date_str}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Gmail: {subject}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"connector_id": connector_id,
|
||||
"date_str": date_str,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated Gmail message {subject}")
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -364,48 +332,14 @@ async def index_google_gmail_messages(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"document_type": "Gmail Message",
|
||||
"connector_type": "Google Gmail",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Google Gmail Message: {subject}\n\n"
|
||||
summary_content += f"Sender: {sender}\n"
|
||||
summary_content += f"Date: {date_str}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Create and store new document
|
||||
logger.info(f"Creating new document for Gmail message: {subject}")
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Gmail: {subject}",
|
||||
title=subject,
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
document_metadata={
|
||||
"message_id": message_id,
|
||||
|
|
@ -413,21 +347,120 @@ async def index_google_gmail_messages(
|
|||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date": date_str,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new email {summary_content}")
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"message_id": message_id,
|
||||
"thread_id": thread_id,
|
||||
"subject": subject,
|
||||
"sender": sender,
|
||||
"date_str": date_str,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
||||
|
||||
for item in messages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
"subject": item["subject"],
|
||||
"sender": item["sender"],
|
||||
"date": item["date_str"],
|
||||
"document_type": "Gmail Message",
|
||||
"connector_type": "Google Gmail",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["markdown_content"],
|
||||
user_llm,
|
||||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
summary_content = f"Google Gmail Message: {item['subject']}\n\n"
|
||||
summary_content += f"Sender: {item['sender']}\n"
|
||||
summary_content += f"Date: {item['date_str']}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["subject"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"message_id": item["message_id"],
|
||||
"thread_id": item["thread_id"],
|
||||
"subject": item["subject"],
|
||||
"sender": item["sender"],
|
||||
"date": item["date_str"],
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Gmail messages processed so far"
|
||||
|
|
@ -435,45 +468,76 @@ async def index_google_gmail_messages(
|
|||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing the email {message_id}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
skipped_messages.append(f"{subject} (processing error)")
|
||||
documents_skipped += 1
|
||||
continue # Skip this message and continue with others
|
||||
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Google gmail document changes to database"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Google Gmail document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same message was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
total_processed = documents_indexed
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Google gmail indexing for connector {connector_id}",
|
||||
f"Successfully completed Google Gmail indexing for connector {connector_id}",
|
||||
{
|
||||
"events_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"skipped_messages_count": len(skipped_messages),
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Google gmail indexing completed: {documents_indexed} new emails, {documents_skipped} skipped"
|
||||
f"Google Gmail indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
warning_message,
|
||||
) # Return warning_message (None on success)
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Jira connector indexer.
|
||||
|
||||
Provides real-time document status updates during indexing using a two-phase approach:
|
||||
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
|
||||
- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED)
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.jira_history import JiraHistoryConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -29,6 +33,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -174,22 +179,22 @@ async def index_jira_issues(
|
|||
logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True)
|
||||
return 0, f"Error fetching Jira issues: {e!s}"
|
||||
|
||||
# Process and index each issue
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all issues, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
documents_indexed = 0
|
||||
skipped_issues = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
issues_to_process = [] # List of dicts with document and issue data
|
||||
new_documents_created = False
|
||||
|
||||
for issue in issues:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
issue_id = issue.get("key")
|
||||
issue_identifier = issue.get("key", "")
|
||||
|
|
@ -199,9 +204,6 @@ async def index_jira_issues(
|
|||
logger.warning(
|
||||
f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
|
||||
)
|
||||
skipped_issues.append(
|
||||
f"{issue_identifier or 'Unknown'} (missing data)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -215,7 +217,6 @@ async def index_jira_issues(
|
|||
logger.warning(
|
||||
f"Skipping issue with no content: {issue_identifier} - {issue_title}"
|
||||
)
|
||||
skipped_issues.append(f"{issue_identifier} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
|
|
@ -237,73 +238,29 @@ async def index_jira_issues(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Jira issue {issue_identifier} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Jira issue {issue_identifier}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"issue_key": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"status": formatted_issue.get("status", "Unknown"),
|
||||
"priority": formatted_issue.get("priority", "Unknown"),
|
||||
"comment_count": comment_count,
|
||||
"document_type": "Jira Issue",
|
||||
"connector_type": "Jira",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
issue_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
|
||||
if formatted_issue.get("description"):
|
||||
summary_content += f"Description: {formatted_issue.get('description')}\n\n"
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(issue_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = (
|
||||
f"Jira - {issue_identifier}: {issue_title}"
|
||||
)
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": formatted_issue.get("status", "Unknown"),
|
||||
"formatted_issue": formatted_issue,
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Jira issue {issue_identifier}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -318,53 +275,14 @@ async def index_jira_issues(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"issue_key": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"status": formatted_issue.get("status", "Unknown"),
|
||||
"priority": formatted_issue.get("priority", "Unknown"),
|
||||
"comment_count": comment_count,
|
||||
"document_type": "Jira Issue",
|
||||
"connector_type": "Jira",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
issue_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n"
|
||||
if formatted_issue.get("description"):
|
||||
summary_content += (
|
||||
f"Description: {formatted_issue.get('description')}\n\n"
|
||||
)
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks - using the full issue content with comments
|
||||
chunks = await create_document_chunks(issue_content)
|
||||
|
||||
# Create and store new document
|
||||
logger.info(
|
||||
f"Creating new document for issue {issue_identifier} - {issue_title}"
|
||||
)
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Jira - {issue_identifier}: {issue_title}",
|
||||
title=f"{issue_identifier}: {issue_title}",
|
||||
document_type=DocumentType.JIRA_CONNECTOR,
|
||||
document_metadata={
|
||||
"issue_id": issue_id,
|
||||
|
|
@ -372,25 +290,122 @@ async def index_jira_issues(
|
|||
"issue_title": issue_title,
|
||||
"state": formatted_issue.get("status", "Unknown"),
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
|
||||
new_documents_created = True
|
||||
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"formatted_issue": formatted_issue,
|
||||
"comment_count": comment_count,
|
||||
}
|
||||
)
|
||||
|
||||
# Batch commit every 10 documents
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(issues_to_process)} documents")
|
||||
|
||||
for item in issues_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"issue_key": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"status": item["formatted_issue"].get("status", "Unknown"),
|
||||
"priority": item["formatted_issue"].get("priority", "Unknown"),
|
||||
"comment_count": item["comment_count"],
|
||||
"document_type": "Jira Issue",
|
||||
"connector_type": "Jira",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["issue_content"], user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n"
|
||||
if item["formatted_issue"].get("description"):
|
||||
summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks - using the full issue content with comments
|
||||
chunks = await create_document_chunks(item["issue_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['issue_identifier']}: {item['issue_title']}"
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"issue_id": item["issue_id"],
|
||||
"issue_identifier": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"state": item["formatted_issue"].get("status", "Unknown"),
|
||||
"comment_count": item["comment_count"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Jira issues processed so far"
|
||||
|
|
@ -399,48 +414,75 @@ async def index_jira_issues(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
|
||||
f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
skipped_issues.append(
|
||||
f"{issue.get('identifier', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue # Skip this issue and continue with others
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
total_processed = documents_indexed
|
||||
if update_last_indexed:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
logger.info(f"Final commit: Total {documents_indexed} Jira issues processed")
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all JIRA document changes to database")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all JIRA document changes to database")
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same issue was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed JIRA indexing for connector {connector_id}",
|
||||
{
|
||||
"issues_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"skipped_issues_count": len(skipped_issues),
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
|
||||
f"JIRA indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
|
||||
# Clean up the connector
|
||||
await jira_client.close()
|
||||
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Linear connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.linear_connector import LinearConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -196,6 +201,7 @@ async def index_linear_issues(
|
|||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0 # Track issues that failed processing
|
||||
skipped_issues = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
|
|
@ -207,16 +213,14 @@ async def index_linear_issues(
|
|||
{"stage": "process_issues", "total_issues": len(issues)},
|
||||
)
|
||||
|
||||
# Process each issue
|
||||
for issue in issues:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all issues, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
issues_to_process = [] # List of dicts with document and issue data
|
||||
new_documents_created = False
|
||||
|
||||
for issue in issues:
|
||||
try:
|
||||
issue_id = issue.get("id", "")
|
||||
issue_identifier = issue.get("identifier", "")
|
||||
|
|
@ -262,80 +266,39 @@ async def index_linear_issues(
|
|||
state = formatted_issue.get("state", "Unknown")
|
||||
description = formatted_issue.get("description", "")
|
||||
comment_count = len(formatted_issue.get("comments", []))
|
||||
priority = formatted_issue.get("priority", "Unknown")
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for Linear issue {issue_identifier} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Linear issue {issue_identifier}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"issue_id": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"priority": formatted_issue.get("priority", "Unknown"),
|
||||
"comment_count": comment_count,
|
||||
"document_type": "Linear Issue",
|
||||
"connector_type": "Linear",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
issue_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
if description and len(description) > 1000:
|
||||
description = description[:997] + "..."
|
||||
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n\n"
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(issue_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = (
|
||||
f"Linear - {issue_identifier}: {issue_title}"
|
||||
)
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"description": description,
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"priority": priority,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully updated Linear issue {issue_identifier}"
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -353,51 +316,10 @@ async def index_linear_issues(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"issue_id": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"priority": formatted_issue.get("priority", "Unknown"),
|
||||
"comment_count": comment_count,
|
||||
"document_type": "Linear Issue",
|
||||
"connector_type": "Linear",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
issue_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
# Truncate description if it's too long for the summary
|
||||
if description and len(description) > 1000:
|
||||
description = description[:997] + "..."
|
||||
summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n\n"
|
||||
summary_content += f"Comments: {comment_count}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks - using the full issue content with comments
|
||||
chunks = await create_document_chunks(issue_content)
|
||||
|
||||
# Create and store new document
|
||||
logger.info(
|
||||
f"Creating new document for issue {issue_identifier} - {issue_title}"
|
||||
)
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Linear - {issue_identifier}: {issue_title}",
|
||||
title=f"{issue_identifier}: {issue_title}",
|
||||
document_type=DocumentType.LINEAR_CONNECTOR,
|
||||
document_metadata={
|
||||
"issue_id": issue_id,
|
||||
|
|
@ -405,25 +327,126 @@ async def index_linear_issues(
|
|||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"comment_count": comment_count,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
f"Successfully indexed new issue {issue_identifier} - {issue_title}"
|
||||
new_documents_created = True
|
||||
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"description": description,
|
||||
"comment_count": comment_count,
|
||||
"priority": priority,
|
||||
}
|
||||
)
|
||||
|
||||
# Batch commit every 10 documents
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(issues_to_process)} documents")
|
||||
|
||||
for item in issues_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"issue_id": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"state": item["state"],
|
||||
"priority": item["priority"],
|
||||
"comment_count": item["comment_count"],
|
||||
"document_type": "Linear Issue",
|
||||
"connector_type": "Linear",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["issue_content"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
description = item["description"]
|
||||
if description and len(description) > 1000:
|
||||
description = description[:997] + "..."
|
||||
summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n\n"
|
||||
summary_content += f"Comments: {item['comment_count']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["issue_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['issue_identifier']}: {item['issue_title']}"
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"issue_id": item["issue_id"],
|
||||
"issue_identifier": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"state": item["state"],
|
||||
"comment_count": item["comment_count"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Linear issues processed so far"
|
||||
|
|
@ -432,44 +455,72 @@ async def index_linear_issues(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}",
|
||||
f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_issues.append(
|
||||
f"{issue.get('identifier', 'Unknown')} (processing error)"
|
||||
f"{item.get('issue_identifier', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue # Skip this issue and continue with others
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
total_processed = documents_indexed
|
||||
if update_last_indexed:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} Linear issues processed")
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all Linear document changes to database")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Linear document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same issue was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Linear indexing for connector {connector_id}",
|
||||
{
|
||||
"issues_processed": total_processed,
|
||||
"issues_processed": documents_indexed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"skipped_issues_count": len(skipped_issues),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped"
|
||||
f"Linear indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None as the error message to indicate success
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Luma connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Collect all events and create pending documents (visible in UI immediately)
|
||||
- Phase 2: Process each event: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.luma_connector import LumaConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -27,6 +31,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -227,21 +232,22 @@ async def index_luma_events(
|
|||
logger.error(f"Error fetching Luma events: {e!s}", exc_info=True)
|
||||
return 0, f"Error fetching Luma events: {e!s}"
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all events, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
skipped_events = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
events_to_process = [] # List of dicts with document and event data
|
||||
new_documents_created = False
|
||||
|
||||
for event in events:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
# Luma event structure fields - events have nested 'event' field
|
||||
event_data = event.get("event", {})
|
||||
|
|
@ -298,91 +304,38 @@ async def index_luma_events(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for Luma event {event_name} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Luma event {event_name}. Updating document."
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"event_name": event_name,
|
||||
"event_url": event_url,
|
||||
"start_at": start_at,
|
||||
"end_at": end_at,
|
||||
"timezone": timezone,
|
||||
"location": location or "No location",
|
||||
"city": city,
|
||||
"hosts": host_names,
|
||||
"document_type": "Luma Event",
|
||||
"connector_type": "Luma",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
event_markdown, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Luma Event: {event_name}\n\n"
|
||||
if event_url:
|
||||
summary_content += f"URL: {event_url}\n"
|
||||
summary_content += f"Start: {start_at}\n"
|
||||
summary_content += f"End: {end_at}\n"
|
||||
if timezone:
|
||||
summary_content += f"Timezone: {timezone}\n"
|
||||
if location:
|
||||
summary_content += f"Location: {location}\n"
|
||||
if city:
|
||||
summary_content += f"City: {city}\n"
|
||||
if host_names:
|
||||
summary_content += f"Hosts: {host_names}\n"
|
||||
if description:
|
||||
desc_preview = description[:1000]
|
||||
if len(description) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(event_markdown)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Luma Event - {event_name}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"event_id": event_id,
|
||||
"event_name": event_name,
|
||||
"event_url": event_url,
|
||||
"event_markdown": event_markdown,
|
||||
"content_hash": content_hash,
|
||||
"start_at": start_at,
|
||||
"end_at": end_at,
|
||||
"timezone": timezone,
|
||||
"location": location,
|
||||
"city": city,
|
||||
"hosts": host_names,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"host_names": host_names,
|
||||
"description": description,
|
||||
"cover_url": cover_url,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated Luma event {event_name}")
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -400,62 +353,10 @@ async def index_luma_events(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"event_name": event_name,
|
||||
"event_url": event_url,
|
||||
"start_at": start_at,
|
||||
"end_at": end_at,
|
||||
"timezone": timezone,
|
||||
"location": location or "No location",
|
||||
"city": city,
|
||||
"hosts": host_names,
|
||||
"document_type": "Luma Event",
|
||||
"connector_type": "Luma",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
event_markdown, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Luma Event: {event_name}\n\n"
|
||||
if event_url:
|
||||
summary_content += f"URL: {event_url}\n"
|
||||
summary_content += f"Start: {start_at}\n"
|
||||
summary_content += f"End: {end_at}\n"
|
||||
if timezone:
|
||||
summary_content += f"Timezone: {timezone}\n"
|
||||
if location:
|
||||
summary_content += f"Location: {location}\n"
|
||||
if city:
|
||||
summary_content += f"City: {city}\n"
|
||||
if host_names:
|
||||
summary_content += f"Hosts: {host_names}\n"
|
||||
if description:
|
||||
desc_preview = description[:1000]
|
||||
if len(description) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(event_markdown)
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Luma Event - {event_name}",
|
||||
title=event_name,
|
||||
document_type=DocumentType.LUMA_CONNECTOR,
|
||||
document_metadata={
|
||||
"event_id": event_id,
|
||||
|
|
@ -468,23 +369,151 @@ async def index_luma_events(
|
|||
"city": city,
|
||||
"hosts": host_names,
|
||||
"cover_url": cover_url,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new event {event_name}")
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
events_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"event_id": event_id,
|
||||
"event_name": event_name,
|
||||
"event_url": event_url,
|
||||
"event_markdown": event_markdown,
|
||||
"content_hash": content_hash,
|
||||
"start_at": start_at,
|
||||
"end_at": end_at,
|
||||
"timezone": timezone,
|
||||
"location": location,
|
||||
"city": city,
|
||||
"host_names": host_names,
|
||||
"description": description,
|
||||
"cover_url": cover_url,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
|
||||
|
||||
for item in events_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item["event_id"],
|
||||
"event_name": item["event_name"],
|
||||
"event_url": item["event_url"],
|
||||
"start_at": item["start_at"],
|
||||
"end_at": item["end_at"],
|
||||
"timezone": item["timezone"],
|
||||
"location": item["location"] or "No location",
|
||||
"city": item["city"],
|
||||
"hosts": item["host_names"],
|
||||
"document_type": "Luma Event",
|
||||
"connector_type": "Luma",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["event_markdown"], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Luma Event: {item['event_name']}\n\n"
|
||||
if item["event_url"]:
|
||||
summary_content += f"URL: {item['event_url']}\n"
|
||||
summary_content += f"Start: {item['start_at']}\n"
|
||||
summary_content += f"End: {item['end_at']}\n"
|
||||
if item["timezone"]:
|
||||
summary_content += f"Timezone: {item['timezone']}\n"
|
||||
if item["location"]:
|
||||
summary_content += f"Location: {item['location']}\n"
|
||||
if item["city"]:
|
||||
summary_content += f"City: {item['city']}\n"
|
||||
if item["host_names"]:
|
||||
summary_content += f"Hosts: {item['host_names']}\n"
|
||||
if item["description"]:
|
||||
desc_preview = item["description"][:1000]
|
||||
if len(item["description"]) > 1000:
|
||||
desc_preview += "..."
|
||||
summary_content += f"Description: {desc_preview}\n"
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["event_markdown"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["event_name"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"event_id": item["event_id"],
|
||||
"event_name": item["event_name"],
|
||||
"event_url": item["event_url"],
|
||||
"start_at": item["start_at"],
|
||||
"end_at": item["end_at"],
|
||||
"timezone": item["timezone"],
|
||||
"location": item["location"],
|
||||
"city": item["city"],
|
||||
"hosts": item["host_names"],
|
||||
"cover_url": item["cover_url"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Luma events processed so far"
|
||||
|
|
@ -493,38 +522,71 @@ async def index_luma_events(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing event {event.get('name', 'Unknown')}: {e!s}",
|
||||
f"Error processing event {item.get('event_name', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_events.append(
|
||||
f"{event.get('name', 'Unknown')} (processing error)"
|
||||
f"{item.get('event_name', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
# This ensures the UI shows "Last indexed" instead of "Never indexed"
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} Luma events processed")
|
||||
await session.commit()
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all Luma document changes to database")
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same event was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Luma indexing for connector {connector_id}",
|
||||
{
|
||||
"events_processed": total_processed,
|
||||
"events_processed": documents_indexed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"skipped_events_count": len(skipped_events),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Luma indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
|
||||
f"Luma indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return total_processed, None
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Notion connector indexer.
|
||||
|
||||
Implements real-time document status updates using a two-phase approach:
|
||||
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
|
||||
- Phase 2: Process each document one by one (pending → processing → ready/failed)
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -9,8 +13,9 @@ from datetime import datetime
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.notion_history import NotionHistoryConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +33,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -245,12 +251,17 @@ async def index_notion_pages(
|
|||
{"pages_found": 0},
|
||||
)
|
||||
logger.info("No Notion pages found to index")
|
||||
# CRITICAL: Update timestamp even when no pages found so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
await notion_client.close()
|
||||
return 0, None # Success with 0 pages, not an error
|
||||
|
||||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
skipped_pages = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
|
|
@ -262,22 +273,69 @@ async def index_notion_pages(
|
|||
{"stage": "process_pages", "total_pages": len(pages)},
|
||||
)
|
||||
|
||||
# Process each page
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all pages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
pages_to_process = [] # List of dicts with document and page data
|
||||
new_documents_created = False
|
||||
|
||||
# Helper function to convert page content to markdown
|
||||
def process_blocks(blocks, level=0):
|
||||
result = ""
|
||||
for block in blocks:
|
||||
block_type = block.get("type")
|
||||
block_content = block.get("content", "")
|
||||
children = block.get("children", [])
|
||||
|
||||
# Add indentation based on level
|
||||
indent = " " * level
|
||||
|
||||
# Format based on block type
|
||||
if block_type in ["paragraph", "text"]:
|
||||
result += f"{indent}{block_content}\n\n"
|
||||
elif block_type in ["heading_1", "header"]:
|
||||
result += f"{indent}# {block_content}\n\n"
|
||||
elif block_type == "heading_2":
|
||||
result += f"{indent}## {block_content}\n\n"
|
||||
elif block_type == "heading_3":
|
||||
result += f"{indent}### {block_content}\n\n"
|
||||
elif block_type == "bulleted_list_item":
|
||||
result += f"{indent}* {block_content}\n"
|
||||
elif block_type == "numbered_list_item":
|
||||
result += f"{indent}1. {block_content}\n"
|
||||
elif block_type == "to_do":
|
||||
result += f"{indent}- [ ] {block_content}\n"
|
||||
elif block_type == "toggle":
|
||||
result += f"{indent}> {block_content}\n"
|
||||
elif block_type == "code":
|
||||
result += f"{indent}```\n{block_content}\n```\n\n"
|
||||
elif block_type == "quote":
|
||||
result += f"{indent}> {block_content}\n\n"
|
||||
elif block_type == "callout":
|
||||
result += f"{indent}> **Note:** {block_content}\n\n"
|
||||
elif block_type == "image":
|
||||
result += f"{indent}\n\n"
|
||||
else:
|
||||
# Default for other block types
|
||||
if block_content:
|
||||
result += f"{indent}{block_content}\n\n"
|
||||
|
||||
# Process children recursively
|
||||
if children:
|
||||
result += process_blocks(children, level + 1)
|
||||
|
||||
return result
|
||||
|
||||
for page in pages:
|
||||
try:
|
||||
page_id = page.get("page_id")
|
||||
page_title = page.get("title", f"Untitled page ({page_id})")
|
||||
page_content = page.get("content", [])
|
||||
|
||||
logger.info(f"Processing Notion page: {page_title} ({page_id})")
|
||||
if not page_id:
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
if not page_content:
|
||||
logger.info(f"No content found in page {page_title}. Skipping.")
|
||||
|
|
@ -287,57 +345,6 @@ async def index_notion_pages(
|
|||
|
||||
# Convert page content to markdown format
|
||||
markdown_content = f"# Notion Page: {page_title}\n\n"
|
||||
|
||||
# Process blocks recursively
|
||||
def process_blocks(blocks, level=0):
|
||||
result = ""
|
||||
for block in blocks:
|
||||
block_type = block.get("type")
|
||||
block_content = block.get("content", "")
|
||||
children = block.get("children", [])
|
||||
|
||||
# Add indentation based on level
|
||||
indent = " " * level
|
||||
|
||||
# Format based on block type
|
||||
if block_type in ["paragraph", "text"]:
|
||||
result += f"{indent}{block_content}\n\n"
|
||||
elif block_type in ["heading_1", "header"]:
|
||||
result += f"{indent}# {block_content}\n\n"
|
||||
elif block_type == "heading_2":
|
||||
result += f"{indent}## {block_content}\n\n"
|
||||
elif block_type == "heading_3":
|
||||
result += f"{indent}### {block_content}\n\n"
|
||||
elif block_type == "bulleted_list_item":
|
||||
result += f"{indent}* {block_content}\n"
|
||||
elif block_type == "numbered_list_item":
|
||||
result += f"{indent}1. {block_content}\n"
|
||||
elif block_type == "to_do":
|
||||
result += f"{indent}- [ ] {block_content}\n"
|
||||
elif block_type == "toggle":
|
||||
result += f"{indent}> {block_content}\n"
|
||||
elif block_type == "code":
|
||||
result += f"{indent}```\n{block_content}\n```\n\n"
|
||||
elif block_type == "quote":
|
||||
result += f"{indent}> {block_content}\n\n"
|
||||
elif block_type == "callout":
|
||||
result += f"{indent}> **Note:** {block_content}\n\n"
|
||||
elif block_type == "image":
|
||||
result += f"{indent}\n\n"
|
||||
else:
|
||||
# Default for other block types
|
||||
if block_content:
|
||||
result += f"{indent}{block_content}\n\n"
|
||||
|
||||
# Process children recursively
|
||||
if children:
|
||||
result += process_blocks(children, level + 1)
|
||||
|
||||
return result
|
||||
|
||||
logger.debug(
|
||||
f"Converting {len(page_content)} blocks to markdown for page {page_title}"
|
||||
)
|
||||
markdown_content += process_blocks(page_content)
|
||||
|
||||
# Format document metadata
|
||||
|
|
@ -377,71 +384,26 @@ async def index_notion_pages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
f"Document for Notion page {page_title} unchanged. Skipping."
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Notion page {page_title}. Updating document."
|
||||
)
|
||||
|
||||
# Get user's long context LLM
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if not user_llm:
|
||||
logger.error(
|
||||
f"No long context LLM configured for user {user_id}"
|
||||
)
|
||||
skipped_pages.append(f"{page_title} (no LLM configured)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
"page_title": page_title,
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"document_type": "Notion Page",
|
||||
"connector_type": "Notion",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = f"Notion - {page_title}"
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.connector_id = connector_id
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated Notion page: {page_title}")
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} documents processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -456,91 +418,182 @@ async def index_notion_pages(
|
|||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Get user's long context LLM
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
if not user_llm:
|
||||
logger.error(f"No long context LLM configured for user {user_id}")
|
||||
skipped_pages.append(f"{page_title} (no LLM configured)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate summary with metadata
|
||||
logger.debug(f"Generating summary for page {page_title}")
|
||||
document_metadata = {
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"document_type": "Notion Page",
|
||||
"connector_type": "Notion",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
logger.debug(f"Chunking content for page {page_title}")
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Create and store new document
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Notion - {page_title}",
|
||||
title=page_title,
|
||||
document_type=DocumentType.NOTION_CONNECTOR,
|
||||
document_metadata={
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new Notion page: {page_title}")
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
|
||||
|
||||
for item in pages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
"document_type": "Notion Page",
|
||||
"connector_type": "Notion",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["markdown_content"],
|
||||
user_llm,
|
||||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content'][:500]}..."
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["page_title"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} documents processed so far"
|
||||
f"Committing batch: {documents_indexed} Notion pages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing Notion page {page.get('title', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
skipped_pages.append(
|
||||
f"{page.get('title', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue # Skip this page and continue with others
|
||||
logger.error(f"Error processing Notion page: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_pages.append(f"{item['page_title']} (processing error)")
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
# and if we successfully indexed at least one page
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
logger.info(f"Final commit: Total {documents_indexed} documents processed")
|
||||
await session.commit()
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Notion document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same page was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Get final count of pages with skipped Notion AI content
|
||||
pages_with_skipped_ai_content = notion_client.get_skipped_content_count()
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Prepare result message with user-friendly notification about skipped content
|
||||
result_message = None
|
||||
if skipped_pages:
|
||||
|
|
@ -563,6 +616,8 @@ async def index_notion_pages(
|
|||
"pages_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_pages_count": len(skipped_pages),
|
||||
"pages_with_skipped_ai_content": pages_with_skipped_ai_content,
|
||||
"result_message": result_message,
|
||||
|
|
@ -570,7 +625,9 @@ async def index_notion_pages(
|
|||
)
|
||||
|
||||
logger.info(
|
||||
f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped"
|
||||
f"Notion indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
|
||||
# Clean up the async client
|
||||
|
|
@ -590,6 +647,10 @@ async def index_notion_pages(
|
|||
"Using legacy token. Reconnect with OAuth for better reliability."
|
||||
)
|
||||
|
||||
# Include warning message if there were issues
|
||||
if warning_message:
|
||||
notification_parts.append(warning_message)
|
||||
|
||||
user_notification_message = (
|
||||
" ".join(notification_parts) if notification_parts else None
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@ Obsidian connector indexer.
|
|||
|
||||
Indexes markdown notes from a local Obsidian vault.
|
||||
This connector is only available in self-hosted mode.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -17,7 +21,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -34,6 +38,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -307,25 +312,22 @@ async def index_obsidian_vault(
|
|||
|
||||
logger.info(f"Processing {len(files)} files after date filtering")
|
||||
|
||||
# Get LLM for summarization
|
||||
long_context_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
indexed_count = 0
|
||||
skipped_count = 0
|
||||
failed_count = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all files, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
files_to_process = [] # List of dicts with document and file data
|
||||
new_documents_created = False
|
||||
|
||||
for file_info in files:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(indexed_count)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
file_path = file_info["path"]
|
||||
relative_path = file_info["relative_path"]
|
||||
|
|
@ -368,13 +370,151 @@ async def index_obsidian_vault(
|
|||
search_space_id,
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(content, search_space_id)
|
||||
|
||||
# Check for existing document
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(content, search_space_id)
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.debug(f"Note {title} unchanged, skipping")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"file_info": file_info,
|
||||
"content": content,
|
||||
"body_content": body_content,
|
||||
"frontmatter": frontmatter,
|
||||
"wiki_links": wiki_links,
|
||||
"tags": tags,
|
||||
"title": title,
|
||||
"relative_path": relative_path,
|
||||
"content_hash": content_hash,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Obsidian note {title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=title,
|
||||
document_type=DocumentType.OBSIDIAN_CONNECTOR,
|
||||
document_metadata={
|
||||
"vault_name": vault_name,
|
||||
"file_path": relative_path,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
files_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"file_info": file_info,
|
||||
"content": content,
|
||||
"body_content": body_content,
|
||||
"frontmatter": frontmatter,
|
||||
"wiki_links": wiki_links,
|
||||
"tags": tags,
|
||||
"title": title,
|
||||
"relative_path": relative_path,
|
||||
"content_hash": content_hash,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}"
|
||||
)
|
||||
failed_count += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(files_to_process)} documents")
|
||||
|
||||
# Get LLM for summarization
|
||||
long_context_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
for item in files_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(indexed_count)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Extract data from item
|
||||
title = item["title"]
|
||||
relative_path = item["relative_path"]
|
||||
content = item["content"]
|
||||
body_content = item["body_content"]
|
||||
frontmatter = item["frontmatter"]
|
||||
wiki_links = item["wiki_links"]
|
||||
tags = item["tags"]
|
||||
content_hash = item["content_hash"]
|
||||
file_info = item["file_info"]
|
||||
|
||||
# Build metadata
|
||||
document_metadata = {
|
||||
|
|
@ -404,134 +544,114 @@ async def index_obsidian_vault(
|
|||
]
|
||||
document_string = build_document_metadata_string(metadata_sections)
|
||||
|
||||
if existing_document:
|
||||
# Check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.debug(f"Note {title} unchanged, skipping")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Update existing document
|
||||
logger.info(f"Updating note: {title}")
|
||||
|
||||
# Generate new summary if content changed
|
||||
if long_context_llm:
|
||||
new_summary, _ = await generate_document_summary(
|
||||
document_string,
|
||||
long_context_llm,
|
||||
document_metadata,
|
||||
)
|
||||
# Store summary in metadata
|
||||
document_metadata["summary"] = new_summary
|
||||
|
||||
# Add URL and connector_id to metadata
|
||||
document_metadata["url"] = (
|
||||
f"obsidian://{vault_name}/{relative_path}"
|
||||
)
|
||||
document_metadata["connector_id"] = connector_id
|
||||
|
||||
existing_document.content = document_string
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.document_metadata = document_metadata
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
# Update embedding
|
||||
embedding = config.embedding_model_instance.embed(document_string)
|
||||
existing_document.embedding = embedding
|
||||
|
||||
# Update chunks - delete old and create new
|
||||
existing_document.chunks.clear()
|
||||
new_chunks = await create_document_chunks(document_string)
|
||||
existing_document.chunks = new_chunks
|
||||
|
||||
indexed_count += 1
|
||||
|
||||
else:
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Obsidian note {title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
logger.info(f"Indexing new note: {title}")
|
||||
|
||||
# Generate summary
|
||||
summary_content = ""
|
||||
if long_context_llm:
|
||||
summary_content, _ = await generate_document_summary(
|
||||
document_string,
|
||||
long_context_llm,
|
||||
document_metadata,
|
||||
)
|
||||
|
||||
# Generate embedding
|
||||
embedding = config.embedding_model_instance.embed(document_string)
|
||||
|
||||
# Add URL and summary to metadata
|
||||
document_metadata["url"] = (
|
||||
f"obsidian://{vault_name}/{relative_path}"
|
||||
)
|
||||
document_metadata["summary"] = summary_content
|
||||
document_metadata["connector_id"] = connector_id
|
||||
|
||||
# Create chunks
|
||||
chunks = await create_document_chunks(document_string)
|
||||
|
||||
# Create document
|
||||
new_document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=title,
|
||||
document_type=DocumentType.OBSIDIAN_CONNECTOR,
|
||||
content=document_string,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
document_metadata=document_metadata,
|
||||
embedding=embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
# Generate summary
|
||||
summary_content = ""
|
||||
if long_context_llm:
|
||||
summary_content, _ = await generate_document_summary(
|
||||
document_string,
|
||||
long_context_llm,
|
||||
document_metadata,
|
||||
)
|
||||
|
||||
session.add(new_document)
|
||||
# Generate embedding
|
||||
embedding = config.embedding_model_instance.embed(document_string)
|
||||
|
||||
indexed_count += 1
|
||||
# Add URL and summary to metadata
|
||||
document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}"
|
||||
document_metadata["summary"] = summary_content
|
||||
document_metadata["connector_id"] = connector_id
|
||||
|
||||
# Create chunks
|
||||
chunks = await create_document_chunks(document_string)
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = title
|
||||
document.content = document_string
|
||||
document.content_hash = content_hash
|
||||
document.embedding = embedding
|
||||
document.document_metadata = document_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
indexed_count += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if indexed_count % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {indexed_count} Obsidian notes processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Error processing file {file_info.get('path', 'unknown')}: {e}"
|
||||
f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}"
|
||||
)
|
||||
skipped_count += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
failed_count += 1
|
||||
continue
|
||||
|
||||
# Update connector's last indexed timestamp
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Commit all changes
|
||||
await session.commit()
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Obsidian document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same note was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if failed_count > 0:
|
||||
warning_parts.append(f"{failed_count} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
total_processed = indexed_count
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully indexed {indexed_count} Obsidian notes (skipped {skipped_count})",
|
||||
f"Successfully completed Obsidian vault indexing for connector {connector_id}",
|
||||
{
|
||||
"indexed_count": indexed_count,
|
||||
"skipped_count": skipped_count,
|
||||
"total_files": len(files),
|
||||
"notes_processed": total_processed,
|
||||
"documents_indexed": indexed_count,
|
||||
"documents_skipped": skipped_count,
|
||||
"documents_failed": failed_count,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
return indexed_count, None
|
||||
logger.info(
|
||||
f"Obsidian vault indexing completed: {indexed_count} ready, "
|
||||
f"{skipped_count} skipped, {failed_count} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return total_processed, warning_message
|
||||
|
||||
except SQLAlchemyError as e:
|
||||
logger.exception(f"Database error during Obsidian indexing: {e}")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Slack connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.slack_history import SlackHistory
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -168,11 +173,15 @@ async def index_slack_messages(
|
|||
f"No Slack channels found for connector {connector_id}",
|
||||
{"channels_found": 0},
|
||||
)
|
||||
return 0, "No Slack channels found"
|
||||
# CRITICAL: Update timestamp even when no channels found so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
return 0, None # Return None (not error) when no channels found
|
||||
|
||||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0 # Track messages that failed processing
|
||||
skipped_channels = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
|
|
@ -184,15 +193,14 @@ async def index_slack_messages(
|
|||
{"stage": "process_channels", "total_channels": len(channels)},
|
||||
)
|
||||
|
||||
# Process each channel
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all messages from all channels, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
messages_to_process = [] # List of dicts with document and message data
|
||||
new_documents_created = False
|
||||
|
||||
for channel_obj in channels:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
channel_id = channel_obj["id"]
|
||||
channel_name = channel_obj["name"]
|
||||
is_private = channel_obj["is_private"]
|
||||
|
|
@ -305,47 +313,33 @@ async def index_slack_messages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document."
|
||||
)
|
||||
|
||||
# Update chunks and embedding
|
||||
chunks = await create_document_chunks(
|
||||
combined_document_string
|
||||
)
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
|
||||
# Update existing document
|
||||
existing_document.content = combined_document_string
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = doc_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"msg_ts": msg_ts,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
"message_count": len(formatted_messages),
|
||||
"indexed_at": datetime.now().strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
|
||||
# Delete old chunks and add new ones
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully updated Slack message {msg_ts}")
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -363,48 +357,47 @@ async def index_slack_messages(
|
|||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(combined_document_string)
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
|
||||
# Create and store new document
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Slack - {channel_name}",
|
||||
title=channel_name,
|
||||
document_type=DocumentType.SLACK_CONNECTOR,
|
||||
document_metadata={
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
"message_count": len(formatted_messages),
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"msg_ts": msg_ts,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=combined_document_string,
|
||||
embedding=doc_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Slack channels processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"msg_ts": msg_ts,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
"message_count": len(formatted_messages),
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages"
|
||||
f"Phase 1: Collected {len(formatted_messages)} messages from channel {channel_name}"
|
||||
)
|
||||
|
||||
except SlackApiError as slack_error:
|
||||
|
|
@ -420,43 +413,129 @@ async def index_slack_messages(
|
|||
documents_skipped += 1
|
||||
continue # Skip this channel and continue with others
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
# and if we successfully indexed at least one channel
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
||||
|
||||
for item in messages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["channel_name"]
|
||||
document.content = item["combined_document_string"]
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = doc_embedding
|
||||
document.document_metadata = {
|
||||
"channel_name": item["channel_name"],
|
||||
"channel_id": item["channel_id"],
|
||||
"start_date": item["start_date"],
|
||||
"end_date": item["end_date"],
|
||||
"message_count": item["message_count"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Slack messages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing Slack message {item.get('msg_ts', 'Unknown')}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} Slack channels processed")
|
||||
await session.commit()
|
||||
logger.info(f"Final commit: Total {documents_indexed} Slack messages processed")
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all Slack document changes to database")
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same message was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Prepare result message
|
||||
result_message = None
|
||||
if skipped_channels:
|
||||
result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
|
||||
else:
|
||||
result_message = f"Processed {total_processed} channels."
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Slack indexing for connector {connector_id}",
|
||||
{
|
||||
"channels_processed": total_processed,
|
||||
"channels_processed": len(channels),
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"skipped_channels_count": len(skipped_channels),
|
||||
"result_message": result_message,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped"
|
||||
f"Slack indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None on success (result_message is for logging only)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,17 +1,21 @@
|
|||
"""
|
||||
Microsoft Teams connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config
|
||||
from app.connectors.teams_history import TeamsHistory
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -27,6 +31,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -50,6 +55,10 @@ async def index_teams_messages(
|
|||
"""
|
||||
Index Microsoft Teams messages from all accessible teams and channels.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the Teams connector
|
||||
|
|
@ -165,11 +174,16 @@ async def index_teams_messages(
|
|||
f"No Teams found for connector {connector_id}",
|
||||
{"teams_found": 0},
|
||||
)
|
||||
return 0, "No Teams found"
|
||||
# CRITICAL: Update timestamp even when no teams found so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await session.commit()
|
||||
return 0, None # Return None (not error) when no items found
|
||||
|
||||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
skipped_channels = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
|
|
@ -182,8 +196,6 @@ async def index_teams_messages(
|
|||
)
|
||||
|
||||
# Convert date strings to datetime objects for filtering
|
||||
from datetime import datetime
|
||||
|
||||
start_datetime = None
|
||||
end_datetime = None
|
||||
if start_date_str:
|
||||
|
|
@ -197,16 +209,14 @@ async def index_teams_messages(
|
|||
hour=23, minute=59, second=59, tzinfo=UTC
|
||||
)
|
||||
|
||||
# Process each team
|
||||
for team in teams:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
# =======================================================================
|
||||
# PHASE 1: Collect all messages and create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
messages_to_process = [] # List of dicts with document and message data
|
||||
new_documents_created = False
|
||||
|
||||
for team in teams:
|
||||
team_id = team.get("id")
|
||||
team_name = team.get("displayName", "Unknown Team")
|
||||
|
||||
|
|
@ -239,7 +249,6 @@ async def index_teams_messages(
|
|||
channel_name,
|
||||
team_name,
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Process each message
|
||||
|
|
@ -322,60 +331,33 @@ async def index_teams_messages(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(
|
||||
"Document for Teams message %s in channel %s unchanged. Skipping.",
|
||||
message_id,
|
||||
channel_name,
|
||||
)
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = (
|
||||
DocumentStatus.ready()
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
"Content changed for Teams message %s in channel %s. Updating document.",
|
||||
message_id,
|
||||
channel_name,
|
||||
)
|
||||
|
||||
# Update chunks and embedding
|
||||
chunks = await create_document_chunks(
|
||||
combined_document_string
|
||||
)
|
||||
doc_embedding = (
|
||||
config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
)
|
||||
|
||||
# Update existing document
|
||||
existing_document.content = combined_document_string
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = doc_embedding
|
||||
existing_document.document_metadata = {
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"team_name": team_name,
|
||||
"team_id": team_id,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"message_id": message_id,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
"message_count": len(messages),
|
||||
"indexed_at": datetime.now().strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
|
||||
# Delete old chunks and add new ones
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = (
|
||||
get_current_timestamp()
|
||||
)
|
||||
|
||||
documents_indexed += 1
|
||||
logger.info(
|
||||
"Successfully updated Teams message %s",
|
||||
message_id,
|
||||
)
|
||||
continue
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
|
|
@ -395,62 +377,50 @@ async def index_teams_messages(
|
|||
duplicate_by_content.id,
|
||||
duplicate_by_content.document_type,
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(
|
||||
combined_document_string
|
||||
)
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
combined_document_string
|
||||
)
|
||||
|
||||
# Create and store new document
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"Teams - {team_name} - {channel_name}",
|
||||
title=f"{team_name} - {channel_name}",
|
||||
document_type=DocumentType.TEAMS_CONNECTOR,
|
||||
document_metadata={
|
||||
"team_name": team_name,
|
||||
"team_id": team_id,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
"message_count": len(messages),
|
||||
"indexed_at": datetime.now().strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content=combined_document_string,
|
||||
embedding=doc_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
new_documents_created = True
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
"Committing batch: %s Teams messages processed so far",
|
||||
documents_indexed,
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
logger.info(
|
||||
"Successfully indexed channel %s in team %s with %s messages",
|
||||
channel_name,
|
||||
team_name,
|
||||
len(messages),
|
||||
)
|
||||
messages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"combined_document_string": combined_document_string,
|
||||
"content_hash": content_hash,
|
||||
"team_name": team_name,
|
||||
"team_id": team_id,
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
"message_id": message_id,
|
||||
"start_date": start_date_str,
|
||||
"end_date": end_date_str,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
|
|
@ -462,54 +432,143 @@ async def index_teams_messages(
|
|||
skipped_channels.append(
|
||||
f"{team_name}/{channel_name} (processing error)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error processing team %s: %s", team_name, str(e))
|
||||
continue
|
||||
|
||||
# Update the last_indexed_at timestamp for the connector only if requested
|
||||
# and if we successfully indexed at least one document
|
||||
total_processed = documents_indexed
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
||||
|
||||
for item in messages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (embeddings, chunks)
|
||||
chunks = await create_document_chunks(item["combined_document_string"])
|
||||
doc_embedding = config.embedding_model_instance.embed(
|
||||
item["combined_document_string"]
|
||||
)
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['team_name']} - {item['channel_name']}"
|
||||
document.content = item["combined_document_string"]
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = doc_embedding
|
||||
document.document_metadata = {
|
||||
"team_name": item["team_name"],
|
||||
"team_id": item["team_id"],
|
||||
"channel_name": item["channel_name"],
|
||||
"channel_id": item["channel_id"],
|
||||
"start_date": item["start_date"],
|
||||
"end_date": item["end_date"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
"Committing batch: %s Teams messages processed so far",
|
||||
documents_indexed,
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Teams message: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
"Final commit: Total %s Teams messages processed", documents_indexed
|
||||
)
|
||||
await session.commit()
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info("Successfully committed all Teams document changes to database")
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Prepare result message
|
||||
result_message = None
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
if skipped_channels:
|
||||
result_message = f"Processed {total_processed} messages. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
|
||||
else:
|
||||
result_message = f"Processed {total_processed} messages."
|
||||
warning_parts.append(f"{len(skipped_channels)} channels skipped")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Teams indexing for connector {connector_id}",
|
||||
{
|
||||
"messages_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_channels_count": len(skipped_channels),
|
||||
"result_message": result_message,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Teams indexing completed: %s new messages, %s skipped",
|
||||
"Teams indexing completed: %s ready, %s skipped, %s failed "
|
||||
"(%s duplicate content)",
|
||||
documents_indexed,
|
||||
documents_skipped,
|
||||
documents_failed,
|
||||
duplicate_content_count,
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None on success (result_message is for logging only)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
Webcrawler connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import time
|
||||
|
|
@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.webcrawler_connector import WebCrawlerConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnectorType
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -28,6 +32,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -49,7 +54,11 @@ async def index_crawled_urls(
|
|||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index web page URLs.
|
||||
Index web page URLs with real-time document status updates.
|
||||
|
||||
Implements 2-phase approach for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
|
|
@ -150,9 +159,9 @@ async def index_crawled_urls(
|
|||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to crawl {len(urls)} URLs",
|
||||
f"Starting to process {len(urls)} URLs",
|
||||
{
|
||||
"stage": "crawling",
|
||||
"stage": "processing",
|
||||
"total_urls": len(urls),
|
||||
},
|
||||
)
|
||||
|
|
@ -160,28 +169,128 @@ async def index_crawled_urls(
|
|||
documents_indexed = 0
|
||||
documents_updated = 0
|
||||
documents_skipped = 0
|
||||
failed_urls = []
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for idx, url in enumerate(urls, 1):
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all URLs, create pending documents for new ones
|
||||
# This makes ALL new documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
urls_to_process = [] # List of dicts with document and URL data
|
||||
new_documents_created = False
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
|
||||
# Generate unique identifier hash for this URL
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.CRAWLED_URL, url, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if it's already being processed
|
||||
if DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PENDING
|
||||
):
|
||||
logger.info(f"URL {url} already pending. Skipping.")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
if DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PROCESSING
|
||||
):
|
||||
logger.info(f"URL {url} already processing. Skipping.")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Queue existing document for potential update check
|
||||
urls_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"url": url,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=url[:100], # Placeholder - URL as title (truncated)
|
||||
document_type=DocumentType.CRAWLED_URL,
|
||||
document_metadata={
|
||||
"url": url,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending crawl...", # Placeholder content
|
||||
content_hash=unique_identifier_hash, # Temporary unique value
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # PENDING status - visible in UI
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
urls_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"url": url,
|
||||
"unique_identifier_hash": unique_identifier_hash,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for URL {url}: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each URL one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(urls_to_process)} URLs")
|
||||
|
||||
for item in urls_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed + documents_updated)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
url = item["url"]
|
||||
is_new = item["is_new"]
|
||||
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Crawling URL {idx}/{len(urls)}: {url}",
|
||||
f"Crawling URL: {url}",
|
||||
{
|
||||
"stage": "crawling_url",
|
||||
"url_index": idx,
|
||||
"url": url,
|
||||
},
|
||||
)
|
||||
|
|
@ -191,7 +300,10 @@ async def index_crawled_urls(
|
|||
|
||||
if error or not crawl_result:
|
||||
logger.warning(f"Failed to crawl URL {url}: {error}")
|
||||
failed_urls.append((url, error or "Unknown error"))
|
||||
document.status = DocumentStatus.failed(error or "Crawl failed")
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Extract content and metadata
|
||||
|
|
@ -201,23 +313,18 @@ async def index_crawled_urls(
|
|||
|
||||
if not content.strip():
|
||||
logger.warning(f"Skipping URL with no content: {url}")
|
||||
failed_urls.append((url, "No content extracted"))
|
||||
documents_skipped += 1
|
||||
document.status = DocumentStatus.failed("No content extracted")
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Format content as structured document for summary generation (includes all metadata)
|
||||
# Format content as structured document for summary generation
|
||||
structured_document = crawler.format_to_structured_document(
|
||||
crawl_result
|
||||
)
|
||||
|
||||
# Generate unique identifier hash for this URL
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.CRAWLED_URL, url, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash using a version WITHOUT metadata
|
||||
# This ensures the hash only changes when actual content changes,
|
||||
# not when metadata (which contains dynamic fields like timestamps, IDs, etc.) changes
|
||||
structured_document_for_hash = crawler.format_to_structured_document(
|
||||
crawl_result, exclude_metadata=True
|
||||
)
|
||||
|
|
@ -225,114 +332,53 @@ async def index_crawled_urls(
|
|||
structured_document_for_hash, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# Extract useful metadata
|
||||
title = metadata.get("title", url)
|
||||
description = metadata.get("description", "")
|
||||
language = metadata.get("language", "")
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
logger.info(f"Document for URL {url} unchanged. Skipping.")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logger.info(
|
||||
f"Content changed for URL {url}. Updating document."
|
||||
)
|
||||
# Update title immediately for better UX
|
||||
document.title = title
|
||||
await session.commit()
|
||||
|
||||
# Generate summary with metadata
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"language": language,
|
||||
"document_type": "Crawled URL",
|
||||
"crawler_type": crawler_type,
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
structured_document, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
summary_content = f"Crawled URL: {title}\n\n"
|
||||
summary_content += f"URL: {url}\n"
|
||||
if description:
|
||||
summary_content += f"Description: {description}\n"
|
||||
if language:
|
||||
summary_content += f"Language: {language}\n"
|
||||
summary_content += f"Crawler: {crawler_type}\n\n"
|
||||
|
||||
# Add content preview
|
||||
content_preview = content[:1000]
|
||||
if len(content) > 1000:
|
||||
content_preview += "..."
|
||||
summary_content += f"Content Preview:\n{content_preview}\n"
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(content)
|
||||
|
||||
# Update existing document
|
||||
existing_document.title = title
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
**metadata,
|
||||
"crawler_type": crawler_type,
|
||||
"last_crawled_at": datetime.now().strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_updated += 1
|
||||
logger.info(f"Successfully updated URL {url}")
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"URL {url} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
# For existing documents, check if content has changed
|
||||
if not is_new and document.content_hash == content_hash:
|
||||
logger.info(f"Document for URL {url} unchanged. Marking as ready.")
|
||||
# Ensure status is ready (might have been stuck)
|
||||
document.status = DocumentStatus.ready()
|
||||
await session.commit()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist - create new one
|
||||
# Generate summary with metadata
|
||||
# For new documents, check if duplicate content exists elsewhere
|
||||
if is_new:
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"URL {url} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}). "
|
||||
f"Marking as failed."
|
||||
)
|
||||
document.status = DocumentStatus.failed(
|
||||
"Duplicate content exists"
|
||||
)
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate summary with LLM
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
document_metadata_for_summary = {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
|
|
@ -344,7 +390,7 @@ async def index_crawled_urls(
|
|||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
structured_document, user_llm, document_metadata
|
||||
structured_document, user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
# Fallback to simple summary if no LLM configured
|
||||
|
|
@ -366,32 +412,32 @@ async def index_crawled_urls(
|
|||
summary_content
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(content)
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=title,
|
||||
document_type=DocumentType.CRAWLED_URL,
|
||||
document_metadata={
|
||||
**metadata,
|
||||
"crawler_type": crawler_type,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
# Update document to READY with actual content
|
||||
document.title = title
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
**metadata,
|
||||
"crawler_type": crawler_type,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.status = DocumentStatus.ready() # READY status
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
||||
session.add(document)
|
||||
documents_indexed += 1
|
||||
logger.info(f"Successfully indexed new URL {url}")
|
||||
if is_new:
|
||||
documents_indexed += 1
|
||||
else:
|
||||
documents_updated += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
logger.info(f"Successfully processed URL {url}")
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if (documents_indexed + documents_updated) % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed + documents_updated} URLs processed so far"
|
||||
|
|
@ -399,32 +445,51 @@ async def index_crawled_urls(
|
|||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing URL {url}: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
failed_urls.append((url, str(e)))
|
||||
logger.error(f"Error processing URL {url}: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e)[:200])
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
total_processed = documents_indexed + documents_updated
|
||||
|
||||
if total_processed > 0:
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# Log failed URLs if any (for debugging purposes)
|
||||
if failed_urls:
|
||||
failed_summary = "; ".join(
|
||||
[f"{url}: {error}" for url, error in failed_urls[:5]]
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all webcrawler document changes to database"
|
||||
)
|
||||
if len(failed_urls) > 5:
|
||||
failed_summary += f" (and {len(failed_urls) - 5} more)"
|
||||
logger.warning(f"Some URLs failed to index: {failed_summary}")
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully
|
||||
if "duplicate key value violates unique constraint" in str(e).lower():
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -434,19 +499,21 @@ async def index_crawled_urls(
|
|||
"documents_indexed": documents_indexed,
|
||||
"documents_updated": documents_updated,
|
||||
"documents_skipped": documents_skipped,
|
||||
"failed_urls_count": len(failed_urls),
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Web page indexing completed: {documents_indexed} new, "
|
||||
f"{documents_updated} updated, {documents_skipped} skipped, "
|
||||
f"{len(failed_urls)} failed"
|
||||
f"{documents_failed} failed"
|
||||
)
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None on success (result_message is for logging only)
|
||||
|
||||
if warning_message:
|
||||
return total_processed, f"Completed with issues: {warning_message}"
|
||||
|
||||
return total_processed, None
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
@ -494,9 +561,7 @@ async def get_crawled_url_documents(
|
|||
)
|
||||
|
||||
if connector_id:
|
||||
# Filter by connector if needed - you might need to add a connector_id field to Document
|
||||
# or filter by some other means depending on your schema
|
||||
pass
|
||||
query = query.filter(Document.connector_id == connector_id)
|
||||
|
||||
result = await session.execute(query)
|
||||
documents = result.scalars().all()
|
||||
|
|
|
|||
|
|
@ -14,6 +14,35 @@ from app.db import Document
|
|||
md = MarkdownifyTransformer()
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
"""
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
"""
|
||||
Get the current timestamp with timezone for updated_at field.
|
||||
|
|
|
|||
|
|
@ -3,6 +3,11 @@ Circleback meeting document processor.
|
|||
|
||||
This module processes meeting data received from Circleback webhooks
|
||||
and stores it as searchable documents in the database.
|
||||
|
||||
Implements real-time document status updates for UI feedback:
|
||||
- Create document with 'pending' status (visible in UI immediately)
|
||||
- Set to 'processing' while processing content
|
||||
- Set to 'ready' or 'failed' when complete
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
|
@ -14,6 +19,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import (
|
||||
Document,
|
||||
DocumentStatus,
|
||||
DocumentType,
|
||||
SearchSourceConnector,
|
||||
SearchSourceConnectorType,
|
||||
|
|
@ -30,6 +36,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -47,6 +54,11 @@ async def add_circleback_meeting_document(
|
|||
"""
|
||||
Process and store a Circleback meeting document.
|
||||
|
||||
Implements real-time document status updates:
|
||||
- Phase 1: Create document with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Set to 'processing' while processing content
|
||||
- Phase 3: Set to 'ready' or 'failed' when complete
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
meeting_id: Circleback meeting ID
|
||||
|
|
@ -59,6 +71,7 @@ async def add_circleback_meeting_document(
|
|||
Returns:
|
||||
Document object if successful, None if failed or duplicate
|
||||
"""
|
||||
document = None
|
||||
try:
|
||||
# Generate unique identifier hash using Circleback meeting ID
|
||||
unique_identifier = f"circleback_{meeting_id}"
|
||||
|
|
@ -77,6 +90,12 @@ async def add_circleback_meeting_document(
|
|||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
await session.commit()
|
||||
logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.")
|
||||
return existing_document
|
||||
else:
|
||||
|
|
@ -84,6 +103,78 @@ async def add_circleback_meeting_document(
|
|||
logger.info(
|
||||
f"Content changed for Circleback meeting {meeting_id}. Updating document."
|
||||
)
|
||||
document = existing_document
|
||||
# Set to PROCESSING status and commit - shows "processing" in UI
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
else:
|
||||
# =======================================================================
|
||||
# PHASE 1: Create document with PENDING status
|
||||
# This makes the document visible in the UI immediately
|
||||
# =======================================================================
|
||||
|
||||
# Fetch the user who set up the Circleback connector (preferred)
|
||||
# or fall back to search space owner if no connector found
|
||||
created_by_user_id = None
|
||||
|
||||
# Try to find the Circleback connector for this search space
|
||||
connector_result = await session.execute(
|
||||
select(SearchSourceConnector.user_id).where(
|
||||
SearchSourceConnector.search_space_id == search_space_id,
|
||||
SearchSourceConnector.connector_type
|
||||
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
|
||||
)
|
||||
)
|
||||
connector_user = connector_result.scalar_one_or_none()
|
||||
|
||||
if connector_user:
|
||||
# Use the user who set up the Circleback connector
|
||||
created_by_user_id = connector_user
|
||||
else:
|
||||
# Fallback: use search space owner if no connector found
|
||||
search_space_result = await session.execute(
|
||||
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
|
||||
)
|
||||
created_by_user_id = search_space_result.scalar_one_or_none()
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=meeting_name,
|
||||
document_type=DocumentType.CIRCLEBACK,
|
||||
document_metadata={
|
||||
"CIRCLEBACK_MEETING_ID": meeting_id,
|
||||
"MEETING_NAME": meeting_name,
|
||||
"SOURCE": "CIRCLEBACK_WEBHOOK",
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=created_by_user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
# Commit immediately so document appears in UI with pending status
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Created pending Circleback meeting document {meeting_id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Set to PROCESSING status
|
||||
# =======================================================================
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 3: Process the document content
|
||||
# =======================================================================
|
||||
|
||||
# Get LLM for generating summary
|
||||
llm = await get_document_summary_llm(session, search_space_id)
|
||||
|
|
@ -100,7 +191,7 @@ async def add_circleback_meeting_document(
|
|||
summary_embedding = None
|
||||
else:
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
summary_metadata = {
|
||||
"meeting_name": meeting_name,
|
||||
"meeting_id": meeting_id,
|
||||
"document_type": "Circleback Meeting",
|
||||
|
|
@ -111,7 +202,7 @@ async def add_circleback_meeting_document(
|
|||
},
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, llm, document_metadata
|
||||
markdown_content, llm, summary_metadata
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
|
|
@ -126,7 +217,7 @@ async def add_circleback_meeting_document(
|
|||
f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable"
|
||||
)
|
||||
|
||||
# Prepare document metadata
|
||||
# Prepare final document metadata
|
||||
document_metadata = {
|
||||
"CIRCLEBACK_MEETING_ID": meeting_id,
|
||||
"MEETING_NAME": meeting_name,
|
||||
|
|
@ -134,77 +225,34 @@ async def add_circleback_meeting_document(
|
|||
**metadata,
|
||||
}
|
||||
|
||||
# Fetch the user who set up the Circleback connector (preferred)
|
||||
# or fall back to search space owner if no connector found
|
||||
created_by_user_id = None
|
||||
# =======================================================================
|
||||
# PHASE 4: Update document to READY status with actual content
|
||||
# =======================================================================
|
||||
document.title = meeting_name
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
if summary_embedding is not None:
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = document_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
document.blocknote_document = blocknote_json
|
||||
document.content_needs_reindexing = False
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
# Ensure connector_id is set (backfill for documents created before this field)
|
||||
if connector_id is not None:
|
||||
document.connector_id = connector_id
|
||||
|
||||
# Try to find the Circleback connector for this search space
|
||||
connector_result = await session.execute(
|
||||
select(SearchSourceConnector.user_id).where(
|
||||
SearchSourceConnector.search_space_id == search_space_id,
|
||||
SearchSourceConnector.connector_type
|
||||
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
|
||||
)
|
||||
)
|
||||
connector_user = connector_result.scalar_one_or_none()
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
|
||||
if connector_user:
|
||||
# Use the user who set up the Circleback connector
|
||||
created_by_user_id = connector_user
|
||||
else:
|
||||
# Fallback: use search space owner if no connector found
|
||||
search_space_result = await session.execute(
|
||||
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
|
||||
)
|
||||
created_by_user_id = search_space_result.scalar_one_or_none()
|
||||
|
||||
# Update or create document
|
||||
if existing_document:
|
||||
# Update existing document
|
||||
existing_document.title = meeting_name
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
if summary_embedding is not None:
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = document_metadata
|
||||
existing_document.chunks = chunks
|
||||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
# Ensure connector_id is set (backfill for documents created before this field)
|
||||
if connector_id is not None:
|
||||
existing_document.connector_id = connector_id
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
document = existing_document
|
||||
logger.info(
|
||||
f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}"
|
||||
)
|
||||
else:
|
||||
# Create new document
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=meeting_name,
|
||||
document_type=DocumentType.CIRCLEBACK,
|
||||
document_metadata=document_metadata,
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=created_by_user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
logger.info(
|
||||
f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}"
|
||||
f"Processed Circleback meeting document {meeting_id} in search space {search_space_id} - now ready"
|
||||
)
|
||||
|
||||
return document
|
||||
|
|
@ -214,8 +262,28 @@ async def add_circleback_meeting_document(
|
|||
logger.error(
|
||||
f"Database error processing Circleback meeting {meeting_id}: {db_error}"
|
||||
)
|
||||
# Mark document as failed if it was created
|
||||
if document is not None:
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(db_error))
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
raise db_error
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}")
|
||||
# Mark document as failed if it was created
|
||||
if document is not None:
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.db import Document, DocumentType, Log, Notification
|
||||
from app.db import Document, DocumentStatus, DocumentType, Log, Notification
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.notification_service import NotificationService
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
|
@ -33,6 +33,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from .markdown_processor import add_received_markdown_file_document
|
||||
|
||||
|
|
@ -499,6 +500,7 @@ async def add_received_file_document_using_unstructured(
|
|||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
|
|
@ -528,6 +530,7 @@ async def add_received_file_document_using_unstructured(
|
|||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
status=DocumentStatus.ready(), # Mark as ready
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -640,6 +643,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
|
|
@ -669,6 +673,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
status=DocumentStatus.ready(), # Mark as ready
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -806,6 +811,7 @@ async def add_received_file_document_using_docling(
|
|||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
|
|
@ -835,6 +841,7 @@ async def add_received_file_document_using_docling(
|
|||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
status=DocumentStatus.ready(), # Mark as ready
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
@ -1606,3 +1613,372 @@ async def process_file_in_background(
|
|||
|
||||
logging.error(f"Error processing file in background: {error_message}")
|
||||
raise # Re-raise so the wrapper can also handle it
|
||||
|
||||
|
||||
async def process_file_in_background_with_document(
|
||||
document: Document,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
session: AsyncSession,
|
||||
task_logger: TaskLoggingService,
|
||||
log_entry: Log,
|
||||
connector: dict | None = None,
|
||||
notification: Notification | None = None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process file and update existing pending document (2-phase pattern).
|
||||
|
||||
This function is Phase 2 of the real-time document status updates:
|
||||
- Phase 1 (API): Created document with pending status
|
||||
- Phase 2 (this): Process file and update document to ready/failed
|
||||
|
||||
The document already exists with pending status. This function:
|
||||
1. Parses the file content (markdown, audio, or ETL services)
|
||||
2. Updates the document with content, embeddings, and chunks
|
||||
3. Sets status to 'ready' on success
|
||||
|
||||
Args:
|
||||
document: Existing document with pending status
|
||||
file_path: Path to the uploaded file
|
||||
filename: Original filename
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
session: Database session
|
||||
task_logger: Task logging service
|
||||
log_entry: Log entry for this task
|
||||
connector: Optional connector info for Google Drive files
|
||||
notification: Optional notification for progress updates
|
||||
|
||||
Returns:
|
||||
Updated Document object if successful, None if duplicate content detected
|
||||
"""
|
||||
import os
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.blocknote_converter import convert_markdown_to_blocknote
|
||||
|
||||
try:
|
||||
markdown_content = None
|
||||
etl_service = None
|
||||
|
||||
# ===== STEP 1: Parse file content based on type =====
|
||||
|
||||
# Check if the file is a markdown or text file
|
||||
if filename.lower().endswith((".md", ".markdown", ".txt")):
|
||||
# Update notification: parsing stage
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Reading file",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing markdown/text file: {filename}",
|
||||
{"file_type": "markdown", "processing_stage": "reading_file"},
|
||||
)
|
||||
|
||||
# Read markdown content directly
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
markdown_content = f.read()
|
||||
etl_service = "MARKDOWN"
|
||||
|
||||
# Clean up temp file
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
# Check if the file is an audio file
|
||||
elif filename.lower().endswith(
|
||||
(".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
|
||||
):
|
||||
# Update notification: parsing stage (transcription)
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Transcribing audio",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing audio file for transcription: {filename}",
|
||||
{"file_type": "audio", "processing_stage": "starting_transcription"},
|
||||
)
|
||||
|
||||
# Transcribe audio
|
||||
stt_service_type = (
|
||||
"local"
|
||||
if app_config.STT_SERVICE
|
||||
and app_config.STT_SERVICE.startswith("local/")
|
||||
else "external"
|
||||
)
|
||||
|
||||
if stt_service_type == "local":
|
||||
from app.services.stt_service import stt_service
|
||||
|
||||
result = stt_service.transcribe_file(file_path)
|
||||
transcribed_text = result.get("text", "")
|
||||
if not transcribed_text:
|
||||
raise ValueError("Transcription returned empty text")
|
||||
markdown_content = (
|
||||
f"# Transcription of {filename}\n\n{transcribed_text}"
|
||||
)
|
||||
else:
|
||||
with open(file_path, "rb") as audio_file:
|
||||
transcription_kwargs = {
|
||||
"model": app_config.STT_SERVICE,
|
||||
"file": audio_file,
|
||||
"api_key": app_config.STT_SERVICE_API_KEY,
|
||||
}
|
||||
if app_config.STT_SERVICE_API_BASE:
|
||||
transcription_kwargs["api_base"] = (
|
||||
app_config.STT_SERVICE_API_BASE
|
||||
)
|
||||
transcription_response = await atranscription(
|
||||
**transcription_kwargs
|
||||
)
|
||||
transcribed_text = transcription_response.get("text", "")
|
||||
if not transcribed_text:
|
||||
raise ValueError("Transcription returned empty text")
|
||||
markdown_content = (
|
||||
f"# Transcription of {filename}\n\n{transcribed_text}"
|
||||
)
|
||||
|
||||
etl_service = "AUDIO_TRANSCRIPTION"
|
||||
# Clean up temp file
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
else:
|
||||
# Document files - use ETL service
|
||||
from app.services.page_limit_service import (
|
||||
PageLimitExceededError,
|
||||
PageLimitService,
|
||||
)
|
||||
|
||||
page_limit_service = PageLimitService(session)
|
||||
|
||||
# Estimate page count
|
||||
try:
|
||||
estimated_pages = page_limit_service.estimate_pages_before_processing(
|
||||
file_path
|
||||
)
|
||||
except Exception:
|
||||
file_size = os.path.getsize(file_path)
|
||||
estimated_pages = max(1, file_size // (80 * 1024))
|
||||
|
||||
# Check page limit
|
||||
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
||||
|
||||
if app_config.ETL_SERVICE == "UNSTRUCTURED":
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Extracting content",
|
||||
)
|
||||
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path,
|
||||
mode="elements",
|
||||
post_processors=[],
|
||||
languages=["eng"],
|
||||
include_orig_elements=False,
|
||||
include_metadata=False,
|
||||
strategy="auto",
|
||||
)
|
||||
docs = await loader.aload()
|
||||
markdown_content = await convert_document_to_markdown(docs)
|
||||
actual_pages = page_limit_service.estimate_pages_from_elements(docs)
|
||||
final_page_count = max(estimated_pages, actual_pages)
|
||||
etl_service = "UNSTRUCTURED"
|
||||
|
||||
# Update page usage
|
||||
await page_limit_service.update_page_usage(
|
||||
user_id, final_page_count, allow_exceed=True
|
||||
)
|
||||
|
||||
elif app_config.ETL_SERVICE == "LLAMACLOUD":
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Extracting content",
|
||||
)
|
||||
|
||||
result = await parse_with_llamacloud_retry(
|
||||
file_path=file_path,
|
||||
estimated_pages=estimated_pages,
|
||||
task_logger=task_logger,
|
||||
log_entry=log_entry,
|
||||
)
|
||||
markdown_documents = await result.aget_markdown_documents(
|
||||
split_by_page=False
|
||||
)
|
||||
if not markdown_documents:
|
||||
raise RuntimeError(
|
||||
f"LlamaCloud parsing returned no documents: {filename}"
|
||||
)
|
||||
markdown_content = markdown_documents[0].text
|
||||
etl_service = "LLAMACLOUD"
|
||||
|
||||
# Update page usage
|
||||
await page_limit_service.update_page_usage(
|
||||
user_id, estimated_pages, allow_exceed=True
|
||||
)
|
||||
|
||||
elif app_config.ETL_SERVICE == "DOCLING":
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Extracting content",
|
||||
)
|
||||
|
||||
# Suppress logging during Docling import
|
||||
getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
|
||||
getLogger("docling.document_converter").setLevel(ERROR)
|
||||
getLogger(
|
||||
"docling_core.transforms.chunker.hierarchical_chunker"
|
||||
).setLevel(ERROR)
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
markdown_content = result.document.export_to_markdown()
|
||||
etl_service = "DOCLING"
|
||||
|
||||
# Update page usage
|
||||
await page_limit_service.update_page_usage(
|
||||
user_id, estimated_pages, allow_exceed=True
|
||||
)
|
||||
|
||||
else:
|
||||
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
|
||||
|
||||
# Clean up temp file
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
if not markdown_content:
|
||||
raise RuntimeError(f"Failed to extract content from file: {filename}")
|
||||
|
||||
# ===== STEP 2: Check for duplicate content =====
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_by_content = await check_duplicate_document(session, content_hash)
|
||||
if existing_by_content and existing_by_content.id != document.id:
|
||||
# Duplicate content found - mark this document as failed
|
||||
logging.info(
|
||||
f"Duplicate content detected for {filename}, "
|
||||
f"matches document {existing_by_content.id}"
|
||||
)
|
||||
return None
|
||||
|
||||
# ===== STEP 3: Generate embeddings and chunks =====
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking"
|
||||
)
|
||||
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"file_name": filename,
|
||||
"etl_service": etl_service,
|
||||
"document_type": "File Document",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
# Fallback: use truncated content as summary
|
||||
summary_content = markdown_content[:4000]
|
||||
from app.config import config
|
||||
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Convert to BlockNote for editing
|
||||
blocknote_json = await convert_markdown_to_blocknote(markdown_content)
|
||||
|
||||
# ===== STEP 4: Update document to READY =====
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
document.title = filename
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"FILE_NAME": filename,
|
||||
"ETL_SERVICE": etl_service or "UNKNOWN",
|
||||
**(document.document_metadata or {}),
|
||||
}
|
||||
flag_modified(document, "document_metadata")
|
||||
|
||||
# Use safe_set_chunks to avoid async issues
|
||||
safe_set_chunks(document, chunks)
|
||||
|
||||
document.blocknote_document = blocknote_json
|
||||
document.content_needs_reindexing = False
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready() # Shows checkmark in UI
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed file: {filename}",
|
||||
{
|
||||
"document_id": document.id,
|
||||
"content_hash": content_hash,
|
||||
"file_type": etl_service,
|
||||
"chunks_count": len(chunks),
|
||||
},
|
||||
)
|
||||
|
||||
return document
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
|
||||
from app.services.page_limit_service import PageLimitExceededError
|
||||
|
||||
if isinstance(e, PageLimitExceededError):
|
||||
error_message = str(e)
|
||||
elif isinstance(e, HTTPException) and "page limit" in str(e.detail).lower():
|
||||
error_message = str(e.detail)
|
||||
else:
|
||||
error_message = f"Failed to process file: {filename}"
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
error_message,
|
||||
str(e),
|
||||
{
|
||||
"error_type": type(e).__name__,
|
||||
"filename": filename,
|
||||
"document_id": document.id,
|
||||
},
|
||||
)
|
||||
logging.error(f"Error processing file with document: {error_message}")
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import logging
|
|||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -270,6 +270,7 @@ async def add_received_markdown_file_document(
|
|||
existing_document.chunks = chunks
|
||||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
|
|
@ -297,6 +298,7 @@ async def add_received_markdown_file_document(
|
|||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
status=DocumentStatus.ready(), # Mark as ready
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
"""
|
||||
YouTube video document processor.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create document with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process document: pending → processing → ready/failed
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
|
@ -12,7 +16,7 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
|
|
@ -26,6 +30,7 @@ from app.utils.proxy_config import get_requests_proxies
|
|||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -61,6 +66,10 @@ async def add_youtube_video_document(
|
|||
"""
|
||||
Process a YouTube video URL, extract transcripts, and store as a document.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create document with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process document: pending → processing → ready/failed
|
||||
|
||||
Args:
|
||||
session: Database session for storing the document
|
||||
url: YouTube video URL (supports standard, shortened, and embed formats)
|
||||
|
|
@ -85,15 +94,18 @@ async def add_youtube_video_document(
|
|||
metadata={"url": url, "user_id": str(user_id)},
|
||||
)
|
||||
|
||||
document = None
|
||||
video_id = None
|
||||
is_new_document = False
|
||||
|
||||
try:
|
||||
# Extract video ID from URL
|
||||
# Extract video ID from URL (lightweight operation)
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Extracting video ID from URL: {url}",
|
||||
{"stage": "video_id_extraction"},
|
||||
)
|
||||
|
||||
# Get video ID
|
||||
video_id = get_youtube_video_id(url)
|
||||
if not video_id:
|
||||
raise ValueError(f"Could not extract video ID from URL: {url}")
|
||||
|
|
@ -104,13 +116,87 @@ async def add_youtube_video_document(
|
|||
{"stage": "video_id_extracted", "video_id": video_id},
|
||||
)
|
||||
|
||||
# Get video metadata
|
||||
# Generate unique identifier hash for this YouTube video
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Checking for existing video: {video_id}",
|
||||
{"stage": "duplicate_check", "video_id": video_id},
|
||||
)
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Create pending document or prepare existing for update
|
||||
# =======================================================================
|
||||
if existing_document:
|
||||
document = existing_document
|
||||
is_new_document = False
|
||||
# Check if already being processed
|
||||
if DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PENDING
|
||||
):
|
||||
logging.info(
|
||||
f"YouTube video {video_id} already pending. Returning existing."
|
||||
)
|
||||
return existing_document
|
||||
if DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PROCESSING
|
||||
):
|
||||
logging.info(
|
||||
f"YouTube video {video_id} already processing. Returning existing."
|
||||
)
|
||||
return existing_document
|
||||
else:
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Creating pending document for video: {video_id}",
|
||||
{"stage": "pending_document_creation"},
|
||||
)
|
||||
|
||||
document = Document(
|
||||
title=f"YouTube Video: {video_id}", # Placeholder title
|
||||
document_type=DocumentType.YOUTUBE_VIDEO,
|
||||
document_metadata={
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
},
|
||||
content="Processing video...", # Placeholder content
|
||||
content_hash=unique_identifier_hash, # Temporary unique value
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation
|
||||
status=DocumentStatus.pending(), # PENDING status - visible in UI
|
||||
search_space_id=search_space_id,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
)
|
||||
session.add(document)
|
||||
await session.commit() # Document visible in UI now with pending status!
|
||||
is_new_document = True
|
||||
|
||||
logging.info(f"Created pending document for YouTube video {video_id}")
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Set to PROCESSING and do heavy work
|
||||
# =======================================================================
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit() # UI shows "processing" status
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching video metadata for: {video_id}",
|
||||
{"stage": "metadata_fetch"},
|
||||
)
|
||||
|
||||
# Fetch video metadata
|
||||
params = {
|
||||
"format": "json",
|
||||
"url": f"https://www.youtube.com/watch?v={video_id}",
|
||||
|
|
@ -130,6 +216,10 @@ async def add_youtube_video_document(
|
|||
):
|
||||
video_data = await response.json()
|
||||
|
||||
# Update title immediately for better UX (user sees actual title sooner)
|
||||
document.title = video_data.get("title", f"YouTube Video: {video_id}")
|
||||
await session.commit()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Video metadata fetched: {video_data.get('title', 'Unknown')}",
|
||||
|
|
@ -219,53 +309,28 @@ async def add_youtube_video_document(
|
|||
document_parts.append("</DOCUMENT>")
|
||||
combined_document_string = "\n".join(document_parts)
|
||||
|
||||
# Generate unique identifier hash for this YouTube video
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.YOUTUBE_VIDEO, video_id, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(combined_document_string, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Checking for existing video: {video_id}",
|
||||
{"stage": "duplicate_check", "video_id": video_id},
|
||||
)
|
||||
# For existing documents, check if content has changed
|
||||
if not is_new_document and existing_document.content_hash == content_hash:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
|
||||
{
|
||||
"duplicate_detected": True,
|
||||
"existing_document_id": existing_document.id,
|
||||
"video_id": video_id,
|
||||
},
|
||||
)
|
||||
logging.info(
|
||||
f"Document for YouTube video {video_id} unchanged. Marking as ready."
|
||||
)
|
||||
document.status = DocumentStatus.ready()
|
||||
await session.commit()
|
||||
return document
|
||||
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}",
|
||||
{
|
||||
"duplicate_detected": True,
|
||||
"existing_document_id": existing_document.id,
|
||||
"video_id": video_id,
|
||||
},
|
||||
)
|
||||
logging.info(
|
||||
f"Document for YouTube video {video_id} unchanged. Skipping."
|
||||
)
|
||||
return existing_document
|
||||
else:
|
||||
# Content has changed - update the existing document
|
||||
logging.info(
|
||||
f"Content changed for YouTube video {video_id}. Updating document."
|
||||
)
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "document_update", "video_id": video_id},
|
||||
)
|
||||
|
||||
# Get LLM for summary generation (needed for both create and update)
|
||||
# Get LLM for summary generation
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
|
||||
|
|
@ -287,7 +352,7 @@ async def add_youtube_video_document(
|
|||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
document_metadata_for_summary = {
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"title": video_data.get("title", "YouTube Video"),
|
||||
|
|
@ -297,7 +362,7 @@ async def add_youtube_video_document(
|
|||
"has_transcript": "No captions available" not in transcript_text,
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
combined_document_string, user_llm, document_metadata
|
||||
combined_document_string, user_llm, document_metadata_for_summary
|
||||
)
|
||||
|
||||
# Process chunks
|
||||
|
|
@ -319,65 +384,33 @@ async def add_youtube_video_document(
|
|||
|
||||
chunks = await create_document_chunks(combined_document_string)
|
||||
|
||||
# Update or create document
|
||||
if existing_document:
|
||||
# Update existing document
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "document_update", "chunks_count": len(chunks)},
|
||||
)
|
||||
# =======================================================================
|
||||
# PHASE 3: Update document to READY with all content
|
||||
# =======================================================================
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Finalizing document: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "document_finalization", "chunks_count": len(chunks)},
|
||||
)
|
||||
|
||||
existing_document.title = video_data.get("title", "YouTube Video")
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"video_title": video_data.get("title", "YouTube Video"),
|
||||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.blocknote_document = blocknote_json
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
document.title = video_data.get("title", "YouTube Video")
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"video_title": video_data.get("title", "YouTube Video"),
|
||||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.blocknote_document = blocknote_json
|
||||
document.status = DocumentStatus.ready() # READY status - fully processed
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
document = existing_document
|
||||
else:
|
||||
# Create new document
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "document_creation", "chunks_count": len(chunks)},
|
||||
)
|
||||
|
||||
document = Document(
|
||||
title=video_data.get("title", "YouTube Video"),
|
||||
document_type=DocumentType.YOUTUBE_VIDEO,
|
||||
document_metadata={
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"video_title": video_data.get("title", "YouTube Video"),
|
||||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
},
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
search_space_id=search_space_id,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
blocknote_document=blocknote_json,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
|
|
@ -395,27 +428,51 @@ async def add_youtube_video_document(
|
|||
)
|
||||
|
||||
return document
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
# Mark document as failed if it exists
|
||||
if document:
|
||||
try:
|
||||
document.status = DocumentStatus.failed(
|
||||
f"Database error: {str(db_error)[:150]}"
|
||||
)
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception:
|
||||
await session.rollback()
|
||||
else:
|
||||
await session.rollback()
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Database error while processing YouTube video: {url}",
|
||||
str(db_error),
|
||||
{
|
||||
"error_type": "SQLAlchemyError",
|
||||
"video_id": video_id if "video_id" in locals() else None,
|
||||
"video_id": video_id,
|
||||
},
|
||||
)
|
||||
raise db_error
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
# Mark document as failed if it exists
|
||||
if document:
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e)[:200])
|
||||
document.updated_at = get_current_timestamp()
|
||||
await session.commit()
|
||||
except Exception:
|
||||
await session.rollback()
|
||||
else:
|
||||
await session.rollback()
|
||||
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process YouTube video: {url}",
|
||||
str(e),
|
||||
{
|
||||
"error_type": type(e).__name__,
|
||||
"video_id": video_id if "video_id" in locals() else None,
|
||||
"video_id": video_id,
|
||||
},
|
||||
)
|
||||
logging.error(f"Failed to process YouTube video: {e!s}")
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import {
|
|||
llmPreferencesAtom,
|
||||
} from "@/atoms/new-llm-config/new-llm-config-query.atoms";
|
||||
import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms";
|
||||
import { ConnectorIndicator } from "@/components/assistant-ui/connector-popup";
|
||||
import { DocumentUploadDialogProvider } from "@/components/assistant-ui/document-upload-popup";
|
||||
import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb";
|
||||
import { LayoutDataProvider } from "@/components/layout";
|
||||
|
|
@ -192,6 +193,8 @@ export function DashboardClientLayout({
|
|||
<LayoutDataProvider searchSpaceId={searchSpaceId} breadcrumb={<DashboardBreadcrumb />}>
|
||||
{children}
|
||||
</LayoutDataProvider>
|
||||
{/* Global connector dialog - triggered from documents page */}
|
||||
<ConnectorIndicator hideTrigger />
|
||||
</DocumentUploadDialogProvider>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
"use client";
|
||||
|
||||
import type React from "react";
|
||||
import { useRef, useState, useEffect } from "react";
|
||||
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
||||
|
||||
export function getDocumentTypeIcon(type: string): React.ReactNode {
|
||||
return getConnectorIcon(type);
|
||||
export function getDocumentTypeIcon(type: string, className?: string): React.ReactNode {
|
||||
return getConnectorIcon(type, className);
|
||||
}
|
||||
|
||||
export function getDocumentTypeLabel(type: string): string {
|
||||
|
|
@ -15,16 +17,43 @@ export function getDocumentTypeLabel(type: string): string {
|
|||
}
|
||||
|
||||
export function DocumentTypeChip({ type, className }: { type: string; className?: string }) {
|
||||
const icon = getDocumentTypeIcon(type);
|
||||
return (
|
||||
<span
|
||||
className={
|
||||
"inline-flex items-center gap-1.5 rounded-full border border-border bg-primary/5 px-2 py-1 text-xs font-medium " +
|
||||
(className ?? "")
|
||||
const icon = getDocumentTypeIcon(type, "h-4 w-4");
|
||||
const fullLabel = getDocumentTypeLabel(type);
|
||||
const textRef = useRef<HTMLSpanElement>(null);
|
||||
const [isTruncated, setIsTruncated] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
const checkTruncation = () => {
|
||||
if (textRef.current) {
|
||||
setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth);
|
||||
}
|
||||
};
|
||||
checkTruncation();
|
||||
window.addEventListener("resize", checkTruncation);
|
||||
return () => window.removeEventListener("resize", checkTruncation);
|
||||
}, []);
|
||||
|
||||
const chip = (
|
||||
<span
|
||||
className={`inline-flex items-center gap-1.5 rounded bg-muted/40 px-2 py-1 text-xs text-muted-foreground max-w-full overflow-hidden ${className ?? ""}`}
|
||||
>
|
||||
<span className="text-primary">{icon}</span>
|
||||
{getDocumentTypeLabel(type)}
|
||||
<span className="opacity-80 flex-shrink-0">{icon}</span>
|
||||
<span ref={textRef} className="truncate min-w-0">
|
||||
{fullLabel}
|
||||
</span>
|
||||
</span>
|
||||
);
|
||||
|
||||
if (isTruncated) {
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>{chip}</TooltipTrigger>
|
||||
<TooltipContent side="top" className="max-w-xs">
|
||||
<p>{fullLabel}</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
|
||||
return chip;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,21 @@
|
|||
"use client";
|
||||
|
||||
import { CircleAlert, CircleX, Columns3, Filter, ListFilter, Trash } from "lucide-react";
|
||||
import { AnimatePresence, motion, type Variants } from "motion/react";
|
||||
import { useSetAtom } from "jotai";
|
||||
import {
|
||||
CircleAlert,
|
||||
CircleX,
|
||||
FilePlus2,
|
||||
FileType,
|
||||
ListFilter,
|
||||
Search,
|
||||
SlidersHorizontal,
|
||||
Trash,
|
||||
} from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import React, { useMemo, useRef } from "react";
|
||||
import React, { useMemo, useRef, useState } from "react";
|
||||
import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
|
||||
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
|
||||
import {
|
||||
AlertDialog,
|
||||
AlertDialogAction,
|
||||
|
|
@ -17,24 +29,10 @@ import {
|
|||
} from "@/components/ui/alert-dialog";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuCheckboxItem,
|
||||
DropdownMenuContent,
|
||||
DropdownMenuLabel,
|
||||
DropdownMenuTrigger,
|
||||
} from "@/components/ui/dropdown-menu";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
||||
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
|
||||
import type { ColumnVisibility } from "./types";
|
||||
|
||||
const fadeInScale: Variants = {
|
||||
hidden: { opacity: 0, scale: 0.95 },
|
||||
visible: { opacity: 1, scale: 1, transition: { type: "spring", stiffness: 300, damping: 30 } },
|
||||
exit: { opacity: 0, scale: 0.95, transition: { duration: 0.15 } },
|
||||
};
|
||||
import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
|
||||
|
||||
export function DocumentsFilters({
|
||||
typeCounts: typeCountsRecord,
|
||||
|
|
@ -44,8 +42,6 @@ export function DocumentsFilters({
|
|||
onBulkDelete,
|
||||
onToggleType,
|
||||
activeTypes,
|
||||
columnVisibility,
|
||||
onToggleColumn,
|
||||
}: {
|
||||
typeCounts: Partial<Record<DocumentTypeEnum, number>>;
|
||||
selectedIds: Set<number>;
|
||||
|
|
@ -54,17 +50,27 @@ export function DocumentsFilters({
|
|||
onBulkDelete: () => Promise<void>;
|
||||
onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
|
||||
activeTypes: DocumentTypeEnum[];
|
||||
columnVisibility: ColumnVisibility;
|
||||
onToggleColumn: (id: keyof ColumnVisibility, checked: boolean) => void;
|
||||
}) {
|
||||
const t = useTranslations("documents");
|
||||
const id = React.useId();
|
||||
const inputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
// Dialog hooks for action buttons
|
||||
const { openDialog: openUploadDialog } = useDocumentUploadDialog();
|
||||
const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom);
|
||||
|
||||
const [typeSearchQuery, setTypeSearchQuery] = useState("");
|
||||
|
||||
const uniqueTypes = useMemo(() => {
|
||||
return Object.keys(typeCountsRecord).sort() as DocumentTypeEnum[];
|
||||
}, [typeCountsRecord]);
|
||||
|
||||
const filteredTypes = useMemo(() => {
|
||||
if (!typeSearchQuery.trim()) return uniqueTypes;
|
||||
const query = typeSearchQuery.toLowerCase();
|
||||
return uniqueTypes.filter((type) => getDocumentTypeLabel(type).toLowerCase().includes(query));
|
||||
}, [uniqueTypes, typeSearchQuery]);
|
||||
|
||||
const typeCounts = useMemo(() => {
|
||||
const map = new Map<string, number>();
|
||||
for (const [type, count] of Object.entries(typeCountsRecord)) {
|
||||
|
|
@ -75,202 +81,233 @@ export function DocumentsFilters({
|
|||
|
||||
return (
|
||||
<motion.div
|
||||
className="flex flex-wrap items-center justify-start gap-3 w-full"
|
||||
className="flex flex-col gap-4"
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.1 }}
|
||||
>
|
||||
<div className="flex items-center gap-3 flex-wrap w-full sm:w-auto">
|
||||
{/* Main toolbar row */}
|
||||
<div className="flex flex-wrap items-center gap-3">
|
||||
{/* Action Buttons - Left Side */}
|
||||
<div className="flex items-center gap-2">
|
||||
<Button
|
||||
onClick={openUploadDialog}
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="h-9 gap-2 bg-white text-gray-700 border-white hover:bg-gray-50 dark:bg-white dark:text-gray-800 dark:hover:bg-gray-100"
|
||||
>
|
||||
<FilePlus2 size={16} />
|
||||
<span>Upload documents</span>
|
||||
</Button>
|
||||
<Button
|
||||
onClick={() => setConnectorDialogOpen(true)}
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="h-9 gap-2 bg-white text-gray-700 border-white hover:bg-gray-50 dark:bg-white dark:text-gray-800 dark:hover:bg-gray-100"
|
||||
>
|
||||
<SlidersHorizontal size={16} />
|
||||
<span>Manage connectors</span>
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
{/* Spacer */}
|
||||
<div className="flex-1" />
|
||||
|
||||
{/* Search Input */}
|
||||
<motion.div
|
||||
className="relative w-full sm:w-auto"
|
||||
className="relative w-[180px]"
|
||||
initial={{ opacity: 0, y: -10 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ type: "spring", stiffness: 300, damping: 30 }}
|
||||
>
|
||||
<div className="pointer-events-none absolute inset-y-0 left-0 flex items-center pl-3 text-muted-foreground">
|
||||
<ListFilter size={14} aria-hidden="true" />
|
||||
</div>
|
||||
<Input
|
||||
id={`${id}-input`}
|
||||
ref={inputRef}
|
||||
className="peer w-full sm:min-w-60 ps-9"
|
||||
className="peer h-9 w-full pl-9 pr-9 text-sm bg-background border-border/60 focus-visible:ring-1 focus-visible:ring-ring/30"
|
||||
value={searchValue}
|
||||
onChange={(e) => onSearch(e.target.value)}
|
||||
placeholder={t("filter_placeholder")}
|
||||
placeholder="Filter by title"
|
||||
type="text"
|
||||
aria-label={t("filter_placeholder")}
|
||||
/>
|
||||
<motion.div
|
||||
className="pointer-events-none absolute inset-y-0 start-0 flex items-center justify-center ps-3 text-muted-foreground/80 peer-disabled:opacity-50"
|
||||
initial={{ scale: 0.8 }}
|
||||
animate={{ scale: 1 }}
|
||||
transition={{ delay: 0.1 }}
|
||||
>
|
||||
<ListFilter size={16} strokeWidth={2} aria-hidden="true" />
|
||||
</motion.div>
|
||||
{Boolean(searchValue) && (
|
||||
<motion.button
|
||||
className="absolute inset-y-0 end-0 flex h-full w-9 items-center justify-center rounded-e-lg text-muted-foreground/80 outline-offset-2 transition-colors hover:text-foreground focus:z-10 focus-visible:outline focus-visible:outline-ring/70"
|
||||
className="absolute inset-y-0 right-0 flex h-full w-9 items-center justify-center rounded-r-md text-muted-foreground/60 hover:text-foreground transition-colors"
|
||||
aria-label="Clear filter"
|
||||
onClick={() => {
|
||||
onSearch("");
|
||||
inputRef.current?.focus();
|
||||
}}
|
||||
initial={{ opacity: 0, rotate: -90 }}
|
||||
animate={{ opacity: 1, rotate: 0 }}
|
||||
exit={{ opacity: 0, rotate: 90 }}
|
||||
initial={{ opacity: 0, scale: 0.8 }}
|
||||
animate={{ opacity: 1, scale: 1 }}
|
||||
exit={{ opacity: 0, scale: 0.8 }}
|
||||
whileHover={{ scale: 1.1 }}
|
||||
whileTap={{ scale: 0.9 }}
|
||||
>
|
||||
<CircleX size={16} strokeWidth={2} aria-hidden="true" />
|
||||
<CircleX size={14} strokeWidth={2} aria-hidden="true" />
|
||||
</motion.button>
|
||||
)}
|
||||
</motion.div>
|
||||
|
||||
<Popover>
|
||||
<PopoverTrigger asChild>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button variant="outline">
|
||||
<Filter
|
||||
className="-ms-1 me-2 opacity-60"
|
||||
size={16}
|
||||
strokeWidth={2}
|
||||
aria-hidden="true"
|
||||
/>
|
||||
Type
|
||||
{/* Filter Buttons Group */}
|
||||
<div className="flex items-center gap-2 flex-wrap">
|
||||
{/* Type Filter */}
|
||||
<Popover>
|
||||
<PopoverTrigger asChild>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="h-9 gap-2 border-dashed border-border/60 text-muted-foreground hover:text-foreground hover:border-border"
|
||||
>
|
||||
<FileType size={14} className="text-muted-foreground" />
|
||||
<span className="hidden sm:inline">Type</span>
|
||||
{activeTypes.length > 0 && (
|
||||
<motion.span
|
||||
initial={{ scale: 0.8 }}
|
||||
animate={{ scale: 1 }}
|
||||
className="-me-1 ms-3 inline-flex h-5 max-h-full items-center rounded border border-border bg-background px-1 text-[0.625rem] font-medium text-muted-foreground/70"
|
||||
>
|
||||
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-primary text-[10px] font-medium text-primary-foreground">
|
||||
{activeTypes.length}
|
||||
</motion.span>
|
||||
</span>
|
||||
)}
|
||||
</Button>
|
||||
</motion.div>
|
||||
</PopoverTrigger>
|
||||
<PopoverContent className="min-w-36 p-3" align="start">
|
||||
<motion.div initial="hidden" animate="visible" exit="exit" variants={fadeInScale}>
|
||||
<div className="space-y-3">
|
||||
<div className="text-xs font-medium text-muted-foreground">Filters</div>
|
||||
<div className="space-y-3">
|
||||
<AnimatePresence>
|
||||
{uniqueTypes.map((value: DocumentTypeEnum, i) => (
|
||||
<motion.div
|
||||
</PopoverTrigger>
|
||||
<PopoverContent className="w-64 !p-0 overflow-hidden" align="end">
|
||||
<div>
|
||||
{/* Search input */}
|
||||
<div className="p-2 border-b border-border/50">
|
||||
<div className="relative">
|
||||
<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
placeholder="Search types..."
|
||||
value={typeSearchQuery}
|
||||
onChange={(e) => setTypeSearchQuery(e.target.value)}
|
||||
className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5">
|
||||
{filteredTypes.length === 0 ? (
|
||||
<div className="py-6 text-center text-sm text-muted-foreground">
|
||||
No types found
|
||||
</div>
|
||||
) : (
|
||||
filteredTypes.map((value: DocumentTypeEnum, i) => (
|
||||
<div
|
||||
key={value}
|
||||
className="flex items-center gap-2"
|
||||
initial={{ opacity: 0, y: -5 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
exit={{ opacity: 0, y: 5 }}
|
||||
transition={{ delay: i * 0.05 }}
|
||||
role="button"
|
||||
tabIndex={0}
|
||||
className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-muted/50 transition-colors cursor-pointer text-left"
|
||||
onClick={() => onToggleType(value, !activeTypes.includes(value))}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === "Enter" || e.key === " ") {
|
||||
e.preventDefault();
|
||||
onToggleType(value, !activeTypes.includes(value));
|
||||
}
|
||||
}}
|
||||
>
|
||||
{/* Icon */}
|
||||
<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
|
||||
{getDocumentTypeIcon(value, "h-4 w-4")}
|
||||
</div>
|
||||
{/* Text content */}
|
||||
<div className="flex flex-col min-w-0 flex-1 gap-0.5">
|
||||
<span className="text-[13px] font-medium text-foreground truncate leading-tight">
|
||||
{getDocumentTypeLabel(value)}
|
||||
</span>
|
||||
<span className="text-[11px] text-muted-foreground leading-tight">
|
||||
{typeCounts.get(value)} document
|
||||
{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
|
||||
</span>
|
||||
</div>
|
||||
{/* Checkbox */}
|
||||
<Checkbox
|
||||
id={`${id}-${i}`}
|
||||
checked={activeTypes.includes(value)}
|
||||
onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
|
||||
className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
|
||||
/>
|
||||
<Label
|
||||
htmlFor={`${id}-${i}`}
|
||||
className="flex grow justify-between gap-2 font-normal"
|
||||
>
|
||||
{value}{" "}
|
||||
<span className="ms-2 text-xs text-muted-foreground">
|
||||
{typeCounts.get(value)}
|
||||
</span>
|
||||
</Label>
|
||||
</motion.div>
|
||||
))}
|
||||
</AnimatePresence>
|
||||
</div>
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
{activeTypes.length > 0 && (
|
||||
<div className="px-3 pt-1.5 pb-1.5 border-t border-border/50">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground"
|
||||
onClick={() => {
|
||||
activeTypes.forEach((t) => {
|
||||
onToggleType(t, false);
|
||||
});
|
||||
}}
|
||||
>
|
||||
Clear filters
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</motion.div>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button variant="outline">
|
||||
<Columns3
|
||||
className="-ms-1 me-2 opacity-60"
|
||||
size={16}
|
||||
strokeWidth={2}
|
||||
aria-hidden="true"
|
||||
/>
|
||||
View
|
||||
</Button>
|
||||
</motion.div>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end">
|
||||
<DropdownMenuLabel>Toggle columns</DropdownMenuLabel>
|
||||
{(
|
||||
[
|
||||
["title", "Title"],
|
||||
["document_type", "Type"],
|
||||
["content", "Content"],
|
||||
["created_at", "Created At"],
|
||||
] as Array<[keyof ColumnVisibility, string]>
|
||||
).map(([key, label]) => (
|
||||
<DropdownMenuCheckboxItem
|
||||
key={key}
|
||||
className="capitalize"
|
||||
checked={columnVisibility[key]}
|
||||
onCheckedChange={(v) => onToggleColumn(key, !!v)}
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
{label}
|
||||
</DropdownMenuCheckboxItem>
|
||||
))}
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-3 w-full sm:w-auto sm:ml-auto">
|
||||
{selectedIds.size > 0 && (
|
||||
<AlertDialog>
|
||||
<AlertDialogTrigger asChild>
|
||||
<Button className="w-full sm:w-auto" variant="outline">
|
||||
<Trash
|
||||
className="-ms-1 me-2 opacity-60"
|
||||
size={16}
|
||||
strokeWidth={2}
|
||||
aria-hidden="true"
|
||||
/>
|
||||
Delete
|
||||
<span className="-me-1 ms-3 inline-flex h-5 max-h-full items-center rounded border border-border bg-background px-1 text-[0.625rem] font-medium text-muted-foreground/70">
|
||||
{selectedIds.size}
|
||||
</span>
|
||||
</Button>
|
||||
</AlertDialogTrigger>
|
||||
<AlertDialogContent>
|
||||
<div className="flex flex-col gap-2 max-sm:items-center sm:flex-row sm:gap-4">
|
||||
<div
|
||||
className="flex size-9 shrink-0 items-center justify-center rounded-full border border-border"
|
||||
aria-hidden="true"
|
||||
{/* Bulk Delete Button */}
|
||||
{selectedIds.size > 0 && (
|
||||
<AlertDialog>
|
||||
<AlertDialogTrigger asChild>
|
||||
<motion.div
|
||||
initial={{ opacity: 0, scale: 0.9 }}
|
||||
animate={{ opacity: 1, scale: 1 }}
|
||||
exit={{ opacity: 0, scale: 0.9 }}
|
||||
>
|
||||
<CircleAlert className="opacity-80" size={16} strokeWidth={2} />
|
||||
{/* Mobile: icon with count */}
|
||||
<Button variant="destructive" size="sm" className="h-9 gap-1.5 px-2.5 md:hidden">
|
||||
<Trash size={14} />
|
||||
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-destructive-foreground/20 text-[10px] font-medium">
|
||||
{selectedIds.size}
|
||||
</span>
|
||||
</Button>
|
||||
{/* Desktop: full button */}
|
||||
<Button variant="destructive" size="sm" className="h-9 gap-2 hidden md:flex">
|
||||
<Trash size={14} />
|
||||
Delete
|
||||
<span className="flex h-5 w-5 items-center justify-center rounded-full bg-destructive-foreground/20 text-[10px] font-medium">
|
||||
{selectedIds.size}
|
||||
</span>
|
||||
</Button>
|
||||
</motion.div>
|
||||
</AlertDialogTrigger>
|
||||
<AlertDialogContent className="max-w-md">
|
||||
<div className="flex flex-col gap-2 sm:flex-row sm:gap-4">
|
||||
<div
|
||||
className="flex size-10 shrink-0 items-center justify-center rounded-full bg-destructive/10 text-destructive"
|
||||
aria-hidden="true"
|
||||
>
|
||||
<CircleAlert size={18} strokeWidth={2} />
|
||||
</div>
|
||||
<AlertDialogHeader className="flex-1">
|
||||
<AlertDialogTitle>
|
||||
Delete {selectedIds.size} document{selectedIds.size !== 1 ? "s" : ""}?
|
||||
</AlertDialogTitle>
|
||||
<AlertDialogDescription>
|
||||
This action cannot be undone. This will permanently delete the selected{" "}
|
||||
{selectedIds.size === 1 ? "document" : "documents"} from your search space.
|
||||
</AlertDialogDescription>
|
||||
</AlertDialogHeader>
|
||||
</div>
|
||||
<AlertDialogHeader>
|
||||
<AlertDialogTitle>Are you absolutely sure?</AlertDialogTitle>
|
||||
<AlertDialogDescription>
|
||||
This action cannot be undone. This will permanently delete {selectedIds.size}{" "}
|
||||
selected {selectedIds.size === 1 ? "row" : "rows"}.
|
||||
</AlertDialogDescription>
|
||||
</AlertDialogHeader>
|
||||
</div>
|
||||
<AlertDialogFooter>
|
||||
<AlertDialogCancel>Cancel</AlertDialogCancel>
|
||||
<AlertDialogAction onClick={onBulkDelete}>Delete</AlertDialogAction>
|
||||
</AlertDialogFooter>
|
||||
</AlertDialogContent>
|
||||
</AlertDialog>
|
||||
)}
|
||||
<AlertDialogFooter>
|
||||
<AlertDialogCancel>Cancel</AlertDialogCancel>
|
||||
<AlertDialogAction
|
||||
onClick={onBulkDelete}
|
||||
className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
|
||||
>
|
||||
Delete
|
||||
</AlertDialogAction>
|
||||
</AlertDialogFooter>
|
||||
</AlertDialogContent>
|
||||
</AlertDialog>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</motion.div>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1,14 +1,30 @@
|
|||
"use client";
|
||||
|
||||
import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react";
|
||||
import { formatDistanceToNow } from "date-fns";
|
||||
import {
|
||||
AlertCircle,
|
||||
Calendar,
|
||||
CheckCircle2,
|
||||
ChevronDown,
|
||||
ChevronUp,
|
||||
Clock,
|
||||
FileText,
|
||||
FileX,
|
||||
Loader2,
|
||||
Network,
|
||||
Plus,
|
||||
User,
|
||||
} from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { useParams } from "next/navigation";
|
||||
import { useTranslations } from "next-intl";
|
||||
import React from "react";
|
||||
import React, { useRef, useState, useEffect, useCallback } from "react";
|
||||
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
|
||||
import { DocumentViewer } from "@/components/document-viewer";
|
||||
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
|
||||
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Checkbox } from "@/components/ui/checkbox";
|
||||
import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog";
|
||||
import { Skeleton } from "@/components/ui/skeleton";
|
||||
import { Spinner } from "@/components/ui/spinner";
|
||||
import {
|
||||
Table,
|
||||
|
|
@ -19,9 +35,64 @@ import {
|
|||
TableRow,
|
||||
} from "@/components/ui/table";
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
||||
import { DocumentTypeChip, getDocumentTypeIcon } from "./DocumentTypeIcon";
|
||||
import { documentsApiService } from "@/lib/apis/documents-api.service";
|
||||
import { DocumentTypeChip } from "./DocumentTypeIcon";
|
||||
import { RowActions } from "./RowActions";
|
||||
import type { ColumnVisibility, Document } from "./types";
|
||||
import type { ColumnVisibility, Document, DocumentStatus } from "./types";
|
||||
|
||||
// Status indicator component for document processing status
|
||||
function StatusIndicator({ status }: { status?: DocumentStatus }) {
|
||||
const state = status?.state ?? "ready";
|
||||
|
||||
switch (state) {
|
||||
case "pending":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<Clock className="h-5 w-5 text-muted-foreground/60" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Pending - waiting to be synced</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "processing":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<Spinner size="sm" className="text-primary" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Syncing</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "failed":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<AlertCircle className="h-5 w-5 text-destructive" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top" className="max-w-xs">
|
||||
{status?.reason || "Processing failed"}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "ready":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<CheckCircle2 className="h-5 w-5 text-muted-foreground/60" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Ready</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export type SortKey = keyof Pick<Document, "title" | "document_type" | "created_at">;
|
||||
|
||||
|
|
@ -36,57 +107,215 @@ function sortDocuments(docs: Document[], key: SortKey, desc: boolean): Document[
|
|||
return desc ? sorted.reverse() : sorted;
|
||||
}
|
||||
|
||||
function truncate(text: string, len = 150): string {
|
||||
const plain = text
|
||||
.replace(/[#*_`>\-[\]()]+/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
if (plain.length <= len) return plain;
|
||||
return `${plain.slice(0, len)}...`;
|
||||
function formatRelativeDate(dateStr: string): string {
|
||||
return formatDistanceToNow(new Date(dateStr), { addSuffix: true });
|
||||
}
|
||||
|
||||
function formatAbsoluteDate(dateStr: string): string {
|
||||
const date = new Date(dateStr);
|
||||
return date.toLocaleString("en-US", {
|
||||
year: "numeric",
|
||||
month: "long",
|
||||
day: "numeric",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
hour12: false,
|
||||
});
|
||||
}
|
||||
|
||||
function TruncatedText({ text, className }: { text: string; className?: string }) {
|
||||
const textRef = useRef<HTMLSpanElement>(null);
|
||||
const [isTruncated, setIsTruncated] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
const checkTruncation = () => {
|
||||
if (textRef.current) {
|
||||
setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth);
|
||||
}
|
||||
};
|
||||
checkTruncation();
|
||||
window.addEventListener("resize", checkTruncation);
|
||||
return () => window.removeEventListener("resize", checkTruncation);
|
||||
}, []);
|
||||
|
||||
if (isTruncated) {
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span ref={textRef} className={className}>
|
||||
{text}
|
||||
</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top" className="max-w-xs">
|
||||
<p className="break-words">{text}</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<span ref={textRef} className={className}>
|
||||
{text}
|
||||
</span>
|
||||
);
|
||||
}
|
||||
|
||||
function SortableHeader({
|
||||
children,
|
||||
sortKey,
|
||||
currentSortKey,
|
||||
sortDesc,
|
||||
onSort,
|
||||
icon,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
sortKey: SortKey;
|
||||
currentSortKey: SortKey;
|
||||
sortDesc: boolean;
|
||||
onSort: (key: SortKey) => void;
|
||||
icon?: React.ReactNode;
|
||||
}) {
|
||||
const isActive = currentSortKey === sortKey;
|
||||
return (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => onSort(sortKey)}
|
||||
className="flex items-center gap-1.5 text-left text-sm font-medium text-muted-foreground/70 hover:text-muted-foreground transition-colors group"
|
||||
>
|
||||
{icon && <span className="opacity-60">{icon}</span>}
|
||||
{children}
|
||||
<span
|
||||
className={`transition-opacity ${isActive ? "opacity-100" : "opacity-0 group-hover:opacity-50"}`}
|
||||
>
|
||||
{isActive && sortDesc ? <ChevronDown size={14} /> : <ChevronUp size={14} />}
|
||||
</span>
|
||||
</button>
|
||||
);
|
||||
}
|
||||
|
||||
export function DocumentsTableShell({
|
||||
documents,
|
||||
loading,
|
||||
error,
|
||||
onRefresh,
|
||||
selectedIds,
|
||||
setSelectedIds,
|
||||
columnVisibility,
|
||||
deleteDocument,
|
||||
sortKey,
|
||||
sortDesc,
|
||||
onSortChange,
|
||||
deleteDocument,
|
||||
searchSpaceId,
|
||||
}: {
|
||||
documents: Document[];
|
||||
loading: boolean;
|
||||
error: boolean;
|
||||
onRefresh: () => Promise<void>;
|
||||
selectedIds: Set<number>;
|
||||
setSelectedIds: (update: Set<number>) => void;
|
||||
columnVisibility: ColumnVisibility;
|
||||
deleteDocument: (id: number) => Promise<boolean>;
|
||||
sortKey: SortKey;
|
||||
sortDesc: boolean;
|
||||
onSortChange: (key: SortKey) => void;
|
||||
deleteDocument: (id: number) => Promise<boolean>;
|
||||
searchSpaceId: string;
|
||||
}) {
|
||||
const t = useTranslations("documents");
|
||||
const params = useParams();
|
||||
const searchSpaceId = params.search_space_id;
|
||||
const { openDialog } = useDocumentUploadDialog();
|
||||
|
||||
// State for metadata viewer (opened via Ctrl/Cmd+Click)
|
||||
// Real-time documents don't sync metadata - we fetch on-demand when viewing
|
||||
const [metadataDoc, setMetadataDoc] = useState<Document | null>(null);
|
||||
const [metadataContent, setMetadataContent] = useState<any>(null);
|
||||
const [metadataLoading, setMetadataLoading] = useState(false);
|
||||
|
||||
// State for lazy document content viewer
|
||||
// Real-time documents don't sync content - we fetch on-demand when viewing
|
||||
const [viewingDoc, setViewingDoc] = useState<Document | null>(null);
|
||||
const [viewingContent, setViewingContent] = useState<string>("");
|
||||
const [viewingLoading, setViewingLoading] = useState(false);
|
||||
|
||||
// Fetch document metadata on-demand when metadata viewer is opened
|
||||
const handleViewMetadata = useCallback(async (doc: Document) => {
|
||||
setMetadataDoc(doc);
|
||||
|
||||
// If metadata is already available (from API/search), use it directly
|
||||
if (doc.document_metadata) {
|
||||
setMetadataContent(doc.document_metadata);
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, fetch from API (lazy loading for real-time synced documents)
|
||||
setMetadataLoading(true);
|
||||
try {
|
||||
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
|
||||
setMetadataContent(fullDoc.document_metadata);
|
||||
} catch (err) {
|
||||
console.error("[DocumentsTableShell] Failed to fetch document metadata:", err);
|
||||
setMetadataContent(null);
|
||||
} finally {
|
||||
setMetadataLoading(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
// Close metadata viewer
|
||||
const handleCloseMetadata = useCallback(() => {
|
||||
setMetadataDoc(null);
|
||||
setMetadataContent(null);
|
||||
setMetadataLoading(false);
|
||||
}, []);
|
||||
|
||||
// Fetch document content on-demand when viewer is opened
|
||||
const handleViewDocument = useCallback(async (doc: Document) => {
|
||||
setViewingDoc(doc);
|
||||
|
||||
// If content is already available (from API/search), use it directly
|
||||
if (doc.content) {
|
||||
setViewingContent(doc.content);
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, fetch from API (lazy loading for real-time synced documents)
|
||||
setViewingLoading(true);
|
||||
try {
|
||||
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
|
||||
setViewingContent(fullDoc.content);
|
||||
} catch (err) {
|
||||
console.error("[DocumentsTableShell] Failed to fetch document content:", err);
|
||||
setViewingContent("Failed to load document content.");
|
||||
} finally {
|
||||
setViewingLoading(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
// Close document viewer
|
||||
const handleCloseViewer = useCallback(() => {
|
||||
setViewingDoc(null);
|
||||
setViewingContent("");
|
||||
setViewingLoading(false);
|
||||
}, []);
|
||||
|
||||
const sorted = React.useMemo(
|
||||
() => sortDocuments(documents, sortKey, sortDesc),
|
||||
[documents, sortKey, sortDesc]
|
||||
);
|
||||
|
||||
const allSelectedOnPage = sorted.length > 0 && sorted.every((d) => selectedIds.has(d.id));
|
||||
const someSelectedOnPage = sorted.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage;
|
||||
// Helper: check if document can be selected (not processing/pending)
|
||||
const isSelectable = (doc: Document) => {
|
||||
const state = doc.status?.state;
|
||||
return state !== "pending" && state !== "processing";
|
||||
};
|
||||
|
||||
// Only consider selectable documents for "select all" logic
|
||||
const selectableDocs = sorted.filter(isSelectable);
|
||||
const allSelectedOnPage =
|
||||
selectableDocs.length > 0 && selectableDocs.every((d) => selectedIds.has(d.id));
|
||||
const someSelectedOnPage =
|
||||
selectableDocs.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage;
|
||||
|
||||
const toggleAll = (checked: boolean) => {
|
||||
const next = new Set(selectedIds);
|
||||
if (checked)
|
||||
sorted.forEach((d) => {
|
||||
// Only select documents that are not processing/pending
|
||||
selectableDocs.forEach((d) => {
|
||||
next.add(d.id);
|
||||
});
|
||||
else
|
||||
|
|
@ -107,39 +336,139 @@ export function DocumentsTableShell({
|
|||
|
||||
return (
|
||||
<motion.div
|
||||
className="rounded-md border mt-6 overflow-hidden"
|
||||
className="rounded-lg border border-border/40 bg-background overflow-hidden"
|
||||
initial={{ opacity: 0, y: 20 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.2 }}
|
||||
>
|
||||
{loading ? (
|
||||
<div className="flex h-[400px] w-full items-center justify-center">
|
||||
<div className="flex flex-col items-center gap-2">
|
||||
<Spinner size="lg" className="text-primary" />
|
||||
<p className="text-sm text-muted-foreground">{t("loading")}</p>
|
||||
<>
|
||||
{/* Desktop Skeleton View */}
|
||||
<div className="hidden md:flex md:flex-col">
|
||||
<Table className="table-fixed w-full">
|
||||
<TableHeader>
|
||||
<TableRow className="hover:bg-transparent border-b border-border/40">
|
||||
<TableHead className="w-8 px-0 text-center">
|
||||
<div className="flex items-center justify-center h-full">
|
||||
<Skeleton className="h-4 w-4 rounded" />
|
||||
</div>
|
||||
</TableHead>
|
||||
<TableHead className="w-[35%] max-w-0 border-r border-border/40">
|
||||
<Skeleton className="h-3 w-20" />
|
||||
</TableHead>
|
||||
{columnVisibility.document_type && (
|
||||
<TableHead className="w-[20%] min-w-[120px] max-w-[200px] border-r border-border/40">
|
||||
<Skeleton className="h-3 w-14" />
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.created_by && (
|
||||
<TableHead className="w-36 border-r border-border/40">
|
||||
<Skeleton className="h-3 w-10" />
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableHead className="w-32 border-r border-border/40">
|
||||
<Skeleton className="h-3 w-16" />
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.status && (
|
||||
<TableHead className="w-20 text-center">
|
||||
<Skeleton className="h-3 w-12 mx-auto" />
|
||||
</TableHead>
|
||||
)}
|
||||
<TableHead className="w-10">
|
||||
<span className="sr-only">Actions</span>
|
||||
</TableHead>
|
||||
</TableRow>
|
||||
</TableHeader>
|
||||
</Table>
|
||||
<div className="h-[50vh] overflow-auto">
|
||||
<Table className="table-fixed w-full">
|
||||
<TableBody>
|
||||
{[65, 80, 45, 72, 55, 88, 40, 60, 50, 75].map((widthPercent, index) => (
|
||||
<TableRow
|
||||
key={`skeleton-${index}`}
|
||||
className="border-b border-border/40 hover:bg-transparent"
|
||||
>
|
||||
<TableCell className="w-8 px-0 py-2.5 text-center">
|
||||
<div className="flex items-center justify-center h-full">
|
||||
<Skeleton className="h-4 w-4 rounded" />
|
||||
</div>
|
||||
</TableCell>
|
||||
<TableCell className="w-[35%] py-2.5 max-w-0 border-r border-border/40">
|
||||
<Skeleton className="h-4" style={{ width: `${widthPercent}%` }} />
|
||||
</TableCell>
|
||||
{columnVisibility.document_type && (
|
||||
<TableCell className="w-[20%] min-w-[120px] max-w-[200px] py-2.5 border-r border-border/40 overflow-hidden">
|
||||
<Skeleton className="h-5 w-24 rounded" />
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_by && (
|
||||
<TableCell className="w-36 py-2.5 truncate border-r border-border/40">
|
||||
<Skeleton className="h-4 w-20" />
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableCell className="w-32 py-2.5 border-r border-border/40">
|
||||
<Skeleton className="h-4 w-20" />
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.status && (
|
||||
<TableCell className="w-20 py-2.5 text-center">
|
||||
<Skeleton className="h-5 w-5 mx-auto rounded-full" />
|
||||
</TableCell>
|
||||
)}
|
||||
<TableCell className="w-10 py-2.5 text-center">
|
||||
<Skeleton className="h-6 w-6 mx-auto rounded" />
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{/* Mobile Skeleton View */}
|
||||
<div className="md:hidden divide-y divide-border/30 h-[50vh] overflow-auto">
|
||||
{[70, 85, 55, 78, 62, 90].map((widthPercent, index) => (
|
||||
<div key={`skeleton-mobile-${index}`} className="px-4 py-3">
|
||||
<div className="flex items-start gap-3">
|
||||
<Skeleton className="h-4 w-4 mt-0.5 rounded" />
|
||||
<div className="flex-1 min-w-0 space-y-2">
|
||||
<Skeleton className="h-4" style={{ width: `${widthPercent}%` }} />
|
||||
<div className="flex flex-wrap items-center gap-2">
|
||||
<Skeleton className="h-5 w-20 rounded" />
|
||||
{columnVisibility.created_by && <Skeleton className="h-3 w-14" />}
|
||||
{columnVisibility.created_at && <Skeleton className="h-3 w-20" />}
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
{columnVisibility.status && <Skeleton className="h-5 w-5 rounded-full" />}
|
||||
<Skeleton className="h-7 w-7 rounded" />
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</>
|
||||
) : error ? (
|
||||
<div className="flex h-[400px] w-full items-center justify-center">
|
||||
<div className="flex flex-col items-center gap-2">
|
||||
<div className="flex h-[50vh] w-full items-center justify-center">
|
||||
<div className="flex flex-col items-center gap-3">
|
||||
<AlertCircle className="h-8 w-8 text-destructive/60" />
|
||||
<p className="text-sm text-destructive">{t("error_loading")}</p>
|
||||
<Button variant="outline" size="sm" onClick={() => onRefresh()} className="mt-2">
|
||||
{t("retry")}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
) : sorted.length === 0 ? (
|
||||
<div className="flex h-[400px] w-full items-center justify-center">
|
||||
<div className="flex h-[50vh] w-full items-center justify-center">
|
||||
<motion.div
|
||||
initial={{ opacity: 0, y: 20 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ duration: 0.4 }}
|
||||
className="flex flex-col items-center gap-4 max-w-md px-4 text-center"
|
||||
>
|
||||
<div className="rounded-full bg-muted p-4">
|
||||
<FileX className="h-8 w-8 text-muted-foreground" />
|
||||
<div className="rounded-full bg-muted/50 p-4">
|
||||
<FileX className="h-8 w-8 text-muted-foreground/60" />
|
||||
</div>
|
||||
<div className="space-y-2">
|
||||
<div className="space-y-1.5">
|
||||
<h3 className="text-lg font-semibold">{t("no_documents")}</h3>
|
||||
<p className="text-sm text-muted-foreground">
|
||||
Get started by uploading your first document.
|
||||
|
|
@ -153,234 +482,301 @@ export function DocumentsTableShell({
|
|||
</div>
|
||||
) : (
|
||||
<>
|
||||
<div className="hidden md:block max-h-[60vh] overflow-auto">
|
||||
{/* Desktop Table View - Notion Style */}
|
||||
<div className="hidden md:flex md:flex-col">
|
||||
{/* Fixed Header */}
|
||||
<Table className="table-fixed w-full">
|
||||
<TableHeader className="sticky top-0 bg-background">
|
||||
<TableRow className="hover:bg-transparent">
|
||||
<TableHead style={{ width: 28 }}>
|
||||
<Checkbox
|
||||
checked={allSelectedOnPage || (someSelectedOnPage && "indeterminate")}
|
||||
onCheckedChange={(v) => toggleAll(!!v)}
|
||||
aria-label="Select all"
|
||||
/>
|
||||
<TableHeader>
|
||||
<TableRow className="hover:bg-transparent border-b border-border/40">
|
||||
<TableHead className="w-8 px-0 text-center">
|
||||
<div className="flex items-center justify-center h-full">
|
||||
<Checkbox
|
||||
checked={allSelectedOnPage || (someSelectedOnPage && "indeterminate")}
|
||||
onCheckedChange={(v) => toggleAll(!!v)}
|
||||
aria-label="Select all"
|
||||
className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary"
|
||||
/>
|
||||
</div>
|
||||
</TableHead>
|
||||
<TableHead className="w-[35%] border-r border-border/40">
|
||||
<SortableHeader
|
||||
sortKey="title"
|
||||
currentSortKey={sortKey}
|
||||
sortDesc={sortDesc}
|
||||
onSort={onSortHeader}
|
||||
icon={<FileText size={14} className="text-muted-foreground" />}
|
||||
>
|
||||
Document
|
||||
</SortableHeader>
|
||||
</TableHead>
|
||||
{columnVisibility.title && (
|
||||
<TableHead style={{ width: 250 }}>
|
||||
<Button
|
||||
variant="ghost"
|
||||
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
|
||||
onClick={() => onSortHeader("title")}
|
||||
>
|
||||
{t("title")}
|
||||
{sortKey === "title" ? (
|
||||
sortDesc ? (
|
||||
<ChevronDown className="shrink-0 opacity-60" size={16} />
|
||||
) : (
|
||||
<ChevronUp className="shrink-0 opacity-60" size={16} />
|
||||
)
|
||||
) : null}
|
||||
</Button>
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.document_type && (
|
||||
<TableHead style={{ width: 180 }}>
|
||||
<Button
|
||||
variant="ghost"
|
||||
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
|
||||
onClick={() => onSortHeader("document_type")}
|
||||
<TableHead className="w-[20%] min-w-[120px] max-w-[200px] border-r border-border/40">
|
||||
<SortableHeader
|
||||
sortKey="document_type"
|
||||
currentSortKey={sortKey}
|
||||
sortDesc={sortDesc}
|
||||
onSort={onSortHeader}
|
||||
icon={<Network size={14} className="text-muted-foreground" />}
|
||||
>
|
||||
{t("type")}
|
||||
{sortKey === "document_type" ? (
|
||||
sortDesc ? (
|
||||
<ChevronDown className="shrink-0 opacity-60" size={16} />
|
||||
) : (
|
||||
<ChevronUp className="shrink-0 opacity-60" size={16} />
|
||||
)
|
||||
) : null}
|
||||
</Button>
|
||||
Source
|
||||
</SortableHeader>
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.content && (
|
||||
<TableHead style={{ width: 300 }}>{t("content_summary")}</TableHead>
|
||||
{columnVisibility.created_by && (
|
||||
<TableHead className="w-36 border-r border-border/40">
|
||||
<span className="flex items-center gap-1.5 text-sm font-medium text-muted-foreground/70">
|
||||
<User size={14} className="opacity-60 text-muted-foreground" />
|
||||
User
|
||||
</span>
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableHead style={{ width: 120 }}>
|
||||
<Button
|
||||
variant="ghost"
|
||||
className="flex h-full w-full cursor-pointer select-none items-center justify-between gap-2"
|
||||
onClick={() => onSortHeader("created_at")}
|
||||
<TableHead className="w-32 border-r border-border/40">
|
||||
<SortableHeader
|
||||
sortKey="created_at"
|
||||
currentSortKey={sortKey}
|
||||
sortDesc={sortDesc}
|
||||
onSort={onSortHeader}
|
||||
icon={<Calendar size={14} className="text-muted-foreground" />}
|
||||
>
|
||||
Created At
|
||||
{sortKey === "created_at" ? (
|
||||
sortDesc ? (
|
||||
<ChevronDown className="shrink-0 opacity-60" size={16} />
|
||||
) : (
|
||||
<ChevronUp className="shrink-0 opacity-60" size={16} />
|
||||
)
|
||||
) : null}
|
||||
</Button>
|
||||
Created
|
||||
</SortableHeader>
|
||||
</TableHead>
|
||||
)}
|
||||
<TableHead style={{ width: 60 }}>
|
||||
{columnVisibility.status && (
|
||||
<TableHead className="w-20 text-center">
|
||||
<span className="text-sm font-medium text-muted-foreground/70">Status</span>
|
||||
</TableHead>
|
||||
)}
|
||||
<TableHead className="w-10">
|
||||
<span className="sr-only">Actions</span>
|
||||
</TableHead>
|
||||
</TableRow>
|
||||
</TableHeader>
|
||||
<TableBody>
|
||||
{sorted.map((doc, index) => {
|
||||
const icon = getDocumentTypeIcon(doc.document_type);
|
||||
const title = doc.title;
|
||||
const truncatedTitle = title.length > 30 ? `${title.slice(0, 30)}...` : title;
|
||||
return (
|
||||
<motion.tr
|
||||
key={doc.id}
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
animate={{
|
||||
opacity: 1,
|
||||
y: 0,
|
||||
transition: {
|
||||
type: "spring",
|
||||
stiffness: 300,
|
||||
damping: 30,
|
||||
delay: index * 0.03,
|
||||
},
|
||||
}}
|
||||
exit={{ opacity: 0, y: -10 }}
|
||||
className="border-b transition-colors hover:bg-muted/50"
|
||||
>
|
||||
<TableCell className="px-4 py-3">
|
||||
<Checkbox
|
||||
checked={selectedIds.has(doc.id)}
|
||||
onCheckedChange={(v) => toggleOne(doc.id, !!v)}
|
||||
aria-label="Select row"
|
||||
/>
|
||||
</TableCell>
|
||||
{columnVisibility.title && (
|
||||
<TableCell className="px-4 py-3">
|
||||
<motion.div
|
||||
className="flex items-center gap-2 font-medium"
|
||||
whileHover={{ scale: 1.02 }}
|
||||
transition={{ type: "spring", stiffness: 300 }}
|
||||
style={{ display: "flex" }}
|
||||
</Table>
|
||||
{/* Scrollable Body */}
|
||||
<div className="h-[50vh] overflow-auto">
|
||||
<Table className="table-fixed w-full">
|
||||
<TableBody>
|
||||
{sorted.map((doc, index) => {
|
||||
const title = doc.title;
|
||||
const isSelected = selectedIds.has(doc.id);
|
||||
const canSelect = isSelectable(doc);
|
||||
return (
|
||||
<motion.tr
|
||||
key={doc.id}
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{
|
||||
opacity: 1,
|
||||
transition: {
|
||||
duration: 0.2,
|
||||
delay: index * 0.02,
|
||||
},
|
||||
}}
|
||||
className={`border-b border-border/40 transition-colors ${
|
||||
isSelected ? "bg-primary/5 hover:bg-primary/8" : "hover:bg-muted/30"
|
||||
}`}
|
||||
>
|
||||
<TableCell className="w-8 px-0 py-2.5 text-center">
|
||||
<div className="flex items-center justify-center h-full">
|
||||
<Checkbox
|
||||
checked={isSelected}
|
||||
onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)}
|
||||
disabled={!canSelect}
|
||||
aria-label={
|
||||
canSelect ? "Select row" : "Cannot select while processing"
|
||||
}
|
||||
className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`}
|
||||
/>
|
||||
</div>
|
||||
</TableCell>
|
||||
<TableCell className="w-[35%] py-2.5 max-w-0 border-r border-border/40">
|
||||
<button
|
||||
type="button"
|
||||
className="block w-full text-left text-sm text-foreground hover:text-foreground transition-colors cursor-pointer bg-transparent border-0 p-0 truncate"
|
||||
onClick={(e) => {
|
||||
// Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata
|
||||
if (e.ctrlKey || e.metaKey) {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
handleViewMetadata(doc);
|
||||
} else {
|
||||
// Normal click opens document viewer (lazy loads content)
|
||||
handleViewDocument(doc);
|
||||
}
|
||||
}}
|
||||
onKeyDown={(e) => {
|
||||
// Ctrl/Cmd + Enter opens metadata
|
||||
if ((e.ctrlKey || e.metaKey) && e.key === "Enter") {
|
||||
e.preventDefault();
|
||||
handleViewMetadata(doc);
|
||||
} else if (e.key === "Enter") {
|
||||
// Enter opens document viewer
|
||||
handleViewDocument(doc);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<TruncatedText text={title} className="truncate block" />
|
||||
</button>
|
||||
</TableCell>
|
||||
{columnVisibility.document_type && (
|
||||
<TableCell className="w-[20%] min-w-[120px] max-w-[200px] py-2.5 border-r border-border/40 overflow-hidden">
|
||||
<DocumentTypeChip type={doc.document_type} />
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_by && (
|
||||
<TableCell className="w-36 py-2.5 text-sm text-foreground truncate border-r border-border/40">
|
||||
{doc.created_by_name || "—"}
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableCell className="w-32 py-2.5 text-sm text-foreground border-r border-border/40">
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="flex items-center gap-2">
|
||||
<span className="text-muted-foreground shrink-0">{icon}</span>
|
||||
<span>{truncatedTitle}</span>
|
||||
<span className="cursor-default">
|
||||
{formatRelativeDate(doc.created_at)}
|
||||
</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent>
|
||||
<p>{title}</p>
|
||||
<TooltipContent side="top">
|
||||
{formatAbsoluteDate(doc.created_at)}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</motion.div>
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.status && (
|
||||
<TableCell className="w-20 py-2.5 text-center">
|
||||
<StatusIndicator status={doc.status} />
|
||||
</TableCell>
|
||||
)}
|
||||
<TableCell className="w-10 py-2.5 text-center">
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
searchSpaceId={searchSpaceId}
|
||||
/>
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.document_type && (
|
||||
<TableCell className="px-4 py-3">
|
||||
<div className="flex items-center gap-2">
|
||||
<DocumentTypeChip type={doc.document_type} />
|
||||
</div>
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.content && (
|
||||
<TableCell className="px-4 py-3">
|
||||
<div className="flex flex-col gap-2">
|
||||
<div className="max-w-[300px] max-h-[60px] overflow-hidden text-sm text-muted-foreground">
|
||||
{truncate(doc.content)}
|
||||
</div>
|
||||
<DocumentViewer
|
||||
title={doc.title}
|
||||
content={doc.content}
|
||||
trigger={
|
||||
<Button variant="ghost" size="sm" className="w-fit text-xs">
|
||||
{t("view_full")}
|
||||
</Button>
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableCell className="px-4 py-3">
|
||||
{new Date(doc.created_at).toLocaleDateString()}
|
||||
</TableCell>
|
||||
)}
|
||||
<TableCell className="px-4 py-3">
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
refreshDocuments={async () => {
|
||||
await onRefresh();
|
||||
}}
|
||||
searchSpaceId={searchSpaceId as string}
|
||||
/>
|
||||
</TableCell>
|
||||
</motion.tr>
|
||||
);
|
||||
})}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</motion.tr>
|
||||
);
|
||||
})}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</div>
|
||||
</div>
|
||||
<div className="md:hidden divide-y">
|
||||
{sorted.map((doc) => {
|
||||
const icon = getDocumentTypeIcon(doc.document_type);
|
||||
|
||||
{/* Mobile Card View - Notion Style */}
|
||||
<div className="md:hidden divide-y divide-border/40 h-[50vh] overflow-auto">
|
||||
{sorted.map((doc, index) => {
|
||||
const isSelected = selectedIds.has(doc.id);
|
||||
const canSelect = isSelectable(doc);
|
||||
return (
|
||||
<div key={doc.id} className="p-3">
|
||||
<motion.div
|
||||
key={doc.id}
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{ opacity: 1, transition: { delay: index * 0.03 } }}
|
||||
className={`px-4 py-3 transition-colors ${
|
||||
isSelected ? "bg-primary/5" : "hover:bg-muted/20"
|
||||
}`}
|
||||
>
|
||||
<div className="flex items-center gap-3">
|
||||
<Checkbox
|
||||
checked={selectedIds.has(doc.id)}
|
||||
onCheckedChange={(v) => toggleOne(doc.id, !!v)}
|
||||
aria-label="Select row"
|
||||
checked={isSelected}
|
||||
onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)}
|
||||
disabled={!canSelect}
|
||||
aria-label={canSelect ? "Select row" : "Cannot select while processing"}
|
||||
className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`}
|
||||
/>
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="flex items-center gap-2 min-w-0">
|
||||
<span className="text-muted-foreground shrink-0">{icon}</span>
|
||||
<div className="font-medium truncate">{doc.title}</div>
|
||||
</div>
|
||||
<div className="mt-1 flex flex-wrap items-center gap-2">
|
||||
<div className="flex-1 min-w-0 space-y-1.5">
|
||||
<button
|
||||
type="button"
|
||||
className="text-left text-sm text-foreground hover:text-foreground transition-colors cursor-pointer truncate block w-full bg-transparent border-0 p-0"
|
||||
onClick={(e) => {
|
||||
// Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata
|
||||
if (e.ctrlKey || e.metaKey) {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
handleViewMetadata(doc);
|
||||
} else {
|
||||
// Normal click opens document viewer (lazy loads content)
|
||||
handleViewDocument(doc);
|
||||
}
|
||||
}}
|
||||
onKeyDown={(e) => {
|
||||
// Ctrl/Cmd + Enter opens metadata
|
||||
if ((e.ctrlKey || e.metaKey) && e.key === "Enter") {
|
||||
e.preventDefault();
|
||||
handleViewMetadata(doc);
|
||||
} else if (e.key === "Enter") {
|
||||
// Enter opens document viewer
|
||||
handleViewDocument(doc);
|
||||
}
|
||||
}}
|
||||
>
|
||||
{doc.title}
|
||||
</button>
|
||||
<div className="flex flex-wrap items-center gap-2">
|
||||
<DocumentTypeChip type={doc.document_type} />
|
||||
<span className="text-xs text-muted-foreground">
|
||||
{new Date(doc.created_at).toLocaleDateString()}
|
||||
</span>
|
||||
{columnVisibility.created_by && doc.created_by_name && (
|
||||
<span className="text-xs text-foreground">{doc.created_by_name}</span>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="text-xs text-foreground cursor-default">
|
||||
{formatRelativeDate(doc.created_at)}
|
||||
</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">
|
||||
{formatAbsoluteDate(doc.created_at)}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
)}
|
||||
</div>
|
||||
{columnVisibility.content && (
|
||||
<div className="mt-2 text-sm text-muted-foreground">
|
||||
{truncate(doc.content)}
|
||||
<div className="mt-1">
|
||||
<DocumentViewer
|
||||
title={doc.title}
|
||||
content={doc.content}
|
||||
trigger={
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
className="w-fit text-xs p-0 h-auto"
|
||||
>
|
||||
{t("view_full")}
|
||||
</Button>
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
refreshDocuments={async () => {
|
||||
await onRefresh();
|
||||
}}
|
||||
searchSpaceId={searchSpaceId as string}
|
||||
/>
|
||||
<div className="flex items-center gap-2">
|
||||
{columnVisibility.status && <StatusIndicator status={doc.status} />}
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
searchSpaceId={searchSpaceId}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</motion.div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Metadata Viewer - opened via Ctrl/Cmd+Click on document title */}
|
||||
{/* Lazy loads metadata from API for real-time synced documents */}
|
||||
<JsonMetadataViewer
|
||||
title={metadataDoc?.title ?? ""}
|
||||
metadata={metadataContent}
|
||||
loading={metadataLoading}
|
||||
open={!!metadataDoc}
|
||||
onOpenChange={(open) => {
|
||||
if (!open) handleCloseMetadata();
|
||||
}}
|
||||
/>
|
||||
|
||||
{/* Document Content Viewer - lazy loads content on-demand */}
|
||||
<Dialog open={!!viewingDoc} onOpenChange={(open) => !open && handleCloseViewer()}>
|
||||
<DialogContent className="max-w-4xl max-h-[80vh] overflow-y-auto">
|
||||
<DialogHeader>
|
||||
<DialogTitle>{viewingDoc?.title}</DialogTitle>
|
||||
</DialogHeader>
|
||||
<div className="mt-4">
|
||||
{viewingLoading ? (
|
||||
<div className="flex items-center justify-center py-12">
|
||||
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
) : (
|
||||
<MarkdownViewer content={viewingContent} />
|
||||
)}
|
||||
</div>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</motion.div>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,164 +2,89 @@
|
|||
|
||||
import { ChevronFirst, ChevronLast, ChevronLeft, ChevronRight } from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Pagination, PaginationContent, PaginationItem } from "@/components/ui/pagination";
|
||||
import {
|
||||
Select,
|
||||
SelectContent,
|
||||
SelectItem,
|
||||
SelectTrigger,
|
||||
SelectValue,
|
||||
} from "@/components/ui/select";
|
||||
|
||||
const PAGE_SIZE = 50;
|
||||
|
||||
export function PaginationControls({
|
||||
pageIndex,
|
||||
pageSize,
|
||||
total,
|
||||
onPageSizeChange,
|
||||
onFirst,
|
||||
onPrev,
|
||||
onNext,
|
||||
onLast,
|
||||
canPrev,
|
||||
canNext,
|
||||
id,
|
||||
}: {
|
||||
pageIndex: number;
|
||||
pageSize: number;
|
||||
total: number;
|
||||
onPageSizeChange: (size: number) => void;
|
||||
onFirst: () => void;
|
||||
onPrev: () => void;
|
||||
onNext: () => void;
|
||||
onLast: () => void;
|
||||
canPrev: boolean;
|
||||
canNext: boolean;
|
||||
id: string;
|
||||
}) {
|
||||
const t = useTranslations("documents");
|
||||
const start = total === 0 ? 0 : pageIndex * pageSize + 1;
|
||||
const end = Math.min((pageIndex + 1) * pageSize, total);
|
||||
const start = pageIndex * PAGE_SIZE + 1;
|
||||
const end = Math.min((pageIndex + 1) * PAGE_SIZE, total);
|
||||
|
||||
return (
|
||||
<div className="flex items-center justify-between gap-8 mt-6">
|
||||
<motion.div
|
||||
className="flex items-center gap-3"
|
||||
initial={{ opacity: 0, x: -20 }}
|
||||
animate={{ opacity: 1, x: 0 }}
|
||||
transition={{ type: "spring", stiffness: 300, damping: 30 }}
|
||||
>
|
||||
<Label htmlFor={id} className="max-sm:sr-only">
|
||||
{t("rows_per_page")}
|
||||
</Label>
|
||||
<Select value={String(pageSize)} onValueChange={(v) => onPageSizeChange(Number(v))}>
|
||||
<SelectTrigger id={id} className="w-fit whitespace-nowrap">
|
||||
<SelectValue placeholder="Select number of results" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{[5, 10, 25, 50].map((s) => (
|
||||
<SelectItem key={s} value={String(s)}>
|
||||
{s}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</motion.div>
|
||||
<motion.div
|
||||
className="flex items-center justify-end gap-3 py-3 px-2"
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ type: "spring", stiffness: 300, damping: 30, delay: 0.3 }}
|
||||
>
|
||||
{/* Range indicator */}
|
||||
<span className="text-sm text-muted-foreground tabular-nums">
|
||||
{start}-{end} of {total}
|
||||
</span>
|
||||
|
||||
<motion.div
|
||||
className="flex grow justify-end whitespace-nowrap text-sm text-muted-foreground"
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{ opacity: 1 }}
|
||||
transition={{ delay: 0.2 }}
|
||||
>
|
||||
<p className="whitespace-nowrap text-sm text-muted-foreground" aria-live="polite">
|
||||
<span className="text-foreground">
|
||||
{start}-{end}
|
||||
</span>{" "}
|
||||
of <span className="text-foreground">{total}</span>
|
||||
</p>
|
||||
</motion.div>
|
||||
|
||||
<div>
|
||||
<Pagination>
|
||||
<PaginationContent>
|
||||
<PaginationItem>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button
|
||||
size="icon"
|
||||
variant="outline"
|
||||
className="disabled:pointer-events-none disabled:opacity-50"
|
||||
onClick={onFirst}
|
||||
disabled={!canPrev}
|
||||
aria-label="Go to first page"
|
||||
>
|
||||
<ChevronFirst size={16} strokeWidth={2} aria-hidden="true" />
|
||||
</Button>
|
||||
</motion.div>
|
||||
</PaginationItem>
|
||||
<PaginationItem>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button
|
||||
size="icon"
|
||||
variant="outline"
|
||||
className="disabled:pointer-events-none disabled:opacity-50"
|
||||
onClick={onPrev}
|
||||
disabled={!canPrev}
|
||||
aria-label="Go to previous page"
|
||||
>
|
||||
<ChevronLeft size={16} strokeWidth={2} aria-hidden="true" />
|
||||
</Button>
|
||||
</motion.div>
|
||||
</PaginationItem>
|
||||
<PaginationItem>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button
|
||||
size="icon"
|
||||
variant="outline"
|
||||
className="disabled:pointer-events-none disabled:opacity-50"
|
||||
onClick={onNext}
|
||||
disabled={!canNext}
|
||||
aria-label="Go to next page"
|
||||
>
|
||||
<ChevronRight size={16} strokeWidth={2} aria-hidden="true" />
|
||||
</Button>
|
||||
</motion.div>
|
||||
</PaginationItem>
|
||||
<PaginationItem>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.05 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button
|
||||
size="icon"
|
||||
variant="outline"
|
||||
className="disabled:pointer-events-none disabled:opacity-50"
|
||||
onClick={onLast}
|
||||
disabled={!canNext}
|
||||
aria-label="Go to last page"
|
||||
>
|
||||
<ChevronLast size={16} strokeWidth={2} aria-hidden="true" />
|
||||
</Button>
|
||||
</motion.div>
|
||||
</PaginationItem>
|
||||
</PaginationContent>
|
||||
</Pagination>
|
||||
{/* Navigation buttons */}
|
||||
<div className="flex items-center gap-1">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 disabled:opacity-40"
|
||||
onClick={onFirst}
|
||||
disabled={!canPrev}
|
||||
aria-label="Go to first page"
|
||||
>
|
||||
<ChevronFirst size={18} strokeWidth={2} />
|
||||
</Button>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 disabled:opacity-40"
|
||||
onClick={onPrev}
|
||||
disabled={!canPrev}
|
||||
aria-label="Go to previous page"
|
||||
>
|
||||
<ChevronLeft size={18} strokeWidth={2} />
|
||||
</Button>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 disabled:opacity-40"
|
||||
onClick={onNext}
|
||||
disabled={!canNext}
|
||||
aria-label="Go to next page"
|
||||
>
|
||||
<ChevronRight size={18} strokeWidth={2} />
|
||||
</Button>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 disabled:opacity-40"
|
||||
onClick={onLast}
|
||||
disabled={!canNext}
|
||||
aria-label="Go to last page"
|
||||
>
|
||||
<ChevronLast size={18} strokeWidth={2} />
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</motion.div>
|
||||
);
|
||||
}
|
||||
|
||||
export { PAGE_SIZE };
|
||||
|
|
|
|||
|
|
@ -1,11 +1,9 @@
|
|||
"use client";
|
||||
|
||||
import { FileText, MoreHorizontal, Pencil, Trash2 } from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
|
||||
import { useRouter } from "next/navigation";
|
||||
import { useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { JsonMetadataViewer } from "@/components/json-metadata-viewer";
|
||||
import {
|
||||
AlertDialog,
|
||||
AlertDialogAction,
|
||||
|
|
@ -22,7 +20,6 @@ import {
|
|||
DropdownMenuItem,
|
||||
DropdownMenuTrigger,
|
||||
} from "@/components/ui/dropdown-menu";
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
||||
import type { Document } from "./types";
|
||||
|
||||
// Only FILE and NOTE document types can be edited
|
||||
|
|
@ -34,16 +31,13 @@ const NON_DELETABLE_DOCUMENT_TYPES = ["SURFSENSE_DOCS"] as const;
|
|||
export function RowActions({
|
||||
document,
|
||||
deleteDocument,
|
||||
refreshDocuments,
|
||||
searchSpaceId,
|
||||
}: {
|
||||
document: Document;
|
||||
deleteDocument: (id: number) => Promise<boolean>;
|
||||
refreshDocuments: () => Promise<void>;
|
||||
searchSpaceId: string;
|
||||
}) {
|
||||
const [isDeleteOpen, setIsDeleteOpen] = useState(false);
|
||||
const [isMetadataOpen, setIsMetadataOpen] = useState(false);
|
||||
const [isDeleting, setIsDeleting] = useState(false);
|
||||
const router = useRouter();
|
||||
|
||||
|
|
@ -51,20 +45,37 @@ export function RowActions({
|
|||
document.document_type as (typeof EDITABLE_DOCUMENT_TYPES)[number]
|
||||
);
|
||||
|
||||
const isDeletable = !NON_DELETABLE_DOCUMENT_TYPES.includes(
|
||||
// Documents in "pending" or "processing" state should show disabled delete
|
||||
const isBeingProcessed =
|
||||
document.status?.state === "pending" || document.status?.state === "processing";
|
||||
|
||||
// SURFSENSE_DOCS are system-managed and should not show delete at all
|
||||
const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes(
|
||||
document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number]
|
||||
);
|
||||
|
||||
// Edit and Delete are disabled while processing
|
||||
const isEditDisabled = isBeingProcessed;
|
||||
const isDeleteDisabled = isBeingProcessed;
|
||||
|
||||
const handleDelete = async () => {
|
||||
setIsDeleting(true);
|
||||
try {
|
||||
const ok = await deleteDocument(document.id);
|
||||
if (ok) toast.success("Document deleted successfully");
|
||||
else toast.error("Failed to delete document");
|
||||
await refreshDocuments();
|
||||
} catch (error) {
|
||||
if (!ok) toast.error("Failed to delete document");
|
||||
// Note: Success toast is handled by the mutation atom's onSuccess callback
|
||||
// Cache is updated optimistically by the mutation, no need to refresh
|
||||
} catch (error: unknown) {
|
||||
console.error("Error deleting document:", error);
|
||||
toast.error("Failed to delete document");
|
||||
// Check for 409 Conflict (document started processing after UI loaded)
|
||||
const status =
|
||||
(error as { response?: { status?: number } })?.response?.status ??
|
||||
(error as { status?: number })?.status;
|
||||
if (status === 409) {
|
||||
toast.error("Document is now being processed. Please try again later.");
|
||||
} else {
|
||||
toast.error("Failed to delete document");
|
||||
}
|
||||
} finally {
|
||||
setIsDeleting(false);
|
||||
setIsDeleteOpen(false);
|
||||
|
|
@ -76,124 +87,121 @@ export function RowActions({
|
|||
};
|
||||
|
||||
return (
|
||||
<div className="flex items-center justify-end gap-1">
|
||||
<>
|
||||
{/* Desktop Actions */}
|
||||
<div className="hidden md:flex items-center gap-1">
|
||||
{isEditable && (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.1 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-muted-foreground hover:text-foreground hover:bg-muted/80"
|
||||
onClick={handleEdit}
|
||||
>
|
||||
<Pencil className="h-4 w-4" />
|
||||
<span className="sr-only">Edit Document</span>
|
||||
</Button>
|
||||
</motion.div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">
|
||||
<p>Edit Document</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
)}
|
||||
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.1 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
>
|
||||
<div className="hidden md:inline-flex items-center justify-center">
|
||||
{isEditable ? (
|
||||
// Editable documents: show 3-dot dropdown with edit + delete
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-muted-foreground hover:text-foreground hover:bg-muted/80"
|
||||
onClick={() => setIsMetadataOpen(true)}
|
||||
>
|
||||
<FileText className="h-4 w-4" />
|
||||
<span className="sr-only">View Metadata</span>
|
||||
<MoreHorizontal className="h-4 w-4" />
|
||||
<span className="sr-only">Open menu</span>
|
||||
</Button>
|
||||
</motion.div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">
|
||||
<p>View Metadata</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
|
||||
{isDeletable && (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<motion.div
|
||||
whileHover={{ scale: 1.1 }}
|
||||
whileTap={{ scale: 0.95 }}
|
||||
transition={{ type: "spring", stiffness: 400, damping: 17 }}
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="w-40">
|
||||
<DropdownMenuItem
|
||||
onClick={() => !isEditDisabled && handleEdit()}
|
||||
disabled={isEditDisabled}
|
||||
className={
|
||||
isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""
|
||||
}
|
||||
>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
disabled={isDeleting}
|
||||
<Pencil className="mr-2 h-4 w-4" />
|
||||
<span>Edit</span>
|
||||
</DropdownMenuItem>
|
||||
{shouldShowDelete && (
|
||||
<DropdownMenuItem
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleteDisabled}
|
||||
className={
|
||||
isDeleteDisabled
|
||||
? "text-muted-foreground cursor-not-allowed opacity-50"
|
||||
: "text-destructive focus:text-destructive"
|
||||
}
|
||||
>
|
||||
<Trash2 className="h-4 w-4" />
|
||||
<span className="sr-only">Delete</span>
|
||||
</Button>
|
||||
</motion.div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">
|
||||
<p>Delete</p>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
<Trash2 className="mr-2 h-4 w-4" />
|
||||
<span>Delete</span>
|
||||
</DropdownMenuItem>
|
||||
)}
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
) : (
|
||||
// Non-editable documents: show only delete button directly
|
||||
shouldShowDelete && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleting || isDeleteDisabled}
|
||||
>
|
||||
<Trash2 className="h-4 w-4" />
|
||||
<span className="sr-only">Delete</span>
|
||||
</Button>
|
||||
)
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Mobile Actions Dropdown */}
|
||||
<div className="flex md:hidden">
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<Button variant="ghost" size="icon" className="h-8 w-8 text-muted-foreground">
|
||||
<MoreHorizontal className="h-4 w-4" />
|
||||
<span className="sr-only">Open menu</span>
|
||||
</Button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="w-40">
|
||||
{isEditable && (
|
||||
<DropdownMenuItem onClick={handleEdit}>
|
||||
<div className="inline-flex md:hidden items-center justify-center">
|
||||
{isEditable ? (
|
||||
// Editable documents: show 3-dot dropdown
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<Button variant="ghost" size="icon" className="h-8 w-8 text-muted-foreground">
|
||||
<MoreHorizontal className="h-4 w-4" />
|
||||
<span className="sr-only">Open menu</span>
|
||||
</Button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="w-40">
|
||||
<DropdownMenuItem
|
||||
onClick={() => !isEditDisabled && handleEdit()}
|
||||
disabled={isEditDisabled}
|
||||
className={
|
||||
isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""
|
||||
}
|
||||
>
|
||||
<Pencil className="mr-2 h-4 w-4" />
|
||||
<span>Edit</span>
|
||||
</DropdownMenuItem>
|
||||
)}
|
||||
<DropdownMenuItem onClick={() => setIsMetadataOpen(true)}>
|
||||
<FileText className="mr-2 h-4 w-4" />
|
||||
<span>Metadata</span>
|
||||
</DropdownMenuItem>
|
||||
{isDeletable && (
|
||||
<DropdownMenuItem
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
className="text-destructive focus:text-destructive"
|
||||
>
|
||||
<Trash2 className="mr-2 h-4 w-4" />
|
||||
<span>Delete</span>
|
||||
</DropdownMenuItem>
|
||||
)}
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
{shouldShowDelete && (
|
||||
<DropdownMenuItem
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleteDisabled}
|
||||
className={
|
||||
isDeleteDisabled
|
||||
? "text-muted-foreground cursor-not-allowed opacity-50"
|
||||
: "text-destructive focus:text-destructive"
|
||||
}
|
||||
>
|
||||
<Trash2 className="mr-2 h-4 w-4" />
|
||||
<span>Delete</span>
|
||||
</DropdownMenuItem>
|
||||
)}
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
) : (
|
||||
// Non-editable documents: show only delete button directly
|
||||
shouldShowDelete && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleting || isDeleteDisabled}
|
||||
>
|
||||
<Trash2 className="h-4 w-4" />
|
||||
<span className="sr-only">Delete</span>
|
||||
</Button>
|
||||
)
|
||||
)}
|
||||
</div>
|
||||
|
||||
<JsonMetadataViewer
|
||||
title={document.title}
|
||||
metadata={document.document_metadata}
|
||||
open={isMetadataOpen}
|
||||
onOpenChange={setIsMetadataOpen}
|
||||
/>
|
||||
|
||||
<AlertDialog open={isDeleteOpen} onOpenChange={setIsDeleteOpen}>
|
||||
<AlertDialogContent>
|
||||
<AlertDialogHeader>
|
||||
|
|
@ -214,6 +222,6 @@ export function RowActions({
|
|||
</AlertDialogFooter>
|
||||
</AlertDialogContent>
|
||||
</AlertDialog>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,27 @@
|
|||
export type DocumentType = string;
|
||||
|
||||
export type DocumentStatus = {
|
||||
state: "ready" | "pending" | "processing" | "failed";
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
export type Document = {
|
||||
id: number;
|
||||
title: string;
|
||||
document_type: DocumentType;
|
||||
document_metadata: any;
|
||||
content: string;
|
||||
// Optional: Only needed when viewing document details (lazy loaded)
|
||||
document_metadata?: any;
|
||||
content?: string;
|
||||
created_at: string;
|
||||
search_space_id: number;
|
||||
created_by_id?: string | null;
|
||||
created_by_name?: string | null;
|
||||
status?: DocumentStatus;
|
||||
};
|
||||
|
||||
export type ColumnVisibility = {
|
||||
title: boolean;
|
||||
document_type: boolean;
|
||||
content: boolean;
|
||||
created_by: boolean;
|
||||
created_at: boolean;
|
||||
status: boolean;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -2,22 +2,19 @@
|
|||
|
||||
import { useQuery } from "@tanstack/react-query";
|
||||
import { useAtomValue } from "jotai";
|
||||
import { RefreshCw, SquarePlus, Upload } from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { useParams, useRouter } from "next/navigation";
|
||||
import { useParams } from "next/navigation";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { useCallback, useEffect, useId, useMemo, useState } from "react";
|
||||
import { useCallback, useEffect, useMemo, useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { deleteDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
|
||||
import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms";
|
||||
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
|
||||
import { useDocuments } from "@/hooks/use-documents";
|
||||
import { documentsApiService } from "@/lib/apis/documents-api.service";
|
||||
import { cacheKeys } from "@/lib/query-client/cache-keys";
|
||||
import { DocumentsFilters } from "./components/DocumentsFilters";
|
||||
import { DocumentsTableShell, type SortKey } from "./components/DocumentsTableShell";
|
||||
import { PaginationControls } from "./components/PaginationControls";
|
||||
import { PAGE_SIZE, PaginationControls } from "./components/PaginationControls";
|
||||
import type { ColumnVisibility } from "./components/types";
|
||||
|
||||
function useDebounced<T>(value: T, delay = 250) {
|
||||
|
|
@ -31,70 +28,48 @@ function useDebounced<T>(value: T, delay = 250) {
|
|||
|
||||
export default function DocumentsTable() {
|
||||
const t = useTranslations("documents");
|
||||
const id = useId();
|
||||
const params = useParams();
|
||||
const router = useRouter();
|
||||
const searchSpaceId = Number(params.search_space_id);
|
||||
const { openDialog: openUploadDialog } = useDocumentUploadDialog();
|
||||
|
||||
const handleNewNote = useCallback(() => {
|
||||
router.push(`/dashboard/${searchSpaceId}/editor/new`);
|
||||
}, [router, searchSpaceId]);
|
||||
|
||||
const [search, setSearch] = useState("");
|
||||
const debouncedSearch = useDebounced(search, 250);
|
||||
const [activeTypes, setActiveTypes] = useState<DocumentTypeEnum[]>([]);
|
||||
const [columnVisibility, setColumnVisibility] = useState<ColumnVisibility>({
|
||||
title: true,
|
||||
document_type: true,
|
||||
content: true,
|
||||
created_by: true,
|
||||
created_at: true,
|
||||
status: true,
|
||||
});
|
||||
const [pageIndex, setPageIndex] = useState(0);
|
||||
const [pageSize, setPageSize] = useState(50);
|
||||
const [sortKey, setSortKey] = useState<SortKey>("title");
|
||||
const [sortDesc, setSortDesc] = useState(false);
|
||||
const [sortKey, setSortKey] = useState<SortKey>("created_at");
|
||||
const [sortDesc, setSortDesc] = useState(true);
|
||||
const [selectedIds, setSelectedIds] = useState<Set<number>>(new Set());
|
||||
const { data: rawTypeCounts } = useAtomValue(documentTypeCountsAtom);
|
||||
const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom);
|
||||
|
||||
// Build query parameters for fetching documents
|
||||
const queryParams = useMemo(
|
||||
() => ({
|
||||
search_space_id: searchSpaceId,
|
||||
page: pageIndex,
|
||||
page_size: pageSize,
|
||||
...(activeTypes.length > 0 && { document_types: activeTypes }),
|
||||
}),
|
||||
[searchSpaceId, pageIndex, pageSize, activeTypes]
|
||||
);
|
||||
// REAL-TIME: Use Electric SQL hook for live document updates (when not searching)
|
||||
const {
|
||||
documents: realtimeDocuments,
|
||||
typeCounts: realtimeTypeCounts,
|
||||
loading: realtimeLoading,
|
||||
error: realtimeError,
|
||||
} = useDocuments(searchSpaceId, activeTypes);
|
||||
|
||||
// Build search query parameters
|
||||
// Check if we're in search mode
|
||||
const isSearchMode = !!debouncedSearch.trim();
|
||||
|
||||
// Build search query parameters (only used when searching)
|
||||
const searchQueryParams = useMemo(
|
||||
() => ({
|
||||
search_space_id: searchSpaceId,
|
||||
page: pageIndex,
|
||||
page_size: pageSize,
|
||||
page_size: PAGE_SIZE,
|
||||
title: debouncedSearch.trim(),
|
||||
...(activeTypes.length > 0 && { document_types: activeTypes }),
|
||||
}),
|
||||
[searchSpaceId, pageIndex, pageSize, activeTypes, debouncedSearch]
|
||||
[searchSpaceId, pageIndex, activeTypes, debouncedSearch]
|
||||
);
|
||||
|
||||
// Use query for fetching documents
|
||||
const {
|
||||
data: documentsResponse,
|
||||
isLoading: isDocumentsLoading,
|
||||
refetch: refetchDocuments,
|
||||
error: documentsError,
|
||||
} = useQuery({
|
||||
queryKey: cacheKeys.documents.globalQueryParams(queryParams),
|
||||
queryFn: () => documentsApiService.getDocuments({ queryParams }),
|
||||
staleTime: 3 * 60 * 1000, // 3 minutes
|
||||
enabled: !!searchSpaceId && !debouncedSearch.trim(),
|
||||
});
|
||||
|
||||
// Use query for searching documents
|
||||
// API search query (only enabled when searching - Electric doesn't do full-text search)
|
||||
const {
|
||||
data: searchResponse,
|
||||
isLoading: isSearchLoading,
|
||||
|
|
@ -103,134 +78,135 @@ export default function DocumentsTable() {
|
|||
} = useQuery({
|
||||
queryKey: cacheKeys.documents.globalQueryParams(searchQueryParams),
|
||||
queryFn: () => documentsApiService.searchDocuments({ queryParams: searchQueryParams }),
|
||||
staleTime: 3 * 60 * 1000, // 3 minutes
|
||||
enabled: !!searchSpaceId && !!debouncedSearch.trim(),
|
||||
staleTime: 30 * 1000, // 30 seconds for search (shorter since it's on-demand)
|
||||
enabled: !!searchSpaceId && isSearchMode,
|
||||
});
|
||||
|
||||
// Determine if we should show SurfSense docs (when no type filter or SURFSENSE_DOCS is selected)
|
||||
const showSurfsenseDocs =
|
||||
activeTypes.length === 0 || activeTypes.includes("SURFSENSE_DOCS" as DocumentTypeEnum);
|
||||
// Client-side sorting for real-time documents
|
||||
const sortedRealtimeDocuments = useMemo(() => {
|
||||
const docs = [...realtimeDocuments];
|
||||
docs.sort((a, b) => {
|
||||
const av = a[sortKey] ?? "";
|
||||
const bv = b[sortKey] ?? "";
|
||||
let cmp: number;
|
||||
if (sortKey === "created_at") {
|
||||
cmp = new Date(av as string).getTime() - new Date(bv as string).getTime();
|
||||
} else {
|
||||
cmp = String(av).localeCompare(String(bv));
|
||||
}
|
||||
return sortDesc ? -cmp : cmp;
|
||||
});
|
||||
return docs;
|
||||
}, [realtimeDocuments, sortKey, sortDesc]);
|
||||
|
||||
// Use query for fetching SurfSense docs
|
||||
const {
|
||||
data: surfsenseDocsResponse,
|
||||
isLoading: isSurfsenseDocsLoading,
|
||||
refetch: refetchSurfsenseDocs,
|
||||
} = useQuery({
|
||||
queryKey: ["surfsense-docs", debouncedSearch, pageIndex, pageSize],
|
||||
queryFn: () =>
|
||||
documentsApiService.getSurfsenseDocs({
|
||||
queryParams: {
|
||||
page: pageIndex,
|
||||
page_size: pageSize,
|
||||
title: debouncedSearch.trim() || undefined,
|
||||
},
|
||||
}),
|
||||
staleTime: 3 * 60 * 1000, // 3 minutes
|
||||
enabled: showSurfsenseDocs,
|
||||
});
|
||||
// Client-side pagination for real-time documents
|
||||
const paginatedRealtimeDocuments = useMemo(() => {
|
||||
const start = pageIndex * PAGE_SIZE;
|
||||
const end = start + PAGE_SIZE;
|
||||
return sortedRealtimeDocuments.slice(start, end);
|
||||
}, [sortedRealtimeDocuments, pageIndex]);
|
||||
|
||||
// Transform SurfSense docs to match the Document type
|
||||
const surfsenseDocsAsDocuments: Document[] = useMemo(() => {
|
||||
if (!surfsenseDocsResponse?.items) return [];
|
||||
return surfsenseDocsResponse.items.map((doc) => ({
|
||||
id: doc.id,
|
||||
title: doc.title,
|
||||
document_type: "SURFSENSE_DOCS",
|
||||
document_metadata: { source: doc.source },
|
||||
content: doc.content,
|
||||
created_at: new Date().toISOString(),
|
||||
search_space_id: -1, // Special value for global docs
|
||||
}));
|
||||
}, [surfsenseDocsResponse]);
|
||||
// Determine what to display based on search mode
|
||||
const displayDocs = isSearchMode
|
||||
? (searchResponse?.items || []).map((item) => ({
|
||||
id: item.id,
|
||||
search_space_id: item.search_space_id,
|
||||
document_type: item.document_type,
|
||||
title: item.title,
|
||||
created_by_id: item.created_by_id ?? null,
|
||||
created_by_name: item.created_by_name ?? null,
|
||||
created_at: item.created_at,
|
||||
status: (
|
||||
item as {
|
||||
status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string };
|
||||
}
|
||||
).status ?? { state: "ready" as const },
|
||||
}))
|
||||
: paginatedRealtimeDocuments;
|
||||
|
||||
// Merge type counts with SURFSENSE_DOCS count
|
||||
const typeCounts = useMemo(() => {
|
||||
const counts = { ...(rawTypeCounts || {}) };
|
||||
if (surfsenseDocsResponse?.total) {
|
||||
counts.SURFSENSE_DOCS = surfsenseDocsResponse.total;
|
||||
}
|
||||
return counts;
|
||||
}, [rawTypeCounts, surfsenseDocsResponse?.total]);
|
||||
const displayTotal = isSearchMode ? searchResponse?.total || 0 : sortedRealtimeDocuments.length;
|
||||
|
||||
// Extract documents and total based on search state
|
||||
const documents = debouncedSearch.trim()
|
||||
? searchResponse?.items || []
|
||||
: documentsResponse?.items || [];
|
||||
const total = debouncedSearch.trim() ? searchResponse?.total || 0 : documentsResponse?.total || 0;
|
||||
const loading = isSearchMode ? isSearchLoading : realtimeLoading;
|
||||
const error = isSearchMode ? searchError : realtimeError;
|
||||
|
||||
const loading = debouncedSearch.trim() ? isSearchLoading : isDocumentsLoading;
|
||||
const error = debouncedSearch.trim() ? searchError : documentsError;
|
||||
|
||||
// Display results directly
|
||||
const displayDocs = documents;
|
||||
const displayTotal = total;
|
||||
const pageStart = pageIndex * pageSize;
|
||||
const pageEnd = Math.min(pageStart + pageSize, displayTotal);
|
||||
const pageEnd = Math.min((pageIndex + 1) * PAGE_SIZE, displayTotal);
|
||||
|
||||
const onToggleType = (type: DocumentTypeEnum, checked: boolean) => {
|
||||
setActiveTypes((prev) => (checked ? [...prev, type] : prev.filter((t) => t !== type)));
|
||||
setActiveTypes((prev) => {
|
||||
if (checked) {
|
||||
return prev.includes(type) ? prev : [...prev, type];
|
||||
} else {
|
||||
return prev.filter((t) => t !== type);
|
||||
}
|
||||
});
|
||||
setPageIndex(0);
|
||||
};
|
||||
|
||||
const onToggleColumn = (id: keyof ColumnVisibility, checked: boolean) => {
|
||||
setColumnVisibility((prev) => ({ ...prev, [id]: checked }));
|
||||
};
|
||||
|
||||
const [isRefreshing, setIsRefreshing] = useState(false);
|
||||
|
||||
const refreshCurrentView = useCallback(async () => {
|
||||
if (isRefreshing) return;
|
||||
setIsRefreshing(true);
|
||||
try {
|
||||
if (debouncedSearch.trim()) {
|
||||
await refetchSearch();
|
||||
} else {
|
||||
await refetchDocuments();
|
||||
}
|
||||
toast.success(t("refresh_success") || "Documents refreshed");
|
||||
} finally {
|
||||
setIsRefreshing(false);
|
||||
}
|
||||
}, [debouncedSearch, refetchSearch, refetchDocuments, t, isRefreshing]);
|
||||
|
||||
// Create a delete function for single document deletion
|
||||
const deleteDocument = useCallback(
|
||||
async (id: number) => {
|
||||
try {
|
||||
await deleteDocumentMutation({ id });
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error("Failed to delete document:", error);
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[deleteDocumentMutation]
|
||||
);
|
||||
|
||||
const onBulkDelete = async () => {
|
||||
if (selectedIds.size === 0) {
|
||||
toast.error(t("no_rows_selected"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Filter out pending/processing documents - they cannot be deleted
|
||||
// For real-time mode, use sortedRealtimeDocuments (which has status)
|
||||
// For search mode, use searchResponse items (need to safely access status)
|
||||
const allDocs = isSearchMode
|
||||
? (searchResponse?.items || []).map((item) => ({
|
||||
id: item.id,
|
||||
status: (item as { status?: { state: string } }).status,
|
||||
}))
|
||||
: sortedRealtimeDocuments.map((doc) => ({ id: doc.id, status: doc.status }));
|
||||
|
||||
const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id));
|
||||
const deletableIds = selectedDocs
|
||||
.filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing")
|
||||
.map((doc) => doc.id);
|
||||
const inProgressCount = selectedIds.size - deletableIds.length;
|
||||
|
||||
if (inProgressCount > 0) {
|
||||
toast.warning(
|
||||
`${inProgressCount} document(s) are pending or processing and cannot be deleted.`
|
||||
);
|
||||
}
|
||||
|
||||
if (deletableIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Delete documents one by one using the mutation
|
||||
// Track 409 conflicts separately (document started processing after UI loaded)
|
||||
let conflictCount = 0;
|
||||
const results = await Promise.all(
|
||||
Array.from(selectedIds).map(async (id) => {
|
||||
deletableIds.map(async (id) => {
|
||||
try {
|
||||
await deleteDocumentMutation({ id });
|
||||
return true;
|
||||
} catch {
|
||||
} catch (error: unknown) {
|
||||
const status =
|
||||
(error as { response?: { status?: number } })?.response?.status ??
|
||||
(error as { status?: number })?.status;
|
||||
if (status === 409) conflictCount++;
|
||||
return false;
|
||||
}
|
||||
})
|
||||
);
|
||||
const okCount = results.filter((r) => r === true).length;
|
||||
if (okCount === selectedIds.size)
|
||||
if (okCount === deletableIds.length) {
|
||||
toast.success(t("delete_success_count", { count: okCount }));
|
||||
else toast.error(t("delete_partial_failed"));
|
||||
// Refetch the current page with appropriate method
|
||||
await refreshCurrentView();
|
||||
} else if (conflictCount > 0) {
|
||||
toast.error(`${conflictCount} document(s) started processing. Please try again later.`);
|
||||
} else {
|
||||
toast.error(t("delete_partial_failed"));
|
||||
}
|
||||
|
||||
// If in search mode, refetch search results to reflect deletion
|
||||
if (isSearchMode) {
|
||||
await refetchSearch();
|
||||
}
|
||||
// Real-time mode: Electric will sync the deletion automatically
|
||||
|
||||
setSelectedIds(new Set());
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
|
|
@ -238,10 +214,47 @@ export default function DocumentsTable() {
|
|||
}
|
||||
};
|
||||
|
||||
// Single document delete handler for RowActions
|
||||
const handleDeleteDocument = useCallback(
|
||||
async (id: number): Promise<boolean> => {
|
||||
try {
|
||||
await deleteDocumentMutation({ id });
|
||||
toast.success(t("delete_success") || "Document deleted");
|
||||
// If in search mode, refetch search results to reflect deletion
|
||||
if (isSearchMode) {
|
||||
await refetchSearch();
|
||||
}
|
||||
// Real-time mode: Electric will sync the deletion automatically
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.error("Error deleting document:", e);
|
||||
return false;
|
||||
}
|
||||
},
|
||||
[deleteDocumentMutation, isSearchMode, refetchSearch, t]
|
||||
);
|
||||
|
||||
const handleSortChange = useCallback((key: SortKey) => {
|
||||
setSortKey((currentKey) => {
|
||||
if (currentKey === key) {
|
||||
setSortDesc((v) => !v);
|
||||
return currentKey;
|
||||
}
|
||||
setSortDesc(false);
|
||||
return key;
|
||||
});
|
||||
}, []);
|
||||
|
||||
// Reset page when search changes (type filter already resets via onToggleType)
|
||||
// biome-ignore lint/correctness/useExhaustiveDependencies: Intentionally reset page on search change
|
||||
useEffect(() => {
|
||||
setPageIndex(0);
|
||||
}, [debouncedSearch]);
|
||||
|
||||
useEffect(() => {
|
||||
const mq = window.matchMedia("(max-width: 768px)");
|
||||
const apply = (isSmall: boolean) => {
|
||||
setColumnVisibility((prev) => ({ ...prev, content: !isSmall, created_at: !isSmall }));
|
||||
setColumnVisibility((prev) => ({ ...prev, created_by: !isSmall, created_at: !isSmall }));
|
||||
};
|
||||
apply(mq.matches);
|
||||
const onChange = (e: MediaQueryListEvent) => apply(e.matches);
|
||||
|
|
@ -254,81 +267,44 @@ export default function DocumentsTable() {
|
|||
initial={{ opacity: 0, y: 20 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ duration: 0.3 }}
|
||||
className="w-full px-6 py-4 space-y-6 min-h-[calc(100vh-64px)]"
|
||||
className="w-full max-w-7xl mx-auto px-6 pt-17 pb-6 space-y-6 min-h-[calc(100vh-64px)]"
|
||||
>
|
||||
<motion.div
|
||||
className="flex items-center justify-between"
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ delay: 0.1 }}
|
||||
>
|
||||
<div>
|
||||
<h2 className="text-xl md:text-2xl font-bold tracking-tight">{t("title")}</h2>
|
||||
<p className="text-xs md:text-sm text-muted-foreground">{t("subtitle")}</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<Button onClick={openUploadDialog} variant="default" size="sm">
|
||||
<Upload className="w-4 h-4 mr-2" />
|
||||
{t("upload_documents")}
|
||||
</Button>
|
||||
<Button onClick={handleNewNote} variant="outline" size="sm">
|
||||
<SquarePlus className="w-4 h-4 mr-2" />
|
||||
{t("create_shared_note")}
|
||||
</Button>
|
||||
<Button onClick={refreshCurrentView} variant="outline" size="sm" disabled={isRefreshing}>
|
||||
<RefreshCw className={`w-4 h-4 mr-2 ${isRefreshing ? "animate-spin" : ""}`} />
|
||||
{t("refresh")}
|
||||
</Button>
|
||||
</div>
|
||||
</motion.div>
|
||||
|
||||
{/* Filters - use real-time type counts */}
|
||||
<DocumentsFilters
|
||||
typeCounts={rawTypeCounts ?? {}}
|
||||
typeCounts={realtimeTypeCounts}
|
||||
selectedIds={selectedIds}
|
||||
onSearch={setSearch}
|
||||
searchValue={search}
|
||||
onBulkDelete={onBulkDelete}
|
||||
onToggleType={onToggleType}
|
||||
activeTypes={activeTypes}
|
||||
columnVisibility={columnVisibility}
|
||||
onToggleColumn={onToggleColumn}
|
||||
/>
|
||||
|
||||
{/* Table */}
|
||||
<DocumentsTableShell
|
||||
documents={displayDocs}
|
||||
loading={!!loading}
|
||||
error={!!error}
|
||||
onRefresh={refreshCurrentView}
|
||||
selectedIds={selectedIds}
|
||||
setSelectedIds={setSelectedIds}
|
||||
columnVisibility={columnVisibility}
|
||||
deleteDocument={deleteDocument}
|
||||
sortKey={sortKey}
|
||||
sortDesc={sortDesc}
|
||||
onSortChange={(key) => {
|
||||
if (sortKey === key) setSortDesc((v) => !v);
|
||||
else {
|
||||
setSortKey(key);
|
||||
setSortDesc(false);
|
||||
}
|
||||
}}
|
||||
onSortChange={handleSortChange}
|
||||
deleteDocument={handleDeleteDocument}
|
||||
searchSpaceId={String(searchSpaceId)}
|
||||
/>
|
||||
|
||||
{/* Pagination */}
|
||||
<PaginationControls
|
||||
pageIndex={pageIndex}
|
||||
pageSize={pageSize}
|
||||
total={displayTotal}
|
||||
onPageSizeChange={(s) => {
|
||||
setPageSize(s);
|
||||
setPageIndex(0);
|
||||
}}
|
||||
onFirst={() => setPageIndex(0)}
|
||||
onPrev={() => setPageIndex((i) => Math.max(0, i - 1))}
|
||||
onNext={() => setPageIndex((i) => (pageEnd < displayTotal ? i + 1 : i))}
|
||||
onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / pageSize) - 1))}
|
||||
onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / PAGE_SIZE) - 1))}
|
||||
canPrev={pageIndex > 0}
|
||||
canNext={pageEnd < displayTotal}
|
||||
id={id}
|
||||
/>
|
||||
</motion.div>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
import { atom } from "jotai";
|
||||
|
||||
// Atom to control the connector dialog open state from anywhere in the app
|
||||
export const connectorDialogOpenAtom = atom(false);
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
import { atomWithMutation } from "jotai-tanstack-query";
|
||||
import { toast } from "sonner";
|
||||
import type {
|
||||
CreateConnectorRequest,
|
||||
DeleteConnectorRequest,
|
||||
|
|
@ -17,15 +16,16 @@ export const createConnectorMutationAtom = atomWithMutation((get) => {
|
|||
const searchSpaceId = get(activeSearchSpaceIdAtom);
|
||||
|
||||
return {
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
|
||||
enabled: !!searchSpaceId,
|
||||
mutationFn: async (request: CreateConnectorRequest) => {
|
||||
return connectorsApiService.createConnector(request);
|
||||
},
|
||||
|
||||
onSuccess: () => {
|
||||
if (!searchSpaceId) return;
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId),
|
||||
});
|
||||
},
|
||||
};
|
||||
|
|
@ -35,15 +35,16 @@ export const updateConnectorMutationAtom = atomWithMutation((get) => {
|
|||
const searchSpaceId = get(activeSearchSpaceIdAtom);
|
||||
|
||||
return {
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
|
||||
enabled: !!searchSpaceId,
|
||||
mutationFn: async (request: UpdateConnectorRequest) => {
|
||||
return connectorsApiService.updateConnector(request);
|
||||
},
|
||||
|
||||
onSuccess: (_, request: UpdateConnectorRequest) => {
|
||||
if (!searchSpaceId) return;
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId),
|
||||
});
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.connectors.byId(String(request.id)),
|
||||
|
|
@ -56,15 +57,16 @@ export const deleteConnectorMutationAtom = atomWithMutation((get) => {
|
|||
const searchSpaceId = get(activeSearchSpaceIdAtom);
|
||||
|
||||
return {
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""),
|
||||
enabled: !!searchSpaceId,
|
||||
mutationFn: async (request: DeleteConnectorRequest) => {
|
||||
return connectorsApiService.deleteConnector(request);
|
||||
},
|
||||
|
||||
onSuccess: (_, request: DeleteConnectorRequest) => {
|
||||
if (!searchSpaceId) return;
|
||||
queryClient.setQueryData(
|
||||
cacheKeys.connectors.all(searchSpaceId!),
|
||||
cacheKeys.connectors.all(searchSpaceId),
|
||||
(oldData: GetConnectorsResponse | undefined) => {
|
||||
if (!oldData) return oldData;
|
||||
return oldData.filter((connector) => connector.id !== request.id);
|
||||
|
|
@ -88,9 +90,9 @@ export const indexConnectorMutationAtom = atomWithMutation((get) => {
|
|||
},
|
||||
|
||||
onSuccess: (response: IndexConnectorResponse) => {
|
||||
toast.success(response.message);
|
||||
if (!searchSpaceId) return;
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId!),
|
||||
queryKey: cacheKeys.connectors.all(searchSpaceId),
|
||||
});
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.connectors.byId(String(response.connector_id)),
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ export const uploadDocumentMutationAtom = atomWithMutation((get) => {
|
|||
},
|
||||
|
||||
onSuccess: () => {
|
||||
toast.success("Files uploaded for processing");
|
||||
// Note: Toast notification is handled by the caller (DocumentUploadTab) to use i18n
|
||||
// Invalidate logs summary to show new processing tasks immediately on documents page
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: cacheKeys.logs.summary(searchSpaceId ?? undefined),
|
||||
|
|
@ -95,7 +95,7 @@ export const deleteDocumentMutationAtom = atomWithMutation((get) => {
|
|||
},
|
||||
|
||||
onSuccess: (_, request: DeleteDocumentRequest) => {
|
||||
toast.success("Document deleted successfully");
|
||||
// Note: Toast is handled by the caller (page.tsx onBulkDelete) to show count info
|
||||
queryClient.setQueryData(
|
||||
cacheKeys.documents.globalQueryParams(documentsQueryParams),
|
||||
(oldData: GetDocumentsResponse | undefined) => {
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ import { Spinner } from "@/components/ui/spinner";
|
|||
import { Tabs, TabsContent } from "@/components/ui/tabs";
|
||||
import type { SearchSourceConnector } from "@/contracts/types/connector.types";
|
||||
import { useConnectorsElectric } from "@/hooks/use-connectors-electric";
|
||||
import { useDocumentsElectric } from "@/hooks/use-documents-electric";
|
||||
import { useDocuments } from "@/hooks/use-documents";
|
||||
import { useInbox } from "@/hooks/use-inbox";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { ConnectorDialogHeader } from "./connector-popup/components/connector-dialog-header";
|
||||
|
|
@ -37,7 +37,7 @@ import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab";
|
|||
import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view";
|
||||
import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view";
|
||||
|
||||
export const ConnectorIndicator: FC = () => {
|
||||
export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger = false }) => {
|
||||
const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom);
|
||||
const searchParams = useSearchParams();
|
||||
const { data: currentUser } = useAtomValue(currentUserAtom);
|
||||
|
|
@ -63,7 +63,9 @@ export const ConnectorIndicator: FC = () => {
|
|||
const llmConfigLoading = preferencesLoading || globalConfigsLoading;
|
||||
|
||||
// Fetch document type counts using Electric SQL + PGlite for real-time updates
|
||||
const { documentTypeCounts, loading: documentTypesLoading } = useDocumentsElectric(searchSpaceId);
|
||||
const { typeCounts: documentTypeCounts, loading: documentTypesLoading } = useDocuments(
|
||||
searchSpaceId ? Number(searchSpaceId) : null
|
||||
);
|
||||
|
||||
// Fetch notifications to detect indexing failures
|
||||
const { inboxItems = [] } = useInbox(
|
||||
|
|
@ -186,34 +188,38 @@ export const ConnectorIndicator: FC = () => {
|
|||
|
||||
return (
|
||||
<Dialog open={isOpen} onOpenChange={handleOpenChange}>
|
||||
<TooltipIconButton
|
||||
data-joyride="connector-icon"
|
||||
tooltip={hasConnectors ? `Manage ${activeConnectorsCount} connectors` : "Connect your data"}
|
||||
side="bottom"
|
||||
className={cn(
|
||||
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
|
||||
"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
|
||||
"outline-none focus:outline-none focus-visible:outline-none font-semibold text-xs",
|
||||
"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none"
|
||||
)}
|
||||
aria-label={
|
||||
hasConnectors ? `View ${activeConnectorsCount} connectors` : "Add your first connector"
|
||||
}
|
||||
onClick={() => handleOpenChange(true)}
|
||||
>
|
||||
{isLoading ? (
|
||||
<Spinner size="sm" />
|
||||
) : (
|
||||
<>
|
||||
<Cable className="size-4 stroke-[1.5px]" />
|
||||
{activeConnectorsCount > 0 && (
|
||||
<span className="absolute -top-0.5 right-0 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
|
||||
{activeConnectorsCount > 99 ? "99+" : activeConnectorsCount}
|
||||
</span>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</TooltipIconButton>
|
||||
{!hideTrigger && (
|
||||
<TooltipIconButton
|
||||
data-joyride="connector-icon"
|
||||
tooltip={
|
||||
hasConnectors ? `Manage ${activeConnectorsCount} connectors` : "Connect your data"
|
||||
}
|
||||
side="bottom"
|
||||
className={cn(
|
||||
"size-[34px] rounded-full p-1 flex items-center justify-center transition-colors relative",
|
||||
"hover:bg-muted-foreground/15 dark:hover:bg-muted-foreground/30",
|
||||
"outline-none focus:outline-none focus-visible:outline-none font-semibold text-xs",
|
||||
"border-0 ring-0 focus:ring-0 shadow-none focus:shadow-none"
|
||||
)}
|
||||
aria-label={
|
||||
hasConnectors ? `View ${activeConnectorsCount} connectors` : "Add your first connector"
|
||||
}
|
||||
onClick={() => handleOpenChange(true)}
|
||||
>
|
||||
{isLoading ? (
|
||||
<Spinner size="sm" />
|
||||
) : (
|
||||
<>
|
||||
<Cable className="size-4 stroke-[1.5px]" />
|
||||
{activeConnectorsCount > 0 && (
|
||||
<span className="absolute -top-0.5 right-0 flex items-center justify-center min-w-[16px] h-4 px-1 text-[10px] font-medium rounded-full bg-primary text-primary-foreground shadow-sm">
|
||||
{activeConnectorsCount > 99 ? "99+" : activeConnectorsCount}
|
||||
</span>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</TooltipIconButton>
|
||||
)}
|
||||
|
||||
<DialogContent className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border bg-muted text-foreground focus:outline-none focus:ring-0 focus-visible:outline-none focus-visible:ring-0 [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5">
|
||||
<DialogTitle className="sr-only">Manage Connectors</DialogTitle>
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import { format } from "date-fns";
|
||||
import { useAtomValue } from "jotai";
|
||||
import { useAtom, useAtomValue } from "jotai";
|
||||
import { useRouter, useSearchParams } from "next/navigation";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms";
|
||||
import {
|
||||
createConnectorMutationAtom,
|
||||
deleteConnectorMutationAtom,
|
||||
|
|
@ -49,7 +50,8 @@ export const useConnectorDialog = () => {
|
|||
const { mutateAsync: deleteConnector } = useAtomValue(deleteConnectorMutationAtom);
|
||||
const { mutateAsync: createConnector } = useAtomValue(createConnectorMutationAtom);
|
||||
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
// Use global atom for dialog open state so it can be controlled from anywhere
|
||||
const [isOpen, setIsOpen] = useAtom(connectorDialogOpenAtom);
|
||||
const [activeTab, setActiveTab] = useState("all");
|
||||
const [connectingId, setConnectingId] = useState<string | null>(null);
|
||||
const [isScrolled, setIsScrolled] = useState(false);
|
||||
|
|
@ -293,6 +295,7 @@ export const useConnectorDialog = () => {
|
|||
connectingConnectorType,
|
||||
viewingAccountsType,
|
||||
viewingMCPList,
|
||||
setIsOpen,
|
||||
]);
|
||||
|
||||
// Detect OAuth success / Failure and transition to config view
|
||||
|
|
@ -345,9 +348,10 @@ export const useConnectorDialog = () => {
|
|||
|
||||
// If we found the connector, find the matching OAuth/Composio connector by type
|
||||
if (newConnector) {
|
||||
const connectorType = newConnector.connector_type;
|
||||
oauthConnector =
|
||||
OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) ||
|
||||
COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type);
|
||||
OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) ||
|
||||
COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -358,8 +362,9 @@ export const useConnectorDialog = () => {
|
|||
COMPOSIO_CONNECTORS.find((c) => c.id === params.connector);
|
||||
|
||||
if (oauthConnector) {
|
||||
const oauthConnectorType = oauthConnector.connectorType;
|
||||
newConnector = result.data.find(
|
||||
(c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType
|
||||
(c: SearchSourceConnector) => c.connector_type === oauthConnectorType
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -399,7 +404,7 @@ export const useConnectorDialog = () => {
|
|||
// Invalid query params - log but don't crash
|
||||
console.warn("Invalid connector popup query params in OAuth success handler:", error);
|
||||
}
|
||||
}, [searchParams, searchSpaceId, refetchAllConnectors]);
|
||||
}, [searchParams, searchSpaceId, refetchAllConnectors, setIsOpen]);
|
||||
|
||||
// Handle OAuth connection
|
||||
const handleConnectOAuth = useCallback(
|
||||
|
|
@ -514,7 +519,7 @@ export const useConnectorDialog = () => {
|
|||
} finally {
|
||||
setConnectingId(null);
|
||||
}
|
||||
}, [searchSpaceId, createConnector, refetchAllConnectors]);
|
||||
}, [searchSpaceId, createConnector, refetchAllConnectors, setIsOpen]);
|
||||
|
||||
// Handle connecting non-OAuth connectors (like Tavily API)
|
||||
const handleConnectNonOAuth = useCallback(
|
||||
|
|
@ -677,12 +682,8 @@ export const useConnectorDialog = () => {
|
|||
const successMessage =
|
||||
currentConnectorType === "MCP_CONNECTOR"
|
||||
? `${connector.name} added successfully`
|
||||
: `${connectorTitle} connected and indexing started!`;
|
||||
toast.success(successMessage, {
|
||||
description: periodicEnabledForIndexing
|
||||
? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutesForIndexing)}.`
|
||||
: "You can continue working while we sync your data.",
|
||||
});
|
||||
: `${connectorTitle} connected and syncing started!`;
|
||||
toast.success(successMessage);
|
||||
|
||||
const url = new URL(window.location.href);
|
||||
url.searchParams.delete("modal");
|
||||
|
|
@ -782,7 +783,6 @@ export const useConnectorDialog = () => {
|
|||
updateConnector,
|
||||
indexConnector,
|
||||
router,
|
||||
getFrequencyLabel,
|
||||
]
|
||||
);
|
||||
|
||||
|
|
@ -1010,11 +1010,7 @@ export const useConnectorDialog = () => {
|
|||
);
|
||||
}
|
||||
|
||||
toast.success(`${indexingConfig.connectorTitle} indexing started`, {
|
||||
description: periodicEnabled
|
||||
? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutes)}.`
|
||||
: "You can continue working while we sync your data.",
|
||||
});
|
||||
toast.success(`${indexingConfig.connectorTitle} indexing started`);
|
||||
|
||||
// Update URL - the effect will handle closing the modal and clearing state
|
||||
const url = new URL(window.location.href);
|
||||
|
|
@ -1045,7 +1041,6 @@ export const useConnectorDialog = () => {
|
|||
updateConnector,
|
||||
periodicEnabled,
|
||||
frequencyMinutes,
|
||||
getFrequencyLabel,
|
||||
router,
|
||||
indexingConnectorConfig,
|
||||
]
|
||||
|
|
@ -1426,9 +1421,7 @@ export const useConnectorDialog = () => {
|
|||
end_date: endDateStr,
|
||||
},
|
||||
});
|
||||
toast.success("Indexing started", {
|
||||
description: "You can continue working while we sync your data.",
|
||||
});
|
||||
toast.success("Indexing started");
|
||||
|
||||
// Invalidate queries to refresh data
|
||||
queryClient.invalidateQueries({
|
||||
|
|
@ -1445,7 +1438,7 @@ export const useConnectorDialog = () => {
|
|||
}
|
||||
}
|
||||
},
|
||||
[searchSpaceId, indexConnector, queryClient]
|
||||
[searchSpaceId, indexConnector]
|
||||
);
|
||||
|
||||
// Handle going back from edit view
|
||||
|
|
@ -1527,7 +1520,7 @@ export const useConnectorDialog = () => {
|
|||
}
|
||||
}
|
||||
},
|
||||
[activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector]
|
||||
[activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector, setIsOpen]
|
||||
);
|
||||
|
||||
// Handle tab change
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { FileJson } from "lucide-react";
|
||||
import { FileJson, Loader2 } from "lucide-react";
|
||||
import React from "react";
|
||||
import { defaultStyles, JsonView } from "react-json-view-lite";
|
||||
import { Button } from "@/components/ui/button";
|
||||
|
|
@ -17,6 +17,7 @@ interface JsonMetadataViewerProps {
|
|||
trigger?: React.ReactNode;
|
||||
open?: boolean;
|
||||
onOpenChange?: (open: boolean) => void;
|
||||
loading?: boolean;
|
||||
}
|
||||
|
||||
export function JsonMetadataViewer({
|
||||
|
|
@ -25,6 +26,7 @@ export function JsonMetadataViewer({
|
|||
trigger,
|
||||
open,
|
||||
onOpenChange,
|
||||
loading,
|
||||
}: JsonMetadataViewerProps) {
|
||||
// Ensure metadata is a valid object
|
||||
const jsonData = React.useMemo(() => {
|
||||
|
|
@ -54,7 +56,13 @@ export function JsonMetadataViewer({
|
|||
</DialogTitle>
|
||||
</DialogHeader>
|
||||
<div className="mt-2 sm:mt-4 p-2 sm:p-4 bg-muted/30 rounded-md text-xs sm:text-sm">
|
||||
<JsonView data={jsonData} style={defaultStyles} />
|
||||
{loading ? (
|
||||
<div className="flex items-center justify-center py-12">
|
||||
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
) : (
|
||||
<JsonView data={jsonData} style={defaultStyles} />
|
||||
)}
|
||||
</div>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ export function LayoutDataProvider({
|
|||
});
|
||||
|
||||
// Fetch threads (40 total to allow up to 20 per section - shared/private)
|
||||
const { data: threadsData } = useQuery({
|
||||
const { data: threadsData, isPending: isLoadingThreads } = useQuery({
|
||||
queryKey: ["threads", searchSpaceId, { limit: 40 }],
|
||||
queryFn: () => fetchThreads(Number(searchSpaceId), 40),
|
||||
enabled: !!searchSpaceId,
|
||||
|
|
@ -585,6 +585,7 @@ export function LayoutDataProvider({
|
|||
theme={theme}
|
||||
setTheme={setTheme}
|
||||
isChatPage={isChatPage}
|
||||
isLoadingChats={isLoadingThreads}
|
||||
inbox={{
|
||||
isOpen: isInboxSidebarOpen,
|
||||
onOpenChange: setIsInboxSidebarOpen,
|
||||
|
|
|
|||
|
|
@ -74,6 +74,7 @@ interface LayoutShellProps {
|
|||
className?: string;
|
||||
// Inbox props
|
||||
inbox?: InboxProps;
|
||||
isLoadingChats?: boolean;
|
||||
}
|
||||
|
||||
export function LayoutShell({
|
||||
|
|
@ -110,6 +111,7 @@ export function LayoutShell({
|
|||
children,
|
||||
className,
|
||||
inbox,
|
||||
isLoadingChats = false,
|
||||
}: LayoutShellProps) {
|
||||
const isMobile = useIsMobile();
|
||||
const [mobileMenuOpen, setMobileMenuOpen] = useState(false);
|
||||
|
|
@ -162,6 +164,7 @@ export function LayoutShell({
|
|||
pageUsage={pageUsage}
|
||||
theme={theme}
|
||||
setTheme={setTheme}
|
||||
isLoadingChats={isLoadingChats}
|
||||
/>
|
||||
|
||||
<main className={cn("flex-1", isChatPage ? "overflow-hidden" : "overflow-auto")}>
|
||||
|
|
@ -232,6 +235,7 @@ export function LayoutShell({
|
|||
theme={theme}
|
||||
setTheme={setTheme}
|
||||
className="hidden md:flex border-r shrink-0"
|
||||
isLoadingChats={isLoadingChats}
|
||||
/>
|
||||
|
||||
{/* Docked Inbox Sidebar - renders as flex sibling between sidebar and content */}
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ interface MobileSidebarProps {
|
|||
pageUsage?: PageUsage;
|
||||
theme?: string;
|
||||
setTheme?: (theme: "light" | "dark" | "system") => void;
|
||||
isLoadingChats?: boolean;
|
||||
}
|
||||
|
||||
export function MobileSidebarTrigger({ onClick }: { onClick: () => void }) {
|
||||
|
|
@ -78,6 +79,7 @@ export function MobileSidebar({
|
|||
pageUsage,
|
||||
theme,
|
||||
setTheme,
|
||||
isLoadingChats = false,
|
||||
}: MobileSidebarProps) {
|
||||
const handleSearchSpaceSelect = (id: number) => {
|
||||
onSearchSpaceSelect(id);
|
||||
|
|
@ -158,6 +160,7 @@ export function MobileSidebar({
|
|||
theme={theme}
|
||||
setTheme={setTheme}
|
||||
className="w-full border-none"
|
||||
isLoadingChats={isLoadingChats}
|
||||
/>
|
||||
</div>
|
||||
</SheetContent>
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
import { FolderOpen, PenSquare } from "lucide-react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Skeleton } from "@/components/ui/skeleton";
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
|
||||
import { cn } from "@/lib/utils";
|
||||
import type { ChatItem, NavItem, PageUsage, SearchSpace, User } from "../../types/layout.types";
|
||||
|
|
@ -14,6 +15,15 @@ import { SidebarHeader } from "./SidebarHeader";
|
|||
import { SidebarSection } from "./SidebarSection";
|
||||
import { SidebarUserProfile } from "./SidebarUserProfile";
|
||||
|
||||
function ChatListItemSkeleton() {
|
||||
return (
|
||||
<div className="flex w-full items-center gap-2 rounded-md p-2">
|
||||
<Skeleton className="h-4 w-4 shrink-0 rounded" />
|
||||
<Skeleton className="h-4 w-full max-w-[180px]" />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
interface SidebarProps {
|
||||
searchSpace: SearchSpace | null;
|
||||
isCollapsed?: boolean;
|
||||
|
|
@ -39,6 +49,7 @@ interface SidebarProps {
|
|||
theme?: string;
|
||||
setTheme?: (theme: "light" | "dark" | "system") => void;
|
||||
className?: string;
|
||||
isLoadingChats?: boolean;
|
||||
}
|
||||
|
||||
export function Sidebar({
|
||||
|
|
@ -66,6 +77,7 @@ export function Sidebar({
|
|||
theme,
|
||||
setTheme,
|
||||
className,
|
||||
isLoadingChats = false,
|
||||
}: SidebarProps) {
|
||||
const t = useTranslations("sidebar");
|
||||
|
||||
|
|
@ -153,7 +165,15 @@ export function Sidebar({
|
|||
) : undefined
|
||||
}
|
||||
>
|
||||
{sharedChats.length > 0 ? (
|
||||
{isLoadingChats ? (
|
||||
<div className="flex flex-col gap-0.5">
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
</div>
|
||||
) : sharedChats.length > 0 ? (
|
||||
<div className="relative min-h-0 flex-1">
|
||||
<div
|
||||
className={`flex flex-col gap-0.5 max-h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${sharedChats.length > 4 ? "pb-8" : ""}`}
|
||||
|
|
@ -206,7 +226,15 @@ export function Sidebar({
|
|||
) : undefined
|
||||
}
|
||||
>
|
||||
{chats.length > 0 ? (
|
||||
{isLoadingChats ? (
|
||||
<div className="flex flex-col gap-0.5">
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
<ChatListItemSkeleton />
|
||||
</div>
|
||||
) : chats.length > 0 ? (
|
||||
<div className="relative flex-1 min-h-0">
|
||||
<div
|
||||
className={`flex flex-col gap-0.5 h-full overflow-y-auto scrollbar-thin scrollbar-thumb-muted-foreground/20 scrollbar-track-transparent ${chats.length > 4 ? "pb-8" : ""}`}
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
|
|||
case "FILE":
|
||||
return <File {...iconProps} />;
|
||||
case "GOOGLE_DRIVE_FILE":
|
||||
return <File {...iconProps} />;
|
||||
return <Image src="/connectors/google-drive.svg" alt="Google Drive" {...imgProps} />;
|
||||
case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR":
|
||||
return <Image src="/connectors/google-drive.svg" alt="Google Drive" {...imgProps} />;
|
||||
case "COMPOSIO_GMAIL_CONNECTOR":
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ export const documentTypeEnum = z.enum([
|
|||
"ELASTICSEARCH_CONNECTOR",
|
||||
"BOOKSTACK_CONNECTOR",
|
||||
"CIRCLEBACK",
|
||||
"OBSIDIAN_CONNECTOR",
|
||||
"SURFSENSE_DOCS",
|
||||
"NOTE",
|
||||
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
|
||||
|
|
@ -41,6 +42,8 @@ export const document = z.object({
|
|||
created_at: z.string(),
|
||||
updated_at: z.string().nullable(),
|
||||
search_space_id: z.number(),
|
||||
created_by_id: z.string().nullable().optional(),
|
||||
created_by_name: z.string().nullable().optional(),
|
||||
});
|
||||
|
||||
export const extensionDocumentContent = z.object({
|
||||
|
|
|
|||
|
|
@ -1,185 +0,0 @@
|
|||
"use client";
|
||||
|
||||
import { useEffect, useMemo, useRef, useState } from "react";
|
||||
import type { SyncHandle } from "@/lib/electric/client";
|
||||
import { useElectricClient } from "@/lib/electric/context";
|
||||
|
||||
interface Document {
|
||||
id: number;
|
||||
search_space_id: number;
|
||||
document_type: string;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook for managing documents with Electric SQL real-time sync
|
||||
*
|
||||
* Uses the Electric client from context (provided by ElectricProvider)
|
||||
* instead of initializing its own - prevents race conditions and memory leaks
|
||||
*/
|
||||
export function useDocumentsElectric(searchSpaceId: number | string | null) {
|
||||
// Get Electric client from context - ElectricProvider handles initialization
|
||||
const electricClient = useElectricClient();
|
||||
|
||||
const [documents, setDocuments] = useState<Document[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [error, setError] = useState<Error | null>(null);
|
||||
const syncHandleRef = useRef<SyncHandle | null>(null);
|
||||
const liveQueryRef = useRef<{ unsubscribe: () => void } | null>(null);
|
||||
const syncKeyRef = useRef<string | null>(null);
|
||||
|
||||
// Calculate document type counts from synced documents
|
||||
const documentTypeCounts = useMemo(() => {
|
||||
if (!documents.length) return {};
|
||||
|
||||
const counts: Record<string, number> = {};
|
||||
for (const doc of documents) {
|
||||
counts[doc.document_type] = (counts[doc.document_type] || 0) + 1;
|
||||
}
|
||||
return counts;
|
||||
}, [documents]);
|
||||
|
||||
// Start syncing when Electric client is available
|
||||
useEffect(() => {
|
||||
// Wait for both searchSpaceId and Electric client to be available
|
||||
if (!searchSpaceId || !electricClient) {
|
||||
setLoading(!electricClient); // Still loading if waiting for Electric
|
||||
if (!searchSpaceId) {
|
||||
setDocuments([]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a unique key for this sync to prevent duplicate subscriptions
|
||||
const syncKey = `documents_${searchSpaceId}`;
|
||||
if (syncKeyRef.current === syncKey) {
|
||||
// Already syncing for this search space
|
||||
return;
|
||||
}
|
||||
|
||||
let mounted = true;
|
||||
syncKeyRef.current = syncKey;
|
||||
|
||||
async function startSync() {
|
||||
try {
|
||||
console.log("[useDocumentsElectric] Starting sync for search space:", searchSpaceId);
|
||||
|
||||
const handle = await electricClient.syncShape({
|
||||
table: "documents",
|
||||
where: `search_space_id = ${searchSpaceId}`,
|
||||
columns: ["id", "document_type", "search_space_id", "created_at"],
|
||||
primaryKey: ["id"],
|
||||
});
|
||||
|
||||
console.log("[useDocumentsElectric] Sync started:", {
|
||||
isUpToDate: handle.isUpToDate,
|
||||
});
|
||||
|
||||
// Wait for initial sync with timeout
|
||||
if (!handle.isUpToDate && handle.initialSyncPromise) {
|
||||
try {
|
||||
await Promise.race([
|
||||
handle.initialSyncPromise,
|
||||
new Promise((resolve) => setTimeout(resolve, 2000)),
|
||||
]);
|
||||
} catch (syncErr) {
|
||||
console.error("[useDocumentsElectric] Initial sync failed:", syncErr);
|
||||
}
|
||||
}
|
||||
|
||||
if (!mounted) {
|
||||
handle.unsubscribe();
|
||||
return;
|
||||
}
|
||||
|
||||
syncHandleRef.current = handle;
|
||||
setLoading(false);
|
||||
setError(null);
|
||||
|
||||
// Fetch initial documents
|
||||
await fetchDocuments();
|
||||
|
||||
// Set up live query for real-time updates
|
||||
await setupLiveQuery();
|
||||
} catch (err) {
|
||||
if (!mounted) return;
|
||||
console.error("[useDocumentsElectric] Failed to start sync:", err);
|
||||
setError(err instanceof Error ? err : new Error("Failed to sync documents"));
|
||||
setLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchDocuments() {
|
||||
try {
|
||||
const result = await electricClient.db.query<Document>(
|
||||
`SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`,
|
||||
[searchSpaceId]
|
||||
);
|
||||
if (mounted) {
|
||||
setDocuments(result.rows || []);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[useDocumentsElectric] Failed to fetch:", err);
|
||||
}
|
||||
}
|
||||
|
||||
async function setupLiveQuery() {
|
||||
try {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const db = electricClient.db as any;
|
||||
|
||||
if (db.live?.query && typeof db.live.query === "function") {
|
||||
const liveQuery = await db.live.query(
|
||||
`SELECT id, document_type, search_space_id, created_at FROM documents WHERE search_space_id = $1 ORDER BY created_at DESC`,
|
||||
[searchSpaceId]
|
||||
);
|
||||
|
||||
if (!mounted) {
|
||||
liveQuery.unsubscribe?.();
|
||||
return;
|
||||
}
|
||||
|
||||
// Set initial results
|
||||
if (liveQuery.initialResults?.rows) {
|
||||
setDocuments(liveQuery.initialResults.rows);
|
||||
} else if (liveQuery.rows) {
|
||||
setDocuments(liveQuery.rows);
|
||||
}
|
||||
|
||||
// Subscribe to changes
|
||||
if (typeof liveQuery.subscribe === "function") {
|
||||
liveQuery.subscribe((result: { rows: Document[] }) => {
|
||||
if (mounted && result.rows) {
|
||||
setDocuments(result.rows);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (typeof liveQuery.unsubscribe === "function") {
|
||||
liveQueryRef.current = liveQuery;
|
||||
}
|
||||
}
|
||||
} catch (liveErr) {
|
||||
console.error("[useDocumentsElectric] Failed to set up live query:", liveErr);
|
||||
}
|
||||
}
|
||||
|
||||
startSync();
|
||||
|
||||
return () => {
|
||||
mounted = false;
|
||||
syncKeyRef.current = null;
|
||||
|
||||
if (syncHandleRef.current) {
|
||||
syncHandleRef.current.unsubscribe();
|
||||
syncHandleRef.current = null;
|
||||
}
|
||||
if (liveQueryRef.current) {
|
||||
liveQueryRef.current.unsubscribe();
|
||||
liveQueryRef.current = null;
|
||||
}
|
||||
};
|
||||
}, [searchSpaceId, electricClient]);
|
||||
|
||||
return { documentTypeCounts, loading, error };
|
||||
}
|
||||
449
surfsense_web/hooks/use-documents.ts
Normal file
449
surfsense_web/hooks/use-documents.ts
Normal file
|
|
@ -0,0 +1,449 @@
|
|||
"use client";
|
||||
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
|
||||
import { documentsApiService } from "@/lib/apis/documents-api.service";
|
||||
import type { SyncHandle } from "@/lib/electric/client";
|
||||
import { useElectricClient } from "@/lib/electric/context";
|
||||
|
||||
// Stable empty array to prevent infinite re-renders when no typeFilter is provided
|
||||
const EMPTY_TYPE_FILTER: DocumentTypeEnum[] = [];
|
||||
|
||||
// Document status type (matches backend DocumentStatus JSONB)
|
||||
export interface DocumentStatusType {
|
||||
state: "ready" | "pending" | "processing" | "failed";
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
// Document from Electric sync (lightweight table columns - NO content/metadata)
|
||||
interface DocumentElectric {
|
||||
id: number;
|
||||
search_space_id: number;
|
||||
document_type: string;
|
||||
title: string;
|
||||
created_by_id: string | null;
|
||||
created_at: string;
|
||||
status: DocumentStatusType | null;
|
||||
}
|
||||
|
||||
// Document for display (with resolved user name)
|
||||
export interface DocumentDisplay {
|
||||
id: number;
|
||||
search_space_id: number;
|
||||
document_type: string;
|
||||
title: string;
|
||||
created_by_id: string | null;
|
||||
created_by_name: string | null;
|
||||
created_at: string;
|
||||
status: DocumentStatusType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deduplicate by ID and sort by created_at descending (newest first)
|
||||
*/
|
||||
function deduplicateAndSort<T extends { id: number; created_at: string }>(items: T[]): T[] {
|
||||
const seen = new Map<number, T>();
|
||||
for (const item of items) {
|
||||
// Keep the most recent version if duplicate
|
||||
const existing = seen.get(item.id);
|
||||
if (!existing || new Date(item.created_at) > new Date(existing.created_at)) {
|
||||
seen.set(item.id, item);
|
||||
}
|
||||
}
|
||||
return Array.from(seen.values()).sort(
|
||||
(a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a document has valid/complete data
|
||||
*/
|
||||
function isValidDocument(doc: DocumentElectric): boolean {
|
||||
return doc.id != null && doc.title != null && doc.title !== "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Real-time documents hook with Electric SQL
|
||||
*
|
||||
* Architecture (100% Reliable):
|
||||
* 1. API is the PRIMARY source of truth - always loads first
|
||||
* 2. Electric provides REAL-TIME updates for additions and deletions
|
||||
* 3. Use syncHandle.isUpToDate to determine if deletions can be trusted
|
||||
* 4. Handles bulk deletions correctly by checking sync state
|
||||
*
|
||||
* @param searchSpaceId - The search space ID to filter documents
|
||||
* @param typeFilter - Optional document types to filter by
|
||||
*/
|
||||
export function useDocuments(
|
||||
searchSpaceId: number | null,
|
||||
typeFilter: DocumentTypeEnum[] = EMPTY_TYPE_FILTER
|
||||
) {
|
||||
const electricClient = useElectricClient();
|
||||
|
||||
const [documents, setDocuments] = useState<DocumentDisplay[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [error, setError] = useState<Error | null>(null);
|
||||
|
||||
// Track if initial API load is complete (source of truth)
|
||||
const apiLoadedRef = useRef(false);
|
||||
|
||||
// User cache: userId → displayName
|
||||
const userCacheRef = useRef<Map<string, string>>(new Map());
|
||||
|
||||
// Electric sync refs
|
||||
const syncHandleRef = useRef<SyncHandle | null>(null);
|
||||
const liveQueryRef = useRef<{ unsubscribe?: () => void } | null>(null);
|
||||
|
||||
// Real-time type counts
|
||||
const typeCounts = useMemo(() => {
|
||||
const counts: Record<string, number> = {};
|
||||
for (const doc of documents) {
|
||||
counts[doc.document_type] = (counts[doc.document_type] || 0) + 1;
|
||||
}
|
||||
return counts;
|
||||
}, [documents]);
|
||||
|
||||
// Populate user cache from API response
|
||||
const populateUserCache = useCallback(
|
||||
(items: Array<{ created_by_id?: string | null; created_by_name?: string | null }>) => {
|
||||
for (const item of items) {
|
||||
if (item.created_by_id && item.created_by_name) {
|
||||
userCacheRef.current.set(item.created_by_id, item.created_by_name);
|
||||
}
|
||||
}
|
||||
},
|
||||
[]
|
||||
);
|
||||
|
||||
// Convert API item to display doc
|
||||
const apiToDisplayDoc = useCallback(
|
||||
(item: {
|
||||
id: number;
|
||||
search_space_id: number;
|
||||
document_type: string;
|
||||
title: string;
|
||||
created_by_id?: string | null;
|
||||
created_by_name?: string | null;
|
||||
created_at: string;
|
||||
status?: DocumentStatusType | null;
|
||||
}): DocumentDisplay => ({
|
||||
id: item.id,
|
||||
search_space_id: item.search_space_id,
|
||||
document_type: item.document_type,
|
||||
title: item.title,
|
||||
created_by_id: item.created_by_id ?? null,
|
||||
created_by_name: item.created_by_name ?? null,
|
||||
created_at: item.created_at,
|
||||
status: item.status ?? { state: "ready" },
|
||||
}),
|
||||
[]
|
||||
);
|
||||
|
||||
// Convert Electric doc to display doc
|
||||
const electricToDisplayDoc = useCallback(
|
||||
(doc: DocumentElectric): DocumentDisplay => ({
|
||||
...doc,
|
||||
created_by_name: doc.created_by_id
|
||||
? (userCacheRef.current.get(doc.created_by_id) ?? null)
|
||||
: null,
|
||||
status: doc.status ?? { state: "ready" },
|
||||
}),
|
||||
[]
|
||||
);
|
||||
|
||||
// EFFECT 1: Load from API (PRIMARY source of truth)
|
||||
useEffect(() => {
|
||||
if (!searchSpaceId) {
|
||||
setLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Capture validated value for async closure
|
||||
const spaceId = searchSpaceId;
|
||||
const currentTypeFilter = typeFilter;
|
||||
|
||||
let mounted = true;
|
||||
apiLoadedRef.current = false;
|
||||
|
||||
async function loadFromApi() {
|
||||
try {
|
||||
setLoading(true);
|
||||
console.log("[useDocuments] Loading from API (source of truth):", spaceId);
|
||||
|
||||
const response = await documentsApiService.getDocuments({
|
||||
queryParams: {
|
||||
search_space_id: spaceId,
|
||||
page: 0,
|
||||
page_size: -1, // Fetch all documents
|
||||
...(currentTypeFilter.length > 0 && { document_types: currentTypeFilter }),
|
||||
},
|
||||
});
|
||||
|
||||
if (!mounted) return;
|
||||
|
||||
populateUserCache(response.items);
|
||||
const docs = response.items.map(apiToDisplayDoc);
|
||||
setDocuments(docs);
|
||||
apiLoadedRef.current = true;
|
||||
setError(null);
|
||||
console.log("[useDocuments] API loaded", docs.length, "documents");
|
||||
} catch (err) {
|
||||
if (!mounted) return;
|
||||
console.error("[useDocuments] API load failed:", err);
|
||||
setError(err instanceof Error ? err : new Error("Failed to load documents"));
|
||||
} finally {
|
||||
if (mounted) setLoading(false);
|
||||
}
|
||||
}
|
||||
|
||||
loadFromApi();
|
||||
|
||||
return () => {
|
||||
mounted = false;
|
||||
};
|
||||
}, [searchSpaceId, typeFilter, populateUserCache, apiToDisplayDoc]);
|
||||
|
||||
// EFFECT 2: Start Electric sync + live query for real-time updates
|
||||
useEffect(() => {
|
||||
if (!searchSpaceId || !electricClient) return;
|
||||
|
||||
// Capture validated values for async closure
|
||||
const spaceId = searchSpaceId;
|
||||
const client = electricClient;
|
||||
const currentTypeFilter = typeFilter;
|
||||
|
||||
let mounted = true;
|
||||
|
||||
async function setupElectricRealtime() {
|
||||
// Cleanup previous subscriptions
|
||||
if (syncHandleRef.current) {
|
||||
syncHandleRef.current.unsubscribe();
|
||||
syncHandleRef.current = null;
|
||||
}
|
||||
if (liveQueryRef.current) {
|
||||
liveQueryRef.current.unsubscribe?.();
|
||||
liveQueryRef.current = null;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log("[useDocuments] Starting Electric sync for real-time updates");
|
||||
|
||||
// Start Electric sync
|
||||
const handle = await client.syncShape({
|
||||
table: "documents",
|
||||
where: `search_space_id = ${spaceId}`,
|
||||
columns: [
|
||||
"id",
|
||||
"document_type",
|
||||
"search_space_id",
|
||||
"title",
|
||||
"created_by_id",
|
||||
"created_at",
|
||||
"status",
|
||||
],
|
||||
primaryKey: ["id"],
|
||||
});
|
||||
|
||||
if (!mounted) {
|
||||
handle.unsubscribe();
|
||||
return;
|
||||
}
|
||||
|
||||
syncHandleRef.current = handle;
|
||||
console.log("[useDocuments] Sync started, isUpToDate:", handle.isUpToDate);
|
||||
|
||||
// Wait for initial sync (with timeout)
|
||||
if (!handle.isUpToDate && handle.initialSyncPromise) {
|
||||
await Promise.race([
|
||||
handle.initialSyncPromise,
|
||||
new Promise((resolve) => setTimeout(resolve, 5000)),
|
||||
]);
|
||||
console.log("[useDocuments] Initial sync complete, isUpToDate:", handle.isUpToDate);
|
||||
}
|
||||
|
||||
if (!mounted) return;
|
||||
|
||||
// Set up live query
|
||||
const db = client.db as {
|
||||
live?: {
|
||||
query: <T>(
|
||||
sql: string,
|
||||
params?: (number | string)[]
|
||||
) => Promise<{
|
||||
subscribe: (cb: (result: { rows: T[] }) => void) => void;
|
||||
unsubscribe?: () => void;
|
||||
}>;
|
||||
};
|
||||
};
|
||||
|
||||
if (!db.live?.query) {
|
||||
console.warn("[useDocuments] Live queries not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at, status
|
||||
FROM documents
|
||||
WHERE search_space_id = $1`;
|
||||
|
||||
const params: (number | string)[] = [spaceId];
|
||||
|
||||
if (currentTypeFilter.length > 0) {
|
||||
const placeholders = currentTypeFilter.map((_, i) => `$${i + 2}`).join(", ");
|
||||
query += ` AND document_type IN (${placeholders})`;
|
||||
params.push(...currentTypeFilter);
|
||||
}
|
||||
|
||||
query += ` ORDER BY created_at DESC`;
|
||||
|
||||
const liveQuery = await db.live.query<DocumentElectric>(query, params);
|
||||
|
||||
if (!mounted) {
|
||||
liveQuery.unsubscribe?.();
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("[useDocuments] Live query subscribed");
|
||||
|
||||
liveQuery.subscribe((result: { rows: DocumentElectric[] }) => {
|
||||
if (!mounted || !result.rows) return;
|
||||
|
||||
// DEBUG: Log first few raw documents to see what's coming from Electric
|
||||
console.log("[useDocuments] Raw data sample:", result.rows.slice(0, 3));
|
||||
|
||||
const validItems = result.rows.filter(isValidDocument);
|
||||
const isFullySynced = syncHandleRef.current?.isUpToDate ?? false;
|
||||
|
||||
console.log(
|
||||
`[useDocuments] Live update: ${result.rows.length} raw, ${validItems.length} valid, synced: ${isFullySynced}`
|
||||
);
|
||||
|
||||
// Fetch user names for new users (non-blocking)
|
||||
const unknownUserIds = validItems
|
||||
.filter(
|
||||
(doc): doc is DocumentElectric & { created_by_id: string } =>
|
||||
doc.created_by_id !== null && !userCacheRef.current.has(doc.created_by_id)
|
||||
)
|
||||
.map((doc) => doc.created_by_id);
|
||||
|
||||
if (unknownUserIds.length > 0) {
|
||||
documentsApiService
|
||||
.getDocuments({
|
||||
queryParams: { search_space_id: spaceId, page: 0, page_size: 20 },
|
||||
})
|
||||
.then((response) => {
|
||||
populateUserCache(response.items);
|
||||
if (mounted) {
|
||||
setDocuments((prev) =>
|
||||
prev.map((doc) => ({
|
||||
...doc,
|
||||
created_by_name: doc.created_by_id
|
||||
? (userCacheRef.current.get(doc.created_by_id) ?? null)
|
||||
: null,
|
||||
}))
|
||||
);
|
||||
}
|
||||
})
|
||||
.catch(() => {});
|
||||
}
|
||||
|
||||
// Smart update logic based on sync state
|
||||
setDocuments((prev) => {
|
||||
// Don't process if API hasn't loaded yet
|
||||
if (!apiLoadedRef.current) {
|
||||
console.log("[useDocuments] Waiting for API load, skipping live update");
|
||||
return prev;
|
||||
}
|
||||
|
||||
// Case 1: Live query is empty
|
||||
if (validItems.length === 0) {
|
||||
if (isFullySynced && prev.length > 0) {
|
||||
// Electric is fully synced and says 0 items - trust it (all deleted)
|
||||
console.log("[useDocuments] All documents deleted (Electric synced)");
|
||||
return [];
|
||||
}
|
||||
// Partial sync or error - keep existing
|
||||
console.log("[useDocuments] Empty live result, keeping existing");
|
||||
return prev;
|
||||
}
|
||||
|
||||
// Case 2: Electric is fully synced - TRUST IT COMPLETELY (handles bulk deletes)
|
||||
if (isFullySynced) {
|
||||
const liveDocs = deduplicateAndSort(validItems.map(electricToDisplayDoc));
|
||||
console.log(
|
||||
`[useDocuments] Synced update: ${liveDocs.length} docs (was ${prev.length})`
|
||||
);
|
||||
return liveDocs;
|
||||
}
|
||||
|
||||
// Case 3: Partial sync - only ADD new items, don't remove any
|
||||
const existingIds = new Set(prev.map((d) => d.id));
|
||||
const liveIds = new Set(validItems.map((d) => d.id));
|
||||
|
||||
// Find new items (in live but not in prev)
|
||||
const newItems = validItems
|
||||
.filter((item) => !existingIds.has(item.id))
|
||||
.map(electricToDisplayDoc);
|
||||
|
||||
// Find updated items (in both, update with latest data)
|
||||
const updatedPrev = prev.map((doc) => {
|
||||
if (liveIds.has(doc.id)) {
|
||||
const liveItem = validItems.find((v) => v.id === doc.id);
|
||||
if (liveItem) {
|
||||
return electricToDisplayDoc(liveItem);
|
||||
}
|
||||
}
|
||||
return doc;
|
||||
});
|
||||
|
||||
if (newItems.length > 0) {
|
||||
console.log(`[useDocuments] Adding ${newItems.length} new items (partial sync)`);
|
||||
return deduplicateAndSort([...newItems, ...updatedPrev]);
|
||||
}
|
||||
|
||||
return updatedPrev;
|
||||
});
|
||||
});
|
||||
|
||||
liveQueryRef.current = liveQuery;
|
||||
} catch (err) {
|
||||
console.error("[useDocuments] Electric setup failed:", err);
|
||||
// Don't set error - API data is already loaded
|
||||
}
|
||||
}
|
||||
|
||||
setupElectricRealtime();
|
||||
|
||||
return () => {
|
||||
mounted = false;
|
||||
if (syncHandleRef.current) {
|
||||
syncHandleRef.current.unsubscribe();
|
||||
syncHandleRef.current = null;
|
||||
}
|
||||
if (liveQueryRef.current) {
|
||||
liveQueryRef.current.unsubscribe?.();
|
||||
liveQueryRef.current = null;
|
||||
}
|
||||
};
|
||||
}, [searchSpaceId, electricClient, typeFilter, electricToDisplayDoc, populateUserCache]);
|
||||
|
||||
// Track previous searchSpaceId to detect actual changes
|
||||
const prevSearchSpaceIdRef = useRef<number | null>(null);
|
||||
|
||||
// Reset on search space change (not on initial mount)
|
||||
useEffect(() => {
|
||||
if (prevSearchSpaceIdRef.current !== null && prevSearchSpaceIdRef.current !== searchSpaceId) {
|
||||
setDocuments([]);
|
||||
apiLoadedRef.current = false;
|
||||
userCacheRef.current.clear();
|
||||
}
|
||||
prevSearchSpaceIdRef.current = searchSpaceId;
|
||||
}, [searchSpaceId]);
|
||||
|
||||
return {
|
||||
documents,
|
||||
typeCounts,
|
||||
total: documents.length,
|
||||
loading,
|
||||
error,
|
||||
};
|
||||
}
|
||||
|
|
@ -38,10 +38,14 @@ function deduplicateAndSort(items: InboxItem[]): InboxItem[] {
|
|||
|
||||
/**
|
||||
* Calculate the cutoff date for sync window
|
||||
* IMPORTANT: Rounds to the start of the day (midnight UTC) to ensure stable values
|
||||
* across re-renders. Without this, millisecond differences cause multiple syncs!
|
||||
*/
|
||||
function getSyncCutoffDate(): string {
|
||||
const cutoff = new Date();
|
||||
cutoff.setDate(cutoff.getDate() - SYNC_WINDOW_DAYS);
|
||||
// Round to start of day to prevent millisecond differences causing duplicate syncs
|
||||
cutoff.setUTCHours(0, 0, 0, 0);
|
||||
return cutoff.toISOString();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,10 +12,21 @@
|
|||
* 3. Works even if logout cleanup fails
|
||||
*/
|
||||
|
||||
import { PGlite } from "@electric-sql/pglite";
|
||||
import { PGlite, type Transaction } from "@electric-sql/pglite";
|
||||
import { live } from "@electric-sql/pglite/live";
|
||||
import { electricSync } from "@electric-sql/pglite-sync";
|
||||
|
||||
// Debug logging - only logs in development, silent in production
|
||||
const IS_DEV = process.env.NODE_ENV === "development";
|
||||
|
||||
function debugLog(...args: unknown[]) {
|
||||
if (IS_DEV) console.log(...args);
|
||||
}
|
||||
|
||||
function debugWarn(...args: unknown[]) {
|
||||
if (IS_DEV) console.warn(...args);
|
||||
}
|
||||
|
||||
// Types
|
||||
export interface ElectricClient {
|
||||
db: PGlite;
|
||||
|
|
@ -56,7 +67,14 @@ const pendingSyncs = new Map<string, Promise<SyncHandle>>();
|
|||
// v2: user-specific database architecture
|
||||
// v3: consistent cutoff date for sync+queries, visibility refresh support
|
||||
// v4: heartbeat-based stale notification detection with updated_at tracking
|
||||
const SYNC_VERSION = 4;
|
||||
// v5: fixed duplicate key errors (root cause: unstable cutoff dates in use-inbox.ts)
|
||||
// - added onMustRefetch handler for server-side refetch scenarios
|
||||
// - fixed getSyncCutoffDate to use stable midnight UTC timestamps
|
||||
// v6: real-time documents table - added title and created_by_id columns for live document display
|
||||
// v7: removed use-documents-electric.ts - consolidated to single documents sync to prevent conflicts
|
||||
// v8: added status column for real-time document processing status (ready/processing/failed)
|
||||
// v9: added pending state for accurate document queue visibility
|
||||
const SYNC_VERSION = 11;
|
||||
|
||||
// Database name prefix for identifying SurfSense databases
|
||||
const DB_PREFIX = "surfsense-";
|
||||
|
|
@ -77,7 +95,7 @@ function getDbName(userId: string): string {
|
|||
}
|
||||
|
||||
/**
|
||||
* Clean up databases from OTHER users (not the current user)
|
||||
* Clean up databases from OTHER users AND old versions
|
||||
* This is called on login to ensure clean state
|
||||
*/
|
||||
async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
|
||||
|
|
@ -85,6 +103,10 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
|
|||
return;
|
||||
}
|
||||
|
||||
// The exact database identifier we want to keep (current user + current version)
|
||||
// Format: "surfsense-{userId}-v{version}"
|
||||
const currentDbIdentifier = `${DB_PREFIX}${currentUserId}-v${SYNC_VERSION}`;
|
||||
|
||||
try {
|
||||
// Try to list all databases (not supported in all browsers)
|
||||
if (typeof window.indexedDB.databases === "function") {
|
||||
|
|
@ -95,26 +117,27 @@ async function cleanupOtherUserDatabases(currentUserId: string): Promise<void> {
|
|||
if (!dbName) continue;
|
||||
|
||||
// Check if this is a SurfSense database
|
||||
if (dbName.startsWith(DB_PREFIX) || dbName.includes("surfsense")) {
|
||||
// Don't delete current user's database
|
||||
if (dbName.includes(currentUserId)) {
|
||||
console.log(`[Electric] Keeping current user's database: ${dbName}`);
|
||||
if (dbName.includes("surfsense")) {
|
||||
// Check if this is the current database
|
||||
// PGlite stores with "/pglite/" prefix, so we check if the name ENDS WITH our identifier
|
||||
if (dbName.endsWith(currentDbIdentifier)) {
|
||||
debugLog(`[Electric] Keeping current database: ${dbName}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Delete databases from other users
|
||||
// Delete ALL other databases (other users OR old versions of current user)
|
||||
try {
|
||||
console.log(`[Electric] Deleting stale database: ${dbName}`);
|
||||
debugLog(`[Electric] Deleting stale database: ${dbName}`);
|
||||
window.indexedDB.deleteDatabase(dbName);
|
||||
} catch (deleteErr) {
|
||||
console.warn(`[Electric] Failed to delete database ${dbName}:`, deleteErr);
|
||||
debugWarn(`[Electric] Failed to delete database ${dbName}:`, deleteErr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// indexedDB.databases() not supported - that's okay, login cleanup is best-effort
|
||||
console.warn("[Electric] Could not enumerate databases for cleanup:", err);
|
||||
debugWarn("[Electric] Could not enumerate databases for cleanup:", err);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -140,7 +163,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
// If initialized for a different user, close the old client first
|
||||
if (electricClient && currentUserId !== userId) {
|
||||
console.log(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`);
|
||||
debugLog(`[Electric] User changed from ${currentUserId} to ${userId}, reinitializing...`);
|
||||
await cleanupElectric();
|
||||
}
|
||||
|
||||
|
|
@ -155,12 +178,12 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
initPromise = (async () => {
|
||||
try {
|
||||
// STEP 1: Clean up databases from other users (login-time cleanup)
|
||||
console.log("[Electric] Cleaning up databases from other users...");
|
||||
debugLog("[Electric] Cleaning up databases from other users...");
|
||||
await cleanupOtherUserDatabases(userId);
|
||||
|
||||
// STEP 2: Create user-specific PGlite database
|
||||
const dbName = getDbName(userId);
|
||||
console.log(`[Electric] Initializing database: ${dbName}`);
|
||||
debugLog(`[Electric] Initializing database: ${dbName}`);
|
||||
|
||||
const db = await PGlite.create({
|
||||
dataDir: dbName,
|
||||
|
|
@ -216,18 +239,22 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
`);
|
||||
|
||||
// Create the documents table schema in PGlite
|
||||
// Only sync minimal fields needed for type counts: id, document_type, search_space_id
|
||||
// Sync columns needed for real-time table display (lightweight - no content/metadata)
|
||||
await db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id INTEGER PRIMARY KEY,
|
||||
search_space_id INTEGER NOT NULL,
|
||||
document_type TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
created_by_id TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status JSONB DEFAULT '{"state": "ready"}'::jsonb
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents(search_space_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(document_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_type ON documents(search_space_id, document_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_status ON documents((status->>'state'));
|
||||
`);
|
||||
|
||||
await db.exec(`
|
||||
|
|
@ -290,14 +317,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Check if we already have an active sync for this shape (memory optimization)
|
||||
const existingHandle = activeSyncHandles.get(cacheKey);
|
||||
if (existingHandle) {
|
||||
console.log(`[Electric] Reusing existing sync handle for: ${cacheKey}`);
|
||||
debugLog(`[Electric] Reusing existing sync handle for: ${cacheKey}`);
|
||||
return existingHandle;
|
||||
}
|
||||
|
||||
// Check if there's already a pending sync for this shape (prevent race condition)
|
||||
const pendingSync = pendingSyncs.get(cacheKey);
|
||||
if (pendingSync) {
|
||||
console.log(`[Electric] Waiting for pending sync to complete: ${cacheKey}`);
|
||||
debugLog(`[Electric] Waiting for pending sync to complete: ${cacheKey}`);
|
||||
return pendingSync;
|
||||
}
|
||||
|
||||
|
|
@ -323,7 +350,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
if (singleQuoteCount % 2 !== 0) {
|
||||
// Odd number of quotes means unterminated string literal
|
||||
console.warn("Where clause has unmatched quotes, fixing:", where);
|
||||
debugWarn("Where clause has unmatched quotes, fixing:", where);
|
||||
// Add closing quote at the end
|
||||
validatedWhere = `${where}'`;
|
||||
params.where = validatedWhere;
|
||||
|
|
@ -337,15 +364,15 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
if (columns) params.columns = columns.join(",");
|
||||
|
||||
console.log("[Electric] Syncing shape with params:", params);
|
||||
console.log("[Electric] Electric URL:", `${electricUrl}/v1/shape`);
|
||||
console.log("[Electric] Where clause:", where, "Validated:", validatedWhere);
|
||||
debugLog("[Electric] Syncing shape with params:", params);
|
||||
debugLog("[Electric] Electric URL:", `${electricUrl}/v1/shape`);
|
||||
debugLog("[Electric] Where clause:", where, "Validated:", validatedWhere);
|
||||
|
||||
try {
|
||||
// Debug: Test Electric SQL connection directly first (DEV ONLY - skipped in production)
|
||||
if (process.env.NODE_ENV === "development") {
|
||||
const testUrl = `${electricUrl}/v1/shape?table=${table}&offset=-1${validatedWhere ? `&where=${encodeURIComponent(validatedWhere)}` : ""}`;
|
||||
console.log("[Electric] Testing Electric SQL directly:", testUrl);
|
||||
debugLog("[Electric] Testing Electric SQL directly:", testUrl);
|
||||
try {
|
||||
const testResponse = await fetch(testUrl);
|
||||
const testHeaders = {
|
||||
|
|
@ -353,9 +380,9 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
offset: testResponse.headers.get("electric-offset"),
|
||||
upToDate: testResponse.headers.get("electric-up-to-date"),
|
||||
};
|
||||
console.log("[Electric] Direct Electric SQL response headers:", testHeaders);
|
||||
debugLog("[Electric] Direct Electric SQL response headers:", testHeaders);
|
||||
const testData = await testResponse.json();
|
||||
console.log(
|
||||
debugLog(
|
||||
"[Electric] Direct Electric SQL data count:",
|
||||
Array.isArray(testData) ? testData.length : "not array",
|
||||
testData
|
||||
|
|
@ -396,14 +423,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Shorter timeout (5 seconds) as fallback
|
||||
setTimeout(() => {
|
||||
if (!syncResolved) {
|
||||
console.warn(
|
||||
debugWarn(
|
||||
`[Electric] ⚠️ Sync timeout for ${table} - checking isUpToDate one more time...`
|
||||
);
|
||||
// Check isUpToDate one more time before resolving
|
||||
// This will be checked after shape is created
|
||||
setTimeout(() => {
|
||||
if (!syncResolved) {
|
||||
console.warn(
|
||||
debugWarn(
|
||||
`[Electric] ⚠️ Sync timeout for ${table} - resolving anyway after 5s`
|
||||
);
|
||||
resolveInitialSync();
|
||||
|
|
@ -413,7 +440,22 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
}, 5000);
|
||||
});
|
||||
|
||||
// Include userId in shapeKey for user-specific sync state
|
||||
// ROOT CAUSE FIX: The duplicate key errors were caused by unstable cutoff dates
|
||||
// in use-inbox.ts generating different sync keys on each render.
|
||||
// That's now fixed (rounded to midnight UTC in getSyncCutoffDate).
|
||||
// We can safely use shapeKey for fast incremental sync.
|
||||
|
||||
const shapeKey = `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`;
|
||||
|
||||
// Type assertion to PGlite with electric extension
|
||||
const pgWithElectric = db as unknown as {
|
||||
electric: {
|
||||
syncShapeToTable: (
|
||||
config: Record<string, unknown>
|
||||
) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>;
|
||||
};
|
||||
};
|
||||
|
||||
const shapeConfig = {
|
||||
shape: {
|
||||
url: `${electricUrl}/v1/shape`,
|
||||
|
|
@ -425,9 +467,9 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
},
|
||||
table,
|
||||
primaryKey,
|
||||
shapeKey: `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`, // User-specific versioned key
|
||||
shapeKey, // Re-enabled for fast incremental sync (root cause in use-inbox.ts is fixed)
|
||||
onInitialSync: () => {
|
||||
console.log(
|
||||
debugLog(
|
||||
`[Electric] ✅ Initial sync complete for ${table} - data should now be in PGlite`
|
||||
);
|
||||
resolveInitialSync();
|
||||
|
|
@ -440,21 +482,37 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
);
|
||||
rejectInitialSync(error);
|
||||
},
|
||||
// Handle must-refetch: clear table data before Electric re-inserts from scratch
|
||||
// This prevents "duplicate key" errors when the shape is invalidated
|
||||
onMustRefetch: async (tx: Transaction) => {
|
||||
debugLog(
|
||||
`[Electric] ⚠️ Must refetch triggered for ${table} - clearing existing data`
|
||||
);
|
||||
try {
|
||||
// Delete rows matching the shape's WHERE clause
|
||||
// If no WHERE clause, delete all rows from the table
|
||||
if (validatedWhere) {
|
||||
// Parse the WHERE clause to build a DELETE statement
|
||||
// The WHERE clause is already validated and formatted
|
||||
await tx.exec(`DELETE FROM ${table} WHERE ${validatedWhere}`);
|
||||
debugLog(`[Electric] 🗑️ Cleared ${table} rows matching: ${validatedWhere}`);
|
||||
} else {
|
||||
// No WHERE clause means we're syncing the entire table
|
||||
await tx.exec(`DELETE FROM ${table}`);
|
||||
debugLog(`[Electric] 🗑️ Cleared all rows from ${table}`);
|
||||
}
|
||||
} catch (cleanupError) {
|
||||
console.error(
|
||||
`[Electric] ❌ Failed to clear ${table} during must-refetch:`,
|
||||
cleanupError
|
||||
);
|
||||
// Re-throw to let Electric handle the error
|
||||
throw cleanupError;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
console.log(
|
||||
"[Electric] syncShapeToTable config:",
|
||||
JSON.stringify(shapeConfig, null, 2)
|
||||
);
|
||||
|
||||
// Type assertion to PGlite with electric extension
|
||||
const pgWithElectric = db as PGlite & {
|
||||
electric: {
|
||||
syncShapeToTable: (
|
||||
config: typeof shapeConfig
|
||||
) => Promise<{ unsubscribe: () => void; isUpToDate: boolean; stream: unknown }>;
|
||||
};
|
||||
};
|
||||
debugLog("[Electric] syncShapeToTable config:", JSON.stringify(shapeConfig, null, 2));
|
||||
|
||||
let shape: { unsubscribe: () => void; isUpToDate: boolean; stream: unknown };
|
||||
try {
|
||||
|
|
@ -464,7 +522,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
const errorMessage =
|
||||
syncError instanceof Error ? syncError.message : String(syncError);
|
||||
if (errorMessage.includes("Already syncing")) {
|
||||
console.warn(
|
||||
debugWarn(
|
||||
`[Electric] Already syncing ${table}, waiting for existing sync to settle...`
|
||||
);
|
||||
|
||||
|
|
@ -474,12 +532,12 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Check if an active handle now exists (another sync might have completed)
|
||||
const existingHandle = activeSyncHandles.get(cacheKey);
|
||||
if (existingHandle) {
|
||||
console.log(`[Electric] Found existing handle after waiting: ${cacheKey}`);
|
||||
debugLog(`[Electric] Found existing handle after waiting: ${cacheKey}`);
|
||||
return existingHandle;
|
||||
}
|
||||
|
||||
// Retry once after waiting
|
||||
console.log(`[Electric] Retrying sync for ${table}...`);
|
||||
debugLog(`[Electric] Retrying sync for ${table}...`);
|
||||
try {
|
||||
shape = await pgWithElectric.electric.syncShapeToTable(shapeConfig);
|
||||
} catch (retryError) {
|
||||
|
|
@ -487,12 +545,10 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
retryError instanceof Error ? retryError.message : String(retryError);
|
||||
if (retryMessage.includes("Already syncing")) {
|
||||
// Still syncing - create a placeholder handle that indicates the table is being synced
|
||||
console.warn(
|
||||
`[Electric] ${table} still syncing, creating placeholder handle`
|
||||
);
|
||||
debugWarn(`[Electric] ${table} still syncing, creating placeholder handle`);
|
||||
const placeholderHandle: SyncHandle = {
|
||||
unsubscribe: () => {
|
||||
console.log(`[Electric] Placeholder unsubscribe for: ${cacheKey}`);
|
||||
debugLog(`[Electric] Placeholder unsubscribe for: ${cacheKey}`);
|
||||
activeSyncHandles.delete(cacheKey);
|
||||
},
|
||||
get isUpToDate() {
|
||||
|
|
@ -516,7 +572,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
}
|
||||
|
||||
// Log the actual shape result structure
|
||||
console.log("[Electric] Shape sync result (initial):", {
|
||||
debugLog("[Electric] Shape sync result (initial):", {
|
||||
hasUnsubscribe: typeof shape?.unsubscribe === "function",
|
||||
isUpToDate: shape?.isUpToDate,
|
||||
hasStream: !!shape?.stream,
|
||||
|
|
@ -525,7 +581,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
// Recommended Approach Step 1: Check isUpToDate immediately
|
||||
if (shape.isUpToDate) {
|
||||
console.log(
|
||||
debugLog(
|
||||
`[Electric] ✅ Sync already up-to-date for ${table} (resuming from previous state)`
|
||||
);
|
||||
resolveInitialSync();
|
||||
|
|
@ -533,7 +589,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Recommended Approach Step 2: Subscribe to stream and watch for "up-to-date" message
|
||||
if (shape?.stream) {
|
||||
const stream = shape.stream as any;
|
||||
console.log("[Electric] Shape stream details:", {
|
||||
debugLog("[Electric] Shape stream details:", {
|
||||
shapeHandle: stream?.shapeHandle,
|
||||
lastOffset: stream?.lastOffset,
|
||||
isUpToDate: stream?.isUpToDate,
|
||||
|
|
@ -546,14 +602,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// NOTE: We keep this subscription active - don't unsubscribe!
|
||||
// The stream is what Electric SQL uses for real-time updates
|
||||
if (typeof stream?.subscribe === "function") {
|
||||
console.log(
|
||||
debugLog(
|
||||
"[Electric] Subscribing to shape stream to watch for up-to-date message..."
|
||||
);
|
||||
// Subscribe but don't store unsubscribe - we want it to stay active
|
||||
stream.subscribe((messages: unknown[]) => {
|
||||
// Continue receiving updates even after sync is resolved
|
||||
if (!syncResolved) {
|
||||
console.log(
|
||||
debugLog(
|
||||
"[Electric] 🔵 Shape stream received messages:",
|
||||
messages?.length || 0
|
||||
);
|
||||
|
|
@ -570,14 +626,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
(typeof msg === "object" && "up-to-date" in msg)
|
||||
) {
|
||||
if (!syncResolved) {
|
||||
console.log(`[Electric] ✅ Received up-to-date message for ${table}`);
|
||||
debugLog(`[Electric] ✅ Received up-to-date message for ${table}`);
|
||||
resolveInitialSync();
|
||||
}
|
||||
// Continue listening for real-time updates - don't return!
|
||||
}
|
||||
}
|
||||
if (!syncResolved && messages.length > 0) {
|
||||
console.log(
|
||||
debugLog(
|
||||
"[Electric] First message:",
|
||||
JSON.stringify(messages[0], null, 2)
|
||||
);
|
||||
|
|
@ -586,16 +642,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
// Also check stream's isUpToDate property after receiving messages
|
||||
if (!syncResolved && stream?.isUpToDate) {
|
||||
console.log(`[Electric] ✅ Stream isUpToDate is true for ${table}`);
|
||||
debugLog(`[Electric] ✅ Stream isUpToDate is true for ${table}`);
|
||||
resolveInitialSync();
|
||||
}
|
||||
});
|
||||
|
||||
// Also check stream's isUpToDate property immediately
|
||||
if (stream?.isUpToDate) {
|
||||
console.log(
|
||||
`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`
|
||||
);
|
||||
debugLog(`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`);
|
||||
resolveInitialSync();
|
||||
}
|
||||
}
|
||||
|
|
@ -608,9 +662,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
}
|
||||
|
||||
if (shape.isUpToDate || stream?.isUpToDate) {
|
||||
console.log(
|
||||
`[Electric] ✅ Sync completed (detected via polling) for ${table}`
|
||||
);
|
||||
debugLog(`[Electric] ✅ Sync completed (detected via polling) for ${table}`);
|
||||
clearInterval(pollInterval);
|
||||
resolveInitialSync();
|
||||
}
|
||||
|
|
@ -621,7 +673,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
clearInterval(pollInterval);
|
||||
});
|
||||
} else {
|
||||
console.warn(
|
||||
debugWarn(
|
||||
`[Electric] ⚠️ No stream available for ${table}, relying on callback and timeout`
|
||||
);
|
||||
}
|
||||
|
|
@ -630,7 +682,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Create the sync handle with proper cleanup
|
||||
const syncHandle: SyncHandle = {
|
||||
unsubscribe: () => {
|
||||
console.log(`[Electric] Unsubscribing from: ${cacheKey}`);
|
||||
debugLog(`[Electric] Unsubscribing from: ${cacheKey}`);
|
||||
// Remove from cache first
|
||||
activeSyncHandles.delete(cacheKey);
|
||||
// Then unsubscribe from the shape
|
||||
|
|
@ -648,7 +700,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
|
||||
// Cache the sync handle for reuse (memory optimization)
|
||||
activeSyncHandles.set(cacheKey, syncHandle);
|
||||
console.log(
|
||||
debugLog(
|
||||
`[Electric] Cached sync handle for: ${cacheKey} (total cached: ${activeSyncHandles.size})`
|
||||
);
|
||||
|
||||
|
|
@ -660,7 +712,7 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
const response = await fetch(`${electricUrl}/v1/shape?table=${table}&offset=-1`, {
|
||||
method: "GET",
|
||||
});
|
||||
console.log(
|
||||
debugLog(
|
||||
"[Electric] Electric SQL server response:",
|
||||
response.status,
|
||||
response.statusText
|
||||
|
|
@ -682,14 +734,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
// Clean up the pending sync when done (whether success or failure)
|
||||
syncPromise.finally(() => {
|
||||
pendingSyncs.delete(cacheKey);
|
||||
console.log(`[Electric] Pending sync removed for: ${cacheKey}`);
|
||||
debugLog(`[Electric] Pending sync removed for: ${cacheKey}`);
|
||||
});
|
||||
|
||||
return syncPromise;
|
||||
},
|
||||
};
|
||||
|
||||
console.log(`[Electric] ✅ Initialized successfully for user: ${userId}`);
|
||||
debugLog(`[Electric] ✅ Initialized successfully for user: ${userId}`);
|
||||
return electricClient;
|
||||
} catch (error) {
|
||||
console.error("[Electric] Failed to initialize:", error);
|
||||
|
|
@ -715,10 +767,10 @@ export async function cleanupElectric(): Promise<void> {
|
|||
}
|
||||
|
||||
const userIdToClean = currentUserId;
|
||||
console.log(`[Electric] Cleaning up for user: ${userIdToClean}`);
|
||||
debugLog(`[Electric] Cleaning up for user: ${userIdToClean}`);
|
||||
|
||||
// Unsubscribe from all active sync handles first (memory cleanup)
|
||||
console.log(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`);
|
||||
debugLog(`[Electric] Unsubscribing from ${activeSyncHandles.size} active sync handles`);
|
||||
// Copy keys to array to avoid mutation during iteration
|
||||
const handleKeys = Array.from(activeSyncHandles.keys());
|
||||
for (const key of handleKeys) {
|
||||
|
|
@ -727,7 +779,7 @@ export async function cleanupElectric(): Promise<void> {
|
|||
try {
|
||||
handle.unsubscribe();
|
||||
} catch (err) {
|
||||
console.warn(`[Electric] Failed to unsubscribe from ${key}:`, err);
|
||||
debugWarn(`[Electric] Failed to unsubscribe from ${key}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -738,7 +790,7 @@ export async function cleanupElectric(): Promise<void> {
|
|||
try {
|
||||
// Close the PGlite database connection
|
||||
await electricClient.db.close();
|
||||
console.log("[Electric] Database closed");
|
||||
debugLog("[Electric] Database closed");
|
||||
} catch (error) {
|
||||
console.error("[Electric] Error closing database:", error);
|
||||
}
|
||||
|
|
@ -754,13 +806,13 @@ export async function cleanupElectric(): Promise<void> {
|
|||
try {
|
||||
const dbName = `${DB_PREFIX}${userIdToClean}-v${SYNC_VERSION}`;
|
||||
window.indexedDB.deleteDatabase(dbName);
|
||||
console.log(`[Electric] Deleted database: ${dbName}`);
|
||||
debugLog(`[Electric] Deleted database: ${dbName}`);
|
||||
} catch (err) {
|
||||
console.warn("[Electric] Failed to delete database:", err);
|
||||
debugWarn("[Electric] Failed to delete database:", err);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("[Electric] Cleanup complete");
|
||||
debugLog("[Electric] Cleanup complete");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -308,6 +308,7 @@
|
|||
"no_rows_selected": "No rows selected",
|
||||
"delete_success_count": "Successfully deleted {count} document(s)",
|
||||
"delete_partial_failed": "Some documents could not be deleted",
|
||||
"delete_success": "Document deleted successfully",
|
||||
"delete_error": "Error deleting documents",
|
||||
"filter_by_title": "Filter by title...",
|
||||
"bulk_delete": "Delete Selected",
|
||||
|
|
@ -328,7 +329,6 @@
|
|||
"filter_placeholder": "Filter by title...",
|
||||
"rows_per_page": "Rows per page",
|
||||
"refresh": "Refresh",
|
||||
"refresh_success": "Documents refreshed",
|
||||
"upload_documents": "Upload Documents",
|
||||
"create_shared_note": "Create Shared Note",
|
||||
"processing_documents": "Processing documents...",
|
||||
|
|
|
|||
|
|
@ -313,7 +313,6 @@
|
|||
"filter_placeholder": "按标题筛选...",
|
||||
"rows_per_page": "每页行数",
|
||||
"refresh": "刷新",
|
||||
"refresh_success": "文档已刷新",
|
||||
"upload_documents": "上传文档",
|
||||
"create_shared_note": "创建共享笔记",
|
||||
"processing_documents": "正在处理文档...",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue