mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
feat: add document status management with JSONB column for processing states in documents
This commit is contained in:
parent
04884caeef
commit
aef59d04eb
13 changed files with 526 additions and 135 deletions
|
|
@ -0,0 +1,80 @@
|
|||
"""Add status column to documents table for per-document processing status
|
||||
|
||||
Revision ID: 92
|
||||
Revises: 91
|
||||
Create Date: 2026-02-05
|
||||
|
||||
Changes:
|
||||
1. Add status column (JSONB) to documents table
|
||||
2. Default value is {"state": "ready"} for backward compatibility
|
||||
3. Existing documents are set to ready status
|
||||
4. Index created for efficient status filtering
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "92"
|
||||
down_revision: str | None = "91"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add status column to documents with default ready state."""
|
||||
|
||||
# 1. Add status column with default value for new rows
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'status'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
ADD COLUMN status JSONB NOT NULL DEFAULT '{"state": "ready"}'::jsonb;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# 2. Create index on status for efficient filtering by state
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_documents_status
|
||||
ON documents ((status->>'state'));
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove status column from documents."""
|
||||
|
||||
# Drop index
|
||||
op.execute(
|
||||
"""
|
||||
DROP INDEX IF EXISTS ix_documents_status;
|
||||
"""
|
||||
)
|
||||
|
||||
# Drop column
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'documents' AND column_name = 'status'
|
||||
) THEN
|
||||
ALTER TABLE documents
|
||||
DROP COLUMN status;
|
||||
END IF;
|
||||
END$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
|
@ -16,13 +16,14 @@ from sqlalchemy.orm import selectinload
|
|||
|
||||
from app.config import config
|
||||
from app.connectors.composio_connector import ComposioConnector
|
||||
from app.db import Document, DocumentType
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.connector_indexers.base import (
|
||||
calculate_date_range,
|
||||
check_duplicate_document_by_hash,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -266,18 +267,18 @@ async def index_composio_google_calendar(
|
|||
|
||||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
duplicate_content_count = (
|
||||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
documents_failed = 0 # Track events that failed processing
|
||||
duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all events, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
events_to_process = [] # List of dicts with document and event data
|
||||
new_documents_created = False
|
||||
|
||||
for event in events:
|
||||
# Send heartbeat periodically to indicate task is still alive
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
try:
|
||||
# Handle both standard Google API and potential Composio variations
|
||||
event_id = event.get("id", "") or event.get("eventId", "")
|
||||
|
|
@ -315,61 +316,24 @@ async def index_composio_google_calendar(
|
|||
|
||||
if existing_document:
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Update existing
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
existing_document.title = summary
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"location": location,
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
events_to_process.append({
|
||||
'document': existing_document,
|
||||
'is_new': False,
|
||||
'markdown_content': markdown_content,
|
||||
'content_hash': content_hash,
|
||||
'event_id': event_id,
|
||||
'summary': summary,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time,
|
||||
'location': location,
|
||||
})
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
|
|
@ -380,46 +344,16 @@ async def index_composio_google_calendar(
|
|||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
# A document with the same content already exists (likely from standard connector)
|
||||
logger.info(
|
||||
f"Event {summary} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata = {
|
||||
"event_id": event_id,
|
||||
"summary": summary,
|
||||
"start_time": start_time,
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
markdown_content, user_llm, document_metadata
|
||||
)
|
||||
else:
|
||||
summary_content = (
|
||||
f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
|
||||
)
|
||||
if location:
|
||||
summary_content += f"\nLocation: {location}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=summary,
|
||||
|
|
@ -436,19 +370,107 @@ async def index_composio_google_calendar(
|
|||
"toolkit_id": "googlecalendar",
|
||||
"source": "composio",
|
||||
},
|
||||
content=summary_content,
|
||||
content_hash=content_hash,
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
events_to_process.append({
|
||||
'document': document,
|
||||
'is_new': True,
|
||||
'markdown_content': markdown_content,
|
||||
'content_hash': content_hash,
|
||||
'event_id': event_id,
|
||||
'summary': summary,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time,
|
||||
'location': location,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents")
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
|
||||
|
||||
for item in events_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item['document']
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm:
|
||||
document_metadata_for_summary = {
|
||||
"event_id": item['event_id'],
|
||||
"summary": item['summary'],
|
||||
"start_time": item['start_time'],
|
||||
"document_type": "Google Calendar Event (Composio)",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
item['markdown_content'], user_llm, document_metadata_for_summary
|
||||
)
|
||||
else:
|
||||
summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}"
|
||||
if item['location']:
|
||||
summary_content += f"\nLocation: {item['location']}"
|
||||
summary_embedding = config.embedding_model_instance.embed(
|
||||
summary_content
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(item['markdown_content'])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item['summary']
|
||||
document.content = summary_content
|
||||
document.content_hash = item['content_hash']
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"event_id": item['event_id'],
|
||||
"summary": item['summary'],
|
||||
"start_time": item['start_time'],
|
||||
"end_time": item['end_time'],
|
||||
"location": item['location'],
|
||||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||
|
|
@ -457,7 +479,13 @@ async def index_composio_google_calendar(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||
documents_skipped += 1
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(f"Failed to update document status to failed: {status_error}")
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
|
||||
|
|
@ -490,10 +518,13 @@ async def index_composio_google_calendar(
|
|||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if duplicates were found
|
||||
warning_message = None
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -501,13 +532,15 @@ async def index_composio_google_calendar(
|
|||
{
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||
f"Composio Google Calendar indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed "
|
||||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
return documents_indexed, warning_message
|
||||
|
||||
|
|
|
|||
|
|
@ -100,6 +100,80 @@ class PodcastStatus(str, Enum):
|
|||
FAILED = "failed"
|
||||
|
||||
|
||||
class DocumentStatus:
|
||||
"""
|
||||
Helper class for document processing status (stored as JSONB).
|
||||
|
||||
Status values:
|
||||
- {"state": "ready"} - Document is fully processed and searchable
|
||||
- {"state": "pending"} - Document is queued, waiting to be processed
|
||||
- {"state": "processing"} - Document is currently being processed (only 1 at a time)
|
||||
- {"state": "failed", "reason": "..."} - Processing failed with reason
|
||||
|
||||
Usage:
|
||||
document.status = DocumentStatus.pending()
|
||||
document.status = DocumentStatus.processing()
|
||||
document.status = DocumentStatus.ready()
|
||||
document.status = DocumentStatus.failed("LLM rate limit exceeded")
|
||||
"""
|
||||
|
||||
# State constants
|
||||
READY = "ready"
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
FAILED = "failed"
|
||||
|
||||
@staticmethod
|
||||
def ready() -> dict:
|
||||
"""Return status dict for a ready/searchable document."""
|
||||
return {"state": DocumentStatus.READY}
|
||||
|
||||
@staticmethod
|
||||
def pending() -> dict:
|
||||
"""Return status dict for a document waiting to be processed."""
|
||||
return {"state": DocumentStatus.PENDING}
|
||||
|
||||
@staticmethod
|
||||
def processing() -> dict:
|
||||
"""Return status dict for a document being processed."""
|
||||
return {"state": DocumentStatus.PROCESSING}
|
||||
|
||||
@staticmethod
|
||||
def failed(reason: str, **extra_details) -> dict:
|
||||
"""
|
||||
Return status dict for a failed document.
|
||||
|
||||
Args:
|
||||
reason: Human-readable failure reason
|
||||
**extra_details: Optional additional details (duplicate_of, error_code, etc.)
|
||||
"""
|
||||
status = {"state": DocumentStatus.FAILED, "reason": reason[:500]} # Truncate long reasons
|
||||
if extra_details:
|
||||
status.update(extra_details)
|
||||
return status
|
||||
|
||||
@staticmethod
|
||||
def get_state(status: dict | None) -> str | None:
|
||||
"""Extract state from status dict, returns None if invalid."""
|
||||
if status is None:
|
||||
return None
|
||||
return status.get("state") if isinstance(status, dict) else None
|
||||
|
||||
@staticmethod
|
||||
def is_state(status: dict | None, state: str) -> bool:
|
||||
"""Check if status matches a given state."""
|
||||
return DocumentStatus.get_state(status) == state
|
||||
|
||||
@staticmethod
|
||||
def get_failure_reason(status: dict | None) -> str | None:
|
||||
"""Extract failure reason from status dict."""
|
||||
if status is None or not isinstance(status, dict):
|
||||
return None
|
||||
if status.get("state") == DocumentStatus.FAILED:
|
||||
return status.get("reason")
|
||||
return None
|
||||
|
||||
|
||||
class LiteLLMProvider(str, Enum):
|
||||
"""
|
||||
Enum for LLM providers supported by LiteLLM.
|
||||
|
|
@ -785,6 +859,17 @@ class Document(BaseModel, TimestampMixin):
|
|||
index=True,
|
||||
)
|
||||
|
||||
# Processing status for real-time visibility (JSONB)
|
||||
# Format: {"state": "ready"} or {"state": "processing"} or {"state": "failed", "reason": "..."}
|
||||
# Default to {"state": "ready"} for backward compatibility with existing documents
|
||||
status = Column(
|
||||
JSONB,
|
||||
nullable=False,
|
||||
default=DocumentStatus.ready,
|
||||
server_default=text("'{\"state\": \"ready\"}'::jsonb"),
|
||||
index=True,
|
||||
)
|
||||
|
||||
# Relationships
|
||||
search_space = relationship("SearchSpace", back_populates="documents")
|
||||
created_by = relationship("User", back_populates="documents")
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from app.db import (
|
|||
from app.schemas import (
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusSchema,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
|
|
@ -271,6 +272,14 @@ async def read_documents(
|
|||
if doc.created_by:
|
||||
created_by_name = doc.created_by.display_name or doc.created_by.email
|
||||
|
||||
# Parse status from JSONB
|
||||
status_data = None
|
||||
if hasattr(doc, 'status') and doc.status:
|
||||
status_data = DocumentStatusSchema(
|
||||
state=doc.status.get("state", "ready"),
|
||||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
|
|
@ -285,6 +294,7 @@ async def read_documents(
|
|||
search_space_id=doc.search_space_id,
|
||||
created_by_id=doc.created_by_id,
|
||||
created_by_name=created_by_name,
|
||||
status=status_data,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -417,6 +427,14 @@ async def search_documents(
|
|||
if doc.created_by:
|
||||
created_by_name = doc.created_by.display_name or doc.created_by.email
|
||||
|
||||
# Parse status from JSONB
|
||||
status_data = None
|
||||
if hasattr(doc, 'status') and doc.status:
|
||||
status_data = DocumentStatusSchema(
|
||||
state=doc.status.get("state", "ready"),
|
||||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
|
|
@ -431,6 +449,7 @@ async def search_documents(
|
|||
search_space_id=doc.search_space_id,
|
||||
created_by_id=doc.created_by_id,
|
||||
created_by_name=created_by_name,
|
||||
status=status_data,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -806,6 +825,7 @@ async def delete_document(
|
|||
"""
|
||||
Delete a document.
|
||||
Requires DOCUMENTS_DELETE permission for the search space.
|
||||
Documents in "processing" state cannot be deleted.
|
||||
"""
|
||||
try:
|
||||
result = await session.execute(
|
||||
|
|
@ -818,6 +838,14 @@ async def delete_document(
|
|||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
# Check if document is pending or currently being processed
|
||||
doc_state = document.status.get("state") if document.status else None
|
||||
if doc_state in ("pending", "processing"):
|
||||
raise HTTPException(
|
||||
status_code=409, # Conflict
|
||||
detail="Cannot delete document while it is pending or being processed. Please wait for processing to complete.",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from .documents import (
|
|||
DocumentBase,
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusSchema,
|
||||
DocumentTitleRead,
|
||||
DocumentTitleSearchResponse,
|
||||
DocumentUpdate,
|
||||
|
|
@ -87,6 +88,7 @@ __all__ = [
|
|||
# Document schemas
|
||||
"DocumentBase",
|
||||
"DocumentRead",
|
||||
"DocumentStatusSchema",
|
||||
"DocumentTitleRead",
|
||||
"DocumentTitleSearchResponse",
|
||||
"DocumentUpdate",
|
||||
|
|
|
|||
|
|
@ -41,6 +41,12 @@ class DocumentUpdate(DocumentBase):
|
|||
pass
|
||||
|
||||
|
||||
class DocumentStatusSchema(BaseModel):
|
||||
"""Document processing status."""
|
||||
state: str # "ready", "processing", "failed"
|
||||
reason: str | None = None
|
||||
|
||||
|
||||
class DocumentRead(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
|
|
@ -54,6 +60,7 @@ class DocumentRead(BaseModel):
|
|||
search_space_id: int
|
||||
created_by_id: UUID | None = None # User who created/uploaded this document
|
||||
created_by_name: str | None = None # Display name or email of the user who created this document
|
||||
status: DocumentStatusSchema | None = None # Processing status (ready, processing, failed)
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,34 @@ def get_current_timestamp() -> datetime:
|
|||
return datetime.now(UTC)
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
"""
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
set_committed_value(document, 'chunks', chunks)
|
||||
|
||||
|
||||
async def check_duplicate_document_by_hash(
|
||||
session: AsyncSession, content_hash: str
|
||||
) -> Document | None:
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"use client";
|
||||
|
||||
import { formatDistanceToNow } from "date-fns";
|
||||
import { Calendar, ChevronDown, ChevronUp, FileText, FileX, Loader2, Network, Plus, User } from "lucide-react";
|
||||
import { AlertCircle, Calendar, CheckCircle2, ChevronDown, ChevronUp, Clock, FileText, FileX, Loader2, Network, Plus, User } from "lucide-react";
|
||||
import { motion } from "motion/react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import React, { useRef, useState, useEffect, useCallback } from "react";
|
||||
|
|
@ -17,6 +17,7 @@ import {
|
|||
DialogTitle,
|
||||
} from "@/components/ui/dialog";
|
||||
import { Skeleton } from "@/components/ui/skeleton";
|
||||
import { Spinner } from "@/components/ui/spinner";
|
||||
import {
|
||||
Table,
|
||||
TableBody,
|
||||
|
|
@ -29,7 +30,61 @@ import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip
|
|||
import { documentsApiService } from "@/lib/apis/documents-api.service";
|
||||
import { DocumentTypeChip } from "./DocumentTypeIcon";
|
||||
import { RowActions } from "./RowActions";
|
||||
import type { ColumnVisibility, Document } from "./types";
|
||||
import type { ColumnVisibility, Document, DocumentStatus } from "./types";
|
||||
|
||||
// Status indicator component for document processing status
|
||||
function StatusIndicator({ status }: { status?: DocumentStatus }) {
|
||||
const state = status?.state ?? "ready";
|
||||
|
||||
switch (state) {
|
||||
case "pending":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<Clock className="h-5 w-5 text-muted-foreground" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Pending - waiting to be processed</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "processing":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<Spinner size="sm" className="text-primary" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Processing...</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "failed":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<AlertCircle className="h-5 w-5 text-destructive" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top" className="max-w-xs">
|
||||
{status?.reason || "Processing failed"}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
case "ready":
|
||||
return (
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex items-center justify-center">
|
||||
<CheckCircle2 className="h-5 w-5 text-muted-foreground/60" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">Ready</TooltipContent>
|
||||
</Tooltip>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export type SortKey = keyof Pick<Document, "title" | "document_type" | "created_at">;
|
||||
|
||||
|
|
@ -460,7 +515,7 @@ export function DocumentsTableShell({
|
|||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableHead className="w-32">
|
||||
<TableHead className="w-32 border-r border-border/40">
|
||||
<SortableHeader
|
||||
sortKey="created_at"
|
||||
currentSortKey={sortKey}
|
||||
|
|
@ -472,6 +527,13 @@ export function DocumentsTableShell({
|
|||
</SortableHeader>
|
||||
</TableHead>
|
||||
)}
|
||||
{columnVisibility.status && (
|
||||
<TableHead className="w-20 text-center">
|
||||
<span className="text-sm font-medium text-muted-foreground/70">
|
||||
Status
|
||||
</span>
|
||||
</TableHead>
|
||||
)}
|
||||
<TableHead className="w-10">
|
||||
<span className="sr-only">Actions</span>
|
||||
</TableHead>
|
||||
|
|
@ -552,7 +614,7 @@ export function DocumentsTableShell({
|
|||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.created_at && (
|
||||
<TableCell className="w-32 py-2.5 text-sm text-foreground">
|
||||
<TableCell className="w-32 py-2.5 text-sm text-foreground border-r border-border/40">
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="cursor-default">{formatRelativeDate(doc.created_at)}</span>
|
||||
|
|
@ -563,6 +625,11 @@ export function DocumentsTableShell({
|
|||
</Tooltip>
|
||||
</TableCell>
|
||||
)}
|
||||
{columnVisibility.status && (
|
||||
<TableCell className="w-20 py-2.5 text-center">
|
||||
<StatusIndicator status={doc.status} />
|
||||
</TableCell>
|
||||
)}
|
||||
<TableCell className="w-10 py-2.5 text-center">
|
||||
<RowActions
|
||||
document={doc}
|
||||
|
|
@ -647,11 +714,14 @@ export function DocumentsTableShell({
|
|||
)}
|
||||
</div>
|
||||
</div>
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
searchSpaceId={searchSpaceId}
|
||||
/>
|
||||
<div className="flex items-center gap-2">
|
||||
{columnVisibility.status && <StatusIndicator status={doc.status} />}
|
||||
<RowActions
|
||||
document={doc}
|
||||
deleteDocument={deleteDocument}
|
||||
searchSpaceId={searchSpaceId}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</motion.div>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -45,10 +45,17 @@ export function RowActions({
|
|||
document.document_type as (typeof EDITABLE_DOCUMENT_TYPES)[number]
|
||||
);
|
||||
|
||||
const isDeletable = !NON_DELETABLE_DOCUMENT_TYPES.includes(
|
||||
// Documents in "pending" or "processing" state should show disabled delete
|
||||
const isBeingProcessed = document.status?.state === "pending" || document.status?.state === "processing";
|
||||
|
||||
// SURFSENSE_DOCS are system-managed and should not show delete at all
|
||||
const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes(
|
||||
document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number]
|
||||
);
|
||||
|
||||
// Delete is disabled while processing
|
||||
const isDeleteDisabled = isBeingProcessed;
|
||||
|
||||
const handleDelete = async () => {
|
||||
setIsDeleting(true);
|
||||
try {
|
||||
|
|
@ -87,10 +94,11 @@ export function RowActions({
|
|||
<Pencil className="mr-2 h-4 w-4" />
|
||||
<span>Edit</span>
|
||||
</DropdownMenuItem>
|
||||
{isDeletable && (
|
||||
{shouldShowDelete && (
|
||||
<DropdownMenuItem
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
className="text-destructive focus:text-destructive"
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleteDisabled}
|
||||
className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"}
|
||||
>
|
||||
<Trash2 className="mr-2 h-4 w-4" />
|
||||
<span>Delete</span>
|
||||
|
|
@ -100,13 +108,13 @@ export function RowActions({
|
|||
</DropdownMenu>
|
||||
) : (
|
||||
// Non-editable documents: show only delete button directly
|
||||
isDeletable && (
|
||||
shouldShowDelete && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
disabled={isDeleting}
|
||||
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground/50 cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleting || isDeleteDisabled}
|
||||
>
|
||||
<Trash2 className="h-4 w-4" />
|
||||
<span className="sr-only">Delete</span>
|
||||
|
|
@ -131,10 +139,11 @@ export function RowActions({
|
|||
<Pencil className="mr-2 h-4 w-4" />
|
||||
<span>Edit</span>
|
||||
</DropdownMenuItem>
|
||||
{isDeletable && (
|
||||
{shouldShowDelete && (
|
||||
<DropdownMenuItem
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
className="text-destructive focus:text-destructive"
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleteDisabled}
|
||||
className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"}
|
||||
>
|
||||
<Trash2 className="mr-2 h-4 w-4" />
|
||||
<span>Delete</span>
|
||||
|
|
@ -144,13 +153,13 @@ export function RowActions({
|
|||
</DropdownMenu>
|
||||
) : (
|
||||
// Non-editable documents: show only delete button directly
|
||||
isDeletable && (
|
||||
shouldShowDelete && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
|
||||
onClick={() => setIsDeleteOpen(true)}
|
||||
disabled={isDeleting}
|
||||
className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground/50 cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
|
||||
onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
|
||||
disabled={isDeleting || isDeleteDisabled}
|
||||
>
|
||||
<Trash2 className="h-4 w-4" />
|
||||
<span className="sr-only">Delete</span>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
export type DocumentType = string;
|
||||
|
||||
export type DocumentStatus = {
|
||||
state: "ready" | "pending" | "processing" | "failed";
|
||||
reason?: string;
|
||||
};
|
||||
|
||||
export type Document = {
|
||||
id: number;
|
||||
title: string;
|
||||
|
|
@ -11,10 +16,12 @@ export type Document = {
|
|||
search_space_id: number;
|
||||
created_by_id?: string | null;
|
||||
created_by_name?: string | null;
|
||||
status?: DocumentStatus;
|
||||
};
|
||||
|
||||
export type ColumnVisibility = {
|
||||
document_type: boolean;
|
||||
created_by: boolean;
|
||||
created_at: boolean;
|
||||
status: boolean;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ export default function DocumentsTable() {
|
|||
document_type: true,
|
||||
created_by: true,
|
||||
created_at: true,
|
||||
status: true,
|
||||
});
|
||||
const [pageIndex, setPageIndex] = useState(0);
|
||||
const [sortKey, setSortKey] = useState<SortKey>("created_at");
|
||||
|
|
@ -115,6 +116,7 @@ export default function DocumentsTable() {
|
|||
created_by_id: item.created_by_id ?? null,
|
||||
created_by_name: item.created_by_name ?? null,
|
||||
created_at: item.created_at,
|
||||
status: (item as { status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string } }).status ?? { state: "ready" as const },
|
||||
}))
|
||||
: paginatedRealtimeDocuments;
|
||||
|
||||
|
|
@ -159,10 +161,35 @@ export default function DocumentsTable() {
|
|||
toast.error(t("no_rows_selected"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Filter out pending/processing documents - they cannot be deleted
|
||||
// For real-time mode, use sortedRealtimeDocuments (which has status)
|
||||
// For search mode, use searchResponse items (need to safely access status)
|
||||
const allDocs = isSearchMode
|
||||
? (searchResponse?.items || []).map(item => ({
|
||||
id: item.id,
|
||||
status: (item as { status?: { state: string } }).status,
|
||||
}))
|
||||
: sortedRealtimeDocuments.map(doc => ({ id: doc.id, status: doc.status }));
|
||||
|
||||
const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id));
|
||||
const deletableIds = selectedDocs
|
||||
.filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing")
|
||||
.map((doc) => doc.id);
|
||||
const inProgressCount = selectedIds.size - deletableIds.length;
|
||||
|
||||
if (inProgressCount > 0) {
|
||||
toast.warning(`${inProgressCount} document(s) are pending or processing and cannot be deleted.`);
|
||||
}
|
||||
|
||||
if (deletableIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Delete documents one by one using the mutation
|
||||
const results = await Promise.all(
|
||||
Array.from(selectedIds).map(async (id) => {
|
||||
deletableIds.map(async (id) => {
|
||||
try {
|
||||
await deleteDocumentMutation({ id });
|
||||
return true;
|
||||
|
|
@ -172,7 +199,7 @@ export default function DocumentsTable() {
|
|||
})
|
||||
);
|
||||
const okCount = results.filter((r) => r === true).length;
|
||||
if (okCount === selectedIds.size)
|
||||
if (okCount === deletableIds.length)
|
||||
toast.success(t("delete_success_count", { count: okCount }));
|
||||
else toast.error(t("delete_partial_failed"));
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,12 @@ import { useElectricClient } from "@/lib/electric/context";
|
|||
// Stable empty array to prevent infinite re-renders when no typeFilter is provided
|
||||
const EMPTY_TYPE_FILTER: DocumentTypeEnum[] = [];
|
||||
|
||||
// Document status type (matches backend DocumentStatus JSONB)
|
||||
export interface DocumentStatusType {
|
||||
state: "ready" | "pending" | "processing" | "failed";
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
// Document from Electric sync (lightweight table columns - NO content/metadata)
|
||||
interface DocumentElectric {
|
||||
id: number;
|
||||
|
|
@ -17,6 +23,7 @@ interface DocumentElectric {
|
|||
title: string;
|
||||
created_by_id: string | null;
|
||||
created_at: string;
|
||||
status: DocumentStatusType | null;
|
||||
}
|
||||
|
||||
// Document for display (with resolved user name)
|
||||
|
|
@ -28,6 +35,7 @@ export interface DocumentDisplay {
|
|||
created_by_id: string | null;
|
||||
created_by_name: string | null;
|
||||
created_at: string;
|
||||
status: DocumentStatusType;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -117,6 +125,7 @@ export function useDocuments(
|
|||
created_by_id?: string | null;
|
||||
created_by_name?: string | null;
|
||||
created_at: string;
|
||||
status?: DocumentStatusType | null;
|
||||
}): DocumentDisplay => ({
|
||||
id: item.id,
|
||||
search_space_id: item.search_space_id,
|
||||
|
|
@ -125,6 +134,7 @@ export function useDocuments(
|
|||
created_by_id: item.created_by_id ?? null,
|
||||
created_by_name: item.created_by_name ?? null,
|
||||
created_at: item.created_at,
|
||||
status: item.status ?? { state: "ready" },
|
||||
}),
|
||||
[]
|
||||
);
|
||||
|
|
@ -136,6 +146,7 @@ export function useDocuments(
|
|||
created_by_name: doc.created_by_id
|
||||
? userCacheRef.current.get(doc.created_by_id) ?? null
|
||||
: null,
|
||||
status: doc.status ?? { state: "ready" },
|
||||
}),
|
||||
[]
|
||||
);
|
||||
|
|
@ -221,7 +232,7 @@ export function useDocuments(
|
|||
const handle = await client.syncShape({
|
||||
table: "documents",
|
||||
where: `search_space_id = ${spaceId}`,
|
||||
columns: ["id", "document_type", "search_space_id", "title", "created_by_id", "created_at"],
|
||||
columns: ["id", "document_type", "search_space_id", "title", "created_by_id", "created_at", "status"],
|
||||
primaryKey: ["id"],
|
||||
});
|
||||
|
||||
|
|
@ -259,7 +270,7 @@ export function useDocuments(
|
|||
return;
|
||||
}
|
||||
|
||||
let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at
|
||||
let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at, status
|
||||
FROM documents
|
||||
WHERE search_space_id = $1`;
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,9 @@ const pendingSyncs = new Map<string, Promise<SyncHandle>>();
|
|||
// - fixed getSyncCutoffDate to use stable midnight UTC timestamps
|
||||
// v6: real-time documents table - added title and created_by_id columns for live document display
|
||||
// v7: removed use-documents-electric.ts - consolidated to single documents sync to prevent conflicts
|
||||
const SYNC_VERSION = 7;
|
||||
// v8: added status column for real-time document processing status (ready/processing/failed)
|
||||
// v9: added pending state for accurate document queue visibility
|
||||
const SYNC_VERSION = 11;
|
||||
|
||||
// Database name prefix for identifying SurfSense databases
|
||||
const DB_PREFIX = "surfsense-";
|
||||
|
|
@ -245,12 +247,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
|
|||
document_type TEXT NOT NULL,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
created_by_id TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status JSONB DEFAULT '{"state": "ready"}'::jsonb
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents(search_space_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(document_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_search_space_type ON documents(search_space_id, document_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_status ON documents((status->>'state'));
|
||||
`);
|
||||
|
||||
await db.exec(`
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue