feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates

This commit is contained in:
Anish Sarkar 2026-02-06 04:35:13 +05:30
parent 0f61a249c0
commit 629f6f9cf5
3 changed files with 394 additions and 192 deletions

View file

@ -14,6 +14,34 @@ from app.db import Document
md = MarkdownifyTransformer()
def safe_set_chunks(document: Document, chunks: list) -> None:
"""
Safely assign chunks to a document without triggering lazy loading.
ALWAYS use this instead of `document.chunks = chunks` to avoid
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
Why this is needed:
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
load the OLD chunks first (for comparison/orphan detection)
- This lazy loading fails in async context with asyncpg driver
- set_committed_value bypasses this by setting the value directly
This function is safe regardless of how the document was loaded
(with or without selectinload).
Args:
document: The Document object to update
chunks: List of Chunk objects to assign
Example:
# Instead of: document.chunks = chunks (DANGEROUS!)
safe_set_chunks(document, chunks) # Always safe
"""
from sqlalchemy.orm.attributes import set_committed_value
set_committed_value(document, 'chunks', chunks)
def get_current_timestamp() -> datetime:
"""
Get the current timestamp with timezone for updated_at field.

View file

@ -3,6 +3,11 @@ Circleback meeting document processor.
This module processes meeting data received from Circleback webhooks
and stores it as searchable documents in the database.
Implements real-time document status updates for UI feedback:
- Create document with 'pending' status (visible in UI immediately)
- Set to 'processing' while processing content
- Set to 'ready' or 'failed' when complete
"""
import logging
@ -14,6 +19,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
Document,
DocumentStatus,
DocumentType,
SearchSourceConnector,
SearchSourceConnectorType,
@ -30,6 +36,7 @@ from app.utils.document_converters import (
from .base import (
check_document_by_unique_identifier,
get_current_timestamp,
safe_set_chunks,
)
logger = logging.getLogger(__name__)
@ -47,6 +54,11 @@ async def add_circleback_meeting_document(
"""
Process and store a Circleback meeting document.
Implements real-time document status updates:
- Phase 1: Create document with 'pending' status (visible in UI immediately)
- Phase 2: Set to 'processing' while processing content
- Phase 3: Set to 'ready' or 'failed' when complete
Args:
session: Database session
meeting_id: Circleback meeting ID
@ -59,6 +71,7 @@ async def add_circleback_meeting_document(
Returns:
Document object if successful, None if failed or duplicate
"""
document = None
try:
# Generate unique identifier hash using Circleback meeting ID
unique_identifier = f"circleback_{meeting_id}"
@ -77,6 +90,10 @@ async def add_circleback_meeting_document(
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY):
existing_document.status = DocumentStatus.ready()
await session.commit()
logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.")
return existing_document
else:
@ -84,7 +101,79 @@ async def add_circleback_meeting_document(
logger.info(
f"Content changed for Circleback meeting {meeting_id}. Updating document."
)
document = existing_document
# Set to PROCESSING status and commit - shows "processing" in UI
document.status = DocumentStatus.processing()
await session.commit()
else:
# =======================================================================
# PHASE 1: Create document with PENDING status
# This makes the document visible in the UI immediately
# =======================================================================
# Fetch the user who set up the Circleback connector (preferred)
# or fall back to search space owner if no connector found
created_by_user_id = None
# Try to find the Circleback connector for this search space
connector_result = await session.execute(
select(SearchSourceConnector.user_id).where(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
)
)
connector_user = connector_result.scalar_one_or_none()
if connector_user:
# Use the user who set up the Circleback connector
created_by_user_id = connector_user
else:
# Fallback: use search space owner if no connector found
search_space_result = await session.execute(
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
)
created_by_user_id = search_space_result.scalar_one_or_none()
# Create new document with PENDING status (visible in UI immediately)
document = Document(
search_space_id=search_space_id,
title=meeting_name,
document_type=DocumentType.CIRCLEBACK,
document_metadata={
"CIRCLEBACK_MEETING_ID": meeting_id,
"MEETING_NAME": meeting_name,
"SOURCE": "CIRCLEBACK_WEBHOOK",
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=created_by_user_id,
connector_id=connector_id,
)
session.add(document)
# Commit immediately so document appears in UI with pending status
await session.commit()
logger.info(
f"Created pending Circleback meeting document {meeting_id} in search space {search_space_id}"
)
# =======================================================================
# PHASE 2: Set to PROCESSING status
# =======================================================================
document.status = DocumentStatus.processing()
await session.commit()
# =======================================================================
# PHASE 3: Process the document content
# =======================================================================
# Get LLM for generating summary
llm = await get_document_summary_llm(session, search_space_id)
if not llm:
@ -100,7 +189,7 @@ async def add_circleback_meeting_document(
summary_embedding = None
else:
# Generate summary with metadata
document_metadata = {
summary_metadata = {
"meeting_name": meeting_name,
"meeting_id": meeting_id,
"document_type": "Circleback Meeting",
@ -111,7 +200,7 @@ async def add_circleback_meeting_document(
},
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, llm, document_metadata
markdown_content, llm, summary_metadata
)
# Process chunks
@ -126,7 +215,7 @@ async def add_circleback_meeting_document(
f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable"
)
# Prepare document metadata
# Prepare final document metadata
document_metadata = {
"CIRCLEBACK_MEETING_ID": meeting_id,
"MEETING_NAME": meeting_name,
@ -134,77 +223,34 @@ async def add_circleback_meeting_document(
**metadata,
}
# Fetch the user who set up the Circleback connector (preferred)
# or fall back to search space owner if no connector found
created_by_user_id = None
# =======================================================================
# PHASE 4: Update document to READY status with actual content
# =======================================================================
document.title = meeting_name
document.content = summary_content
document.content_hash = content_hash
if summary_embedding is not None:
document.embedding = summary_embedding
document.document_metadata = document_metadata
safe_set_chunks(document, chunks)
document.blocknote_document = blocknote_json
document.content_needs_reindexing = False
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
# Ensure connector_id is set (backfill for documents created before this field)
if connector_id is not None:
document.connector_id = connector_id
# Try to find the Circleback connector for this search space
connector_result = await session.execute(
select(SearchSourceConnector.user_id).where(
SearchSourceConnector.search_space_id == search_space_id,
SearchSourceConnector.connector_type
== SearchSourceConnectorType.CIRCLEBACK_CONNECTOR,
)
)
connector_user = connector_result.scalar_one_or_none()
if connector_user:
# Use the user who set up the Circleback connector
created_by_user_id = connector_user
else:
# Fallback: use search space owner if no connector found
search_space_result = await session.execute(
select(SearchSpace.user_id).where(SearchSpace.id == search_space_id)
)
created_by_user_id = search_space_result.scalar_one_or_none()
# Update or create document
await session.commit()
await session.refresh(document)
if existing_document:
# Update existing document
existing_document.title = meeting_name
existing_document.content = summary_content
existing_document.content_hash = content_hash
if summary_embedding is not None:
existing_document.embedding = summary_embedding
existing_document.document_metadata = document_metadata
existing_document.chunks = chunks
existing_document.blocknote_document = blocknote_json
existing_document.content_needs_reindexing = False
existing_document.updated_at = get_current_timestamp()
# Ensure connector_id is set (backfill for documents created before this field)
if connector_id is not None:
existing_document.connector_id = connector_id
await session.commit()
await session.refresh(existing_document)
document = existing_document
logger.info(
f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}"
)
else:
# Create new document
document = Document(
search_space_id=search_space_id,
title=meeting_name,
document_type=DocumentType.CIRCLEBACK,
document_metadata=document_metadata,
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
blocknote_document=blocknote_json,
content_needs_reindexing=False,
updated_at=get_current_timestamp(),
created_by_id=created_by_user_id,
connector_id=connector_id,
)
session.add(document)
await session.commit()
await session.refresh(document)
logger.info(
f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}"
f"Processed Circleback meeting document {meeting_id} in search space {search_space_id} - now ready"
)
return document
@ -214,8 +260,24 @@ async def add_circleback_meeting_document(
logger.error(
f"Database error processing Circleback meeting {meeting_id}: {db_error}"
)
# Mark document as failed if it was created
if document is not None:
try:
document.status = DocumentStatus.failed(str(db_error))
document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(f"Failed to update document status to failed: {status_error}")
raise db_error
except Exception as e:
await session.rollback()
logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}")
# Mark document as failed if it was created
if document is not None:
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
await session.commit()
except Exception as status_error:
logger.error(f"Failed to update document status to failed: {status_error}")
raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e