mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-01 20:03:30 +02:00
feat: migrate Linear and Notion indexers to unified parallel pipeline
- Refactored Linear and Notion indexers to utilize the shared IndexingPipelineService for improved document deduplication, summarization, chunking, and embedding with bounded parallel indexing. - Updated the `_build_connector_doc` function in both indexers to create ConnectorDocument instances with enhanced metadata and fallback summaries. - Modified the `index_linear_issues` and `index_notion_pages` functions to return a tuple of (indexed_count, skipped_count, warning_or_error_message) for better error handling and reporting. - Added unit tests for both indexers to validate the new parallel processing logic and ensure correct document creation and indexing behavior.
This commit is contained in:
parent
da6bbcfe39
commit
db6dd058dd
4 changed files with 944 additions and 615 deletions
|
|
@ -1,48 +1,84 @@
|
|||
"""
|
||||
Linear connector indexer.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
||||
- Phase 2: Process each document: pending → processing → ready/failed
|
||||
Uses the shared IndexingPipelineService for document deduplication,
|
||||
summarization, chunking, and embedding with bounded parallel indexing.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.connectors.linear_connector import LinearConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.db import DocumentType, SearchSourceConnectorType
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_hashing import compute_content_hash
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
from .base import (
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
def _build_connector_doc(
|
||||
issue: dict,
|
||||
formatted_issue: dict,
|
||||
issue_content: str,
|
||||
*,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
enable_summary: bool,
|
||||
) -> ConnectorDocument:
|
||||
"""Map a raw Linear issue dict to a ConnectorDocument."""
|
||||
issue_id = issue.get("id", "")
|
||||
issue_identifier = issue.get("identifier", "")
|
||||
issue_title = issue.get("title", "")
|
||||
state = formatted_issue.get("state", "Unknown")
|
||||
priority = formatted_issue.get("priority", "Unknown")
|
||||
comment_count = len(formatted_issue.get("comments", []))
|
||||
|
||||
metadata = {
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"priority": priority,
|
||||
"comment_count": comment_count,
|
||||
"connector_id": connector_id,
|
||||
"document_type": "Linear Issue",
|
||||
"connector_type": "Linear",
|
||||
}
|
||||
|
||||
fallback_summary = (
|
||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n"
|
||||
f"Status: {state}\n\n{issue_content}"
|
||||
)
|
||||
|
||||
return ConnectorDocument(
|
||||
title=f"{issue_identifier}: {issue_title}",
|
||||
source_markdown=issue_content,
|
||||
unique_id=issue_id,
|
||||
document_type=DocumentType.LINEAR_CONNECTOR,
|
||||
search_space_id=search_space_id,
|
||||
connector_id=connector_id,
|
||||
created_by_id=user_id,
|
||||
should_summarize=enable_summary,
|
||||
fallback_summary=fallback_summary,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
async def index_linear_issues(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
|
|
@ -52,26 +88,15 @@ async def index_linear_issues(
|
|||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
) -> tuple[int, int, str | None]:
|
||||
"""
|
||||
Index Linear issues and comments.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the Linear connector
|
||||
search_space_id: ID of the search space to store documents in
|
||||
user_id: ID of the user
|
||||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
Tuple of (indexed_count, skipped_count, warning_or_error_message)
|
||||
"""
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="linear_issues_indexing",
|
||||
source="connector_indexing_task",
|
||||
|
|
@ -85,7 +110,7 @@ async def index_linear_issues(
|
|||
)
|
||||
|
||||
try:
|
||||
# Get the connector
|
||||
# ── Connector lookup ──────────────────────────────────────────
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Retrieving Linear connector {connector_id} from database",
|
||||
|
|
@ -104,11 +129,11 @@ async def index_linear_issues(
|
|||
{"error_type": "ConnectorNotFound"},
|
||||
)
|
||||
return (
|
||||
0,
|
||||
0,
|
||||
f"Connector with ID {connector_id} not found or is not a Linear connector",
|
||||
)
|
||||
|
||||
# Check if access_token exists (support both new OAuth format and old API key format)
|
||||
if not connector.config.get("access_token") and not connector.config.get(
|
||||
"LINEAR_API_KEY"
|
||||
):
|
||||
|
|
@ -118,26 +143,22 @@ async def index_linear_issues(
|
|||
"Missing Linear access token",
|
||||
{"error_type": "MissingToken"},
|
||||
)
|
||||
return 0, "Linear access token not found in connector config"
|
||||
return 0, 0, "Linear access token not found in connector config"
|
||||
|
||||
# Initialize Linear client with internal refresh capability
|
||||
# ── Client init ───────────────────────────────────────────────
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Initializing Linear client for connector {connector_id}",
|
||||
{"stage": "client_initialization"},
|
||||
)
|
||||
|
||||
# Create connector with session and connector_id for internal refresh
|
||||
# Token refresh will happen automatically when needed
|
||||
linear_client = LinearConnector(session=session, connector_id=connector_id)
|
||||
|
||||
# Handle 'undefined' string from frontend (treat as None)
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Calculate date range
|
||||
start_date_str, end_date_str = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
|
@ -154,37 +175,34 @@ async def index_linear_issues(
|
|||
},
|
||||
)
|
||||
|
||||
# Get issues within date range
|
||||
# ── Fetch issues ──────────────────────────────────────────────
|
||||
try:
|
||||
issues, error = await linear_client.get_issues_by_date_range(
|
||||
start_date=start_date_str, end_date=end_date_str, include_comments=True
|
||||
start_date=start_date_str,
|
||||
end_date=end_date_str,
|
||||
include_comments=True,
|
||||
)
|
||||
|
||||
if error:
|
||||
# Don't treat "No issues found" as an error that should stop indexing
|
||||
if "No issues found" in error:
|
||||
logger.info(f"No Linear issues found: {error}")
|
||||
logger.info(
|
||||
"No issues found is not a critical error, continuing with update"
|
||||
)
|
||||
if update_last_indexed:
|
||||
await update_connector_last_indexed(
|
||||
session, connector, update_last_indexed
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
|
||||
)
|
||||
return 0, None
|
||||
return 0, 0, None
|
||||
else:
|
||||
logger.error(f"Failed to get Linear issues: {error}")
|
||||
return 0, f"Failed to get Linear issues: {error}"
|
||||
return 0, 0, f"Failed to get Linear issues: {error}"
|
||||
|
||||
logger.info(f"Retrieved {len(issues)} issues from Linear API")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception when calling Linear API: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to get Linear issues: {e!s}"
|
||||
logger.error(
|
||||
f"Exception when calling Linear API: {e!s}", exc_info=True
|
||||
)
|
||||
return 0, 0, f"Failed to get Linear issues: {e!s}"
|
||||
|
||||
if not issues:
|
||||
logger.info("No Linear issues found for the specified date range")
|
||||
|
|
@ -193,19 +211,12 @@ async def index_linear_issues(
|
|||
session, connector, update_last_indexed
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Updated last_indexed_at to {connector.last_indexed_at} despite no issues found"
|
||||
)
|
||||
return 0, None # Return None instead of error message when no issues found
|
||||
return 0, 0, None
|
||||
|
||||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
# ── Build ConnectorDocuments ──────────────────────────────────
|
||||
connector_docs: list[ConnectorDocument] = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0 # Track issues that failed processing
|
||||
skipped_issues = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
duplicate_content_count = 0
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -213,13 +224,6 @@ async def index_linear_issues(
|
|||
{"stage": "process_issues", "total_issues": len(issues)},
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all issues, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
issues_to_process = [] # List of dicts with document and issue data
|
||||
new_documents_created = False
|
||||
|
||||
for issue in issues:
|
||||
try:
|
||||
issue_id = issue.get("id", "")
|
||||
|
|
@ -230,271 +234,102 @@ async def index_linear_issues(
|
|||
logger.warning(
|
||||
f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}"
|
||||
)
|
||||
skipped_issues.append(
|
||||
f"{issue_identifier or 'Unknown'} (missing data)"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Format the issue first to get well-structured data
|
||||
formatted_issue = linear_client.format_issue(issue)
|
||||
|
||||
# Convert issue to markdown format
|
||||
issue_content = linear_client.format_issue_to_markdown(formatted_issue)
|
||||
issue_content = linear_client.format_issue_to_markdown(
|
||||
formatted_issue
|
||||
)
|
||||
|
||||
if not issue_content:
|
||||
logger.warning(
|
||||
f"Skipping issue with no content: {issue_identifier} - {issue_title}"
|
||||
)
|
||||
skipped_issues.append(f"{issue_identifier} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Generate unique identifier hash for this Linear issue
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.LINEAR_CONNECTOR, issue_id, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(issue_content, search_space_id)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
state = formatted_issue.get("state", "Unknown")
|
||||
description = formatted_issue.get("description", "")
|
||||
comment_count = len(formatted_issue.get("comments", []))
|
||||
priority = formatted_issue.get("priority", "Unknown")
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
logger.info(
|
||||
f"Document for Linear issue {issue_identifier} unchanged. Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"description": description,
|
||||
"comment_count": comment_count,
|
||||
"priority": priority,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
logger.info(
|
||||
f"Linear issue {issue_identifier} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=f"{issue_identifier}: {issue_title}",
|
||||
document_type=DocumentType.LINEAR_CONNECTOR,
|
||||
document_metadata={
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"comment_count": comment_count,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
doc = _build_connector_doc(
|
||||
issue,
|
||||
formatted_issue,
|
||||
issue_content,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
issues_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"issue_content": issue_content,
|
||||
"content_hash": content_hash,
|
||||
"issue_id": issue_id,
|
||||
"issue_identifier": issue_identifier,
|
||||
"issue_title": issue_title,
|
||||
"state": state,
|
||||
"description": description,
|
||||
"comment_count": comment_count,
|
||||
"priority": priority,
|
||||
}
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
enable_summary=connector.enable_summary,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(issues_to_process)} documents")
|
||||
|
||||
for item in issues_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
)
|
||||
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"issue_id": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"state": item["state"],
|
||||
"priority": item["priority"],
|
||||
"comment_count": item["comment_count"],
|
||||
"document_type": "Linear Issue",
|
||||
"connector_type": "Linear",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["issue_content"], user_llm, document_metadata_for_summary
|
||||
with session.no_autoflush:
|
||||
duplicate = await check_duplicate_document_by_hash(
|
||||
session, compute_content_hash(doc)
|
||||
)
|
||||
else:
|
||||
summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n{item['issue_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["issue_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = f"{item['issue_identifier']}: {item['issue_title']}"
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"issue_id": item["issue_id"],
|
||||
"issue_identifier": item["issue_identifier"],
|
||||
"issue_title": item["issue_title"],
|
||||
"state": item["state"],
|
||||
"comment_count": item["comment_count"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
if duplicate:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Linear issues processed so far"
|
||||
f"Linear issue {doc.title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate.id}, "
|
||||
f"type: {duplicate.document_type}). Skipping."
|
||||
)
|
||||
await session.commit()
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
connector_docs.append(doc)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}",
|
||||
f"Error building ConnectorDocument for issue: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_issues.append(
|
||||
f"{item.get('issue_identifier', 'Unknown')} (processing error)"
|
||||
)
|
||||
documents_failed += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
|
||||
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
await pipeline.migrate_legacy_docs(connector_docs)
|
||||
|
||||
async def _get_llm(s):
|
||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
||||
|
||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||
connector_docs,
|
||||
_get_llm,
|
||||
max_concurrency=3,
|
||||
on_heartbeat=on_heartbeat_callback,
|
||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||
)
|
||||
|
||||
# ── Finalize ──────────────────────────────────────────────────
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
# Final commit for any remaining documents not yet committed in batches
|
||||
logger.info(f"Final commit: Total {documents_indexed} Linear issues processed")
|
||||
logger.info(
|
||||
f"Final commit: Total {documents_indexed} Linear issues processed"
|
||||
)
|
||||
try:
|
||||
await session.commit()
|
||||
logger.info(
|
||||
"Successfully committed all Linear document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same issue was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
else:
|
||||
raise
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
warning_parts: list[str] = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Linear indexing for connector {connector_id}",
|
||||
|
|
@ -503,7 +338,7 @@ async def index_linear_issues(
|
|||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"skipped_issues_count": len(skipped_issues),
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -511,7 +346,7 @@ async def index_linear_issues(
|
|||
f"Linear indexing completed: {documents_indexed} ready, "
|
||||
f"{documents_skipped} skipped, {documents_failed} failed"
|
||||
)
|
||||
return documents_indexed, warning_message
|
||||
return documents_indexed, documents_skipped, warning_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
@ -522,7 +357,7 @@ async def index_linear_issues(
|
|||
{"error_type": "SQLAlchemyError"},
|
||||
)
|
||||
logger.error(f"Database error: {db_error!s}", exc_info=True)
|
||||
return 0, f"Database error: {db_error!s}"
|
||||
return 0, 0, f"Database error: {db_error!s}"
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
|
|
@ -532,4 +367,4 @@ async def index_linear_issues(
|
|||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(f"Failed to index Linear issues: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index Linear issues: {e!s}"
|
||||
return 0, 0, f"Failed to index Linear issues: {e!s}"
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
"""
|
||||
Notion connector indexer.
|
||||
|
||||
Implements real-time document status updates using a two-phase approach:
|
||||
- Phase 1: Create all documents with PENDING status (visible in UI immediately)
|
||||
- Phase 2: Process each document one by one (pending → processing → ready/failed)
|
||||
Uses the shared IndexingPipelineService for document deduplication,
|
||||
summarization, chunking, and embedding with bounded parallel indexing.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
|
|
@ -14,42 +12,64 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.connectors.notion_history import NotionHistoryConnector
|
||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||
from app.db import DocumentType, SearchSourceConnectorType
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_hashing import compute_content_hash
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.notion_utils import process_blocks
|
||||
|
||||
from .base import (
|
||||
build_document_metadata_string,
|
||||
calculate_date_range,
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document_by_hash,
|
||||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
safe_set_chunks,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type alias for retry callback
|
||||
# Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
|
||||
RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]
|
||||
|
||||
# Type alias for heartbeat callback
|
||||
# Signature: async callback(indexed_count) -> None
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
def _build_connector_doc(
|
||||
page: dict,
|
||||
markdown_content: str,
|
||||
*,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
enable_summary: bool,
|
||||
) -> ConnectorDocument:
|
||||
"""Map a raw Notion page dict to a ConnectorDocument."""
|
||||
page_id = page.get("page_id", "")
|
||||
page_title = page.get("title", f"Untitled page ({page_id})")
|
||||
|
||||
metadata = {
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"connector_id": connector_id,
|
||||
"document_type": "Notion Page",
|
||||
"connector_type": "Notion",
|
||||
}
|
||||
|
||||
fallback_summary = f"Notion Page: {page_title}\n\n{markdown_content}"
|
||||
|
||||
return ConnectorDocument(
|
||||
title=page_title,
|
||||
source_markdown=markdown_content,
|
||||
unique_id=page_id,
|
||||
document_type=DocumentType.NOTION_CONNECTOR,
|
||||
search_space_id=search_space_id,
|
||||
connector_id=connector_id,
|
||||
created_by_id=user_id,
|
||||
should_summarize=enable_summary,
|
||||
fallback_summary=fallback_summary,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
async def index_notion_pages(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
|
|
@ -60,30 +80,15 @@ async def index_notion_pages(
|
|||
update_last_indexed: bool = True,
|
||||
on_retry_callback: RetryCallbackType | None = None,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
) -> tuple[int, int, str | None]:
|
||||
"""
|
||||
Index Notion pages from all accessible pages.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the Notion connector
|
||||
search_space_id: ID of the search space to store documents in
|
||||
user_id: ID of the user
|
||||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_retry_callback: Optional callback for retry progress notifications.
|
||||
Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
|
||||
retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
Called periodically with (indexed_count) to prevent task appearing stuck.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
Tuple of (indexed_count, skipped_count, warning_or_error_message)
|
||||
"""
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="notion_pages_indexing",
|
||||
source="connector_indexing_task",
|
||||
|
|
@ -97,7 +102,7 @@ async def index_notion_pages(
|
|||
)
|
||||
|
||||
try:
|
||||
# Get the connector
|
||||
# ── Connector lookup ──────────────────────────────────────────
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Retrieving Notion connector {connector_id} from database",
|
||||
|
|
@ -116,11 +121,11 @@ async def index_notion_pages(
|
|||
{"error_type": "ConnectorNotFound"},
|
||||
)
|
||||
return (
|
||||
0,
|
||||
0,
|
||||
f"Connector with ID {connector_id} not found or is not a Notion connector",
|
||||
)
|
||||
|
||||
# Check if access_token exists (support both new OAuth format and old integration token format)
|
||||
if not connector.config.get("access_token") and not connector.config.get(
|
||||
"NOTION_INTEGRATION_TOKEN"
|
||||
):
|
||||
|
|
@ -130,9 +135,9 @@ async def index_notion_pages(
|
|||
"Missing Notion access token",
|
||||
{"error_type": "MissingToken"},
|
||||
)
|
||||
return 0, "Notion access token not found in connector config"
|
||||
return 0, 0, "Notion access token not found in connector config"
|
||||
|
||||
# Initialize Notion client with internal refresh capability
|
||||
# ── Client init ───────────────────────────────────────────────
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Initializing Notion client for connector {connector_id}",
|
||||
|
|
@ -141,18 +146,15 @@ async def index_notion_pages(
|
|||
|
||||
logger.info(f"Initializing Notion client for connector {connector_id}")
|
||||
|
||||
# Handle 'undefined' string from frontend (treat as None)
|
||||
if start_date == "undefined" or start_date == "":
|
||||
start_date = None
|
||||
if end_date == "undefined" or end_date == "":
|
||||
end_date = None
|
||||
|
||||
# Calculate date range using the shared utility function
|
||||
start_date_str, end_date_str = calculate_date_range(
|
||||
connector, start_date, end_date, default_days_back=365
|
||||
)
|
||||
|
||||
# Convert YYYY-MM-DD to ISO format for Notion API
|
||||
start_date_iso = datetime.strptime(start_date_str, "%Y-%m-%d").strftime(
|
||||
"%Y-%m-%dT%H:%M:%SZ"
|
||||
)
|
||||
|
|
@ -160,13 +162,10 @@ async def index_notion_pages(
|
|||
"%Y-%m-%dT%H:%M:%SZ"
|
||||
)
|
||||
|
||||
# Create connector with session and connector_id for internal refresh
|
||||
# Token refresh will happen automatically when needed
|
||||
notion_client = NotionHistoryConnector(
|
||||
session=session, connector_id=connector_id
|
||||
)
|
||||
|
||||
# Set retry callback if provided (for user notifications during rate limits)
|
||||
if on_retry_callback:
|
||||
notion_client.set_retry_callback(on_retry_callback)
|
||||
|
||||
|
|
@ -182,21 +181,19 @@ async def index_notion_pages(
|
|||
},
|
||||
)
|
||||
|
||||
# Get all pages
|
||||
# ── Fetch pages ───────────────────────────────────────────────
|
||||
try:
|
||||
pages = await notion_client.get_all_pages(
|
||||
start_date=start_date_iso, end_date=end_date_iso
|
||||
)
|
||||
logger.info(f"Found {len(pages)} Notion pages")
|
||||
|
||||
# Get count of pages that had unsupported content skipped
|
||||
pages_with_skipped_content = notion_client.get_skipped_content_count()
|
||||
if pages_with_skipped_content > 0:
|
||||
logger.info(
|
||||
f"{pages_with_skipped_content} pages had Notion AI content skipped (not available via API)"
|
||||
)
|
||||
|
||||
# Check if using legacy integration token and log warning
|
||||
if notion_client.is_using_legacy_token():
|
||||
logger.warning(
|
||||
f"Connector {connector_id} is using legacy integration token. "
|
||||
|
|
@ -204,8 +201,6 @@ async def index_notion_pages(
|
|||
)
|
||||
except Exception as e:
|
||||
error_str = str(e)
|
||||
# Check if this is an unsupported block type error (transcription, ai_block, etc.)
|
||||
# These are known Notion API limitations and should be logged as warnings, not errors
|
||||
unsupported_block_errors = [
|
||||
"transcription is not supported",
|
||||
"ai_block is not supported",
|
||||
|
|
@ -216,7 +211,6 @@ async def index_notion_pages(
|
|||
)
|
||||
|
||||
if is_unsupported_block_error:
|
||||
# Log as warning since this is a known Notion API limitation
|
||||
logger.warning(
|
||||
f"Notion API limitation for connector {connector_id}: {error_str}. "
|
||||
"This is a known issue with Notion AI blocks (transcription, ai_block) "
|
||||
|
|
@ -229,7 +223,6 @@ async def index_notion_pages(
|
|||
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
|
||||
)
|
||||
else:
|
||||
# Log as error for other failures
|
||||
logger.error(
|
||||
f"Error fetching Notion pages for connector {connector_id}: {error_str}",
|
||||
exc_info=True,
|
||||
|
|
@ -242,7 +235,7 @@ async def index_notion_pages(
|
|||
)
|
||||
|
||||
await notion_client.close()
|
||||
return 0, f"Failed to get Notion pages: {e!s}"
|
||||
return 0, 0, f"Failed to get Notion pages: {e!s}"
|
||||
|
||||
if not pages:
|
||||
await task_logger.log_task_success(
|
||||
|
|
@ -252,21 +245,17 @@ async def index_notion_pages(
|
|||
{"pages_found": 0},
|
||||
)
|
||||
logger.info("No Notion pages found to index")
|
||||
# CRITICAL: Update timestamp even when no pages found so Zero syncs
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
await update_connector_last_indexed(
|
||||
session, connector, update_last_indexed
|
||||
)
|
||||
await session.commit()
|
||||
await notion_client.close()
|
||||
return 0, None # Success with 0 pages, not an error
|
||||
return 0, 0, None
|
||||
|
||||
# Track the number of documents indexed
|
||||
documents_indexed = 0
|
||||
# ── Build ConnectorDocuments ──────────────────────────────────
|
||||
connector_docs: list[ConnectorDocument] = []
|
||||
documents_skipped = 0
|
||||
documents_failed = 0
|
||||
duplicate_content_count = 0
|
||||
skipped_pages = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -274,13 +263,6 @@ async def index_notion_pages(
|
|||
{"stage": "process_pages", "total_pages": len(pages)},
|
||||
)
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 1: Analyze all pages, create pending documents
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
# =======================================================================
|
||||
pages_to_process = [] # List of dicts with document and page data
|
||||
new_documents_created = False
|
||||
|
||||
for page in pages:
|
||||
try:
|
||||
page_id = page.get("page_id")
|
||||
|
|
@ -293,225 +275,71 @@ async def index_notion_pages(
|
|||
|
||||
if not page_content:
|
||||
logger.info(f"No content found in page {page_title}. Skipping.")
|
||||
skipped_pages.append(f"{page_title} (no content)")
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Convert page content to markdown format
|
||||
markdown_content = f"# Notion Page: {page_title}\n\n"
|
||||
markdown_content += process_blocks(page_content)
|
||||
|
||||
# Format document metadata
|
||||
metadata_sections = [
|
||||
("METADATA", [f"PAGE_TITLE: {page_title}", f"PAGE_ID: {page_id}"]),
|
||||
(
|
||||
"CONTENT",
|
||||
[
|
||||
"FORMAT: markdown",
|
||||
"TEXT_START",
|
||||
markdown_content,
|
||||
"TEXT_END",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Build the document string
|
||||
combined_document_string = build_document_metadata_string(
|
||||
metadata_sections
|
||||
)
|
||||
|
||||
# Generate unique identifier hash for this Notion page
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.NOTION_CONNECTOR, page_id, search_space_id
|
||||
)
|
||||
|
||||
# Generate content hash
|
||||
content_hash = generate_content_hash(
|
||||
combined_document_string, search_space_id
|
||||
)
|
||||
|
||||
# Check if document with this unique identifier already exists
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
# Document exists - check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Ensure status is ready (might have been stuck in processing/pending)
|
||||
if not DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.READY
|
||||
):
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Queue existing document for update (will be set to processing in Phase 2)
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": existing_document,
|
||||
"is_new": False,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
}
|
||||
if not markdown_content.strip():
|
||||
logger.warning(
|
||||
f"Skipping page with empty markdown: {page_title}"
|
||||
)
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Document doesn't exist by unique_identifier_hash
|
||||
# Check if a document with the same content_hash exists (from another connector)
|
||||
with session.no_autoflush:
|
||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||
session, content_hash
|
||||
)
|
||||
doc = _build_connector_doc(
|
||||
page,
|
||||
markdown_content,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
enable_summary=connector.enable_summary,
|
||||
)
|
||||
|
||||
if duplicate_by_content:
|
||||
with session.no_autoflush:
|
||||
duplicate = await check_duplicate_document_by_hash(
|
||||
session, compute_content_hash(doc)
|
||||
)
|
||||
if duplicate:
|
||||
logger.info(
|
||||
f"Notion page {page_title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate_by_content.id}, "
|
||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
||||
f"Notion page {doc.title} already indexed by another connector "
|
||||
f"(existing document ID: {duplicate.id}, "
|
||||
f"type: {duplicate.document_type}). Skipping."
|
||||
)
|
||||
duplicate_content_count += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# Create new document with PENDING status (visible in UI immediately)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=page_title,
|
||||
document_type=DocumentType.NOTION_CONNECTOR,
|
||||
document_metadata={
|
||||
"page_title": page_title,
|
||||
"page_id": page_id,
|
||||
"connector_id": connector_id,
|
||||
},
|
||||
content="Pending...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
chunks=[], # Empty at creation - safe for async
|
||||
status=DocumentStatus.pending(), # Pending until processing starts
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
session.add(document)
|
||||
new_documents_created = True
|
||||
|
||||
pages_to_process.append(
|
||||
{
|
||||
"document": document,
|
||||
"is_new": True,
|
||||
"markdown_content": markdown_content,
|
||||
"content_hash": content_hash,
|
||||
"page_id": page_id,
|
||||
"page_title": page_title,
|
||||
}
|
||||
)
|
||||
connector_docs.append(doc)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True)
|
||||
documents_failed += 1
|
||||
continue
|
||||
|
||||
# Commit all pending documents - they all appear in UI now
|
||||
if new_documents_created:
|
||||
logger.info(
|
||||
f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
# =======================================================================
|
||||
# PHASE 2: Process each document one by one
|
||||
# Each document transitions: pending → processing → ready/failed
|
||||
# =======================================================================
|
||||
logger.info(f"Phase 2: Processing {len(pages_to_process)} documents")
|
||||
|
||||
for item in pages_to_process:
|
||||
# Send heartbeat periodically
|
||||
if on_heartbeat_callback:
|
||||
current_time = time.time()
|
||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = current_time
|
||||
|
||||
document = item["document"]
|
||||
try:
|
||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
||||
document.status = DocumentStatus.processing()
|
||||
await session.commit()
|
||||
|
||||
# Heavy processing (LLM, embeddings, chunks)
|
||||
user_llm = await get_user_long_context_llm(
|
||||
session, user_id, search_space_id
|
||||
logger.error(
|
||||
f"Error building ConnectorDocument for page: {e!s}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if user_llm and connector.enable_summary:
|
||||
document_metadata_for_summary = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
"document_type": "Notion Page",
|
||||
"connector_type": "Notion",
|
||||
}
|
||||
(
|
||||
summary_content,
|
||||
summary_embedding,
|
||||
) = await generate_document_summary(
|
||||
item["markdown_content"],
|
||||
user_llm,
|
||||
document_metadata_for_summary,
|
||||
)
|
||||
else:
|
||||
summary_content = f"Notion Page: {item['page_title']}\n\n{item['markdown_content']}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
chunks = await create_document_chunks(item["markdown_content"])
|
||||
|
||||
# Update document to READY with actual content
|
||||
document.title = item["page_title"]
|
||||
document.content = summary_content
|
||||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = {
|
||||
"page_title": item["page_title"],
|
||||
"page_id": item["page_id"],
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
documents_indexed += 1
|
||||
|
||||
# Batch commit every 10 documents (for ready status updates)
|
||||
if documents_indexed % 10 == 0:
|
||||
logger.info(
|
||||
f"Committing batch: {documents_indexed} Notion pages processed so far"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Notion page: {e!s}", exc_info=True)
|
||||
# Mark document as failed with reason (visible in UI)
|
||||
try:
|
||||
document.status = DocumentStatus.failed(str(e))
|
||||
document.updated_at = get_current_timestamp()
|
||||
except Exception as status_error:
|
||||
logger.error(
|
||||
f"Failed to update document status to failed: {status_error}"
|
||||
)
|
||||
skipped_pages.append(f"{item['page_title']} (processing error)")
|
||||
documents_failed += 1
|
||||
documents_skipped += 1
|
||||
continue
|
||||
|
||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
|
||||
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
await pipeline.migrate_legacy_docs(connector_docs)
|
||||
|
||||
async def _get_llm(s):
|
||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
||||
|
||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||
connector_docs,
|
||||
_get_llm,
|
||||
max_concurrency=3,
|
||||
on_heartbeat=on_heartbeat_callback,
|
||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||
)
|
||||
|
||||
# ── Finalize ──────────────────────────────────────────────────
|
||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||
|
||||
total_processed = documents_indexed
|
||||
|
||||
# Final commit to ensure all documents are persisted (safety net)
|
||||
logger.info(f"Final commit: Total {documents_indexed} documents processed")
|
||||
try:
|
||||
await session.commit()
|
||||
|
|
@ -519,59 +347,53 @@ async def index_notion_pages(
|
|||
"Successfully committed all Notion document changes to database"
|
||||
)
|
||||
except Exception as e:
|
||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||
if (
|
||||
"duplicate key value violates unique constraint" in str(e).lower()
|
||||
or "uniqueviolationerror" in str(e).lower()
|
||||
):
|
||||
logger.warning(
|
||||
f"Duplicate content_hash detected during final commit. "
|
||||
f"This may occur if the same page was indexed by multiple connectors. "
|
||||
f"Rolling back and continuing. Error: {e!s}"
|
||||
)
|
||||
await session.rollback()
|
||||
# Don't fail the entire task - some documents may have been successfully indexed
|
||||
else:
|
||||
raise
|
||||
|
||||
# Get final count of pages with skipped Notion AI content
|
||||
# ── Build warning / notification message ──────────────────────
|
||||
pages_with_skipped_ai_content = notion_client.get_skipped_content_count()
|
||||
|
||||
# Build warning message if there were issues
|
||||
warning_parts = []
|
||||
warning_parts: list[str] = []
|
||||
if duplicate_content_count > 0:
|
||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||
if documents_failed > 0:
|
||||
warning_parts.append(f"{documents_failed} failed")
|
||||
warning_message = ", ".join(warning_parts) if warning_parts else None
|
||||
|
||||
# Prepare result message with user-friendly notification about skipped content
|
||||
result_message = None
|
||||
if skipped_pages:
|
||||
result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
|
||||
else:
|
||||
result_message = f"Processed {total_processed} pages."
|
||||
|
||||
# Add user-friendly message about skipped Notion AI content
|
||||
notification_parts: list[str] = []
|
||||
if pages_with_skipped_ai_content > 0:
|
||||
result_message += (
|
||||
" Audio transcriptions and AI summaries from Notion aren't accessible "
|
||||
"via their API - all other content was saved."
|
||||
notification_parts.append(
|
||||
"Some Notion AI content couldn't be synced (API limitation)"
|
||||
)
|
||||
if notion_client.is_using_legacy_token():
|
||||
notification_parts.append(
|
||||
"Using legacy token. Reconnect with OAuth for better reliability."
|
||||
)
|
||||
if warning_parts:
|
||||
notification_parts.append(", ".join(warning_parts))
|
||||
|
||||
user_notification_message = (
|
||||
" ".join(notification_parts) if notification_parts else None
|
||||
)
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed Notion indexing for connector {connector_id}",
|
||||
{
|
||||
"pages_processed": total_processed,
|
||||
"pages_processed": documents_indexed,
|
||||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"documents_failed": documents_failed,
|
||||
"duplicate_content_count": duplicate_content_count,
|
||||
"skipped_pages_count": len(skipped_pages),
|
||||
"pages_with_skipped_ai_content": pages_with_skipped_ai_content,
|
||||
"result_message": result_message,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -581,35 +403,9 @@ async def index_notion_pages(
|
|||
f"({duplicate_content_count} duplicate content)"
|
||||
)
|
||||
|
||||
# Clean up the async client
|
||||
await notion_client.close()
|
||||
|
||||
# Build user-friendly notification messages
|
||||
# This will be shown in the notification to inform users
|
||||
notification_parts = []
|
||||
|
||||
if pages_with_skipped_ai_content > 0:
|
||||
notification_parts.append(
|
||||
"Some Notion AI content couldn't be synced (API limitation)"
|
||||
)
|
||||
|
||||
if notion_client.is_using_legacy_token():
|
||||
notification_parts.append(
|
||||
"Using legacy token. Reconnect with OAuth for better reliability."
|
||||
)
|
||||
|
||||
# Include warning message if there were issues
|
||||
if warning_message:
|
||||
notification_parts.append(warning_message)
|
||||
|
||||
user_notification_message = (
|
||||
" ".join(notification_parts) if notification_parts else None
|
||||
)
|
||||
|
||||
return (
|
||||
total_processed,
|
||||
user_notification_message,
|
||||
)
|
||||
return documents_indexed, documents_skipped, user_notification_message
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
@ -622,10 +418,9 @@ async def index_notion_pages(
|
|||
logger.error(
|
||||
f"Database error during Notion indexing: {db_error!s}", exc_info=True
|
||||
)
|
||||
# Clean up the async client in case of error
|
||||
if "notion_client" in locals():
|
||||
await notion_client.close()
|
||||
return 0, f"Database error: {db_error!s}"
|
||||
return 0, 0, f"Database error: {db_error!s}"
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
|
|
@ -635,7 +430,6 @@ async def index_notion_pages(
|
|||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(f"Failed to index Notion pages: {e!s}", exc_info=True)
|
||||
# Clean up the async client in case of error
|
||||
if "notion_client" in locals():
|
||||
await notion_client.close()
|
||||
return 0, f"Failed to index Notion pages: {e!s}"
|
||||
return 0, 0, f"Failed to index Notion pages: {e!s}"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue