2026-02-25 19:56:59 +02:00
|
|
|
import contextlib
|
2026-02-25 02:20:44 +02:00
|
|
|
from datetime import UTC, datetime
|
|
|
|
|
|
2026-02-25 12:03:00 +02:00
|
|
|
from sqlalchemy import delete, select
|
2026-02-26 03:05:20 +05:30
|
|
|
from sqlalchemy.exc import IntegrityError
|
2026-02-25 00:06:34 +02:00
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
2026-02-25 00:30:11 +02:00
|
|
|
|
2026-02-25 01:40:30 +02:00
|
|
|
from app.db import Chunk, Document, DocumentStatus
|
2026-02-25 00:06:34 +02:00
|
|
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
2026-02-25 01:40:30 +02:00
|
|
|
from app.indexing_pipeline.document_chunker import chunk_text
|
|
|
|
|
from app.indexing_pipeline.document_embedder import embed_text
|
2026-02-25 19:56:59 +02:00
|
|
|
from app.indexing_pipeline.document_hashing import (
|
|
|
|
|
compute_content_hash,
|
|
|
|
|
compute_unique_identifier_hash,
|
|
|
|
|
)
|
|
|
|
|
from app.indexing_pipeline.document_persistence import (
|
|
|
|
|
attach_chunks_to_document,
|
|
|
|
|
rollback_and_persist_failure,
|
|
|
|
|
)
|
2026-02-25 01:40:30 +02:00
|
|
|
from app.indexing_pipeline.document_summarizer import summarize_document
|
2026-02-25 15:26:04 +02:00
|
|
|
from app.indexing_pipeline.exceptions import (
|
|
|
|
|
EMBEDDING_ERRORS,
|
|
|
|
|
PERMANENT_LLM_ERRORS,
|
|
|
|
|
RETRYABLE_LLM_ERRORS,
|
|
|
|
|
PipelineMessages,
|
|
|
|
|
embedding_message,
|
|
|
|
|
llm_permanent_message,
|
|
|
|
|
llm_retryable_message,
|
2026-02-25 16:04:35 +02:00
|
|
|
safe_exception_message,
|
|
|
|
|
)
|
|
|
|
|
from app.indexing_pipeline.pipeline_logger import (
|
|
|
|
|
PipelineLogContext,
|
2026-02-25 17:44:35 +02:00
|
|
|
log_batch_aborted,
|
2026-02-25 16:04:35 +02:00
|
|
|
log_chunking_overflow,
|
|
|
|
|
log_doc_skipped_unknown,
|
|
|
|
|
log_document_queued,
|
|
|
|
|
log_document_requeued,
|
|
|
|
|
log_document_updated,
|
|
|
|
|
log_embedding_error,
|
|
|
|
|
log_index_started,
|
|
|
|
|
log_index_success,
|
|
|
|
|
log_permanent_llm_error,
|
|
|
|
|
log_race_condition,
|
|
|
|
|
log_retryable_llm_error,
|
|
|
|
|
log_unexpected_error,
|
2026-02-25 15:26:04 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2026-02-25 00:06:34 +02:00
|
|
|
class IndexingPipelineService:
|
2026-02-25 01:40:30 +02:00
|
|
|
"""Single pipeline for indexing connector documents. All connectors use this service."""
|
|
|
|
|
|
2026-02-25 00:06:34 +02:00
|
|
|
def __init__(self, session: AsyncSession) -> None:
|
|
|
|
|
self.session = session
|
|
|
|
|
|
|
|
|
|
async def prepare_for_indexing(
|
|
|
|
|
self, connector_docs: list[ConnectorDocument]
|
|
|
|
|
) -> list[Document]:
|
2026-02-25 01:40:30 +02:00
|
|
|
"""
|
|
|
|
|
Persist new documents and detect changes, returning only those that need indexing.
|
|
|
|
|
"""
|
2026-02-25 13:00:34 +02:00
|
|
|
documents = []
|
|
|
|
|
seen_hashes: set[str] = set()
|
2026-02-25 16:04:35 +02:00
|
|
|
batch_ctx = PipelineLogContext(
|
|
|
|
|
connector_id=connector_docs[0].connector_id if connector_docs else 0,
|
|
|
|
|
search_space_id=connector_docs[0].search_space_id if connector_docs else 0,
|
|
|
|
|
unique_id="batch",
|
|
|
|
|
)
|
2026-02-25 12:03:00 +02:00
|
|
|
|
2026-02-25 13:00:34 +02:00
|
|
|
for connector_doc in connector_docs:
|
2026-02-25 16:04:35 +02:00
|
|
|
ctx = PipelineLogContext(
|
|
|
|
|
connector_id=connector_doc.connector_id,
|
|
|
|
|
search_space_id=connector_doc.search_space_id,
|
|
|
|
|
unique_id=connector_doc.unique_id,
|
|
|
|
|
)
|
2026-02-25 13:00:34 +02:00
|
|
|
try:
|
2026-02-25 12:03:00 +02:00
|
|
|
unique_identifier_hash = compute_unique_identifier_hash(connector_doc)
|
|
|
|
|
content_hash = compute_content_hash(connector_doc)
|
|
|
|
|
|
|
|
|
|
if unique_identifier_hash in seen_hashes:
|
|
|
|
|
continue
|
|
|
|
|
seen_hashes.add(unique_identifier_hash)
|
|
|
|
|
|
|
|
|
|
result = await self.session.execute(
|
2026-02-25 19:56:59 +02:00
|
|
|
select(Document).filter(
|
|
|
|
|
Document.unique_identifier_hash == unique_identifier_hash
|
|
|
|
|
)
|
2026-02-25 12:03:00 +02:00
|
|
|
)
|
|
|
|
|
existing = result.scalars().first()
|
|
|
|
|
|
|
|
|
|
if existing is not None:
|
|
|
|
|
if existing.content_hash == content_hash:
|
|
|
|
|
if existing.title != connector_doc.title:
|
|
|
|
|
existing.title = connector_doc.title
|
|
|
|
|
existing.updated_at = datetime.now(UTC)
|
2026-02-25 19:56:59 +02:00
|
|
|
if not DocumentStatus.is_state(
|
|
|
|
|
existing.status, DocumentStatus.READY
|
|
|
|
|
):
|
2026-02-25 12:03:00 +02:00
|
|
|
existing.status = DocumentStatus.pending()
|
|
|
|
|
existing.updated_at = datetime.now(UTC)
|
|
|
|
|
documents.append(existing)
|
2026-02-25 16:04:35 +02:00
|
|
|
log_document_requeued(ctx)
|
2026-02-25 12:03:00 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
existing.title = connector_doc.title
|
|
|
|
|
existing.content_hash = content_hash
|
|
|
|
|
existing.source_markdown = connector_doc.source_markdown
|
|
|
|
|
existing.document_metadata = connector_doc.metadata
|
|
|
|
|
existing.updated_at = datetime.now(UTC)
|
|
|
|
|
existing.status = DocumentStatus.pending()
|
|
|
|
|
documents.append(existing)
|
2026-02-25 16:04:35 +02:00
|
|
|
log_document_updated(ctx)
|
2026-02-25 12:03:00 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
duplicate = await self.session.execute(
|
|
|
|
|
select(Document).filter(Document.content_hash == content_hash)
|
|
|
|
|
)
|
|
|
|
|
if duplicate.scalars().first() is not None:
|
2026-02-25 00:06:34 +02:00
|
|
|
continue
|
|
|
|
|
|
2026-02-25 12:03:00 +02:00
|
|
|
document = Document(
|
|
|
|
|
title=connector_doc.title,
|
|
|
|
|
document_type=connector_doc.document_type,
|
|
|
|
|
content="Pending...",
|
|
|
|
|
content_hash=content_hash,
|
|
|
|
|
unique_identifier_hash=unique_identifier_hash,
|
|
|
|
|
source_markdown=connector_doc.source_markdown,
|
|
|
|
|
document_metadata=connector_doc.metadata,
|
|
|
|
|
search_space_id=connector_doc.search_space_id,
|
|
|
|
|
connector_id=connector_doc.connector_id,
|
|
|
|
|
created_by_id=connector_doc.created_by_id,
|
|
|
|
|
updated_at=datetime.now(UTC),
|
|
|
|
|
status=DocumentStatus.pending(),
|
|
|
|
|
)
|
|
|
|
|
self.session.add(document)
|
|
|
|
|
documents.append(document)
|
2026-02-25 16:04:35 +02:00
|
|
|
log_document_queued(ctx)
|
2026-02-25 12:03:00 +02:00
|
|
|
|
2026-02-25 16:04:35 +02:00
|
|
|
except Exception as e:
|
|
|
|
|
log_doc_skipped_unknown(ctx, e)
|
2026-02-25 13:00:34 +02:00
|
|
|
|
|
|
|
|
try:
|
2026-02-25 12:03:00 +02:00
|
|
|
await self.session.commit()
|
|
|
|
|
return documents
|
2026-02-25 17:44:35 +02:00
|
|
|
except IntegrityError:
|
2026-02-25 13:00:34 +02:00
|
|
|
# A concurrent worker committed a document with the same content_hash
|
|
|
|
|
# or unique_identifier_hash between our check and our INSERT.
|
|
|
|
|
# The document already exists — roll back and let the next sync run handle it.
|
2026-02-25 16:04:35 +02:00
|
|
|
log_race_condition(batch_ctx)
|
2026-02-25 12:03:00 +02:00
|
|
|
await self.session.rollback()
|
|
|
|
|
return []
|
2026-02-25 17:44:35 +02:00
|
|
|
except Exception as e:
|
|
|
|
|
log_batch_aborted(batch_ctx, e)
|
|
|
|
|
await self.session.rollback()
|
|
|
|
|
return []
|
2026-02-25 00:30:11 +02:00
|
|
|
|
|
|
|
|
async def index(
|
|
|
|
|
self, document: Document, connector_doc: ConnectorDocument, llm
|
2026-02-25 19:56:59 +02:00
|
|
|
) -> Document:
|
2026-02-25 01:40:30 +02:00
|
|
|
"""
|
|
|
|
|
Run summarization, embedding, and chunking for a document and persist the results.
|
|
|
|
|
"""
|
2026-02-25 16:04:35 +02:00
|
|
|
ctx = PipelineLogContext(
|
|
|
|
|
connector_id=connector_doc.connector_id,
|
|
|
|
|
search_space_id=connector_doc.search_space_id,
|
|
|
|
|
unique_id=connector_doc.unique_id,
|
|
|
|
|
doc_id=document.id,
|
|
|
|
|
)
|
2026-02-25 00:30:11 +02:00
|
|
|
try:
|
2026-02-25 16:04:35 +02:00
|
|
|
log_index_started(ctx)
|
2026-02-25 00:30:11 +02:00
|
|
|
document.status = DocumentStatus.processing()
|
|
|
|
|
await self.session.commit()
|
|
|
|
|
|
2026-02-25 08:40:13 +02:00
|
|
|
if connector_doc.should_summarize and llm is not None:
|
2026-02-25 01:40:30 +02:00
|
|
|
content = await summarize_document(
|
2026-02-25 00:30:11 +02:00
|
|
|
connector_doc.source_markdown, llm, connector_doc.metadata
|
|
|
|
|
)
|
2026-02-25 13:47:36 +02:00
|
|
|
elif connector_doc.should_summarize and connector_doc.fallback_summary:
|
|
|
|
|
content = connector_doc.fallback_summary
|
2026-02-25 00:30:11 +02:00
|
|
|
else:
|
|
|
|
|
content = connector_doc.source_markdown
|
|
|
|
|
|
2026-02-25 01:40:30 +02:00
|
|
|
embedding = embed_text(content)
|
|
|
|
|
|
2026-02-25 08:40:13 +02:00
|
|
|
await self.session.execute(
|
|
|
|
|
delete(Chunk).where(Chunk.document_id == document.id)
|
|
|
|
|
)
|
|
|
|
|
|
2026-02-25 01:40:30 +02:00
|
|
|
chunks = [
|
|
|
|
|
Chunk(content=text, embedding=embed_text(text))
|
2026-02-25 02:20:44 +02:00
|
|
|
for text in chunk_text(
|
|
|
|
|
connector_doc.source_markdown,
|
|
|
|
|
use_code_chunker=connector_doc.should_use_code_chunker,
|
|
|
|
|
)
|
2026-02-25 01:40:30 +02:00
|
|
|
]
|
2026-02-25 00:30:11 +02:00
|
|
|
|
|
|
|
|
document.content = content
|
|
|
|
|
document.embedding = embedding
|
2026-02-25 15:30:25 +02:00
|
|
|
attach_chunks_to_document(document, chunks)
|
2026-02-25 02:20:44 +02:00
|
|
|
document.updated_at = datetime.now(UTC)
|
2026-02-25 00:30:11 +02:00
|
|
|
document.status = DocumentStatus.ready()
|
|
|
|
|
await self.session.commit()
|
2026-02-25 16:04:35 +02:00
|
|
|
log_index_success(ctx, chunk_count=len(chunks))
|
2026-02-25 00:30:11 +02:00
|
|
|
|
2026-02-25 15:26:04 +02:00
|
|
|
except RETRYABLE_LLM_ERRORS as e:
|
2026-02-25 16:04:35 +02:00
|
|
|
log_retryable_llm_error(ctx, e)
|
2026-02-25 19:56:59 +02:00
|
|
|
await rollback_and_persist_failure(
|
|
|
|
|
self.session, document, llm_retryable_message(e)
|
|
|
|
|
)
|
2026-02-25 15:26:04 +02:00
|
|
|
|
|
|
|
|
except PERMANENT_LLM_ERRORS as e:
|
2026-02-25 16:04:35 +02:00
|
|
|
log_permanent_llm_error(ctx, e)
|
2026-02-25 19:56:59 +02:00
|
|
|
await rollback_and_persist_failure(
|
|
|
|
|
self.session, document, llm_permanent_message(e)
|
|
|
|
|
)
|
2026-02-25 15:26:04 +02:00
|
|
|
|
2026-02-25 16:04:35 +02:00
|
|
|
except RecursionError as e:
|
|
|
|
|
log_chunking_overflow(ctx, e)
|
2026-02-25 19:56:59 +02:00
|
|
|
await rollback_and_persist_failure(
|
|
|
|
|
self.session, document, PipelineMessages.CHUNKING_OVERFLOW
|
|
|
|
|
)
|
2026-02-25 15:26:04 +02:00
|
|
|
|
2026-02-25 16:27:12 +02:00
|
|
|
except EMBEDDING_ERRORS as e:
|
|
|
|
|
log_embedding_error(ctx, e)
|
2026-02-25 19:56:59 +02:00
|
|
|
await rollback_and_persist_failure(
|
|
|
|
|
self.session, document, embedding_message(e)
|
|
|
|
|
)
|
2026-02-25 16:27:12 +02:00
|
|
|
|
2026-02-25 00:30:11 +02:00
|
|
|
except Exception as e:
|
2026-02-25 16:04:35 +02:00
|
|
|
log_unexpected_error(ctx, e)
|
2026-02-25 19:56:59 +02:00
|
|
|
await rollback_and_persist_failure(
|
|
|
|
|
self.session, document, safe_exception_message(e)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
with contextlib.suppress(Exception):
|
|
|
|
|
await self.session.refresh(document)
|
|
|
|
|
|
|
|
|
|
return document
|