SurfSense/surfsense_backend/app/tasks/document_processors/base.py

"""
Base functionality and shared imports for document processors.
"""

from datetime import UTC, datetime

from langchain_community.document_transformers import MarkdownifyTransformer
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select

from app.db import Document

# Initialize markdown transformer
md = MarkdownifyTransformer()


async def safe_set_chunks(
    session: "AsyncSession", document: Document, chunks: list
) -> None:
    """
    Delete old chunks and assign new ones to a document.

    This replaces direct ``document.chunks = chunks`` which triggers lazy
    loading (and MissingGreenlet errors in async contexts).  It also
    explicitly deletes pre-existing chunks so they don't accumulate across
    repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
    delete-orphan cascade.

    Args:
        session: The current async database session.
        document: The Document object to update.
        chunks: List of Chunk objects to assign.
    """
    from sqlalchemy import delete
    from sqlalchemy.orm.attributes import set_committed_value

    from app.db import Chunk

    if document.id is not None:
        await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
        for chunk in chunks:
            chunk.document_id = document.id

    set_committed_value(document, "chunks", chunks)
    session.add_all(chunks)


def get_current_timestamp() -> datetime:
    """
    Get the current timestamp with timezone for updated_at field.

    Returns:
        Current datetime with UTC timezone
    """
    return datetime.now(UTC)


async def check_duplicate_document(
    session: AsyncSession, content_hash: str
) -> Document | None:
    """
    Check if a document with the given content hash already exists.

    Args:
        session: Database session
        content_hash: Hash of the document content

    Returns:
        Existing document if found, None otherwise
    """
    existing_doc_result = await session.execute(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()


async def check_document_by_unique_identifier(
    session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
    """
    Check if a document with the given unique identifier hash already exists.
    Eagerly loads chunks to avoid lazy loading issues during updates.

    Args:
        session: Database session
        unique_identifier_hash: Hash of the unique identifier from the source

    Returns:
        Existing document if found, None otherwise
    """
    from sqlalchemy.orm import selectinload

    existing_doc_result = await session.execute(
        select(Document)
        .options(selectinload(Document.chunks))
        .where(Document.unique_identifier_hash == unique_identifier_hash)
    )
    return existing_doc_result.scalars().first()