SurfSense/surfsense_backend/app/tasks/document_processors/base.py

"""
Base functionality and shared imports for document processors.
"""

from datetime import UTC, datetime

from langchain_community.document_transformers import MarkdownifyTransformer
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select

from app.db import Document

# Initialize markdown transformer
md = MarkdownifyTransformer()


def safe_set_chunks(document: Document, chunks: list) -> None:
    """
    Safely assign chunks to a document without triggering lazy loading.

    ALWAYS use this instead of `document.chunks = chunks` to avoid
    SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).

    Why this is needed:
    - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
      load the OLD chunks first (for comparison/orphan detection)
    - This lazy loading fails in async context with asyncpg driver
    - set_committed_value bypasses this by setting the value directly

    This function is safe regardless of how the document was loaded
    (with or without selectinload).

    Args:
        document: The Document object to update
        chunks: List of Chunk objects to assign

    Example:
        # Instead of: document.chunks = chunks (DANGEROUS!)
        safe_set_chunks(document, chunks)  # Always safe
    """
    from sqlalchemy.orm.attributes import set_committed_value

    set_committed_value(document, "chunks", chunks)


def get_current_timestamp() -> datetime:
    """
    Get the current timestamp with timezone for updated_at field.

    Returns:
        Current datetime with UTC timezone
    """
    return datetime.now(UTC)


async def check_duplicate_document(
    session: AsyncSession, content_hash: str
) -> Document | None:
    """
    Check if a document with the given content hash already exists.

    Args:
        session: Database session
        content_hash: Hash of the document content

    Returns:
        Existing document if found, None otherwise
    """
    existing_doc_result = await session.execute(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()


async def check_document_by_unique_identifier(
    session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
    """
    Check if a document with the given unique identifier hash already exists.
    Eagerly loads chunks to avoid lazy loading issues during updates.

    Args:
        session: Database session
        unique_identifier_hash: Hash of the unique identifier from the source

    Returns:
        Existing document if found, None otherwise
    """
    from sqlalchemy.orm import selectinload

    existing_doc_result = await session.execute(
        select(Document)
        .options(selectinload(Document.chunks))
        .where(Document.unique_identifier_hash == unique_identifier_hash)
    )
    return existing_doc_result.scalars().first()
$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`"""`
			`Base functionality and shared imports for document processors.`
			`"""`

$DESKTOP-RTLN3BA\$punk$ feat: update document tracking to use 'updated_at' timestamp instead of 'last_edited_at' 2025-12-12 01:32:14 -08:00			`from datetime import UTC, datetime`

$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`from langchain_community.document_transformers import MarkdownifyTransformer`
			`from sqlalchemy.ext.asyncio import AsyncSession`
			`from sqlalchemy.future import select`

$DESKTOP-RTLN3BA\$punk$ feat: Fixed Document Summary Content across connectors and processors 2025-08-18 20:51:48 -07:00			`from app.db import Document`
$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00
			`# Initialize markdown transformer`
			`md = MarkdownifyTransformer()`


feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			`def safe_set_chunks(document: Document, chunks: list) -> None:`
			`"""`
			`Safely assign chunks to a document without triggering lazy loading.`
chore: ran linting 2026-02-06 05:35:15 +05:30
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			ALWAYS use this instead of `document.chunks = chunks` to avoid
			`SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).`
chore: ran linting 2026-02-06 05:35:15 +05:30
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			`Why this is needed:`
			- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
			`load the OLD chunks first (for comparison/orphan detection)`
			`- This lazy loading fails in async context with asyncpg driver`
			`- set_committed_value bypasses this by setting the value directly`
chore: ran linting 2026-02-06 05:35:15 +05:30
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			`This function is safe regardless of how the document was loaded`
			`(with or without selectinload).`
chore: ran linting 2026-02-06 05:35:15 +05:30
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			`Args:`
			`document: The Document object to update`
			`chunks: List of Chunk objects to assign`
chore: ran linting 2026-02-06 05:35:15 +05:30
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30			`Example:`
			`# Instead of: document.chunks = chunks (DANGEROUS!)`
			`safe_set_chunks(document, chunks) # Always safe`
			`"""`
			`from sqlalchemy.orm.attributes import set_committed_value`
chore: ran linting 2026-02-06 05:35:15 +05:30
			`set_committed_value(document, "chunks", chunks)`
feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates 2026-02-06 04:35:13 +05:30

$DESKTOP-RTLN3BA\$punk$ feat: update document tracking to use 'updated_at' timestamp instead of 'last_edited_at' 2025-12-12 01:32:14 -08:00			`def get_current_timestamp() -> datetime:`
			`"""`
			`Get the current timestamp with timezone for updated_at field.`

			`Returns:`
			`Current datetime with UTC timezone`
			`"""`
			`return datetime.now(UTC)`


$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`async def check_duplicate_document(`
			`session: AsyncSession, content_hash: str`
			`) -> Document \| None:`
			`"""`
			`Check if a document with the given content hash already exists.`

			`Args:`
			`session: Database session`
			`content_hash: Hash of the document content`

			`Returns:`
			`Existing document if found, None otherwise`
			`"""`
			`existing_doc_result = await session.execute(`
			`select(Document).where(Document.content_hash == content_hash)`
			`)`
			`return existing_doc_result.scalars().first()`
$DESKTOP-RTLN3BA\$punk$ feat: add unique identifier hash for documents to prevent duplicates across various connectors 2025-10-14 21:09:11 -07:00

			`async def check_document_by_unique_identifier(`
			`session: AsyncSession, unique_identifier_hash: str`
			`) -> Document \| None:`
			`"""`
			`Check if a document with the given unique identifier hash already exists.`
			`Eagerly loads chunks to avoid lazy loading issues during updates.`

			`Args:`
			`session: Database session`
			`unique_identifier_hash: Hash of the unique identifier from the source`

			`Returns:`
			`Existing document if found, None otherwise`
			`"""`
			`from sqlalchemy.orm import selectinload`

			`existing_doc_result = await session.execute(`
			`select(Document)`
			`.options(selectinload(Document.chunks))`
			`.where(Document.unique_identifier_hash == unique_identifier_hash)`
			`)`
			`return existing_doc_result.scalars().first()`