SurfSense/surfsense_backend/app/tasks/document_processors/base.py

"""
Base functionality and shared imports for document processors.
"""

from datetime import UTC, datetime

from langchain_community.document_transformers import MarkdownifyTransformer
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select

from app.db import Document

# Initialize markdown transformer
md = MarkdownifyTransformer()


def get_current_timestamp() -> datetime:
    """
    Get the current timestamp with timezone for updated_at field.

    Returns:
        Current datetime with UTC timezone
    """
    return datetime.now(UTC)


async def check_duplicate_document(
    session: AsyncSession, content_hash: str
) -> Document | None:
    """
    Check if a document with the given content hash already exists.

    Args:
        session: Database session
        content_hash: Hash of the document content

    Returns:
        Existing document if found, None otherwise
    """
    existing_doc_result = await session.execute(
        select(Document).where(Document.content_hash == content_hash)
    )
    return existing_doc_result.scalars().first()


async def check_document_by_unique_identifier(
    session: AsyncSession, unique_identifier_hash: str
) -> Document | None:
    """
    Check if a document with the given unique identifier hash already exists.
    Eagerly loads chunks to avoid lazy loading issues during updates.

    Args:
        session: Database session
        unique_identifier_hash: Hash of the unique identifier from the source

    Returns:
        Existing document if found, None otherwise
    """
    from sqlalchemy.orm import selectinload

    existing_doc_result = await session.execute(
        select(Document)
        .options(selectinload(Document.chunks))
        .where(Document.unique_identifier_hash == unique_identifier_hash)
    )
    return existing_doc_result.scalars().first()
$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`"""`
			`Base functionality and shared imports for document processors.`
			`"""`

$DESKTOP-RTLN3BA\$punk$ feat: update document tracking to use 'updated_at' timestamp instead of 'last_edited_at' 2025-12-12 01:32:14 -08:00			`from datetime import UTC, datetime`

$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`from langchain_community.document_transformers import MarkdownifyTransformer`
			`from sqlalchemy.ext.asyncio import AsyncSession`
			`from sqlalchemy.future import select`

$DESKTOP-RTLN3BA\$punk$ feat: Fixed Document Summary Content across connectors and processors 2025-08-18 20:51:48 -07:00			`from app.db import Document`
$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00
			`# Initialize markdown transformer`
			`md = MarkdownifyTransformer()`


$DESKTOP-RTLN3BA\$punk$ feat: update document tracking to use 'updated_at' timestamp instead of 'last_edited_at' 2025-12-12 01:32:14 -08:00			`def get_current_timestamp() -> datetime:`
			`"""`
			`Get the current timestamp with timezone for updated_at field.`

			`Returns:`
			`Current datetime with UTC timezone`
			`"""`
			`return datetime.now(UTC)`


$DESKTOP-RTLN3BA\$punk$ refactor: refactored background_tasks & indexing_tasks 2025-08-12 15:28:13 -07:00			`async def check_duplicate_document(`
			`session: AsyncSession, content_hash: str`
			`) -> Document \| None:`
			`"""`
			`Check if a document with the given content hash already exists.`

			`Args:`
			`session: Database session`
			`content_hash: Hash of the document content`

			`Returns:`
			`Existing document if found, None otherwise`
			`"""`
			`existing_doc_result = await session.execute(`
			`select(Document).where(Document.content_hash == content_hash)`
			`)`
			`return existing_doc_result.scalars().first()`
$DESKTOP-RTLN3BA\$punk$ feat: add unique identifier hash for documents to prevent duplicates across various connectors 2025-10-14 21:09:11 -07:00

			`async def check_document_by_unique_identifier(`
			`session: AsyncSession, unique_identifier_hash: str`
			`) -> Document \| None:`
			`"""`
			`Check if a document with the given unique identifier hash already exists.`
			`Eagerly loads chunks to avoid lazy loading issues during updates.`

			`Args:`
			`session: Database session`
			`unique_identifier_hash: Hash of the unique identifier from the source`

			`Returns:`
			`Existing document if found, None otherwise`
			`"""`
			`from sqlalchemy.orm import selectinload`

			`existing_doc_result = await session.execute(`
			`select(Document)`
			`.options(selectinload(Document.chunks))`
			`.where(Document.unique_identifier_hash == unique_identifier_hash)`
			`)`
			`return existing_doc_result.scalars().first()`