2025-08-12 15:28:13 -07:00
|
|
|
"""
|
|
|
|
|
Base functionality and shared imports for document processors.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-12-12 01:32:14 -08:00
|
|
|
from datetime import UTC, datetime
|
|
|
|
|
|
2025-08-12 15:28:13 -07:00
|
|
|
from langchain_community.document_transformers import MarkdownifyTransformer
|
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
|
from sqlalchemy.future import select
|
|
|
|
|
|
2025-08-18 20:51:48 -07:00
|
|
|
from app.db import Document
|
2025-08-12 15:28:13 -07:00
|
|
|
|
|
|
|
|
# Initialize markdown transformer
|
|
|
|
|
md = MarkdownifyTransformer()
|
|
|
|
|
|
|
|
|
|
|
2026-02-06 04:35:13 +05:30
|
|
|
def safe_set_chunks(document: Document, chunks: list) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Safely assign chunks to a document without triggering lazy loading.
|
|
|
|
|
|
|
|
|
|
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
|
|
|
|
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
|
|
|
|
|
|
|
|
|
Why this is needed:
|
|
|
|
|
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
|
|
|
|
load the OLD chunks first (for comparison/orphan detection)
|
|
|
|
|
- This lazy loading fails in async context with asyncpg driver
|
|
|
|
|
- set_committed_value bypasses this by setting the value directly
|
|
|
|
|
|
|
|
|
|
This function is safe regardless of how the document was loaded
|
|
|
|
|
(with or without selectinload).
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
document: The Document object to update
|
|
|
|
|
chunks: List of Chunk objects to assign
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
# Instead of: document.chunks = chunks (DANGEROUS!)
|
|
|
|
|
safe_set_chunks(document, chunks) # Always safe
|
|
|
|
|
"""
|
|
|
|
|
from sqlalchemy.orm.attributes import set_committed_value
|
|
|
|
|
set_committed_value(document, 'chunks', chunks)
|
|
|
|
|
|
|
|
|
|
|
2025-12-12 01:32:14 -08:00
|
|
|
def get_current_timestamp() -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Get the current timestamp with timezone for updated_at field.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Current datetime with UTC timezone
|
|
|
|
|
"""
|
|
|
|
|
return datetime.now(UTC)
|
|
|
|
|
|
|
|
|
|
|
2025-08-12 15:28:13 -07:00
|
|
|
async def check_duplicate_document(
|
|
|
|
|
session: AsyncSession, content_hash: str
|
|
|
|
|
) -> Document | None:
|
|
|
|
|
"""
|
|
|
|
|
Check if a document with the given content hash already exists.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
session: Database session
|
|
|
|
|
content_hash: Hash of the document content
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Existing document if found, None otherwise
|
|
|
|
|
"""
|
|
|
|
|
existing_doc_result = await session.execute(
|
|
|
|
|
select(Document).where(Document.content_hash == content_hash)
|
|
|
|
|
)
|
|
|
|
|
return existing_doc_result.scalars().first()
|
2025-10-14 21:09:11 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
async def check_document_by_unique_identifier(
|
|
|
|
|
session: AsyncSession, unique_identifier_hash: str
|
|
|
|
|
) -> Document | None:
|
|
|
|
|
"""
|
|
|
|
|
Check if a document with the given unique identifier hash already exists.
|
|
|
|
|
Eagerly loads chunks to avoid lazy loading issues during updates.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
session: Database session
|
|
|
|
|
unique_identifier_hash: Hash of the unique identifier from the source
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Existing document if found, None otherwise
|
|
|
|
|
"""
|
|
|
|
|
from sqlalchemy.orm import selectinload
|
|
|
|
|
|
|
|
|
|
existing_doc_result = await session.execute(
|
|
|
|
|
select(Document)
|
|
|
|
|
.options(selectinload(Document.chunks))
|
|
|
|
|
.where(Document.unique_identifier_hash == unique_identifier_hash)
|
|
|
|
|
)
|
|
|
|
|
return existing_doc_result.scalars().first()
|