mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
refactor: update safe_set_chunks function to be asynchronous and modify all connector and document processor files to use the new async implementation
This commit is contained in:
parent
49d8f41b09
commit
2b33dfe728
30 changed files with 102 additions and 106 deletions
|
|
@ -14,45 +14,37 @@ from app.db import Document
|
|||
md = MarkdownifyTransformer()
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
async def safe_set_chunks(
|
||||
session: "AsyncSession", document: Document, chunks: list
|
||||
) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
Delete old chunks and assign new ones to a document.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
This replaces direct ``document.chunks = chunks`` which triggers lazy
|
||||
loading (and MissingGreenlet errors in async contexts). It also
|
||||
explicitly deletes pre-existing chunks so they don't accumulate across
|
||||
repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
|
||||
delete-orphan cascade.
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
session: The current async database session.
|
||||
document: The Document object to update.
|
||||
chunks: List of Chunk objects to assign.
|
||||
"""
|
||||
from sqlalchemy.orm import object_session
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
# Keep relationship assignment lazy-load-safe.
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
from app.db import Chunk
|
||||
|
||||
# Ensure chunk rows are actually persisted.
|
||||
# set_committed_value bypasses normal unit-of-work tracking, so we need to
|
||||
# explicitly attach chunk objects to the current session.
|
||||
session = object_session(document)
|
||||
if session is not None:
|
||||
if document.id is not None:
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
session.add_all(chunks)
|
||||
if document.id is not None:
|
||||
await session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
session.add_all(chunks)
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ async def add_circleback_meeting_document(
|
|||
if summary_embedding is not None:
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = document_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.source_markdown = markdown_content
|
||||
document.content_needs_reindexing = False
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -154,7 +155,7 @@ async def add_extension_received_document(
|
|||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = content.metadata.model_dump()
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = combined_document_string
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from .markdown_processor import add_received_markdown_file_document
|
||||
|
||||
|
|
@ -488,7 +489,7 @@ async def add_received_file_document_using_unstructured(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "UNSTRUCTURED",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
@ -622,7 +623,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "LLAMACLOUD",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
@ -777,7 +778,7 @@ async def add_received_file_document_using_docling(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "DOCLING",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -258,7 +259,7 @@ async def add_received_markdown_file_document(
|
|||
existing_document.document_metadata = {
|
||||
"FILE_NAME": file_name,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
|
|
|||
|
|
@ -419,7 +419,7 @@ async def add_youtube_video_document(
|
|||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.source_markdown = combined_document_string
|
||||
document.status = DocumentStatus.ready() # READY status - fully processed
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue