feat: optimize document upload process and enhance memory management

- Increased maximum file upload limit from 10 to 50 to improve user experience.
- Implemented batch processing for document uploads to avoid proxy timeouts, splitting files into manageable chunks.
- Enhanced garbage collection in chat streaming functions to prevent memory leaks and improve performance.
- Added memory delta tracking in system snapshots for better monitoring of resource usage.
- Updated LLM router and service configurations to prevent unbounded internal accumulation and improve efficiency.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-28 17:22:34 -08:00
parent cc64e18501
commit d959a6a6c8
16 changed files with 219 additions and 187 deletions

View file

@ -1 +1,28 @@
"""Celery tasks package."""
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from app.config import config
_celery_engine = None
_celery_session_maker = None
def get_celery_session_maker() -> async_sessionmaker:
"""Return a shared async session maker for Celery tasks.
A single NullPool engine is created per worker process and reused
across all task invocations to avoid leaking engine objects.
"""
global _celery_engine, _celery_session_maker
if _celery_session_maker is None:
_celery_engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
_celery_session_maker = async_sessionmaker(
_celery_engine, expire_on_commit=False
)
return _celery_session_maker

View file

@ -3,11 +3,8 @@
import logging
import traceback
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.tasks.celery_tasks import get_celery_session_maker
logger = logging.getLogger(__name__)
@ -42,20 +39,6 @@ def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> N
)
def get_celery_session_maker():
"""
Create a new async session maker for Celery tasks.
This is necessary because Celery tasks run in a new event loop,
and the default session maker is bound to the main app's event loop.
"""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool, # Don't use connection pooling for Celery tasks
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="index_slack_messages", bind=True)
def index_slack_messages_task(
self,

View file

@ -4,15 +4,13 @@ import logging
from sqlalchemy import delete, select
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.orm import selectinload
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Document
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker
from app.utils.document_converters import (
create_document_chunks,
generate_document_summary,
@ -21,16 +19,6 @@ from app.utils.document_converters import (
logger = logging.getLogger(__name__)
def get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="reindex_document", bind=True)
def reindex_document_task(self, document_id: int, user_id: str):
"""

View file

@ -5,13 +5,11 @@ import logging
import os
from uuid import UUID
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker
from app.tasks.document_processors import (
add_extension_received_document,
add_youtube_video_document,
@ -91,20 +89,6 @@ async def _run_heartbeat_loop(notification_id: int):
pass # Normal cancellation when task completes
def get_celery_session_maker():
"""
Create a new async session maker for Celery tasks.
This is necessary because Celery tasks run in a new event loop,
and the default session maker is bound to the main app's event loop.
"""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool, # Don't use connection pooling for Celery tasks
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="process_extension_document", bind=True)
def process_extension_document_task(
self, individual_document_dict, search_space_id: int, user_id: str

View file

@ -5,14 +5,13 @@ import logging
import sys
from sqlalchemy import select
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
from app.agents.podcaster.graph import graph as podcaster_graph
from app.agents.podcaster.state import State as PodcasterState
from app.celery_app import celery_app
from app.config import config
from app.db import Podcast, PodcastStatus
from app.tasks.celery_tasks import get_celery_session_maker
logger = logging.getLogger(__name__)
@ -25,20 +24,6 @@ if sys.platform.startswith("win"):
)
def get_celery_session_maker():
"""
Create a new async session maker for Celery tasks.
This is necessary because Celery tasks run in a new event loop,
and the default session maker is bound to the main app's event loop.
"""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool, # Don't use connection pooling for Celery tasks
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
# =============================================================================
# Content-based podcast generation (for new-chat)
# =============================================================================

View file

@ -3,28 +3,16 @@
import logging
from datetime import UTC, datetime
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.future import select
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Notification, SearchSourceConnector, SearchSourceConnectorType
from app.tasks.celery_tasks import get_celery_session_maker
from app.utils.indexing_locks import is_connector_indexing_locked
logger = logging.getLogger(__name__)
def get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="check_periodic_schedules")
def check_periodic_schedules_task():
"""

View file

@ -29,20 +29,17 @@ from datetime import UTC, datetime
import redis
from sqlalchemy import and_, or_, text
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.future import select
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Document, DocumentStatus, Notification
from app.tasks.celery_tasks import get_celery_session_maker
logger = logging.getLogger(__name__)
# Redis client for checking heartbeats
_redis_client: redis.Redis | None = None
# Error messages shown to users when tasks are interrupted
STALE_SYNC_ERROR_MESSAGE = "Sync was interrupted unexpectedly. Please retry."
STALE_PROCESSING_ERROR_MESSAGE = "Syncing was interrupted unexpectedly. Please retry."
@ -60,16 +57,6 @@ def _get_heartbeat_key(notification_id: int) -> str:
return f"indexing:heartbeat:{notification_id}"
def get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="cleanup_stale_indexing_notifications")
def cleanup_stale_indexing_notifications_task():
"""