diff --git a/surfsense_backend/app/agents/new_chat/tools/podcast.py b/surfsense_backend/app/agents/new_chat/tools/podcast.py index 424b04f77..1048ed881 100644 --- a/surfsense_backend/app/agents/new_chat/tools/podcast.py +++ b/surfsense_backend/app/agents/new_chat/tools/podcast.py @@ -54,7 +54,9 @@ def set_generating_podcast(search_space_id: int, podcast_id: int) -> None: client = get_redis_client() client.setex(_redis_key(search_space_id), 1800, str(podcast_id)) except Exception as e: - print(f"[generate_podcast] Warning: Could not set generating podcast in Redis: {e}") + print( + f"[generate_podcast] Warning: Could not set generating podcast in Redis: {e}" + ) def create_generate_podcast_tool( diff --git a/surfsense_backend/app/routes/new_chat_routes.py b/surfsense_backend/app/routes/new_chat_routes.py index 541e25a75..38352d348 100644 --- a/surfsense_backend/app/routes/new_chat_routes.py +++ b/surfsense_backend/app/routes/new_chat_routes.py @@ -670,7 +670,9 @@ async def delete_thread( ) from None -@router.post("/threads/{thread_id}/complete-clone", response_model=CompleteCloneResponse) +@router.post( + "/threads/{thread_id}/complete-clone", response_model=CompleteCloneResponse +) async def complete_clone( thread_id: int, session: AsyncSession = Depends(get_async_session), @@ -702,7 +704,9 @@ async def complete_clone( raise HTTPException(status_code=400, detail="Clone already completed") if not thread.cloned_from_thread_id: - raise HTTPException(status_code=400, detail="No source thread to clone from") + raise HTTPException( + status_code=400, detail="No source thread to clone from" + ) message_count = await complete_clone_content( session=session, diff --git a/surfsense_backend/app/routes/public_chat_routes.py b/surfsense_backend/app/routes/public_chat_routes.py index 8b4f42559..4676f2ad0 100644 --- a/surfsense_backend/app/routes/public_chat_routes.py +++ b/surfsense_backend/app/routes/public_chat_routes.py @@ -53,7 +53,9 @@ async def clone_public_chat_endpoint( source_thread = await get_thread_by_share_token(session, share_token) if not source_thread: - raise HTTPException(status_code=404, detail="Chat not found or no longer public") + raise HTTPException( + status_code=404, detail="Chat not found or no longer public" + ) target_search_space_id = await get_user_default_search_space(session, user.id) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 6ba67fb69..edb1760f3 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -941,7 +941,11 @@ async def index_connector_content( f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) index_crawled_urls_task.delay( - connector_id, search_space_id, str(user.id), indexing_from, indexing_to + connector_id, + search_space_id, + str(user.id), + indexing_from, + indexing_to, ) response_message = "Web page indexing started in the background." diff --git a/surfsense_backend/app/schemas/new_chat.py b/surfsense_backend/app/schemas/new_chat.py index b420b1b91..ab6be9c9f 100644 --- a/surfsense_backend/app/schemas/new_chat.py +++ b/surfsense_backend/app/schemas/new_chat.py @@ -257,14 +257,11 @@ class PublicChatResponse(BaseModel): class CloneInitResponse(BaseModel): - - thread_id: int search_space_id: int share_token: str class CompleteCloneResponse(BaseModel): - status: str message_count: int diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py index 9e5cb0262..60f9d7dc0 100644 --- a/surfsense_backend/app/schemas/podcasts.py +++ b/surfsense_backend/app/schemas/podcasts.py @@ -59,6 +59,8 @@ class PodcastRead(PodcastBase): "search_space_id": obj.search_space_id, "status": obj.status, "created_at": obj.created_at, - "transcript_entries": len(obj.podcast_transcript) if obj.podcast_transcript else None, + "transcript_entries": len(obj.podcast_transcript) + if obj.podcast_transcript + else None, } return cls(**data) diff --git a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py index 0ce714cdc..2ce8716e0 100644 --- a/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/podcast_tasks.py @@ -55,7 +55,9 @@ def _clear_generating_podcast(search_space_id: int) -> None: client = redis.from_url(redis_url, decode_responses=True) key = f"podcast:generating:{search_space_id}" client.delete(key) - logger.info(f"Cleared generating podcast key for search_space_id={search_space_id}") + logger.info( + f"Cleared generating podcast key for search_space_id={search_space_id}" + ) except Exception as e: logger.warning(f"Could not clear generating podcast key: {e}") @@ -119,9 +121,7 @@ async def _generate_content_podcast( ) -> dict: """Generate content-based podcast and update existing record.""" async with get_celery_session_maker()() as session: - result = await session.execute( - select(Podcast).filter(Podcast.id == podcast_id) - ) + result = await session.execute(select(Podcast).filter(Podcast.id == podcast_id)) podcast = result.scalars().first() if not podcast: diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index 22d45af21..b33e25170 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -165,7 +165,9 @@ async def _check_and_trigger_schedules(): from app.utils.webcrawler_utils import parse_webcrawler_urls connector_config = connector.config or {} - urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS")) + urls = parse_webcrawler_urls( + connector_config.get("INITIAL_URLS") + ) if urls: task.delay( diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 5161fb569..6c4be0cb8 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -55,7 +55,9 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( ) # Timeout calculation constants -UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024 # 100 KB/s (conservative for slow connections) +UPLOAD_BYTES_PER_SECOND_SLOW = ( + 100 * 1024 +) # 100 KB/s (conservative for slow connections) MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing @@ -219,19 +221,19 @@ async def find_existing_document_with_migration( def calculate_upload_timeout(file_size_bytes: int) -> float: """ Calculate appropriate upload timeout based on file size. - + Assumes a conservative slow connection speed to handle worst-case scenarios. - + Args: file_size_bytes: Size of the file in bytes - + Returns: Timeout in seconds """ # Calculate time needed at slow connection speed # Add 50% buffer for network variability and SSL overhead estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 - + # Clamp to reasonable bounds return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) @@ -239,21 +241,21 @@ def calculate_upload_timeout(file_size_bytes: int) -> float: def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: """ Calculate job processing timeout based on page count and file size. - + Args: estimated_pages: Estimated number of pages file_size_bytes: Size of the file in bytes - + Returns: Timeout in seconds """ # Base timeout + time per page page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) - + # Also consider file size (large images take longer to process) # ~1 minute per 10MB of file size size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 - + # Use the larger of the two estimates return max(page_based_timeout, size_based_timeout) @@ -284,18 +286,18 @@ async def parse_with_llamacloud_retry( """ import os import random - + from llama_cloud_services import LlamaParse from llama_cloud_services.parse.utils import ResultType # Get file size for timeout calculations file_size_bytes = os.path.getsize(file_path) file_size_mb = file_size_bytes / (1024 * 1024) - + # Calculate dynamic timeouts based on file size and page count upload_timeout = calculate_upload_timeout(file_size_bytes) job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) - + # HTTP client timeouts - scaled based on file size # Write timeout is critical for large file uploads custom_timeout = httpx.Timeout( @@ -304,7 +306,7 @@ async def parse_with_llamacloud_retry( write=upload_timeout, # Dynamic based on file size (upload time) pool=120.0, # 2 minutes to acquire connection from pool ) - + logging.info( f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " @@ -335,14 +337,14 @@ async def parse_with_llamacloud_retry( # Parse the file asynchronously result = await parser.aparse(file_path) - + # Success - log if we had previous failures if attempt > 1: logging.info( f"LlamaCloud upload succeeded on attempt {attempt} after " f"{len(attempt_errors)} failures" ) - + return result except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: @@ -355,8 +357,7 @@ async def parse_with_llamacloud_retry( # Calculate exponential backoff with jitter # Base delay doubles each attempt, capped at max delay base_delay = min( - LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), - LLAMACLOUD_MAX_DELAY + LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY ) # Add random jitter (±25%) to prevent thundering herd jitter = base_delay * 0.25 * (2 * random.random() - 1) diff --git a/surfsense_backend/app/utils/webcrawler_utils.py b/surfsense_backend/app/utils/webcrawler_utils.py index 05633de4f..31d2ebe50 100644 --- a/surfsense_backend/app/utils/webcrawler_utils.py +++ b/surfsense_backend/app/utils/webcrawler_utils.py @@ -21,6 +21,8 @@ def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]: if isinstance(initial_urls, str): return [url.strip() for url in initial_urls.split("\n") if url.strip()] elif isinstance(initial_urls, list): - return [url.strip() for url in initial_urls if isinstance(url, str) and url.strip()] + return [ + url.strip() for url in initial_urls if isinstance(url, str) and url.strip() + ] else: return []