feat: Implement LLM configuration validation in create and update routes

- Added `validate_llm_config` function to `llm_service.py` for validating LLM configurations via test API calls. - Integrated validation in `create_llm_config` and `update_llm_config` routes in `llm_config_routes.py`, raising HTTP exceptions for invalid configurations. - Enhanced error handling to provide detailed feedback on configuration issues.
2026-04-30 11:26:24 +02:00 · 2025-11-05 12:15:05 -08:00 · 2025-11-05 12:15:05 -08:00 · 9466bf595c
commit 9466bf595c
parent 666dba7150
9 changed files with 235 additions and 52 deletions
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -898,7 +898,7 @@ async def process_file_in_background(
                # Suppress both Python warnings and logging warnings from pdfminer
                pdfminer_logger = getLogger("pdfminer")
                original_level = pdfminer_logger.level
-                
+
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore", category=UserWarning, module="pdfminer"
@ -907,16 +907,16 @@ async def process_file_in_background(
                        "ignore",
                        message=".*Cannot set gray non-stroke color.*",
                    )
-                    warnings.filterwarnings(
-                        "ignore", message=".*invalid float value.*"
-                    )
-                    
+                    warnings.filterwarnings("ignore", message=".*invalid float value.*")
+
                    # Temporarily suppress pdfminer logging warnings
                    pdfminer_logger.setLevel(ERROR)
-                    
+
                    try:
                        # Process the document
-                        result = await docling_service.process_document(file_path, filename)
+                        result = await docling_service.process_document(
+                            file_path, filename
+                        )
                    finally:
                        # Restore original logging level
                        pdfminer_logger.setLevel(original_level)
--- a/surfsense_backend/app/tasks/document_processors/url_crawler.py
+++ b/surfsense_backend/app/tasks/document_processors/url_crawler.py
@ -73,7 +73,7 @@ async def add_crawled_url_document(
        )

        use_firecrawl = bool(config.FIRECRAWL_API_KEY)
-        
+
        if use_firecrawl:
            # Use Firecrawl SDK directly
            firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
@ -84,40 +84,50 @@ async def add_crawled_url_document(
        await task_logger.log_task_progress(
            log_entry,
            f"Crawling URL content: {url}",
-            {"stage": "crawling", "crawler_type": "AsyncFirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
+            {
+                "stage": "crawling",
+                "crawler_type": "AsyncFirecrawlApp"
+                if use_firecrawl
+                else "AsyncChromiumLoader",
+            },
        )

        if use_firecrawl:
            # Use async Firecrawl SDK with v1 API - properly awaited
            scrape_result = await firecrawl_app.scrape_url(
-                url=url,
-                formats=['markdown']
+                url=url, formats=["markdown"]
            )
-            
+
            # scrape_result is a Pydantic ScrapeResponse object
            # Access attributes directly
            if scrape_result and scrape_result.success:
                # Extract markdown content
-                markdown_content = scrape_result.markdown or ''
-                
+                markdown_content = scrape_result.markdown or ""
+
                # Extract metadata - this is a DICT
                metadata = scrape_result.metadata if scrape_result.metadata else {}
-                
+
                # Convert to LangChain Document format
-                url_crawled = [LangchainDocument(
-                    page_content=markdown_content,
-                    metadata={
-                        'source': url,
-                        'title': metadata.get('title', url),
-                        'description': metadata.get('description', ''),
-                        'language': metadata.get('language', ''),
-                        'sourceURL': metadata.get('sourceURL', url),
-                        **metadata  # Include all other metadata fields
-                    }
-                )]
+                url_crawled = [
+                    LangchainDocument(
+                        page_content=markdown_content,
+                        metadata={
+                            "source": url,
+                            "title": metadata.get("title", url),
+                            "description": metadata.get("description", ""),
+                            "language": metadata.get("language", ""),
+                            "sourceURL": metadata.get("sourceURL", url),
+                            **metadata,  # Include all other metadata fields
+                        },
+                    )
+                ]
                content_in_markdown = url_crawled[0].page_content
            else:
-                error_msg = scrape_result.error if scrape_result and hasattr(scrape_result, 'error') else "Unknown error"
+                error_msg = (
+                    scrape_result.error
+                    if scrape_result and hasattr(scrape_result, "error")
+                    else "Unknown error"
+                )
                raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
        else:
            # Use AsyncChromiumLoader as fallback
@ -249,7 +259,9 @@ async def add_crawled_url_document(
                {"stage": "document_update", "chunks_count": len(chunks)},
            )

-            existing_document.title = url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url))
+            existing_document.title = url_crawled[0].metadata.get(
+                "title", url_crawled[0].metadata.get("source", url)
+            )
            existing_document.content = summary_content
            existing_document.content_hash = content_hash
            existing_document.embedding = summary_embedding
@ -267,7 +279,9 @@ async def add_crawled_url_document(

            document = Document(
                search_space_id=search_space_id,
-                title=url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url)),
+                title=url_crawled[0].metadata.get(
+                    "title", url_crawled[0].metadata.get("source", url)
+                ),
                document_type=DocumentType.CRAWLED_URL,
                document_metadata=url_crawled[0].metadata,
                content=summary_content,