feat: Implement LLM configuration validation in create and update routes

- Added `validate_llm_config` function to `llm_service.py` for validating LLM configurations via test API calls.
- Integrated validation in `create_llm_config` and `update_llm_config` routes in `llm_config_routes.py`, raising HTTP exceptions for invalid configurations.
- Enhanced error handling to provide detailed feedback on configuration issues.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-11-05 12:15:05 -08:00
parent 666dba7150
commit 9466bf595c
9 changed files with 235 additions and 52 deletions

View file

@ -898,7 +898,7 @@ async def process_file_in_background(
# Suppress both Python warnings and logging warnings from pdfminer
pdfminer_logger = getLogger("pdfminer")
original_level = pdfminer_logger.level
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", category=UserWarning, module="pdfminer"
@ -907,16 +907,16 @@ async def process_file_in_background(
"ignore",
message=".*Cannot set gray non-stroke color.*",
)
warnings.filterwarnings(
"ignore", message=".*invalid float value.*"
)
warnings.filterwarnings("ignore", message=".*invalid float value.*")
# Temporarily suppress pdfminer logging warnings
pdfminer_logger.setLevel(ERROR)
try:
# Process the document
result = await docling_service.process_document(file_path, filename)
result = await docling_service.process_document(
file_path, filename
)
finally:
# Restore original logging level
pdfminer_logger.setLevel(original_level)

View file

@ -73,7 +73,7 @@ async def add_crawled_url_document(
)
use_firecrawl = bool(config.FIRECRAWL_API_KEY)
if use_firecrawl:
# Use Firecrawl SDK directly
firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
@ -84,40 +84,50 @@ async def add_crawled_url_document(
await task_logger.log_task_progress(
log_entry,
f"Crawling URL content: {url}",
{"stage": "crawling", "crawler_type": "AsyncFirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
{
"stage": "crawling",
"crawler_type": "AsyncFirecrawlApp"
if use_firecrawl
else "AsyncChromiumLoader",
},
)
if use_firecrawl:
# Use async Firecrawl SDK with v1 API - properly awaited
scrape_result = await firecrawl_app.scrape_url(
url=url,
formats=['markdown']
url=url, formats=["markdown"]
)
# scrape_result is a Pydantic ScrapeResponse object
# Access attributes directly
if scrape_result and scrape_result.success:
# Extract markdown content
markdown_content = scrape_result.markdown or ''
markdown_content = scrape_result.markdown or ""
# Extract metadata - this is a DICT
metadata = scrape_result.metadata if scrape_result.metadata else {}
# Convert to LangChain Document format
url_crawled = [LangchainDocument(
page_content=markdown_content,
metadata={
'source': url,
'title': metadata.get('title', url),
'description': metadata.get('description', ''),
'language': metadata.get('language', ''),
'sourceURL': metadata.get('sourceURL', url),
**metadata # Include all other metadata fields
}
)]
url_crawled = [
LangchainDocument(
page_content=markdown_content,
metadata={
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("sourceURL", url),
**metadata, # Include all other metadata fields
},
)
]
content_in_markdown = url_crawled[0].page_content
else:
error_msg = scrape_result.error if scrape_result and hasattr(scrape_result, 'error') else "Unknown error"
error_msg = (
scrape_result.error
if scrape_result and hasattr(scrape_result, "error")
else "Unknown error"
)
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
else:
# Use AsyncChromiumLoader as fallback
@ -249,7 +259,9 @@ async def add_crawled_url_document(
{"stage": "document_update", "chunks_count": len(chunks)},
)
existing_document.title = url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url))
existing_document.title = url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
@ -267,7 +279,9 @@ async def add_crawled_url_document(
document = Document(
search_space_id=search_space_id,
title=url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url)),
title=url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
),
document_type=DocumentType.CRAWLED_URL,
document_metadata=url_crawled[0].metadata,
content=summary_content,