mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-30 11:26:24 +02:00
feat: Implement LLM configuration validation in create and update routes
- Added `validate_llm_config` function to `llm_service.py` for validating LLM configurations via test API calls. - Integrated validation in `create_llm_config` and `update_llm_config` routes in `llm_config_routes.py`, raising HTTP exceptions for invalid configurations. - Enhanced error handling to provide detailed feedback on configuration issues.
This commit is contained in:
parent
666dba7150
commit
9466bf595c
9 changed files with 235 additions and 52 deletions
|
|
@ -898,7 +898,7 @@ async def process_file_in_background(
|
|||
# Suppress both Python warnings and logging warnings from pdfminer
|
||||
pdfminer_logger = getLogger("pdfminer")
|
||||
original_level = pdfminer_logger.level
|
||||
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore", category=UserWarning, module="pdfminer"
|
||||
|
|
@ -907,16 +907,16 @@ async def process_file_in_background(
|
|||
"ignore",
|
||||
message=".*Cannot set gray non-stroke color.*",
|
||||
)
|
||||
warnings.filterwarnings(
|
||||
"ignore", message=".*invalid float value.*"
|
||||
)
|
||||
|
||||
warnings.filterwarnings("ignore", message=".*invalid float value.*")
|
||||
|
||||
# Temporarily suppress pdfminer logging warnings
|
||||
pdfminer_logger.setLevel(ERROR)
|
||||
|
||||
|
||||
try:
|
||||
# Process the document
|
||||
result = await docling_service.process_document(file_path, filename)
|
||||
result = await docling_service.process_document(
|
||||
file_path, filename
|
||||
)
|
||||
finally:
|
||||
# Restore original logging level
|
||||
pdfminer_logger.setLevel(original_level)
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ async def add_crawled_url_document(
|
|||
)
|
||||
|
||||
use_firecrawl = bool(config.FIRECRAWL_API_KEY)
|
||||
|
||||
|
||||
if use_firecrawl:
|
||||
# Use Firecrawl SDK directly
|
||||
firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
|
||||
|
|
@ -84,40 +84,50 @@ async def add_crawled_url_document(
|
|||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Crawling URL content: {url}",
|
||||
{"stage": "crawling", "crawler_type": "AsyncFirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
|
||||
{
|
||||
"stage": "crawling",
|
||||
"crawler_type": "AsyncFirecrawlApp"
|
||||
if use_firecrawl
|
||||
else "AsyncChromiumLoader",
|
||||
},
|
||||
)
|
||||
|
||||
if use_firecrawl:
|
||||
# Use async Firecrawl SDK with v1 API - properly awaited
|
||||
scrape_result = await firecrawl_app.scrape_url(
|
||||
url=url,
|
||||
formats=['markdown']
|
||||
url=url, formats=["markdown"]
|
||||
)
|
||||
|
||||
|
||||
# scrape_result is a Pydantic ScrapeResponse object
|
||||
# Access attributes directly
|
||||
if scrape_result and scrape_result.success:
|
||||
# Extract markdown content
|
||||
markdown_content = scrape_result.markdown or ''
|
||||
|
||||
markdown_content = scrape_result.markdown or ""
|
||||
|
||||
# Extract metadata - this is a DICT
|
||||
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
||||
|
||||
|
||||
# Convert to LangChain Document format
|
||||
url_crawled = [LangchainDocument(
|
||||
page_content=markdown_content,
|
||||
metadata={
|
||||
'source': url,
|
||||
'title': metadata.get('title', url),
|
||||
'description': metadata.get('description', ''),
|
||||
'language': metadata.get('language', ''),
|
||||
'sourceURL': metadata.get('sourceURL', url),
|
||||
**metadata # Include all other metadata fields
|
||||
}
|
||||
)]
|
||||
url_crawled = [
|
||||
LangchainDocument(
|
||||
page_content=markdown_content,
|
||||
metadata={
|
||||
"source": url,
|
||||
"title": metadata.get("title", url),
|
||||
"description": metadata.get("description", ""),
|
||||
"language": metadata.get("language", ""),
|
||||
"sourceURL": metadata.get("sourceURL", url),
|
||||
**metadata, # Include all other metadata fields
|
||||
},
|
||||
)
|
||||
]
|
||||
content_in_markdown = url_crawled[0].page_content
|
||||
else:
|
||||
error_msg = scrape_result.error if scrape_result and hasattr(scrape_result, 'error') else "Unknown error"
|
||||
error_msg = (
|
||||
scrape_result.error
|
||||
if scrape_result and hasattr(scrape_result, "error")
|
||||
else "Unknown error"
|
||||
)
|
||||
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
|
||||
else:
|
||||
# Use AsyncChromiumLoader as fallback
|
||||
|
|
@ -249,7 +259,9 @@ async def add_crawled_url_document(
|
|||
{"stage": "document_update", "chunks_count": len(chunks)},
|
||||
)
|
||||
|
||||
existing_document.title = url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url))
|
||||
existing_document.title = url_crawled[0].metadata.get(
|
||||
"title", url_crawled[0].metadata.get("source", url)
|
||||
)
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
|
|
@ -267,7 +279,9 @@ async def add_crawled_url_document(
|
|||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=url_crawled[0].metadata.get('title', url_crawled[0].metadata.get('source', url)),
|
||||
title=url_crawled[0].metadata.get(
|
||||
"title", url_crawled[0].metadata.get("source", url)
|
||||
),
|
||||
document_type=DocumentType.CRAWLED_URL,
|
||||
document_metadata=url_crawled[0].metadata,
|
||||
content=summary_content,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue