mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
refactor(webcrawler): update scraping logic to use v2 API and improve error handling
This commit is contained in:
parent
5742f83ca8
commit
0b1ca97acf
1 changed files with 8 additions and 11 deletions
|
|
@ -93,21 +93,18 @@ class WebCrawlerConnector:
|
|||
if formats is None:
|
||||
formats = ["markdown"]
|
||||
|
||||
scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats)
|
||||
# v2 API returns Document directly and raises an exception on failure
|
||||
scrape_result = await firecrawl_app.scrape(url, formats=formats)
|
||||
|
||||
if not scrape_result or not scrape_result.success:
|
||||
error_msg = (
|
||||
scrape_result.error
|
||||
if scrape_result and hasattr(scrape_result, "error")
|
||||
else "Unknown error"
|
||||
)
|
||||
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
|
||||
if not scrape_result:
|
||||
raise ValueError("Firecrawl returned no result")
|
||||
|
||||
# Extract content based on format
|
||||
content = scrape_result.markdown or scrape_result.html or ""
|
||||
|
||||
# Extract metadata
|
||||
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
||||
# Extract metadata - v2 returns DocumentMetadata object
|
||||
metadata_obj = scrape_result.metadata
|
||||
metadata = metadata_obj.model_dump() if metadata_obj else {}
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
|
|
@ -116,7 +113,7 @@ class WebCrawlerConnector:
|
|||
"title": metadata.get("title", url),
|
||||
"description": metadata.get("description", ""),
|
||||
"language": metadata.get("language", ""),
|
||||
"sourceURL": metadata.get("sourceURL", url),
|
||||
"sourceURL": metadata.get("source_url", url),
|
||||
**metadata,
|
||||
},
|
||||
"crawler_type": "firecrawl",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue