From 0b1ca97acfd3461cff9699252d45bc067355d66c Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 26 Nov 2025 14:30:08 -0800 Subject: [PATCH] refactor(webcrawler): update scraping logic to use v2 API and improve error handling --- .../app/connectors/webcrawler_connector.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 9bb72f1ce..edd7f8800 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -93,21 +93,18 @@ class WebCrawlerConnector: if formats is None: formats = ["markdown"] - scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats) + # v2 API returns Document directly and raises an exception on failure + scrape_result = await firecrawl_app.scrape(url, formats=formats) - if not scrape_result or not scrape_result.success: - error_msg = ( - scrape_result.error - if scrape_result and hasattr(scrape_result, "error") - else "Unknown error" - ) - raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}") + if not scrape_result: + raise ValueError("Firecrawl returned no result") # Extract content based on format content = scrape_result.markdown or scrape_result.html or "" - # Extract metadata - metadata = scrape_result.metadata if scrape_result.metadata else {} + # Extract metadata - v2 returns DocumentMetadata object + metadata_obj = scrape_result.metadata + metadata = metadata_obj.model_dump() if metadata_obj else {} return { "content": content, @@ -116,7 +113,7 @@ class WebCrawlerConnector: "title": metadata.get("title", url), "description": metadata.get("description", ""), "language": metadata.get("language", ""), - "sourceURL": metadata.get("sourceURL", url), + "sourceURL": metadata.get("source_url", url), **metadata, }, "crawler_type": "firecrawl",