mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
Merge pull request #503 from MODSetter/dev
refactor(webcrawler): update scraping logic to use v2 API and improve error handling
This commit is contained in:
commit
49d42694f7
3 changed files with 14 additions and 16 deletions
|
|
@ -93,21 +93,18 @@ class WebCrawlerConnector:
|
|||
if formats is None:
|
||||
formats = ["markdown"]
|
||||
|
||||
scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats)
|
||||
# v2 API returns Document directly and raises an exception on failure
|
||||
scrape_result = await firecrawl_app.scrape(url, formats=formats)
|
||||
|
||||
if not scrape_result or not scrape_result.success:
|
||||
error_msg = (
|
||||
scrape_result.error
|
||||
if scrape_result and hasattr(scrape_result, "error")
|
||||
else "Unknown error"
|
||||
)
|
||||
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
|
||||
if not scrape_result:
|
||||
raise ValueError("Firecrawl returned no result")
|
||||
|
||||
# Extract content based on format
|
||||
content = scrape_result.markdown or scrape_result.html or ""
|
||||
|
||||
# Extract metadata
|
||||
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
||||
# Extract metadata - v2 returns DocumentMetadata object
|
||||
metadata_obj = scrape_result.metadata
|
||||
metadata = metadata_obj.model_dump() if metadata_obj else {}
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
|
|
@ -116,7 +113,7 @@ class WebCrawlerConnector:
|
|||
"title": metadata.get("title", url),
|
||||
"description": metadata.get("description", ""),
|
||||
"language": metadata.get("language", ""),
|
||||
"sourceURL": metadata.get("sourceURL", url),
|
||||
"sourceURL": metadata.get("source_url", url),
|
||||
**metadata,
|
||||
},
|
||||
"crawler_type": "firecrawl",
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ dependencies = [
|
|||
"docling>=2.15.0",
|
||||
"fastapi>=0.115.8",
|
||||
"fastapi-users[oauth,sqlalchemy]>=14.0.1",
|
||||
"firecrawl-py>=1.12.0",
|
||||
"github3.py==4.0.1",
|
||||
"google-api-python-client>=2.156.0",
|
||||
"google-auth-oauthlib>=1.2.1",
|
||||
|
|
@ -49,6 +48,7 @@ dependencies = [
|
|||
"flower>=2.0.1",
|
||||
"redis>=5.2.1",
|
||||
"chonkie[all]>=1.4.0",
|
||||
"firecrawl-py>=4.9.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
9
surfsense_backend/uv.lock
generated
9
surfsense_backend/uv.lock
generated
|
|
@ -1541,19 +1541,20 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "firecrawl-py"
|
||||
version = "2.8.0"
|
||||
version = "4.9.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "aiohttp" },
|
||||
{ name = "httpx" },
|
||||
{ name = "nest-asyncio" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "requests" },
|
||||
{ name = "websockets" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/11/83/64127a0faafb027c2870c3919aae13fd6f8f8066d000bea93c880ab9772a/firecrawl_py-2.8.0.tar.gz", hash = "sha256:657795b6ddd63f0bd38b38bf0571187e0a66becda23d97c032801895257403c9", size = 37941 }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a5/2e/e4112ebd229bc03202584f5ad2ece81c26cb2a7bad0cd4773b8705d996e9/firecrawl_py-4.9.0.tar.gz", hash = "sha256:8e5740ed923c89e6066dfd63b0449f049bbd274652dfac3d735c9ae0572c4b0c", size = 153395 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/74/e6/e69bd2156856f2b1849244ca3b1d993676175b16acbf704ad85580ebaa3c/firecrawl_py-2.8.0-py3-none-any.whl", hash = "sha256:f2e148086aa1ca42f603a56009577b4f66a2c23893eaa71f7c9c0082b4fdcf60", size = 173118 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/cf/99848233303ca9c9d84cf22de08adc1051e8b6df672aeed14f32272df86b/firecrawl_py-4.9.0-py3-none-any.whl", hash = "sha256:adb027ed8bdda712201dc9727ead1a051dc3d114c2a0051de1f159c420703684", size = 190971 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -5926,7 +5927,7 @@ requires-dist = [
|
|||
{ name = "fastapi", specifier = ">=0.115.8" },
|
||||
{ name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" },
|
||||
{ name = "faster-whisper", specifier = ">=1.1.0" },
|
||||
{ name = "firecrawl-py", specifier = ">=1.12.0" },
|
||||
{ name = "firecrawl-py", specifier = ">=4.9.0" },
|
||||
{ name = "flower", specifier = ">=2.0.1" },
|
||||
{ name = "github3-py", specifier = "==4.0.1" },
|
||||
{ name = "google-api-python-client", specifier = ">=2.156.0" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue