mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-10 20:35:17 +02:00
add user agent to AsyncChromiumLoader
This commit is contained in:
parent
c6cb754aac
commit
1f60d1c22f
3 changed files with 2924 additions and 2904 deletions
|
|
@ -8,6 +8,7 @@ Provides a unified interface for web scraping.
|
|||
from typing import Any
|
||||
|
||||
import validators
|
||||
from fake_useragent import UserAgent
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
|
||||
|
|
@ -121,7 +122,7 @@ class WebCrawlerConnector:
|
|||
|
||||
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
|
||||
"""
|
||||
Crawl URL using AsyncChromiumLoader.
|
||||
Crawl URL using AsyncChromiumLoader with realistic User-Agent.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
|
|
@ -132,7 +133,14 @@ class WebCrawlerConnector:
|
|||
Raises:
|
||||
Exception: If crawling fails
|
||||
"""
|
||||
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
|
||||
# Generate a realistic User-Agent to avoid bot detection
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Pass User-Agent to AsyncChromiumLoader
|
||||
crawl_loader = AsyncChromiumLoader(
|
||||
urls=[url], headless=True, user_agent=user_agent
|
||||
)
|
||||
documents = await crawl_loader.aload()
|
||||
|
||||
if not documents:
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ dependencies = [
|
|||
"litellm>=1.80.10",
|
||||
"langchain-litellm>=0.3.5",
|
||||
"langgraph>=1.0.5",
|
||||
"fake-useragent>=2.2.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
5815
surfsense_backend/uv.lock
generated
5815
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue