add user agent to AsyncChromiumLoader

This commit is contained in:
CREDO23 2025-12-17 19:43:54 +02:00
parent c6cb754aac
commit 1f60d1c22f
3 changed files with 2924 additions and 2904 deletions

View file

@ -8,6 +8,7 @@ Provides a unified interface for web scraping.
from typing import Any
import validators
from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
@ -121,7 +122,7 @@ class WebCrawlerConnector:
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
"""
Crawl URL using AsyncChromiumLoader.
Crawl URL using AsyncChromiumLoader with realistic User-Agent.
Args:
url: URL to crawl
@ -132,7 +133,14 @@ class WebCrawlerConnector:
Raises:
Exception: If crawling fails
"""
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
# Generate a realistic User-Agent to avoid bot detection
ua = UserAgent()
user_agent = ua.random
# Pass User-Agent to AsyncChromiumLoader
crawl_loader = AsyncChromiumLoader(
urls=[url], headless=True, user_agent=user_agent
)
documents = await crawl_loader.aload()
if not documents:

View file

@ -51,6 +51,7 @@ dependencies = [
"litellm>=1.80.10",
"langchain-litellm>=0.3.5",
"langgraph>=1.0.5",
"fake-useragent>=2.2.0",
]
[dependency-groups]

5815
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff