mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-02 22:01:05 +02:00
refactor: enhance web crawling functionality with Firecrawl integration
- Updated WebCrawlerConnector to prioritize Firecrawl API for crawling if an API key is provided, falling back to Chromium if Firecrawl fails. - Improved error handling to log failures from both Firecrawl and Chromium. - Enhanced link preview tool to use a random User-Agent for better compatibility with web servers. - Passed Firecrawl API key to the stream_new_chat function for improved configuration management.
This commit is contained in:
parent
ad5a49c2c6
commit
d9df63f57e
3 changed files with 41 additions and 26 deletions
|
|
@ -280,15 +280,18 @@ def create_link_preview_tool():
|
|||
url = f"https://{url}"
|
||||
|
||||
try:
|
||||
# Generate a random User-Agent to avoid bot detection
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
||||
# This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
|
||||
# We're only fetching publicly available metadata (title, description, thumbnail)
|
||||
# that websites intentionally expose via OG tags for link preview purposes.
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10.0,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue