mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-30 19:36:25 +02:00
refactor: remove link_preview tool and associated components to streamline agent functionality
This commit is contained in:
parent
6c507989d2
commit
a009cae62a
16 changed files with 5 additions and 1202 deletions
|
|
@ -10,7 +10,6 @@ Available tools:
|
|||
- generate_podcast: Generate audio podcasts from content
|
||||
- generate_video_presentation: Generate video presentations with slides and narration
|
||||
- generate_image: Generate images from text descriptions using AI models
|
||||
- link_preview: Fetch rich previews for URLs
|
||||
- scrape_webpage: Extract content from webpages
|
||||
- save_memory: Store facts/preferences about the user
|
||||
- recall_memory: Retrieve relevant user memories
|
||||
|
|
@ -25,7 +24,6 @@ from .knowledge_base import (
|
|||
format_documents_for_context,
|
||||
search_knowledge_base_async,
|
||||
)
|
||||
from .link_preview import create_link_preview_tool
|
||||
from .podcast import create_generate_podcast_tool
|
||||
from .registry import (
|
||||
BUILTIN_TOOLS,
|
||||
|
|
@ -51,7 +49,6 @@ __all__ = [
|
|||
"create_generate_image_tool",
|
||||
"create_generate_podcast_tool",
|
||||
"create_generate_video_presentation_tool",
|
||||
"create_link_preview_tool",
|
||||
"create_recall_memory_tool",
|
||||
"create_save_memory_tool",
|
||||
"create_scrape_webpage_tool",
|
||||
|
|
|
|||
|
|
@ -1,465 +0,0 @@
|
|||
"""
|
||||
Link preview tool for the SurfSense agent.
|
||||
|
||||
This module provides a tool for fetching URL metadata (title, description,
|
||||
Open Graph image, etc.) to display rich link previews in the chat UI.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
import trafilatura
|
||||
from fake_useragent import UserAgent
|
||||
from langchain_core.tools import tool
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_domain(url: str) -> str:
|
||||
"""Extract the domain from a URL."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc
|
||||
# Remove 'www.' prefix if present
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
return domain
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def extract_og_content(html: str, property_name: str) -> str | None:
|
||||
"""Extract Open Graph meta content from HTML."""
|
||||
# Try og:property first
|
||||
pattern = rf'<meta[^>]+property=["\']og:{property_name}["\'][^>]+content=["\']([^"\']+)["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Try content before property
|
||||
pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_twitter_content(html: str, name: str) -> str | None:
|
||||
"""Extract Twitter Card meta content from HTML."""
|
||||
pattern = (
|
||||
rf'<meta[^>]+name=["\']twitter:{name}["\'][^>]+content=["\']([^"\']+)["\']'
|
||||
)
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Try content before name
|
||||
pattern = (
|
||||
rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
|
||||
)
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_meta_description(html: str) -> str | None:
|
||||
"""Extract meta description from HTML."""
|
||||
pattern = r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Try content before name
|
||||
pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_title(html: str) -> str | None:
|
||||
"""Extract title from HTML."""
|
||||
# Try og:title first
|
||||
og_title = extract_og_content(html, "title")
|
||||
if og_title:
|
||||
return og_title
|
||||
|
||||
# Try twitter:title
|
||||
twitter_title = extract_twitter_content(html, "title")
|
||||
if twitter_title:
|
||||
return twitter_title
|
||||
|
||||
# Fall back to <title> tag
|
||||
pattern = r"<title[^>]*>([^<]+)</title>"
|
||||
match = re.search(pattern, html, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_description(html: str) -> str | None:
|
||||
"""Extract description from HTML."""
|
||||
# Try og:description first
|
||||
og_desc = extract_og_content(html, "description")
|
||||
if og_desc:
|
||||
return og_desc
|
||||
|
||||
# Try twitter:description
|
||||
twitter_desc = extract_twitter_content(html, "description")
|
||||
if twitter_desc:
|
||||
return twitter_desc
|
||||
|
||||
# Fall back to meta description
|
||||
return extract_meta_description(html)
|
||||
|
||||
|
||||
def extract_image(html: str) -> str | None:
|
||||
"""Extract image URL from HTML."""
|
||||
# Try og:image first
|
||||
og_image = extract_og_content(html, "image")
|
||||
if og_image:
|
||||
return og_image
|
||||
|
||||
# Try twitter:image
|
||||
twitter_image = extract_twitter_content(html, "image")
|
||||
if twitter_image:
|
||||
return twitter_image
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_preview_id(url: str) -> str:
|
||||
"""Generate a unique ID for a link preview."""
|
||||
hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
|
||||
return f"link-preview-{hash_val}"
|
||||
|
||||
|
||||
def _unescape_html(text: str) -> str:
|
||||
"""Unescape common HTML entities."""
|
||||
return (
|
||||
text.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", '"')
|
||||
.replace("'", "'")
|
||||
.replace("'", "'")
|
||||
)
|
||||
|
||||
|
||||
def _make_absolute_url(image_url: str, base_url: str) -> str:
|
||||
"""Convert a relative image URL to an absolute URL."""
|
||||
if image_url.startswith(("http://", "https://")):
|
||||
return image_url
|
||||
if image_url.startswith("//"):
|
||||
return f"https:{image_url}"
|
||||
if image_url.startswith("/"):
|
||||
parsed = urlparse(base_url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}{image_url}"
|
||||
return image_url
|
||||
|
||||
|
||||
async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
||||
"""
|
||||
Fetch page content using headless Chromium browser via Playwright.
|
||||
Used as a fallback when simple HTTP requests are blocked (403, etc.).
|
||||
|
||||
Runs the sync Playwright API in a thread so it works on any event
|
||||
loop, including Windows ``SelectorEventLoop``.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
|
||||
Returns:
|
||||
Dict with title, description, image, and raw_html, or None if failed
|
||||
"""
|
||||
try:
|
||||
return await asyncio.to_thread(_fetch_with_chromium_sync, url)
|
||||
except Exception as e:
|
||||
logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
|
||||
"""Synchronous Playwright fetch executed in a worker thread."""
|
||||
logger.info(f"[link_preview] Falling back to Chromium for {url}")
|
||||
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
with sync_playwright() as p:
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = p.chromium.launch(**launch_kwargs)
|
||||
context = browser.new_context(user_agent=user_agent)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
raw_html = page.content()
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
if not raw_html or len(raw_html.strip()) == 0:
|
||||
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
|
||||
return None
|
||||
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
image = extract_image(raw_html)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"image": image,
|
||||
"raw_html": raw_html,
|
||||
}
|
||||
|
||||
if trafilatura_metadata:
|
||||
result["title"] = trafilatura_metadata.title
|
||||
result["description"] = trafilatura_metadata.description
|
||||
|
||||
if not result["title"]:
|
||||
result["title"] = extract_title(raw_html)
|
||||
if not result["description"]:
|
||||
result["description"] = extract_description(raw_html)
|
||||
|
||||
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
|
||||
return result
|
||||
|
||||
|
||||
def create_link_preview_tool():
|
||||
"""
|
||||
Factory function to create the link_preview tool.
|
||||
|
||||
Returns:
|
||||
A configured tool function for fetching link previews.
|
||||
"""
|
||||
|
||||
@tool
|
||||
async def link_preview(url: str) -> dict[str, Any]:
|
||||
"""
|
||||
Fetch metadata for a URL to display a rich link preview.
|
||||
|
||||
Use this tool when the user shares a URL or asks about a specific webpage.
|
||||
This tool fetches the page's Open Graph metadata (title, description, image)
|
||||
to display a nice preview card in the chat.
|
||||
|
||||
Common triggers include:
|
||||
- User shares a URL in the chat
|
||||
- User asks "What's this link about?" or similar
|
||||
- User says "Show me a preview of this page"
|
||||
- User wants to preview an article or webpage
|
||||
|
||||
Args:
|
||||
url: The URL to fetch metadata for. Must be a valid HTTP/HTTPS URL.
|
||||
|
||||
Returns:
|
||||
A dictionary containing:
|
||||
- id: Unique identifier for this preview
|
||||
- assetId: The URL itself (for deduplication)
|
||||
- kind: "link" (type of media card)
|
||||
- href: The URL to open when clicked
|
||||
- title: Page title
|
||||
- description: Page description (if available)
|
||||
- thumb: Thumbnail/preview image URL (if available)
|
||||
- domain: The domain name
|
||||
- error: Error message (if fetch failed)
|
||||
"""
|
||||
preview_id = generate_preview_id(url)
|
||||
domain = extract_domain(url)
|
||||
|
||||
# Validate URL
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = f"https://{url}"
|
||||
|
||||
try:
|
||||
# Generate a random User-Agent to avoid bot detection
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
proxy_url = get_residential_proxy_url()
|
||||
|
||||
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
||||
# We're only fetching publicly available metadata (title, description, thumbnail)
|
||||
# that websites intentionally expose via OG tags for link preview purposes.
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10.0,
|
||||
follow_redirects=True,
|
||||
proxy=proxy_url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Cache-Control": "no-cache",
|
||||
"Pragma": "no-cache",
|
||||
},
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Get content type to ensure it's HTML
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "text/html" not in content_type.lower():
|
||||
# Not an HTML page, return basic info
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": url.split("/")[-1] or domain,
|
||||
"description": f"File from {domain}",
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
html = response.text
|
||||
|
||||
# Extract metadata
|
||||
title = extract_title(html) or domain
|
||||
description = extract_description(html)
|
||||
image = extract_image(html)
|
||||
|
||||
# Make sure image URL is absolute
|
||||
if image:
|
||||
image = _make_absolute_url(image, url)
|
||||
|
||||
# Clean up title and description (unescape HTML entities)
|
||||
if title:
|
||||
title = _unescape_html(title)
|
||||
if description:
|
||||
description = _unescape_html(description)
|
||||
# Truncate long descriptions
|
||||
if len(description) > 200:
|
||||
description = description[:197] + "..."
|
||||
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"thumb": image,
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
# Timeout - try Chromium fallback
|
||||
logger.warning(
|
||||
f"[link_preview] Timeout for {url}, trying Chromium fallback"
|
||||
)
|
||||
chromium_result = await fetch_with_chromium(url)
|
||||
if chromium_result:
|
||||
title = chromium_result.get("title") or domain
|
||||
description = chromium_result.get("description")
|
||||
image = chromium_result.get("image")
|
||||
|
||||
# Clean up and truncate
|
||||
if title:
|
||||
title = _unescape_html(title)
|
||||
if description:
|
||||
description = _unescape_html(description)
|
||||
if len(description) > 200:
|
||||
description = description[:197] + "..."
|
||||
|
||||
# Make sure image URL is absolute
|
||||
if image:
|
||||
image = _make_absolute_url(image, url)
|
||||
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"thumb": image,
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": domain or "Link",
|
||||
"domain": domain,
|
||||
"error": "Request timed out",
|
||||
}
|
||||
except httpx.HTTPStatusError as e:
|
||||
status_code = e.response.status_code
|
||||
|
||||
# For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
|
||||
if status_code in (403, 401, 406, 429):
|
||||
logger.warning(
|
||||
f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
|
||||
)
|
||||
chromium_result = await fetch_with_chromium(url)
|
||||
if chromium_result:
|
||||
title = chromium_result.get("title") or domain
|
||||
description = chromium_result.get("description")
|
||||
image = chromium_result.get("image")
|
||||
|
||||
# Clean up and truncate
|
||||
if title:
|
||||
title = _unescape_html(title)
|
||||
if description:
|
||||
description = _unescape_html(description)
|
||||
if len(description) > 200:
|
||||
description = description[:197] + "..."
|
||||
|
||||
# Make sure image URL is absolute
|
||||
if image:
|
||||
image = _make_absolute_url(image, url)
|
||||
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"thumb": image,
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": domain or "Link",
|
||||
"domain": domain,
|
||||
"error": f"HTTP {status_code}",
|
||||
}
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logger.error(f"[link_preview] Error fetching {url}: {error_message}")
|
||||
return {
|
||||
"id": preview_id,
|
||||
"assetId": url,
|
||||
"kind": "link",
|
||||
"href": url,
|
||||
"title": domain or "Link",
|
||||
"domain": domain,
|
||||
"error": f"Failed to fetch: {error_message[:50]}",
|
||||
}
|
||||
|
||||
return link_preview
|
||||
|
|
@ -77,7 +77,6 @@ from .linear import (
|
|||
create_delete_linear_issue_tool,
|
||||
create_update_linear_issue_tool,
|
||||
)
|
||||
from .link_preview import create_link_preview_tool
|
||||
from .mcp_tool import load_mcp_tools
|
||||
from .notion import (
|
||||
create_create_notion_page_tool,
|
||||
|
|
@ -186,13 +185,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
|
|||
# are optional — when missing, source_strategy="kb_search" degrades
|
||||
# gracefully to "provided"
|
||||
),
|
||||
# Link preview tool - fetches Open Graph metadata for URLs
|
||||
ToolDefinition(
|
||||
name="link_preview",
|
||||
description="Fetch metadata for a URL to display a rich preview card",
|
||||
factory=lambda deps: create_link_preview_tool(),
|
||||
requires=[],
|
||||
),
|
||||
# Generate image tool - creates images using AI models (DALL-E, GPT Image, etc.)
|
||||
ToolDefinition(
|
||||
name="generate_image",
|
||||
|
|
@ -559,7 +551,7 @@ def build_tools(
|
|||
tools = build_tools(deps)
|
||||
|
||||
# Use only specific tools
|
||||
tools = build_tools(deps, enabled_tools=["search_knowledge_base", "link_preview"])
|
||||
tools = build_tools(deps, enabled_tools=["search_knowledge_base"])
|
||||
|
||||
# Use defaults but disable podcast
|
||||
tools = build_tools(deps, disabled_tools=["generate_podcast"])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue