refactor: remove link_preview tool and associated components to streamline agent functionality

This commit is contained in:
Anish Sarkar 2026-03-24 17:15:29 +05:30
parent 6c507989d2
commit a009cae62a
16 changed files with 5 additions and 1202 deletions

View file

@ -10,7 +10,6 @@ Available tools:
- generate_podcast: Generate audio podcasts from content
- generate_video_presentation: Generate video presentations with slides and narration
- generate_image: Generate images from text descriptions using AI models
- link_preview: Fetch rich previews for URLs
- scrape_webpage: Extract content from webpages
- save_memory: Store facts/preferences about the user
- recall_memory: Retrieve relevant user memories
@ -25,7 +24,6 @@ from .knowledge_base import (
format_documents_for_context,
search_knowledge_base_async,
)
from .link_preview import create_link_preview_tool
from .podcast import create_generate_podcast_tool
from .registry import (
BUILTIN_TOOLS,
@ -51,7 +49,6 @@ __all__ = [
"create_generate_image_tool",
"create_generate_podcast_tool",
"create_generate_video_presentation_tool",
"create_link_preview_tool",
"create_recall_memory_tool",
"create_save_memory_tool",
"create_scrape_webpage_tool",

View file

@ -1,465 +0,0 @@
"""
Link preview tool for the SurfSense agent.
This module provides a tool for fetching URL metadata (title, description,
Open Graph image, etc.) to display rich link previews in the chat UI.
"""
import asyncio
import hashlib
import logging
import re
from typing import Any
from urllib.parse import urlparse
import httpx
import trafilatura
from fake_useragent import UserAgent
from langchain_core.tools import tool
from playwright.sync_api import sync_playwright
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
logger = logging.getLogger(__name__)
def extract_domain(url: str) -> str:
"""Extract the domain from a URL."""
try:
parsed = urlparse(url)
domain = parsed.netloc
# Remove 'www.' prefix if present
if domain.startswith("www."):
domain = domain[4:]
return domain
except Exception:
return ""
def extract_og_content(html: str, property_name: str) -> str | None:
"""Extract Open Graph meta content from HTML."""
# Try og:property first
pattern = rf'<meta[^>]+property=["\']og:{property_name}["\'][^>]+content=["\']([^"\']+)["\']'
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
# Try content before property
pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
return None
def extract_twitter_content(html: str, name: str) -> str | None:
"""Extract Twitter Card meta content from HTML."""
pattern = (
rf'<meta[^>]+name=["\']twitter:{name}["\'][^>]+content=["\']([^"\']+)["\']'
)
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
# Try content before name
pattern = (
rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
)
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
return None
def extract_meta_description(html: str) -> str | None:
"""Extract meta description from HTML."""
pattern = r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']'
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
# Try content before name
pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1)
return None
def extract_title(html: str) -> str | None:
"""Extract title from HTML."""
# Try og:title first
og_title = extract_og_content(html, "title")
if og_title:
return og_title
# Try twitter:title
twitter_title = extract_twitter_content(html, "title")
if twitter_title:
return twitter_title
# Fall back to <title> tag
pattern = r"<title[^>]*>([^<]+)</title>"
match = re.search(pattern, html, re.IGNORECASE)
if match:
return match.group(1).strip()
return None
def extract_description(html: str) -> str | None:
"""Extract description from HTML."""
# Try og:description first
og_desc = extract_og_content(html, "description")
if og_desc:
return og_desc
# Try twitter:description
twitter_desc = extract_twitter_content(html, "description")
if twitter_desc:
return twitter_desc
# Fall back to meta description
return extract_meta_description(html)
def extract_image(html: str) -> str | None:
"""Extract image URL from HTML."""
# Try og:image first
og_image = extract_og_content(html, "image")
if og_image:
return og_image
# Try twitter:image
twitter_image = extract_twitter_content(html, "image")
if twitter_image:
return twitter_image
return None
def generate_preview_id(url: str) -> str:
"""Generate a unique ID for a link preview."""
hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
return f"link-preview-{hash_val}"
def _unescape_html(text: str) -> str:
"""Unescape common HTML entities."""
return (
text.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", '"')
.replace("&#39;", "'")
.replace("&apos;", "'")
)
def _make_absolute_url(image_url: str, base_url: str) -> str:
"""Convert a relative image URL to an absolute URL."""
if image_url.startswith(("http://", "https://")):
return image_url
if image_url.startswith("//"):
return f"https:{image_url}"
if image_url.startswith("/"):
parsed = urlparse(base_url)
return f"{parsed.scheme}://{parsed.netloc}{image_url}"
return image_url
async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
"""
Fetch page content using headless Chromium browser via Playwright.
Used as a fallback when simple HTTP requests are blocked (403, etc.).
Runs the sync Playwright API in a thread so it works on any event
loop, including Windows ``SelectorEventLoop``.
Args:
url: URL to fetch
Returns:
Dict with title, description, image, and raw_html, or None if failed
"""
try:
return await asyncio.to_thread(_fetch_with_chromium_sync, url)
except Exception as e:
logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
return None
def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
"""Synchronous Playwright fetch executed in a worker thread."""
logger.info(f"[link_preview] Falling back to Chromium for {url}")
ua = UserAgent()
user_agent = ua.random
playwright_proxy = get_playwright_proxy()
with sync_playwright() as p:
launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = p.chromium.launch(**launch_kwargs)
context = browser.new_context(user_agent=user_agent)
page = context.new_page()
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
raw_html = page.content()
finally:
browser.close()
if not raw_html or len(raw_html.strip()) == 0:
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
return None
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
image = extract_image(raw_html)
result: dict[str, Any] = {
"title": None,
"description": None,
"image": image,
"raw_html": raw_html,
}
if trafilatura_metadata:
result["title"] = trafilatura_metadata.title
result["description"] = trafilatura_metadata.description
if not result["title"]:
result["title"] = extract_title(raw_html)
if not result["description"]:
result["description"] = extract_description(raw_html)
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
return result
def create_link_preview_tool():
"""
Factory function to create the link_preview tool.
Returns:
A configured tool function for fetching link previews.
"""
@tool
async def link_preview(url: str) -> dict[str, Any]:
"""
Fetch metadata for a URL to display a rich link preview.
Use this tool when the user shares a URL or asks about a specific webpage.
This tool fetches the page's Open Graph metadata (title, description, image)
to display a nice preview card in the chat.
Common triggers include:
- User shares a URL in the chat
- User asks "What's this link about?" or similar
- User says "Show me a preview of this page"
- User wants to preview an article or webpage
Args:
url: The URL to fetch metadata for. Must be a valid HTTP/HTTPS URL.
Returns:
A dictionary containing:
- id: Unique identifier for this preview
- assetId: The URL itself (for deduplication)
- kind: "link" (type of media card)
- href: The URL to open when clicked
- title: Page title
- description: Page description (if available)
- thumb: Thumbnail/preview image URL (if available)
- domain: The domain name
- error: Error message (if fetch failed)
"""
preview_id = generate_preview_id(url)
domain = extract_domain(url)
# Validate URL
if not url.startswith(("http://", "https://")):
url = f"https://{url}"
try:
# Generate a random User-Agent to avoid bot detection
ua = UserAgent()
user_agent = ua.random
# Use residential proxy if configured
proxy_url = get_residential_proxy_url()
# Use a browser-like User-Agent to fetch Open Graph metadata.
# We're only fetching publicly available metadata (title, description, thumbnail)
# that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
proxy=proxy_url,
headers={
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
},
) as client:
response = await client.get(url)
response.raise_for_status()
# Get content type to ensure it's HTML
content_type = response.headers.get("content-type", "")
if "text/html" not in content_type.lower():
# Not an HTML page, return basic info
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": url.split("/")[-1] or domain,
"description": f"File from {domain}",
"domain": domain,
}
html = response.text
# Extract metadata
title = extract_title(html) or domain
description = extract_description(html)
image = extract_image(html)
# Make sure image URL is absolute
if image:
image = _make_absolute_url(image, url)
# Clean up title and description (unescape HTML entities)
if title:
title = _unescape_html(title)
if description:
description = _unescape_html(description)
# Truncate long descriptions
if len(description) > 200:
description = description[:197] + "..."
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": title,
"description": description,
"thumb": image,
"domain": domain,
}
except httpx.TimeoutException:
# Timeout - try Chromium fallback
logger.warning(
f"[link_preview] Timeout for {url}, trying Chromium fallback"
)
chromium_result = await fetch_with_chromium(url)
if chromium_result:
title = chromium_result.get("title") or domain
description = chromium_result.get("description")
image = chromium_result.get("image")
# Clean up and truncate
if title:
title = _unescape_html(title)
if description:
description = _unescape_html(description)
if len(description) > 200:
description = description[:197] + "..."
# Make sure image URL is absolute
if image:
image = _make_absolute_url(image, url)
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": title,
"description": description,
"thumb": image,
"domain": domain,
}
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": domain or "Link",
"domain": domain,
"error": "Request timed out",
}
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
if status_code in (403, 401, 406, 429):
logger.warning(
f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
)
chromium_result = await fetch_with_chromium(url)
if chromium_result:
title = chromium_result.get("title") or domain
description = chromium_result.get("description")
image = chromium_result.get("image")
# Clean up and truncate
if title:
title = _unescape_html(title)
if description:
description = _unescape_html(description)
if len(description) > 200:
description = description[:197] + "..."
# Make sure image URL is absolute
if image:
image = _make_absolute_url(image, url)
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": title,
"description": description,
"thumb": image,
"domain": domain,
}
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": domain or "Link",
"domain": domain,
"error": f"HTTP {status_code}",
}
except Exception as e:
error_message = str(e)
logger.error(f"[link_preview] Error fetching {url}: {error_message}")
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": domain or "Link",
"domain": domain,
"error": f"Failed to fetch: {error_message[:50]}",
}
return link_preview

View file

@ -77,7 +77,6 @@ from .linear import (
create_delete_linear_issue_tool,
create_update_linear_issue_tool,
)
from .link_preview import create_link_preview_tool
from .mcp_tool import load_mcp_tools
from .notion import (
create_create_notion_page_tool,
@ -186,13 +185,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
# are optional — when missing, source_strategy="kb_search" degrades
# gracefully to "provided"
),
# Link preview tool - fetches Open Graph metadata for URLs
ToolDefinition(
name="link_preview",
description="Fetch metadata for a URL to display a rich preview card",
factory=lambda deps: create_link_preview_tool(),
requires=[],
),
# Generate image tool - creates images using AI models (DALL-E, GPT Image, etc.)
ToolDefinition(
name="generate_image",
@ -559,7 +551,7 @@ def build_tools(
tools = build_tools(deps)
# Use only specific tools
tools = build_tools(deps, enabled_tools=["search_knowledge_base", "link_preview"])
tools = build_tools(deps, enabled_tools=["search_knowledge_base"])
# Use defaults but disable podcast
tools = build_tools(deps, disabled_tools=["generate_podcast"])