refactor: remove link_preview tool and associated components to streamline agent functionality

2026-04-30 19:36:25 +02:00 · 2026-03-24 17:15:29 +05:30 · 2026-03-24 17:15:29 +05:30 · a009cae62a
commit a009cae62a
parent 6c507989d2
16 changed files with 5 additions and 1202 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/init.py
+++ b/surfsense_backend/app/agents/new_chat/tools/init.py
@ -10,7 +10,6 @@ Available tools:
 - generate_podcast: Generate audio podcasts from content
 - generate_video_presentation: Generate video presentations with slides and narration
 - generate_image: Generate images from text descriptions using AI models
- link_preview: Fetch rich previews for URLs
 - scrape_webpage: Extract content from webpages
 - save_memory: Store facts/preferences about the user
 - recall_memory: Retrieve relevant user memories
@ -25,7 +24,6 @@ from .knowledge_base import (
    format_documents_for_context,
    search_knowledge_base_async,
 )
-from .link_preview import create_link_preview_tool
 from .podcast import create_generate_podcast_tool
 from .registry import (
    BUILTIN_TOOLS,
@ -51,7 +49,6 @@ __all__ = [
    "create_generate_image_tool",
    "create_generate_podcast_tool",
    "create_generate_video_presentation_tool",
-    "create_link_preview_tool",
    "create_recall_memory_tool",
    "create_save_memory_tool",
    "create_scrape_webpage_tool",
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -1,465 +0,0 @@
-"""
-Link preview tool for the SurfSense agent.
-
-This module provides a tool for fetching URL metadata (title, description,
-Open Graph image, etc.) to display rich link previews in the chat UI.
-"""
-
-import asyncio
-import hashlib
-import logging
-import re
-from typing import Any
-from urllib.parse import urlparse
-
-import httpx
-import trafilatura
-from fake_useragent import UserAgent
-from langchain_core.tools import tool
-from playwright.sync_api import sync_playwright
-
-from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
-
-logger = logging.getLogger(__name__)
-
-
-def extract_domain(url: str) -> str:
-    """Extract the domain from a URL."""
-    try:
-        parsed = urlparse(url)
-        domain = parsed.netloc
-        # Remove 'www.' prefix if present
-        if domain.startswith("www."):
-            domain = domain[4:]
-        return domain
-    except Exception:
-        return ""
-
-
-def extract_og_content(html: str, property_name: str) -> str | None:
-    """Extract Open Graph meta content from HTML."""
-    # Try og:property first
-    pattern = rf'<meta[^>]+property=["\']og:{property_name}["\'][^>]+content=["\']([^"\']+)["\']'
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    # Try content before property
-    pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    return None
-
-
-def extract_twitter_content(html: str, name: str) -> str | None:
-    """Extract Twitter Card meta content from HTML."""
-    pattern = (
-        rf'<meta[^>]+name=["\']twitter:{name}["\'][^>]+content=["\']([^"\']+)["\']'
-    )
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    # Try content before name
-    pattern = (
-        rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
-    )
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    return None
-
-
-def extract_meta_description(html: str) -> str | None:
-    """Extract meta description from HTML."""
-    pattern = r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']'
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    # Try content before name
-    pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1)
-
-    return None
-
-
-def extract_title(html: str) -> str | None:
-    """Extract title from HTML."""
-    # Try og:title first
-    og_title = extract_og_content(html, "title")
-    if og_title:
-        return og_title
-
-    # Try twitter:title
-    twitter_title = extract_twitter_content(html, "title")
-    if twitter_title:
-        return twitter_title
-
-    # Fall back to <title> tag
-    pattern = r"<title[^>]*>([^<]+)</title>"
-    match = re.search(pattern, html, re.IGNORECASE)
-    if match:
-        return match.group(1).strip()
-
-    return None
-
-
-def extract_description(html: str) -> str | None:
-    """Extract description from HTML."""
-    # Try og:description first
-    og_desc = extract_og_content(html, "description")
-    if og_desc:
-        return og_desc
-
-    # Try twitter:description
-    twitter_desc = extract_twitter_content(html, "description")
-    if twitter_desc:
-        return twitter_desc
-
-    # Fall back to meta description
-    return extract_meta_description(html)
-
-
-def extract_image(html: str) -> str | None:
-    """Extract image URL from HTML."""
-    # Try og:image first
-    og_image = extract_og_content(html, "image")
-    if og_image:
-        return og_image
-
-    # Try twitter:image
-    twitter_image = extract_twitter_content(html, "image")
-    if twitter_image:
-        return twitter_image
-
-    return None
-
-
-def generate_preview_id(url: str) -> str:
-    """Generate a unique ID for a link preview."""
-    hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
-    return f"link-preview-{hash_val}"
-
-
-def _unescape_html(text: str) -> str:
-    """Unescape common HTML entities."""
-    return (
-        text.replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-        .replace("&quot;", '"')
-        .replace("&#39;", "'")
-        .replace("&apos;", "'")
-    )
-
-
-def _make_absolute_url(image_url: str, base_url: str) -> str:
-    """Convert a relative image URL to an absolute URL."""
-    if image_url.startswith(("http://", "https://")):
-        return image_url
-    if image_url.startswith("//"):
-        return f"https:{image_url}"
-    if image_url.startswith("/"):
-        parsed = urlparse(base_url)
-        return f"{parsed.scheme}://{parsed.netloc}{image_url}"
-    return image_url
-
-
-async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
-    """
-    Fetch page content using headless Chromium browser via Playwright.
-    Used as a fallback when simple HTTP requests are blocked (403, etc.).
-
-    Runs the sync Playwright API in a thread so it works on any event
-    loop, including Windows ``SelectorEventLoop``.
-
-    Args:
-        url: URL to fetch
-
-    Returns:
-        Dict with title, description, image, and raw_html, or None if failed
-    """
-    try:
-        return await asyncio.to_thread(_fetch_with_chromium_sync, url)
-    except Exception as e:
-        logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
-        return None
-
-
-def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
-    """Synchronous Playwright fetch executed in a worker thread."""
-    logger.info(f"[link_preview] Falling back to Chromium for {url}")
-
-    ua = UserAgent()
-    user_agent = ua.random
-
-    playwright_proxy = get_playwright_proxy()
-
-    with sync_playwright() as p:
-        launch_kwargs: dict = {"headless": True}
-        if playwright_proxy:
-            launch_kwargs["proxy"] = playwright_proxy
-        browser = p.chromium.launch(**launch_kwargs)
-        context = browser.new_context(user_agent=user_agent)
-        page = context.new_page()
-
-        try:
-            page.goto(url, wait_until="domcontentloaded", timeout=30000)
-            raw_html = page.content()
-        finally:
-            browser.close()
-
-    if not raw_html or len(raw_html.strip()) == 0:
-        logger.warning(f"[link_preview] Chromium returned empty content for {url}")
-        return None
-
-    trafilatura_metadata = trafilatura.extract_metadata(raw_html)
-
-    image = extract_image(raw_html)
-
-    result: dict[str, Any] = {
-        "title": None,
-        "description": None,
-        "image": image,
-        "raw_html": raw_html,
-    }
-
-    if trafilatura_metadata:
-        result["title"] = trafilatura_metadata.title
-        result["description"] = trafilatura_metadata.description
-
-    if not result["title"]:
-        result["title"] = extract_title(raw_html)
-    if not result["description"]:
-        result["description"] = extract_description(raw_html)
-
-    logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
-    return result
-
-
-def create_link_preview_tool():
-    """
-    Factory function to create the link_preview tool.
-
-    Returns:
-        A configured tool function for fetching link previews.
-    """
-
-    @tool
-    async def link_preview(url: str) -> dict[str, Any]:
-        """
-        Fetch metadata for a URL to display a rich link preview.
-
-        Use this tool when the user shares a URL or asks about a specific webpage.
-        This tool fetches the page's Open Graph metadata (title, description, image)
-        to display a nice preview card in the chat.
-
-        Common triggers include:
-        - User shares a URL in the chat
-        - User asks "What's this link about?" or similar
-        - User says "Show me a preview of this page"
-        - User wants to preview an article or webpage
-
-        Args:
-            url: The URL to fetch metadata for. Must be a valid HTTP/HTTPS URL.
-
-        Returns:
-            A dictionary containing:
-            - id: Unique identifier for this preview
-            - assetId: The URL itself (for deduplication)
-            - kind: "link" (type of media card)
-            - href: The URL to open when clicked
-            - title: Page title
-            - description: Page description (if available)
-            - thumb: Thumbnail/preview image URL (if available)
-            - domain: The domain name
-            - error: Error message (if fetch failed)
-        """
-        preview_id = generate_preview_id(url)
-        domain = extract_domain(url)
-
-        # Validate URL
-        if not url.startswith(("http://", "https://")):
-            url = f"https://{url}"
-
-        try:
-            # Generate a random User-Agent to avoid bot detection
-            ua = UserAgent()
-            user_agent = ua.random
-
-            # Use residential proxy if configured
-            proxy_url = get_residential_proxy_url()
-
-            # Use a browser-like User-Agent to fetch Open Graph metadata.
-            # We're only fetching publicly available metadata (title, description, thumbnail)
-            # that websites intentionally expose via OG tags for link preview purposes.
-            async with httpx.AsyncClient(
-                timeout=10.0,
-                follow_redirects=True,
-                proxy=proxy_url,
-                headers={
-                    "User-Agent": user_agent,
-                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
-                    "Accept-Language": "en-US,en;q=0.9",
-                    "Accept-Encoding": "gzip, deflate, br",
-                    "Cache-Control": "no-cache",
-                    "Pragma": "no-cache",
-                },
-            ) as client:
-                response = await client.get(url)
-                response.raise_for_status()
-
-                # Get content type to ensure it's HTML
-                content_type = response.headers.get("content-type", "")
-                if "text/html" not in content_type.lower():
-                    # Not an HTML page, return basic info
-                    return {
-                        "id": preview_id,
-                        "assetId": url,
-                        "kind": "link",
-                        "href": url,
-                        "title": url.split("/")[-1] or domain,
-                        "description": f"File from {domain}",
-                        "domain": domain,
-                    }
-
-                html = response.text
-
-                # Extract metadata
-                title = extract_title(html) or domain
-                description = extract_description(html)
-                image = extract_image(html)
-
-                # Make sure image URL is absolute
-                if image:
-                    image = _make_absolute_url(image, url)
-
-                # Clean up title and description (unescape HTML entities)
-                if title:
-                    title = _unescape_html(title)
-                if description:
-                    description = _unescape_html(description)
-                    # Truncate long descriptions
-                    if len(description) > 200:
-                        description = description[:197] + "..."
-
-                return {
-                    "id": preview_id,
-                    "assetId": url,
-                    "kind": "link",
-                    "href": url,
-                    "title": title,
-                    "description": description,
-                    "thumb": image,
-                    "domain": domain,
-                }
-
-        except httpx.TimeoutException:
-            # Timeout - try Chromium fallback
-            logger.warning(
-                f"[link_preview] Timeout for {url}, trying Chromium fallback"
-            )
-            chromium_result = await fetch_with_chromium(url)
-            if chromium_result:
-                title = chromium_result.get("title") or domain
-                description = chromium_result.get("description")
-                image = chromium_result.get("image")
-
-                # Clean up and truncate
-                if title:
-                    title = _unescape_html(title)
-                if description:
-                    description = _unescape_html(description)
-                    if len(description) > 200:
-                        description = description[:197] + "..."
-
-                # Make sure image URL is absolute
-                if image:
-                    image = _make_absolute_url(image, url)
-
-                return {
-                    "id": preview_id,
-                    "assetId": url,
-                    "kind": "link",
-                    "href": url,
-                    "title": title,
-                    "description": description,
-                    "thumb": image,
-                    "domain": domain,
-                }
-
-            return {
-                "id": preview_id,
-                "assetId": url,
-                "kind": "link",
-                "href": url,
-                "title": domain or "Link",
-                "domain": domain,
-                "error": "Request timed out",
-            }
-        except httpx.HTTPStatusError as e:
-            status_code = e.response.status_code
-
-            # For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
-            if status_code in (403, 401, 406, 429):
-                logger.warning(
-                    f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
-                )
-                chromium_result = await fetch_with_chromium(url)
-                if chromium_result:
-                    title = chromium_result.get("title") or domain
-                    description = chromium_result.get("description")
-                    image = chromium_result.get("image")
-
-                    # Clean up and truncate
-                    if title:
-                        title = _unescape_html(title)
-                    if description:
-                        description = _unescape_html(description)
-                        if len(description) > 200:
-                            description = description[:197] + "..."
-
-                    # Make sure image URL is absolute
-                    if image:
-                        image = _make_absolute_url(image, url)
-
-                    return {
-                        "id": preview_id,
-                        "assetId": url,
-                        "kind": "link",
-                        "href": url,
-                        "title": title,
-                        "description": description,
-                        "thumb": image,
-                        "domain": domain,
-                    }
-
-            return {
-                "id": preview_id,
-                "assetId": url,
-                "kind": "link",
-                "href": url,
-                "title": domain or "Link",
-                "domain": domain,
-                "error": f"HTTP {status_code}",
-            }
-        except Exception as e:
-            error_message = str(e)
-            logger.error(f"[link_preview] Error fetching {url}: {error_message}")
-            return {
-                "id": preview_id,
-                "assetId": url,
-                "kind": "link",
-                "href": url,
-                "title": domain or "Link",
-                "domain": domain,
-                "error": f"Failed to fetch: {error_message[:50]}",
-            }
-
-    return link_preview
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -77,7 +77,6 @@ from .linear import (
    create_delete_linear_issue_tool,
    create_update_linear_issue_tool,
 )
-from .link_preview import create_link_preview_tool
 from .mcp_tool import load_mcp_tools
 from .notion import (
    create_create_notion_page_tool,
@ -186,13 +185,6 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
        # are optional — when missing, source_strategy="kb_search" degrades
        # gracefully to "provided"
    ),
-    # Link preview tool - fetches Open Graph metadata for URLs
-    ToolDefinition(
-        name="link_preview",
-        description="Fetch metadata for a URL to display a rich preview card",
-        factory=lambda deps: create_link_preview_tool(),
-        requires=[],
-    ),
    # Generate image tool - creates images using AI models (DALL-E, GPT Image, etc.)
    ToolDefinition(
        name="generate_image",
@ -559,7 +551,7 @@ def build_tools(
        tools = build_tools(deps)

        # Use only specific tools
-        tools = build_tools(deps, enabled_tools=["search_knowledge_base", "link_preview"])
+        tools = build_tools(deps, enabled_tools=["search_knowledge_base"])

        # Use defaults but disable podcast
        tools = build_tools(deps, disabled_tools=["generate_podcast"])