feat: add link preview tool for enhanced URL metadata display in chat

2026-05-10 08:12:40 +02:00 · 2025-12-23 00:58:27 +05:30 · 2025-12-23 00:58:27 +05:30 · 4b69fdf214
commit 4b69fdf214
parent 28985e6af4
8 changed files with 1035 additions and 3 deletions
--- a/surfsense_backend/app/agents/new_chat/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/link_preview.py
@ -0,0 +1,292 @@
+"""
+Link preview tool for the new chat agent.
+
+This module provides a tool for fetching URL metadata (title, description,
+Open Graph image, etc.) to display rich link previews in the chat UI.
+"""
+
+import hashlib
+import re
+from typing import Any
+from urllib.parse import urlparse
+
+import httpx
+from langchain_core.tools import tool
+
+
+def extract_domain(url: str) -> str:
+    """Extract the domain from a URL."""
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc
+        # Remove 'www.' prefix if present
+        if domain.startswith("www."):
+            domain = domain[4:]
+        return domain
+    except Exception:
+        return ""
+
+
+def extract_og_content(html: str, property_name: str) -> str | None:
+    """Extract Open Graph meta content from HTML."""
+    # Try og:property first
+    pattern = rf'<meta[^>]+property=["\']og:{property_name}["\'][^>]+content=["\']([^"\']+)["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    # Try content before property
+    pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+property=["\']og:{property_name}["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    return None
+
+
+def extract_twitter_content(html: str, name: str) -> str | None:
+    """Extract Twitter Card meta content from HTML."""
+    pattern = rf'<meta[^>]+name=["\']twitter:{name}["\'][^>]+content=["\']([^"\']+)["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    # Try content before name
+    pattern = rf'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:{name}["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    return None
+
+
+def extract_meta_description(html: str) -> str | None:
+    """Extract meta description from HTML."""
+    pattern = r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    # Try content before name
+    pattern = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]+name=["\']description["\']'
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    
+    return None
+
+
+def extract_title(html: str) -> str | None:
+    """Extract title from HTML."""
+    # Try og:title first
+    og_title = extract_og_content(html, "title")
+    if og_title:
+        return og_title
+    
+    # Try twitter:title
+    twitter_title = extract_twitter_content(html, "title")
+    if twitter_title:
+        return twitter_title
+    
+    # Fall back to <title> tag
+    pattern = r"<title[^>]*>([^<]+)</title>"
+    match = re.search(pattern, html, re.IGNORECASE)
+    if match:
+        return match.group(1).strip()
+    
+    return None
+
+
+def extract_description(html: str) -> str | None:
+    """Extract description from HTML."""
+    # Try og:description first
+    og_desc = extract_og_content(html, "description")
+    if og_desc:
+        return og_desc
+    
+    # Try twitter:description
+    twitter_desc = extract_twitter_content(html, "description")
+    if twitter_desc:
+        return twitter_desc
+    
+    # Fall back to meta description
+    return extract_meta_description(html)
+
+
+def extract_image(html: str) -> str | None:
+    """Extract image URL from HTML."""
+    # Try og:image first
+    og_image = extract_og_content(html, "image")
+    if og_image:
+        return og_image
+    
+    # Try twitter:image
+    twitter_image = extract_twitter_content(html, "image")
+    if twitter_image:
+        return twitter_image
+    
+    return None
+
+
+def generate_preview_id(url: str) -> str:
+    """Generate a unique ID for a link preview."""
+    hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
+    return f"link-preview-{hash_val}"
+
+
+def create_link_preview_tool():
+    """
+    Factory function to create the link_preview tool.
+
+    Returns:
+        A configured tool function for fetching link previews.
+    """
+
+    @tool
+    async def link_preview(url: str) -> dict[str, Any]:
+        """
+        Fetch metadata for a URL to display a rich link preview.
+
+        Use this tool when the user shares a URL or asks about a specific webpage.
+        This tool fetches the page's Open Graph metadata (title, description, image)
+        to display a nice preview card in the chat.
+
+        Common triggers include:
+        - User shares a URL in the chat
+        - User asks "What's this link about?" or similar
+        - User says "Show me a preview of this page"
+        - User wants to preview an article or webpage
+
+        Args:
+            url: The URL to fetch metadata for. Must be a valid HTTP/HTTPS URL.
+
+        Returns:
+            A dictionary containing:
+            - id: Unique identifier for this preview
+            - assetId: The URL itself (for deduplication)
+            - kind: "link" (type of media card)
+            - href: The URL to open when clicked
+            - title: Page title
+            - description: Page description (if available)
+            - thumb: Thumbnail/preview image URL (if available)
+            - domain: The domain name
+            - error: Error message (if fetch failed)
+        """
+        preview_id = generate_preview_id(url)
+        domain = extract_domain(url)
+
+        # Validate URL
+        if not url.startswith(("http://", "https://")):
+            url = f"https://{url}"
+
+        try:
+            async with httpx.AsyncClient(
+                timeout=10.0,
+                follow_redirects=True,
+                headers={
+                    "User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)",
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.5",
+                },
+            ) as client:
+                response = await client.get(url)
+                response.raise_for_status()
+
+                # Get content type to ensure it's HTML
+                content_type = response.headers.get("content-type", "")
+                if "text/html" not in content_type.lower():
+                    # Not an HTML page, return basic info
+                    return {
+                        "id": preview_id,
+                        "assetId": url,
+                        "kind": "link",
+                        "href": url,
+                        "title": url.split("/")[-1] or domain,
+                        "description": f"File from {domain}",
+                        "domain": domain,
+                    }
+
+                html = response.text
+
+                # Extract metadata
+                title = extract_title(html) or domain
+                description = extract_description(html)
+                image = extract_image(html)
+
+                # Make sure image URL is absolute
+                if image and not image.startswith(("http://", "https://")):
+                    if image.startswith("//"):
+                        image = f"https:{image}"
+                    elif image.startswith("/"):
+                        parsed = urlparse(url)
+                        image = f"{parsed.scheme}://{parsed.netloc}{image}"
+
+                # Clean up title and description (unescape HTML entities)
+                if title:
+                    title = (
+                        title.replace("&amp;", "&")
+                        .replace("&lt;", "<")
+                        .replace("&gt;", ">")
+                        .replace("&quot;", '"')
+                        .replace("&#39;", "'")
+                        .replace("&apos;", "'")
+                    )
+                if description:
+                    description = (
+                        description.replace("&amp;", "&")
+                        .replace("&lt;", "<")
+                        .replace("&gt;", ">")
+                        .replace("&quot;", '"')
+                        .replace("&#39;", "'")
+                        .replace("&apos;", "'")
+                    )
+                    # Truncate long descriptions
+                    if len(description) > 200:
+                        description = description[:197] + "..."
+
+                return {
+                    "id": preview_id,
+                    "assetId": url,
+                    "kind": "link",
+                    "href": url,
+                    "title": title,
+                    "description": description,
+                    "thumb": image,
+                    "domain": domain,
+                }
+
+        except httpx.TimeoutException:
+            return {
+                "id": preview_id,
+                "assetId": url,
+                "kind": "link",
+                "href": url,
+                "title": domain or "Link",
+                "domain": domain,
+                "error": "Request timed out",
+            }
+        except httpx.HTTPStatusError as e:
+            return {
+                "id": preview_id,
+                "assetId": url,
+                "kind": "link",
+                "href": url,
+                "title": domain or "Link",
+                "domain": domain,
+                "error": f"HTTP {e.response.status_code}",
+            }
+        except Exception as e:
+            error_message = str(e)
+            print(f"[link_preview] Error fetching {url}: {error_message}")
+            return {
+                "id": preview_id,
+                "assetId": url,
+                "kind": "link",
+                "href": url,
+                "title": domain or "Link",
+                "domain": domain,
+                "error": f"Failed to fetch: {error_message[:50]}",
+            }
+
+    return link_preview
+