refactor: enhance link preview functionality with Chromium fallback

- Added a fallback mechanism using headless Chromium to fetch page content when standard HTTP requests fail. - Introduced utility functions for unescaping HTML entities and converting relative URLs to absolute. - Updated HTTP request headers to mimic a browser for better compatibility with web servers. - Improved error handling and logging for better debugging and user feedback. - Made various properties in Zod schemas nullable for better type safety and flexibility in handling optional data.
2026-04-25 00:36:31 +02:00 · 2025-12-26 00:07:45 +05:30 · 2025-12-26 00:07:45 +05:30 · bea18960a4
commit bea18960a4
parent 4c2de73694
7 changed files with 271 additions and 86 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -6,13 +6,19 @@ Open Graph image, etc.) to display rich link previews in the chat UI.
 """

 import hashlib
+import logging
 import re
 from typing import Any
 from urllib.parse import urlparse

 import httpx
+import trafilatura
+from fake_useragent import UserAgent
+from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.tools import tool

+logger = logging.getLogger(__name__)
+

 def extract_domain(url: str) -> str:
    """Extract the domain from a URL."""
@ -138,6 +144,96 @@ def generate_preview_id(url: str) -> str:
    return f"link-preview-{hash_val}"


+def _unescape_html(text: str) -> str:
+    """Unescape common HTML entities."""
+    return (
+        text.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", '"')
+        .replace("&#39;", "'")
+        .replace("&apos;", "'")
+    )
+
+
+def _make_absolute_url(image_url: str, base_url: str) -> str:
+    """Convert a relative image URL to an absolute URL."""
+    if image_url.startswith(("http://", "https://")):
+        return image_url
+    if image_url.startswith("//"):
+        return f"https:{image_url}"
+    if image_url.startswith("/"):
+        parsed = urlparse(base_url)
+        return f"{parsed.scheme}://{parsed.netloc}{image_url}"
+    return image_url
+
+
+async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
+    """
+    Fetch page content using headless Chromium browser.
+    Used as a fallback when simple HTTP requests are blocked (403, etc.).
+    
+    Args:
+        url: URL to fetch
+        
+    Returns:
+        Dict with title, description, image, and raw_html, or None if failed
+    """
+    try:
+        logger.info(f"[link_preview] Falling back to Chromium for {url}")
+        
+        # Generate a realistic User-Agent to avoid bot detection
+        ua = UserAgent()
+        user_agent = ua.random
+        
+        # Use AsyncChromiumLoader to fetch the page
+        crawl_loader = AsyncChromiumLoader(
+            urls=[url], headless=True, user_agent=user_agent
+        )
+        documents = await crawl_loader.aload()
+        
+        if not documents:
+            logger.warning(f"[link_preview] Chromium returned no documents for {url}")
+            return None
+        
+        doc = documents[0]
+        raw_html = doc.page_content
+        
+        if not raw_html or len(raw_html.strip()) == 0:
+            logger.warning(f"[link_preview] Chromium returned empty content for {url}")
+            return None
+        
+        # Extract metadata using Trafilatura
+        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
+        
+        # Extract OG image from raw HTML (trafilatura doesn't extract this)
+        image = extract_image(raw_html)
+        
+        result = {
+            "title": None,
+            "description": None,
+            "image": image,
+            "raw_html": raw_html,
+        }
+        
+        if trafilatura_metadata:
+            result["title"] = trafilatura_metadata.title
+            result["description"] = trafilatura_metadata.description
+        
+        # If trafilatura didn't get the title/description, try OG tags
+        if not result["title"]:
+            result["title"] = extract_title(raw_html)
+        if not result["description"]:
+            result["description"] = extract_description(raw_html)
+        
+        logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
+        return result
+        
+    except Exception as e:
+        logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
+        return None
+
+
 def create_link_preview_tool():
    """
    Factory function to create the link_preview tool.
@ -184,13 +280,20 @@ def create_link_preview_tool():
            url = f"https://{url}"

        try:
+            # Use a browser-like User-Agent to fetch Open Graph metadata.
+            # This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
+            # We're only fetching publicly available metadata (title, description, thumbnail)
+            # that websites intentionally expose via OG tags for link preview purposes.
            async with httpx.AsyncClient(
                timeout=10.0,
                follow_redirects=True,
                headers={
-                    "User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)",
-                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                    "Accept-Language": "en-US,en;q=0.5",
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.9",
+                    "Accept-Encoding": "gzip, deflate, br",
+                    "Cache-Control": "no-cache",
+                    "Pragma": "no-cache",
                },
            ) as client:
                response = await client.get(url)
@ -218,32 +321,14 @@ def create_link_preview_tool():
                image = extract_image(html)

                # Make sure image URL is absolute
-                if image and not image.startswith(("http://", "https://")):
-                    if image.startswith("//"):
-                        image = f"https:{image}"
-                    elif image.startswith("/"):
-                        parsed = urlparse(url)
-                        image = f"{parsed.scheme}://{parsed.netloc}{image}"
+                if image:
+                    image = _make_absolute_url(image, url)

                # Clean up title and description (unescape HTML entities)
                if title:
-                    title = (
-                        title.replace("&amp;", "&")
-                        .replace("&lt;", "<")
-                        .replace("&gt;", ">")
-                        .replace("&quot;", '"')
-                        .replace("&#39;", "'")
-                        .replace("&apos;", "'")
-                    )
+                    title = _unescape_html(title)
                if description:
-                    description = (
-                        description.replace("&amp;", "&")
-                        .replace("&lt;", "<")
-                        .replace("&gt;", ">")
-                        .replace("&quot;", '"')
-                        .replace("&#39;", "'")
-                        .replace("&apos;", "'")
-                    )
+                    description = _unescape_html(description)
                    # Truncate long descriptions
                    if len(description) > 200:
                        description = description[:197] + "..."
@ -260,6 +345,37 @@ def create_link_preview_tool():
                }

        except httpx.TimeoutException:
+            # Timeout - try Chromium fallback
+            logger.warning(f"[link_preview] Timeout for {url}, trying Chromium fallback")
+            chromium_result = await fetch_with_chromium(url)
+            if chromium_result:
+                title = chromium_result.get("title") or domain
+                description = chromium_result.get("description")
+                image = chromium_result.get("image")
+                
+                # Clean up and truncate
+                if title:
+                    title = _unescape_html(title)
+                if description:
+                    description = _unescape_html(description)
+                    if len(description) > 200:
+                        description = description[:197] + "..."
+                
+                # Make sure image URL is absolute
+                if image:
+                    image = _make_absolute_url(image, url)
+                
+                return {
+                    "id": preview_id,
+                    "assetId": url,
+                    "kind": "link",
+                    "href": url,
+                    "title": title,
+                    "description": description,
+                    "thumb": image,
+                    "domain": domain,
+                }
+            
            return {
                "id": preview_id,
                "assetId": url,
@ -270,6 +386,42 @@ def create_link_preview_tool():
                "error": "Request timed out",
            }
        except httpx.HTTPStatusError as e:
+            status_code = e.response.status_code
+            
+            # For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
+            if status_code in (403, 401, 406, 429):
+                logger.warning(
+                    f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
+                )
+                chromium_result = await fetch_with_chromium(url)
+                if chromium_result:
+                    title = chromium_result.get("title") or domain
+                    description = chromium_result.get("description")
+                    image = chromium_result.get("image")
+                    
+                    # Clean up and truncate
+                    if title:
+                        title = _unescape_html(title)
+                    if description:
+                        description = _unescape_html(description)
+                        if len(description) > 200:
+                            description = description[:197] + "..."
+                    
+                    # Make sure image URL is absolute
+                    if image:
+                        image = _make_absolute_url(image, url)
+                    
+                    return {
+                        "id": preview_id,
+                        "assetId": url,
+                        "kind": "link",
+                        "href": url,
+                        "title": title,
+                        "description": description,
+                        "thumb": image,
+                        "domain": domain,
+                    }
+            
            return {
                "id": preview_id,
                "assetId": url,
@ -277,11 +429,11 @@ def create_link_preview_tool():
                "href": url,
                "title": domain or "Link",
                "domain": domain,
-                "error": f"HTTP {e.response.status_code}",
+                "error": f"HTTP {status_code}",
            }
        except Exception as e:
            error_message = str(e)
-            print(f"[link_preview] Error fetching {url}: {error_message}")
+            logger.error(f"[link_preview] Error fetching {url}: {error_message}")
            return {
                "id": preview_id,
                "assetId": url,
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ b/surfsense_web/components/tool-ui/article/index.tsx
@ -19,20 +19,20 @@ import { cn } from "@/lib/utils";
 */
 const SerializableArticleSchema = z.object({
 	id: z.string().default("article-unknown"),
-	assetId: z.string().optional(),
-	kind: z.literal("article").optional(),
+	assetId: z.string().nullish(),
+	kind: z.literal("article").nullish(),
 	title: z.string().default("Untitled Article"),
-	description: z.string().optional(),
-	content: z.string().optional(),
-	href: z.string().url().optional(),
-	domain: z.string().optional(),
-	author: z.string().optional(),
-	date: z.string().optional(),
-	word_count: z.number().optional(),
-	wordCount: z.number().optional(),
-	was_truncated: z.boolean().optional(),
-	wasTruncated: z.boolean().optional(),
-	error: z.string().optional(),
+	description: z.string().nullish(),
+	content: z.string().nullish(),
+	href: z.string().url().nullish(),
+	domain: z.string().nullish(),
+	author: z.string().nullish(),
+	date: z.string().nullish(),
+	word_count: z.number().nullish(),
+	wordCount: z.number().nullish(),
+	was_truncated: z.boolean().nullish(),
+	wasTruncated: z.boolean().nullish(),
+	error: z.string().nullish(),
 });

 /**
--- a/surfsense_web/components/tool-ui/deepagent-thinking.tsx
+++ b/surfsense_web/components/tool-ui/deepagent-thinking.tsx
@ -70,12 +70,12 @@ const ThinkingStepSchema = z.object({
 });

 const DeepAgentThinkingArgsSchema = z.object({
-	query: z.string().optional(),
-	context: z.string().optional(),
+	query: z.string().nullish(),
+	context: z.string().nullish(),
 });

 const DeepAgentThinkingResultSchema = z.object({
-	steps: z.array(ThinkingStepSchema).optional(),
+	steps: z.array(ThinkingStepSchema).nullish(),
 	status: z
 		.enum([
 			THINKING_STATUS.THINKING,
@ -83,8 +83,8 @@ const DeepAgentThinkingResultSchema = z.object({
 			THINKING_STATUS.SYNTHESIZING,
 			THINKING_STATUS.COMPLETED,
 		])
-		.optional(),
-	summary: z.string().optional(),
+		.nullish(),
+	summary: z.string().nullish(),
 });

 /** Types derived from Zod schemas */
@ -325,7 +325,7 @@ export const DeepAgentThinkingToolUI = makeAssistantToolUI<
 	render: function DeepAgentThinkingUI({ result, status }) {
 		// Loading state - tool is still running
 		if (status.type === "running" || status.type === "requires-action") {
-			return <ThinkingLoadingState status={result?.status} />;
+			return <ThinkingLoadingState status={result?.status ?? undefined} />;
 		}

 		// Incomplete/cancelled state
--- a/surfsense_web/components/tool-ui/display-image.tsx
+++ b/surfsense_web/components/tool-ui/display-image.tsx
@ -23,7 +23,7 @@ interface DisplayImageResult {
 	id: string;
 	assetId: string;
 	src: string;
-	alt: string;
+	alt?: string;  // Made optional - parseSerializableImage provides fallback
 	title?: string;
 	description?: string;
 	domain?: string;
--- a/surfsense_web/components/tool-ui/generate-podcast.tsx
+++ b/surfsense_web/components/tool-ui/generate-podcast.tsx
@ -14,27 +14,27 @@ import { clearActivePodcastTaskId, setActivePodcastTaskId } from "@/lib/chat/pod
 */
 const GeneratePodcastArgsSchema = z.object({
 	source_content: z.string(),
-	podcast_title: z.string().optional(),
-	user_prompt: z.string().optional(),
+	podcast_title: z.string().nullish(),
+	user_prompt: z.string().nullish(),
 });

 const GeneratePodcastResultSchema = z.object({
 	status: z.enum(["processing", "already_generating", "success", "error"]),
-	task_id: z.string().optional(),
-	podcast_id: z.number().optional(),
-	title: z.string().optional(),
-	transcript_entries: z.number().optional(),
-	message: z.string().optional(),
-	error: z.string().optional(),
+	task_id: z.string().nullish(),
+	podcast_id: z.number().nullish(),
+	title: z.string().nullish(),
+	transcript_entries: z.number().nullish(),
+	message: z.string().nullish(),
+	error: z.string().nullish(),
 });

 const TaskStatusResponseSchema = z.object({
 	status: z.enum(["processing", "success", "error"]),
-	podcast_id: z.number().optional(),
-	title: z.string().optional(),
-	transcript_entries: z.number().optional(),
-	state: z.string().optional(),
-	error: z.string().optional(),
+	podcast_id: z.number().nullish(),
+	title: z.string().nullish(),
+	transcript_entries: z.number().nullish(),
+	state: z.string().nullish(),
+	error: z.string().nullish(),
 });

 const PodcastTranscriptEntrySchema = z.object({
@ -43,7 +43,7 @@ const PodcastTranscriptEntrySchema = z.object({
 });

 const PodcastDetailsSchema = z.object({
-	podcast_transcript: z.array(PodcastTranscriptEntrySchema).optional(),
+	podcast_transcript: z.array(PodcastTranscriptEntrySchema).nullish(),
 });

 /**
@ -75,7 +75,9 @@ function parsePodcastDetails(data: unknown): { podcast_transcript?: PodcastTrans
 		console.warn("Invalid podcast details:", result.error.issues);
 		return {};
 	}
-	return result.data;
+	return {
+		podcast_transcript: result.data.podcast_transcript ?? undefined,
+	};
 }

 /**
--- a/surfsense_web/components/tool-ui/image/index.tsx
+++ b/surfsense_web/components/tool-ui/image/index.tsx
@ -11,26 +11,26 @@ import { cn } from "@/lib/utils";
 /**
 * Zod schemas for runtime validation
 */
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
 const ImageFitSchema = z.enum(["cover", "contain"]);

 const ImageSourceSchema = z.object({
 	label: z.string(),
-	iconUrl: z.string().optional(),
-	url: z.string().optional(),
+	iconUrl: z.string().nullish(),
+	url: z.string().nullish(),
 });

 const SerializableImageSchema = z.object({
 	id: z.string(),
 	assetId: z.string(),
 	src: z.string(),
-	alt: z.string(),
-	title: z.string().optional(),
-	description: z.string().optional(),
-	href: z.string().optional(),
-	domain: z.string().optional(),
-	ratio: AspectRatioSchema.optional(),
-	source: ImageSourceSchema.optional(),
+	alt: z.string().nullish(),  // Made optional - will use fallback if missing
+	title: z.string().nullish(),
+	description: z.string().nullish(),
+	href: z.string().nullish(),
+	domain: z.string().nullish(),
+	ratio: AspectRatioSchema.nullish(),
+	source: ImageSourceSchema.nullish(),
 });

 /**
@ -48,7 +48,7 @@ export interface ImageProps {
 	id: string;
 	assetId: string;
 	src: string;
-	alt: string;
+	alt?: string;  // Optional with default fallback
 	title?: string;
 	description?: string;
 	href?: string;
@ -62,18 +62,45 @@ export interface ImageProps {

 /**
 * Parse and validate serializable image from tool result
+ * Returns a valid SerializableImage with fallback values for missing optional fields
 */
-export function parseSerializableImage(result: unknown): SerializableImage {
+export function parseSerializableImage(result: unknown): SerializableImage & { alt: string } {
 	const parsed = SerializableImageSchema.safeParse(result);

 	if (!parsed.success) {
 		console.warn("Invalid image data:", parsed.error.issues);
-		// Try to extract basic info for error display
+		
+		// Try to extract basic info and return a fallback object
 		const obj = (result && typeof result === "object" ? result : {}) as Record<string, unknown>;
+		
+		// If we have at least id, assetId, and src, we can still render the image
+		if (
+			typeof obj.id === "string" &&
+			typeof obj.assetId === "string" &&
+			typeof obj.src === "string"
+		) {
+			return {
+				id: obj.id,
+				assetId: obj.assetId,
+				src: obj.src,
+				alt: typeof obj.alt === "string" ? obj.alt : "Image",
+				title: typeof obj.title === "string" ? obj.title : undefined,
+				description: typeof obj.description === "string" ? obj.description : undefined,
+				href: typeof obj.href === "string" ? obj.href : undefined,
+				domain: typeof obj.domain === "string" ? obj.domain : undefined,
+				ratio: undefined, // Use default ratio
+				source: undefined,
+			};
+		}
+		
 		throw new Error(`Invalid image: ${parsed.error.issues.map((i) => i.message).join(", ")}`);
 	}

-	return parsed.data;
+	// Provide fallback for alt if it's null/undefined
+	return {
+		...parsed.data,
+		alt: parsed.data.alt ?? "Image",
+	};
 }

 /**
@ -89,6 +116,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
 			return "aspect-video";
 		case "9:16":
 			return "aspect-[9/16]";
+		case "21:9":
+			return "aspect-[21/9]";
 		case "auto":
 		default:
 			return "aspect-[4/3]";
@ -172,7 +201,7 @@ export function ImageLoading({ title = "Loading image..." }: { title?: string })
 export function Image({
 	id,
 	src,
-	alt,
+	alt = "Image",
 	title,
 	description,
 	href,
--- a/surfsense_web/components/tool-ui/media-card/index.tsx
+++ b/surfsense_web/components/tool-ui/media-card/index.tsx
@ -13,27 +13,27 @@ import { cn } from "@/lib/utils";
 /**
 * Zod schemas for runtime validation
 */
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "21:9", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
 const MediaCardKindSchema = z.enum(["link", "image", "video", "audio"]);

 const ResponseActionSchema = z.object({
 	id: z.string(),
 	label: z.string(),
-	variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).optional(),
-	confirmLabel: z.string().optional(),
+	variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).nullish(),
+	confirmLabel: z.string().nullish(),
 });

 const SerializableMediaCardSchema = z.object({
 	id: z.string(),
 	assetId: z.string(),
 	kind: MediaCardKindSchema,
-	href: z.string().optional(),
-	src: z.string().optional(),
+	href: z.string().nullish(),
+	src: z.string().nullish(),
 	title: z.string(),
-	description: z.string().optional(),
-	thumb: z.string().optional(),
-	ratio: AspectRatioSchema.optional(),
-	domain: z.string().optional(),
+	description: z.string().nullish(),
+	thumb: z.string().nullish(),
+	ratio: AspectRatioSchema.nullish(),
+	domain: z.string().nullish(),
 });

 /**
@ -90,6 +90,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
 			return "aspect-[4/3]";
 		case "16:9":
 			return "aspect-video";
+		case "9:16":
+			return "aspect-[9/16]";
 		case "21:9":
 			return "aspect-[21/9]";
 		case "auto":