refactor: enhance link preview functionality with Chromium fallback

- Added a fallback mechanism using headless Chromium to fetch page content when standard HTTP requests fail. - Introduced utility functions for unescaping HTML entities and converting relative URLs to absolute. - Updated HTTP request headers to mimic a browser for better compatibility with web servers. - Improved error handling and logging for better debugging and user feedback. - Made various properties in Zod schemas nullable for better type safety and flexibility in handling optional data.
2026-04-30 03:16:25 +02:00 · 2025-12-26 00:07:45 +05:30 · 2025-12-26 00:07:45 +05:30 · bea18960a4
commit bea18960a4
parent 4c2de73694
7 changed files with 271 additions and 86 deletions
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -6,13 +6,19 @@ Open Graph image, etc.) to display rich link previews in the chat UI.
 """
 import hashlib
 import logging
 import re
 from typing import Any
 from urllib.parse import urlparse
 import httpx
 import trafilatura
 from fake_useragent import UserAgent
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.tools import tool
 logger = logging.getLogger(__name__)
 def extract_domain(url: str) -> str:
    """Extract the domain from a URL."""
@ -138,6 +144,96 @@ def generate_preview_id(url: str) -> str:
    return f"link-preview-{hash_val}"
 def _unescape_html(text: str) -> str:
    """Unescape common HTML entities."""
    return (
        text.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", '"')
        .replace("&#39;", "'")
        .replace("&apos;", "'")
    )
 def _make_absolute_url(image_url: str, base_url: str) -> str:
    """Convert a relative image URL to an absolute URL."""
    if image_url.startswith(("http://", "https://")):
        return image_url
    if image_url.startswith("//"):
        return f"https:{image_url}"
    if image_url.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{image_url}"
    return image_url
 async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
    """
    Fetch page content using headless Chromium browser.
    Used as a fallback when simple HTTP requests are blocked (403, etc.).
    Args:
        url: URL to fetch
    Returns:
        Dict with title, description, image, and raw_html, or None if failed
    """
    try:
        logger.info(f"[link_preview] Falling back to Chromium for {url}")
        # Generate a realistic User-Agent to avoid bot detection
        ua = UserAgent()
        user_agent = ua.random
        # Use AsyncChromiumLoader to fetch the page
        crawl_loader = AsyncChromiumLoader(
            urls=[url], headless=True, user_agent=user_agent
        )
        documents = await crawl_loader.aload()
        if not documents:
            logger.warning(f"[link_preview] Chromium returned no documents for {url}")
            return None
        doc = documents[0]
        raw_html = doc.page_content
        if not raw_html or len(raw_html.strip()) == 0:
            logger.warning(f"[link_preview] Chromium returned empty content for {url}")
            return None
        # Extract metadata using Trafilatura
        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
        # Extract OG image from raw HTML (trafilatura doesn't extract this)
        image = extract_image(raw_html)
        result = {
            "title": None,
            "description": None,
            "image": image,
            "raw_html": raw_html,
        }
        if trafilatura_metadata:
            result["title"] = trafilatura_metadata.title
            result["description"] = trafilatura_metadata.description
        # If trafilatura didn't get the title/description, try OG tags
        if not result["title"]:
            result["title"] = extract_title(raw_html)
        if not result["description"]:
            result["description"] = extract_description(raw_html)
        logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
        return result
    except Exception as e:
        logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
        return None
 def create_link_preview_tool():
    """
    Factory function to create the link_preview tool.
@ -184,13 +280,20 @@ def create_link_preview_tool():
            url = f"https://{url}"
        try:
            # Use a browser-like User-Agent to fetch Open Graph metadata.
            # This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
            # We're only fetching publicly available metadata (title, description, thumbnail)
            # that websites intentionally expose via OG tags for link preview purposes.
            async with httpx.AsyncClient(
                timeout=10.0,
                follow_redirects=True,
                headers={
-                    "User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)",
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
-                    "Accept-Language": "en-US,en;q=0.5",
+                    "Accept-Language": "en-US,en;q=0.9",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Cache-Control": "no-cache",
                    "Pragma": "no-cache",
                },
            ) as client:
                response = await client.get(url)
@ -218,32 +321,14 @@ def create_link_preview_tool():
                image = extract_image(html)
                # Make sure image URL is absolute
-                if image and not image.startswith(("http://", "https://")):
+                if image:
-                    if image.startswith("//"):
+                    image = _make_absolute_url(image, url)
                        image = f"https:{image}"
                    elif image.startswith("/"):
                        parsed = urlparse(url)
                        image = f"{parsed.scheme}://{parsed.netloc}{image}"
                # Clean up title and description (unescape HTML entities)
                if title:
-                    title = (
+                    title = _unescape_html(title)
                        title.replace("&amp;", "&")
                        .replace("&lt;", "<")
                        .replace("&gt;", ">")
                        .replace("&quot;", '"')
                        .replace("&#39;", "'")
                        .replace("&apos;", "'")
                    )
                if description:
-                    description = (
+                    description = _unescape_html(description)
                        description.replace("&amp;", "&")
                        .replace("&lt;", "<")
                        .replace("&gt;", ">")
                        .replace("&quot;", '"')
                        .replace("&#39;", "'")
                        .replace("&apos;", "'")
                    )
                    # Truncate long descriptions
                    if len(description) > 200:
                        description = description[:197] + "..."
@ -260,6 +345,37 @@ def create_link_preview_tool():
                }
        except httpx.TimeoutException:
            # Timeout - try Chromium fallback
            logger.warning(f"[link_preview] Timeout for {url}, trying Chromium fallback")
            chromium_result = await fetch_with_chromium(url)
            if chromium_result:
                title = chromium_result.get("title") or domain
                description = chromium_result.get("description")
                image = chromium_result.get("image")
                # Clean up and truncate
                if title:
                    title = _unescape_html(title)
                if description:
                    description = _unescape_html(description)
                    if len(description) > 200:
                        description = description[:197] + "..."
                # Make sure image URL is absolute
                if image:
                    image = _make_absolute_url(image, url)
                return {
                    "id": preview_id,
                    "assetId": url,
                    "kind": "link",
                    "href": url,
                    "title": title,
                    "description": description,
                    "thumb": image,
                    "domain": domain,
                }
            return {
                "id": preview_id,
                "assetId": url,
@ -270,6 +386,42 @@ def create_link_preview_tool():
                "error": "Request timed out",
            }
        except httpx.HTTPStatusError as e:
            status_code = e.response.status_code
            # For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
            if status_code in (403, 401, 406, 429):
                logger.warning(
                    f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
                )
                chromium_result = await fetch_with_chromium(url)
                if chromium_result:
                    title = chromium_result.get("title") or domain
                    description = chromium_result.get("description")
                    image = chromium_result.get("image")
                    # Clean up and truncate
                    if title:
                        title = _unescape_html(title)
                    if description:
                        description = _unescape_html(description)
                        if len(description) > 200:
                            description = description[:197] + "..."
                    # Make sure image URL is absolute
                    if image:
                        image = _make_absolute_url(image, url)
                    return {
                        "id": preview_id,
                        "assetId": url,
                        "kind": "link",
                        "href": url,
                        "title": title,
                        "description": description,
                        "thumb": image,
                        "domain": domain,
                    }
            return {
                "id": preview_id,
                "assetId": url,
@ -277,11 +429,11 @@ def create_link_preview_tool():
                "href": url,
                "title": domain or "Link",
                "domain": domain,
-                "error": f"HTTP {e.response.status_code}",
+                "error": f"HTTP {status_code}",
            }
        except Exception as e:
            error_message = str(e)
-            print(f"[link_preview] Error fetching {url}: {error_message}")
+            logger.error(f"[link_preview] Error fetching {url}: {error_message}")
            return {
                "id": preview_id,
                "assetId": url,
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ b/surfsense_web/components/tool-ui/article/index.tsx
@ -19,20 +19,20 @@ import { cn } from "@/lib/utils";
 */
 const SerializableArticleSchema = z.object({
 	id: z.string().default("article-unknown"),
-	assetId: z.string().optional(),
+	assetId: z.string().nullish(),
-	kind: z.literal("article").optional(),
+	kind: z.literal("article").nullish(),
 	title: z.string().default("Untitled Article"),
-	description: z.string().optional(),
+	description: z.string().nullish(),
-	content: z.string().optional(),
+	content: z.string().nullish(),
-	href: z.string().url().optional(),
+	href: z.string().url().nullish(),
-	domain: z.string().optional(),
+	domain: z.string().nullish(),
-	author: z.string().optional(),
+	author: z.string().nullish(),
-	date: z.string().optional(),
+	date: z.string().nullish(),
-	word_count: z.number().optional(),
+	word_count: z.number().nullish(),
-	wordCount: z.number().optional(),
+	wordCount: z.number().nullish(),
-	was_truncated: z.boolean().optional(),
+	was_truncated: z.boolean().nullish(),
-	wasTruncated: z.boolean().optional(),
+	wasTruncated: z.boolean().nullish(),
-	error: z.string().optional(),
+	error: z.string().nullish(),
 });
 /**
--- a/surfsense_web/components/tool-ui/deepagent-thinking.tsx
+++ b/surfsense_web/components/tool-ui/deepagent-thinking.tsx
@ -70,12 +70,12 @@ const ThinkingStepSchema = z.object({
 });
 const DeepAgentThinkingArgsSchema = z.object({
-	query: z.string().optional(),
+	query: z.string().nullish(),
-	context: z.string().optional(),
+	context: z.string().nullish(),
 });
 const DeepAgentThinkingResultSchema = z.object({
-	steps: z.array(ThinkingStepSchema).optional(),
+	steps: z.array(ThinkingStepSchema).nullish(),
 	status: z
 		.enum([
 			THINKING_STATUS.THINKING,
@ -83,8 +83,8 @@ const DeepAgentThinkingResultSchema = z.object({
 			THINKING_STATUS.SYNTHESIZING,
 			THINKING_STATUS.COMPLETED,
 		])
-		.optional(),
+		.nullish(),
-	summary: z.string().optional(),
+	summary: z.string().nullish(),
 });
 /** Types derived from Zod schemas */
@ -325,7 +325,7 @@ export const DeepAgentThinkingToolUI = makeAssistantToolUI<
 	render: function DeepAgentThinkingUI({ result, status }) {
 		// Loading state - tool is still running
 		if (status.type === "running" || status.type === "requires-action") {
-			return <ThinkingLoadingState status={result?.status} />;
+			return <ThinkingLoadingState status={result?.status ?? undefined} />;
 		}
 		// Incomplete/cancelled state
--- a/surfsense_web/components/tool-ui/display-image.tsx
+++ b/surfsense_web/components/tool-ui/display-image.tsx
@ -23,7 +23,7 @@ interface DisplayImageResult {
 	id: string;
 	assetId: string;
 	src: string;
-	alt: string;
+	alt?: string;  // Made optional - parseSerializableImage provides fallback
 	title?: string;
 	description?: string;
 	domain?: string;
--- a/surfsense_web/components/tool-ui/generate-podcast.tsx
+++ b/surfsense_web/components/tool-ui/generate-podcast.tsx
@ -14,27 +14,27 @@ import { clearActivePodcastTaskId, setActivePodcastTaskId } from "@/lib/chat/pod
 */
 const GeneratePodcastArgsSchema = z.object({
 	source_content: z.string(),
-	podcast_title: z.string().optional(),
+	podcast_title: z.string().nullish(),
-	user_prompt: z.string().optional(),
+	user_prompt: z.string().nullish(),
 });
 const GeneratePodcastResultSchema = z.object({
 	status: z.enum(["processing", "already_generating", "success", "error"]),
-	task_id: z.string().optional(),
+	task_id: z.string().nullish(),
-	podcast_id: z.number().optional(),
+	podcast_id: z.number().nullish(),
-	title: z.string().optional(),
+	title: z.string().nullish(),
-	transcript_entries: z.number().optional(),
+	transcript_entries: z.number().nullish(),
-	message: z.string().optional(),
+	message: z.string().nullish(),
-	error: z.string().optional(),
+	error: z.string().nullish(),
 });
 const TaskStatusResponseSchema = z.object({
 	status: z.enum(["processing", "success", "error"]),
-	podcast_id: z.number().optional(),
+	podcast_id: z.number().nullish(),
-	title: z.string().optional(),
+	title: z.string().nullish(),
-	transcript_entries: z.number().optional(),
+	transcript_entries: z.number().nullish(),
-	state: z.string().optional(),
+	state: z.string().nullish(),
-	error: z.string().optional(),
+	error: z.string().nullish(),
 });
 const PodcastTranscriptEntrySchema = z.object({
@ -43,7 +43,7 @@ const PodcastTranscriptEntrySchema = z.object({
 });
 const PodcastDetailsSchema = z.object({
-	podcast_transcript: z.array(PodcastTranscriptEntrySchema).optional(),
+	podcast_transcript: z.array(PodcastTranscriptEntrySchema).nullish(),
 });
 /**
@ -75,7 +75,9 @@ function parsePodcastDetails(data: unknown): { podcast_transcript?: PodcastTrans
 		console.warn("Invalid podcast details:", result.error.issues);
 		return {};
 	}
-	return result.data;
+	return {
 		podcast_transcript: result.data.podcast_transcript ?? undefined,
 	};
 }
 /**
--- a/surfsense_web/components/tool-ui/image/index.tsx
+++ b/surfsense_web/components/tool-ui/image/index.tsx
@ -11,26 +11,26 @@ import { cn } from "@/lib/utils";
 /**
 * Zod schemas for runtime validation
 */
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
 const ImageFitSchema = z.enum(["cover", "contain"]);
 const ImageSourceSchema = z.object({
 	label: z.string(),
-	iconUrl: z.string().optional(),
+	iconUrl: z.string().nullish(),
-	url: z.string().optional(),
+	url: z.string().nullish(),
 });
 const SerializableImageSchema = z.object({
 	id: z.string(),
 	assetId: z.string(),
 	src: z.string(),
-	alt: z.string(),
+	alt: z.string().nullish(),  // Made optional - will use fallback if missing
-	title: z.string().optional(),
+	title: z.string().nullish(),
-	description: z.string().optional(),
+	description: z.string().nullish(),
-	href: z.string().optional(),
+	href: z.string().nullish(),
-	domain: z.string().optional(),
+	domain: z.string().nullish(),
-	ratio: AspectRatioSchema.optional(),
+	ratio: AspectRatioSchema.nullish(),
-	source: ImageSourceSchema.optional(),
+	source: ImageSourceSchema.nullish(),
 });
 /**
@ -48,7 +48,7 @@ export interface ImageProps {
 	id: string;
 	assetId: string;
 	src: string;
-	alt: string;
+	alt?: string;  // Optional with default fallback
 	title?: string;
 	description?: string;
 	href?: string;
@ -62,18 +62,45 @@ export interface ImageProps {
 /**
 * Parse and validate serializable image from tool result
 * Returns a valid SerializableImage with fallback values for missing optional fields
 */
-export function parseSerializableImage(result: unknown): SerializableImage {
+export function parseSerializableImage(result: unknown): SerializableImage & { alt: string } {
 	const parsed = SerializableImageSchema.safeParse(result);
 	if (!parsed.success) {
 		console.warn("Invalid image data:", parsed.error.issues);
-		// Try to extract basic info for error display
+		
 		// Try to extract basic info and return a fallback object
 		const obj = (result && typeof result === "object" ? result : {}) as Record<string, unknown>;
 		// If we have at least id, assetId, and src, we can still render the image
 		if (
 			typeof obj.id === "string" &&
 			typeof obj.assetId === "string" &&
 			typeof obj.src === "string"
 		) {
 			return {
 				id: obj.id,
 				assetId: obj.assetId,
 				src: obj.src,
 				alt: typeof obj.alt === "string" ? obj.alt : "Image",
 				title: typeof obj.title === "string" ? obj.title : undefined,
 				description: typeof obj.description === "string" ? obj.description : undefined,
 				href: typeof obj.href === "string" ? obj.href : undefined,
 				domain: typeof obj.domain === "string" ? obj.domain : undefined,
 				ratio: undefined, // Use default ratio
 				source: undefined,
 			};
 		}
 		throw new Error(`Invalid image: ${parsed.error.issues.map((i) => i.message).join(", ")}`);
 	}
-	return parsed.data;
+	// Provide fallback for alt if it's null/undefined
 	return {
 		...parsed.data,
 		alt: parsed.data.alt ?? "Image",
 	};
 }
 /**
@ -89,6 +116,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
 			return "aspect-video";
 		case "9:16":
 			return "aspect-[9/16]";
 		case "21:9":
 			return "aspect-[21/9]";
 		case "auto":
 		default:
 			return "aspect-[4/3]";
@ -172,7 +201,7 @@ export function ImageLoading({ title = "Loading image..." }: { title?: string })
 export function Image({
 	id,
 	src,
-	alt,
+	alt = "Image",
 	title,
 	description,
 	href,
--- a/surfsense_web/components/tool-ui/media-card/index.tsx
+++ b/surfsense_web/components/tool-ui/media-card/index.tsx
@ -13,27 +13,27 @@ import { cn } from "@/lib/utils";
 /**
 * Zod schemas for runtime validation
 */
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "21:9", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
 const MediaCardKindSchema = z.enum(["link", "image", "video", "audio"]);
 const ResponseActionSchema = z.object({
 	id: z.string(),
 	label: z.string(),
-	variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).optional(),
+	variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).nullish(),
-	confirmLabel: z.string().optional(),
+	confirmLabel: z.string().nullish(),
 });
 const SerializableMediaCardSchema = z.object({
 	id: z.string(),
 	assetId: z.string(),
 	kind: MediaCardKindSchema,
-	href: z.string().optional(),
+	href: z.string().nullish(),
-	src: z.string().optional(),
+	src: z.string().nullish(),
 	title: z.string(),
-	description: z.string().optional(),
+	description: z.string().nullish(),
-	thumb: z.string().optional(),
+	thumb: z.string().nullish(),
-	ratio: AspectRatioSchema.optional(),
+	ratio: AspectRatioSchema.nullish(),
-	domain: z.string().optional(),
+	domain: z.string().nullish(),
 });
 /**
@ -90,6 +90,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
 			return "aspect-[4/3]";
 		case "16:9":
 			return "aspect-video";
 		case "9:16":
 			return "aspect-[9/16]";
 		case "21:9":
 			return "aspect-[21/9]";
 		case "auto":