diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
index 188863015..90b5da1d7 100644
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@@ -6,13 +6,19 @@ Open Graph image, etc.) to display rich link previews in the chat UI.
"""
import hashlib
+import logging
import re
from typing import Any
from urllib.parse import urlparse
import httpx
+import trafilatura
+from fake_useragent import UserAgent
+from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.tools import tool
+logger = logging.getLogger(__name__)
+
def extract_domain(url: str) -> str:
"""Extract the domain from a URL."""
@@ -138,6 +144,96 @@ def generate_preview_id(url: str) -> str:
return f"link-preview-{hash_val}"
+def _unescape_html(text: str) -> str:
+ """Unescape common HTML entities."""
+ return (
+ text.replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace(""", '"')
+ .replace("'", "'")
+ .replace("'", "'")
+ )
+
+
+def _make_absolute_url(image_url: str, base_url: str) -> str:
+ """Convert a relative image URL to an absolute URL."""
+ if image_url.startswith(("http://", "https://")):
+ return image_url
+ if image_url.startswith("//"):
+ return f"https:{image_url}"
+ if image_url.startswith("/"):
+ parsed = urlparse(base_url)
+ return f"{parsed.scheme}://{parsed.netloc}{image_url}"
+ return image_url
+
+
+async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
+ """
+ Fetch page content using headless Chromium browser.
+ Used as a fallback when simple HTTP requests are blocked (403, etc.).
+
+ Args:
+ url: URL to fetch
+
+ Returns:
+ Dict with title, description, image, and raw_html, or None if failed
+ """
+ try:
+ logger.info(f"[link_preview] Falling back to Chromium for {url}")
+
+ # Generate a realistic User-Agent to avoid bot detection
+ ua = UserAgent()
+ user_agent = ua.random
+
+ # Use AsyncChromiumLoader to fetch the page
+ crawl_loader = AsyncChromiumLoader(
+ urls=[url], headless=True, user_agent=user_agent
+ )
+ documents = await crawl_loader.aload()
+
+ if not documents:
+ logger.warning(f"[link_preview] Chromium returned no documents for {url}")
+ return None
+
+ doc = documents[0]
+ raw_html = doc.page_content
+
+ if not raw_html or len(raw_html.strip()) == 0:
+ logger.warning(f"[link_preview] Chromium returned empty content for {url}")
+ return None
+
+ # Extract metadata using Trafilatura
+ trafilatura_metadata = trafilatura.extract_metadata(raw_html)
+
+ # Extract OG image from raw HTML (trafilatura doesn't extract this)
+ image = extract_image(raw_html)
+
+ result = {
+ "title": None,
+ "description": None,
+ "image": image,
+ "raw_html": raw_html,
+ }
+
+ if trafilatura_metadata:
+ result["title"] = trafilatura_metadata.title
+ result["description"] = trafilatura_metadata.description
+
+ # If trafilatura didn't get the title/description, try OG tags
+ if not result["title"]:
+ result["title"] = extract_title(raw_html)
+ if not result["description"]:
+ result["description"] = extract_description(raw_html)
+
+ logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
+ return result
+
+ except Exception as e:
+ logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
+ return None
+
+
def create_link_preview_tool():
"""
Factory function to create the link_preview tool.
@@ -184,13 +280,20 @@ def create_link_preview_tool():
url = f"https://{url}"
try:
+ # Use a browser-like User-Agent to fetch Open Graph metadata.
+ # This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
+ # We're only fetching publicly available metadata (title, description, thumbnail)
+ # that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
headers={
- "User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "en-US,en;q=0.5",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Cache-Control": "no-cache",
+ "Pragma": "no-cache",
},
) as client:
response = await client.get(url)
@@ -218,32 +321,14 @@ def create_link_preview_tool():
image = extract_image(html)
# Make sure image URL is absolute
- if image and not image.startswith(("http://", "https://")):
- if image.startswith("//"):
- image = f"https:{image}"
- elif image.startswith("/"):
- parsed = urlparse(url)
- image = f"{parsed.scheme}://{parsed.netloc}{image}"
+ if image:
+ image = _make_absolute_url(image, url)
# Clean up title and description (unescape HTML entities)
if title:
- title = (
- title.replace("&", "&")
- .replace("<", "<")
- .replace(">", ">")
- .replace(""", '"')
- .replace("'", "'")
- .replace("'", "'")
- )
+ title = _unescape_html(title)
if description:
- description = (
- description.replace("&", "&")
- .replace("<", "<")
- .replace(">", ">")
- .replace(""", '"')
- .replace("'", "'")
- .replace("'", "'")
- )
+ description = _unescape_html(description)
# Truncate long descriptions
if len(description) > 200:
description = description[:197] + "..."
@@ -260,6 +345,37 @@ def create_link_preview_tool():
}
except httpx.TimeoutException:
+ # Timeout - try Chromium fallback
+ logger.warning(f"[link_preview] Timeout for {url}, trying Chromium fallback")
+ chromium_result = await fetch_with_chromium(url)
+ if chromium_result:
+ title = chromium_result.get("title") or domain
+ description = chromium_result.get("description")
+ image = chromium_result.get("image")
+
+ # Clean up and truncate
+ if title:
+ title = _unescape_html(title)
+ if description:
+ description = _unescape_html(description)
+ if len(description) > 200:
+ description = description[:197] + "..."
+
+ # Make sure image URL is absolute
+ if image:
+ image = _make_absolute_url(image, url)
+
+ return {
+ "id": preview_id,
+ "assetId": url,
+ "kind": "link",
+ "href": url,
+ "title": title,
+ "description": description,
+ "thumb": image,
+ "domain": domain,
+ }
+
return {
"id": preview_id,
"assetId": url,
@@ -270,6 +386,42 @@ def create_link_preview_tool():
"error": "Request timed out",
}
except httpx.HTTPStatusError as e:
+ status_code = e.response.status_code
+
+ # For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
+ if status_code in (403, 401, 406, 429):
+ logger.warning(
+ f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
+ )
+ chromium_result = await fetch_with_chromium(url)
+ if chromium_result:
+ title = chromium_result.get("title") or domain
+ description = chromium_result.get("description")
+ image = chromium_result.get("image")
+
+ # Clean up and truncate
+ if title:
+ title = _unescape_html(title)
+ if description:
+ description = _unescape_html(description)
+ if len(description) > 200:
+ description = description[:197] + "..."
+
+ # Make sure image URL is absolute
+ if image:
+ image = _make_absolute_url(image, url)
+
+ return {
+ "id": preview_id,
+ "assetId": url,
+ "kind": "link",
+ "href": url,
+ "title": title,
+ "description": description,
+ "thumb": image,
+ "domain": domain,
+ }
+
return {
"id": preview_id,
"assetId": url,
@@ -277,11 +429,11 @@ def create_link_preview_tool():
"href": url,
"title": domain or "Link",
"domain": domain,
- "error": f"HTTP {e.response.status_code}",
+ "error": f"HTTP {status_code}",
}
except Exception as e:
error_message = str(e)
- print(f"[link_preview] Error fetching {url}: {error_message}")
+ logger.error(f"[link_preview] Error fetching {url}: {error_message}")
return {
"id": preview_id,
"assetId": url,
diff --git a/surfsense_web/components/tool-ui/article/index.tsx b/surfsense_web/components/tool-ui/article/index.tsx
index fd73d993d..5669ea832 100644
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ b/surfsense_web/components/tool-ui/article/index.tsx
@@ -19,20 +19,20 @@ import { cn } from "@/lib/utils";
*/
const SerializableArticleSchema = z.object({
id: z.string().default("article-unknown"),
- assetId: z.string().optional(),
- kind: z.literal("article").optional(),
+ assetId: z.string().nullish(),
+ kind: z.literal("article").nullish(),
title: z.string().default("Untitled Article"),
- description: z.string().optional(),
- content: z.string().optional(),
- href: z.string().url().optional(),
- domain: z.string().optional(),
- author: z.string().optional(),
- date: z.string().optional(),
- word_count: z.number().optional(),
- wordCount: z.number().optional(),
- was_truncated: z.boolean().optional(),
- wasTruncated: z.boolean().optional(),
- error: z.string().optional(),
+ description: z.string().nullish(),
+ content: z.string().nullish(),
+ href: z.string().url().nullish(),
+ domain: z.string().nullish(),
+ author: z.string().nullish(),
+ date: z.string().nullish(),
+ word_count: z.number().nullish(),
+ wordCount: z.number().nullish(),
+ was_truncated: z.boolean().nullish(),
+ wasTruncated: z.boolean().nullish(),
+ error: z.string().nullish(),
});
/**
diff --git a/surfsense_web/components/tool-ui/deepagent-thinking.tsx b/surfsense_web/components/tool-ui/deepagent-thinking.tsx
index 5694035bc..3e6f668a8 100644
--- a/surfsense_web/components/tool-ui/deepagent-thinking.tsx
+++ b/surfsense_web/components/tool-ui/deepagent-thinking.tsx
@@ -70,12 +70,12 @@ const ThinkingStepSchema = z.object({
});
const DeepAgentThinkingArgsSchema = z.object({
- query: z.string().optional(),
- context: z.string().optional(),
+ query: z.string().nullish(),
+ context: z.string().nullish(),
});
const DeepAgentThinkingResultSchema = z.object({
- steps: z.array(ThinkingStepSchema).optional(),
+ steps: z.array(ThinkingStepSchema).nullish(),
status: z
.enum([
THINKING_STATUS.THINKING,
@@ -83,8 +83,8 @@ const DeepAgentThinkingResultSchema = z.object({
THINKING_STATUS.SYNTHESIZING,
THINKING_STATUS.COMPLETED,
])
- .optional(),
- summary: z.string().optional(),
+ .nullish(),
+ summary: z.string().nullish(),
});
/** Types derived from Zod schemas */
@@ -325,7 +325,7 @@ export const DeepAgentThinkingToolUI = makeAssistantToolUI<
render: function DeepAgentThinkingUI({ result, status }) {
// Loading state - tool is still running
if (status.type === "running" || status.type === "requires-action") {
- return ;
+ return ;
}
// Incomplete/cancelled state
diff --git a/surfsense_web/components/tool-ui/display-image.tsx b/surfsense_web/components/tool-ui/display-image.tsx
index 28900840e..cd1c14241 100644
--- a/surfsense_web/components/tool-ui/display-image.tsx
+++ b/surfsense_web/components/tool-ui/display-image.tsx
@@ -23,7 +23,7 @@ interface DisplayImageResult {
id: string;
assetId: string;
src: string;
- alt: string;
+ alt?: string; // Made optional - parseSerializableImage provides fallback
title?: string;
description?: string;
domain?: string;
diff --git a/surfsense_web/components/tool-ui/generate-podcast.tsx b/surfsense_web/components/tool-ui/generate-podcast.tsx
index 6ab598bf1..166d95e47 100644
--- a/surfsense_web/components/tool-ui/generate-podcast.tsx
+++ b/surfsense_web/components/tool-ui/generate-podcast.tsx
@@ -14,27 +14,27 @@ import { clearActivePodcastTaskId, setActivePodcastTaskId } from "@/lib/chat/pod
*/
const GeneratePodcastArgsSchema = z.object({
source_content: z.string(),
- podcast_title: z.string().optional(),
- user_prompt: z.string().optional(),
+ podcast_title: z.string().nullish(),
+ user_prompt: z.string().nullish(),
});
const GeneratePodcastResultSchema = z.object({
status: z.enum(["processing", "already_generating", "success", "error"]),
- task_id: z.string().optional(),
- podcast_id: z.number().optional(),
- title: z.string().optional(),
- transcript_entries: z.number().optional(),
- message: z.string().optional(),
- error: z.string().optional(),
+ task_id: z.string().nullish(),
+ podcast_id: z.number().nullish(),
+ title: z.string().nullish(),
+ transcript_entries: z.number().nullish(),
+ message: z.string().nullish(),
+ error: z.string().nullish(),
});
const TaskStatusResponseSchema = z.object({
status: z.enum(["processing", "success", "error"]),
- podcast_id: z.number().optional(),
- title: z.string().optional(),
- transcript_entries: z.number().optional(),
- state: z.string().optional(),
- error: z.string().optional(),
+ podcast_id: z.number().nullish(),
+ title: z.string().nullish(),
+ transcript_entries: z.number().nullish(),
+ state: z.string().nullish(),
+ error: z.string().nullish(),
});
const PodcastTranscriptEntrySchema = z.object({
@@ -43,7 +43,7 @@ const PodcastTranscriptEntrySchema = z.object({
});
const PodcastDetailsSchema = z.object({
- podcast_transcript: z.array(PodcastTranscriptEntrySchema).optional(),
+ podcast_transcript: z.array(PodcastTranscriptEntrySchema).nullish(),
});
/**
@@ -75,7 +75,9 @@ function parsePodcastDetails(data: unknown): { podcast_transcript?: PodcastTrans
console.warn("Invalid podcast details:", result.error.issues);
return {};
}
- return result.data;
+ return {
+ podcast_transcript: result.data.podcast_transcript ?? undefined,
+ };
}
/**
diff --git a/surfsense_web/components/tool-ui/image/index.tsx b/surfsense_web/components/tool-ui/image/index.tsx
index 79f1c5a10..1d28490a3 100644
--- a/surfsense_web/components/tool-ui/image/index.tsx
+++ b/surfsense_web/components/tool-ui/image/index.tsx
@@ -11,26 +11,26 @@ import { cn } from "@/lib/utils";
/**
* Zod schemas for runtime validation
*/
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
const ImageFitSchema = z.enum(["cover", "contain"]);
const ImageSourceSchema = z.object({
label: z.string(),
- iconUrl: z.string().optional(),
- url: z.string().optional(),
+ iconUrl: z.string().nullish(),
+ url: z.string().nullish(),
});
const SerializableImageSchema = z.object({
id: z.string(),
assetId: z.string(),
src: z.string(),
- alt: z.string(),
- title: z.string().optional(),
- description: z.string().optional(),
- href: z.string().optional(),
- domain: z.string().optional(),
- ratio: AspectRatioSchema.optional(),
- source: ImageSourceSchema.optional(),
+ alt: z.string().nullish(), // Made optional - will use fallback if missing
+ title: z.string().nullish(),
+ description: z.string().nullish(),
+ href: z.string().nullish(),
+ domain: z.string().nullish(),
+ ratio: AspectRatioSchema.nullish(),
+ source: ImageSourceSchema.nullish(),
});
/**
@@ -48,7 +48,7 @@ export interface ImageProps {
id: string;
assetId: string;
src: string;
- alt: string;
+ alt?: string; // Optional with default fallback
title?: string;
description?: string;
href?: string;
@@ -62,18 +62,45 @@ export interface ImageProps {
/**
* Parse and validate serializable image from tool result
+ * Returns a valid SerializableImage with fallback values for missing optional fields
*/
-export function parseSerializableImage(result: unknown): SerializableImage {
+export function parseSerializableImage(result: unknown): SerializableImage & { alt: string } {
const parsed = SerializableImageSchema.safeParse(result);
if (!parsed.success) {
console.warn("Invalid image data:", parsed.error.issues);
- // Try to extract basic info for error display
+
+ // Try to extract basic info and return a fallback object
const obj = (result && typeof result === "object" ? result : {}) as Record;
+
+ // If we have at least id, assetId, and src, we can still render the image
+ if (
+ typeof obj.id === "string" &&
+ typeof obj.assetId === "string" &&
+ typeof obj.src === "string"
+ ) {
+ return {
+ id: obj.id,
+ assetId: obj.assetId,
+ src: obj.src,
+ alt: typeof obj.alt === "string" ? obj.alt : "Image",
+ title: typeof obj.title === "string" ? obj.title : undefined,
+ description: typeof obj.description === "string" ? obj.description : undefined,
+ href: typeof obj.href === "string" ? obj.href : undefined,
+ domain: typeof obj.domain === "string" ? obj.domain : undefined,
+ ratio: undefined, // Use default ratio
+ source: undefined,
+ };
+ }
+
throw new Error(`Invalid image: ${parsed.error.issues.map((i) => i.message).join(", ")}`);
}
- return parsed.data;
+ // Provide fallback for alt if it's null/undefined
+ return {
+ ...parsed.data,
+ alt: parsed.data.alt ?? "Image",
+ };
}
/**
@@ -89,6 +116,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
return "aspect-video";
case "9:16":
return "aspect-[9/16]";
+ case "21:9":
+ return "aspect-[21/9]";
case "auto":
default:
return "aspect-[4/3]";
@@ -172,7 +201,7 @@ export function ImageLoading({ title = "Loading image..." }: { title?: string })
export function Image({
id,
src,
- alt,
+ alt = "Image",
title,
description,
href,
diff --git a/surfsense_web/components/tool-ui/media-card/index.tsx b/surfsense_web/components/tool-ui/media-card/index.tsx
index b773ef4a3..d4fe0c7c0 100644
--- a/surfsense_web/components/tool-ui/media-card/index.tsx
+++ b/surfsense_web/components/tool-ui/media-card/index.tsx
@@ -13,27 +13,27 @@ import { cn } from "@/lib/utils";
/**
* Zod schemas for runtime validation
*/
-const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "21:9", "auto"]);
+const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
const MediaCardKindSchema = z.enum(["link", "image", "video", "audio"]);
const ResponseActionSchema = z.object({
id: z.string(),
label: z.string(),
- variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).optional(),
- confirmLabel: z.string().optional(),
+ variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).nullish(),
+ confirmLabel: z.string().nullish(),
});
const SerializableMediaCardSchema = z.object({
id: z.string(),
assetId: z.string(),
kind: MediaCardKindSchema,
- href: z.string().optional(),
- src: z.string().optional(),
+ href: z.string().nullish(),
+ src: z.string().nullish(),
title: z.string(),
- description: z.string().optional(),
- thumb: z.string().optional(),
- ratio: AspectRatioSchema.optional(),
- domain: z.string().optional(),
+ description: z.string().nullish(),
+ thumb: z.string().nullish(),
+ ratio: AspectRatioSchema.nullish(),
+ domain: z.string().nullish(),
});
/**
@@ -90,6 +90,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
return "aspect-[4/3]";
case "16:9":
return "aspect-video";
+ case "9:16":
+ return "aspect-[9/16]";
case "21:9":
return "aspect-[21/9]";
case "auto":