diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index 188863015..90b5da1d7 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -6,13 +6,19 @@ Open Graph image, etc.) to display rich link previews in the chat UI. """ import hashlib +import logging import re from typing import Any from urllib.parse import urlparse import httpx +import trafilatura +from fake_useragent import UserAgent +from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.tools import tool +logger = logging.getLogger(__name__) + def extract_domain(url: str) -> str: """Extract the domain from a URL.""" @@ -138,6 +144,96 @@ def generate_preview_id(url: str) -> str: return f"link-preview-{hash_val}" +def _unescape_html(text: str) -> str: + """Unescape common HTML entities.""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", '"') + .replace("'", "'") + .replace("'", "'") + ) + + +def _make_absolute_url(image_url: str, base_url: str) -> str: + """Convert a relative image URL to an absolute URL.""" + if image_url.startswith(("http://", "https://")): + return image_url + if image_url.startswith("//"): + return f"https:{image_url}" + if image_url.startswith("/"): + parsed = urlparse(base_url) + return f"{parsed.scheme}://{parsed.netloc}{image_url}" + return image_url + + +async def fetch_with_chromium(url: str) -> dict[str, Any] | None: + """ + Fetch page content using headless Chromium browser. + Used as a fallback when simple HTTP requests are blocked (403, etc.). + + Args: + url: URL to fetch + + Returns: + Dict with title, description, image, and raw_html, or None if failed + """ + try: + logger.info(f"[link_preview] Falling back to Chromium for {url}") + + # Generate a realistic User-Agent to avoid bot detection + ua = UserAgent() + user_agent = ua.random + + # Use AsyncChromiumLoader to fetch the page + crawl_loader = AsyncChromiumLoader( + urls=[url], headless=True, user_agent=user_agent + ) + documents = await crawl_loader.aload() + + if not documents: + logger.warning(f"[link_preview] Chromium returned no documents for {url}") + return None + + doc = documents[0] + raw_html = doc.page_content + + if not raw_html or len(raw_html.strip()) == 0: + logger.warning(f"[link_preview] Chromium returned empty content for {url}") + return None + + # Extract metadata using Trafilatura + trafilatura_metadata = trafilatura.extract_metadata(raw_html) + + # Extract OG image from raw HTML (trafilatura doesn't extract this) + image = extract_image(raw_html) + + result = { + "title": None, + "description": None, + "image": image, + "raw_html": raw_html, + } + + if trafilatura_metadata: + result["title"] = trafilatura_metadata.title + result["description"] = trafilatura_metadata.description + + # If trafilatura didn't get the title/description, try OG tags + if not result["title"]: + result["title"] = extract_title(raw_html) + if not result["description"]: + result["description"] = extract_description(raw_html) + + logger.info(f"[link_preview] Successfully fetched {url} via Chromium") + return result + + except Exception as e: + logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}") + return None + + def create_link_preview_tool(): """ Factory function to create the link_preview tool. @@ -184,13 +280,20 @@ def create_link_preview_tool(): url = f"https://{url}" try: + # Use a browser-like User-Agent to fetch Open Graph metadata. + # This is the same approach used by Slack, Discord, Twitter, etc. for link previews. + # We're only fetching publicly available metadata (title, description, thumbnail) + # that websites intentionally expose via OG tags for link preview purposes. async with httpx.AsyncClient( timeout=10.0, follow_redirects=True, headers={ - "User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Cache-Control": "no-cache", + "Pragma": "no-cache", }, ) as client: response = await client.get(url) @@ -218,32 +321,14 @@ def create_link_preview_tool(): image = extract_image(html) # Make sure image URL is absolute - if image and not image.startswith(("http://", "https://")): - if image.startswith("//"): - image = f"https:{image}" - elif image.startswith("/"): - parsed = urlparse(url) - image = f"{parsed.scheme}://{parsed.netloc}{image}" + if image: + image = _make_absolute_url(image, url) # Clean up title and description (unescape HTML entities) if title: - title = ( - title.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace(""", '"') - .replace("'", "'") - .replace("'", "'") - ) + title = _unescape_html(title) if description: - description = ( - description.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace(""", '"') - .replace("'", "'") - .replace("'", "'") - ) + description = _unescape_html(description) # Truncate long descriptions if len(description) > 200: description = description[:197] + "..." @@ -260,6 +345,37 @@ def create_link_preview_tool(): } except httpx.TimeoutException: + # Timeout - try Chromium fallback + logger.warning(f"[link_preview] Timeout for {url}, trying Chromium fallback") + chromium_result = await fetch_with_chromium(url) + if chromium_result: + title = chromium_result.get("title") or domain + description = chromium_result.get("description") + image = chromium_result.get("image") + + # Clean up and truncate + if title: + title = _unescape_html(title) + if description: + description = _unescape_html(description) + if len(description) > 200: + description = description[:197] + "..." + + # Make sure image URL is absolute + if image: + image = _make_absolute_url(image, url) + + return { + "id": preview_id, + "assetId": url, + "kind": "link", + "href": url, + "title": title, + "description": description, + "thumb": image, + "domain": domain, + } + return { "id": preview_id, "assetId": url, @@ -270,6 +386,42 @@ def create_link_preview_tool(): "error": "Request timed out", } except httpx.HTTPStatusError as e: + status_code = e.response.status_code + + # For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback + if status_code in (403, 401, 406, 429): + logger.warning( + f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback" + ) + chromium_result = await fetch_with_chromium(url) + if chromium_result: + title = chromium_result.get("title") or domain + description = chromium_result.get("description") + image = chromium_result.get("image") + + # Clean up and truncate + if title: + title = _unescape_html(title) + if description: + description = _unescape_html(description) + if len(description) > 200: + description = description[:197] + "..." + + # Make sure image URL is absolute + if image: + image = _make_absolute_url(image, url) + + return { + "id": preview_id, + "assetId": url, + "kind": "link", + "href": url, + "title": title, + "description": description, + "thumb": image, + "domain": domain, + } + return { "id": preview_id, "assetId": url, @@ -277,11 +429,11 @@ def create_link_preview_tool(): "href": url, "title": domain or "Link", "domain": domain, - "error": f"HTTP {e.response.status_code}", + "error": f"HTTP {status_code}", } except Exception as e: error_message = str(e) - print(f"[link_preview] Error fetching {url}: {error_message}") + logger.error(f"[link_preview] Error fetching {url}: {error_message}") return { "id": preview_id, "assetId": url, diff --git a/surfsense_web/components/tool-ui/article/index.tsx b/surfsense_web/components/tool-ui/article/index.tsx index fd73d993d..5669ea832 100644 --- a/surfsense_web/components/tool-ui/article/index.tsx +++ b/surfsense_web/components/tool-ui/article/index.tsx @@ -19,20 +19,20 @@ import { cn } from "@/lib/utils"; */ const SerializableArticleSchema = z.object({ id: z.string().default("article-unknown"), - assetId: z.string().optional(), - kind: z.literal("article").optional(), + assetId: z.string().nullish(), + kind: z.literal("article").nullish(), title: z.string().default("Untitled Article"), - description: z.string().optional(), - content: z.string().optional(), - href: z.string().url().optional(), - domain: z.string().optional(), - author: z.string().optional(), - date: z.string().optional(), - word_count: z.number().optional(), - wordCount: z.number().optional(), - was_truncated: z.boolean().optional(), - wasTruncated: z.boolean().optional(), - error: z.string().optional(), + description: z.string().nullish(), + content: z.string().nullish(), + href: z.string().url().nullish(), + domain: z.string().nullish(), + author: z.string().nullish(), + date: z.string().nullish(), + word_count: z.number().nullish(), + wordCount: z.number().nullish(), + was_truncated: z.boolean().nullish(), + wasTruncated: z.boolean().nullish(), + error: z.string().nullish(), }); /** diff --git a/surfsense_web/components/tool-ui/deepagent-thinking.tsx b/surfsense_web/components/tool-ui/deepagent-thinking.tsx index 5694035bc..3e6f668a8 100644 --- a/surfsense_web/components/tool-ui/deepagent-thinking.tsx +++ b/surfsense_web/components/tool-ui/deepagent-thinking.tsx @@ -70,12 +70,12 @@ const ThinkingStepSchema = z.object({ }); const DeepAgentThinkingArgsSchema = z.object({ - query: z.string().optional(), - context: z.string().optional(), + query: z.string().nullish(), + context: z.string().nullish(), }); const DeepAgentThinkingResultSchema = z.object({ - steps: z.array(ThinkingStepSchema).optional(), + steps: z.array(ThinkingStepSchema).nullish(), status: z .enum([ THINKING_STATUS.THINKING, @@ -83,8 +83,8 @@ const DeepAgentThinkingResultSchema = z.object({ THINKING_STATUS.SYNTHESIZING, THINKING_STATUS.COMPLETED, ]) - .optional(), - summary: z.string().optional(), + .nullish(), + summary: z.string().nullish(), }); /** Types derived from Zod schemas */ @@ -325,7 +325,7 @@ export const DeepAgentThinkingToolUI = makeAssistantToolUI< render: function DeepAgentThinkingUI({ result, status }) { // Loading state - tool is still running if (status.type === "running" || status.type === "requires-action") { - return ; + return ; } // Incomplete/cancelled state diff --git a/surfsense_web/components/tool-ui/display-image.tsx b/surfsense_web/components/tool-ui/display-image.tsx index 28900840e..cd1c14241 100644 --- a/surfsense_web/components/tool-ui/display-image.tsx +++ b/surfsense_web/components/tool-ui/display-image.tsx @@ -23,7 +23,7 @@ interface DisplayImageResult { id: string; assetId: string; src: string; - alt: string; + alt?: string; // Made optional - parseSerializableImage provides fallback title?: string; description?: string; domain?: string; diff --git a/surfsense_web/components/tool-ui/generate-podcast.tsx b/surfsense_web/components/tool-ui/generate-podcast.tsx index 6ab598bf1..166d95e47 100644 --- a/surfsense_web/components/tool-ui/generate-podcast.tsx +++ b/surfsense_web/components/tool-ui/generate-podcast.tsx @@ -14,27 +14,27 @@ import { clearActivePodcastTaskId, setActivePodcastTaskId } from "@/lib/chat/pod */ const GeneratePodcastArgsSchema = z.object({ source_content: z.string(), - podcast_title: z.string().optional(), - user_prompt: z.string().optional(), + podcast_title: z.string().nullish(), + user_prompt: z.string().nullish(), }); const GeneratePodcastResultSchema = z.object({ status: z.enum(["processing", "already_generating", "success", "error"]), - task_id: z.string().optional(), - podcast_id: z.number().optional(), - title: z.string().optional(), - transcript_entries: z.number().optional(), - message: z.string().optional(), - error: z.string().optional(), + task_id: z.string().nullish(), + podcast_id: z.number().nullish(), + title: z.string().nullish(), + transcript_entries: z.number().nullish(), + message: z.string().nullish(), + error: z.string().nullish(), }); const TaskStatusResponseSchema = z.object({ status: z.enum(["processing", "success", "error"]), - podcast_id: z.number().optional(), - title: z.string().optional(), - transcript_entries: z.number().optional(), - state: z.string().optional(), - error: z.string().optional(), + podcast_id: z.number().nullish(), + title: z.string().nullish(), + transcript_entries: z.number().nullish(), + state: z.string().nullish(), + error: z.string().nullish(), }); const PodcastTranscriptEntrySchema = z.object({ @@ -43,7 +43,7 @@ const PodcastTranscriptEntrySchema = z.object({ }); const PodcastDetailsSchema = z.object({ - podcast_transcript: z.array(PodcastTranscriptEntrySchema).optional(), + podcast_transcript: z.array(PodcastTranscriptEntrySchema).nullish(), }); /** @@ -75,7 +75,9 @@ function parsePodcastDetails(data: unknown): { podcast_transcript?: PodcastTrans console.warn("Invalid podcast details:", result.error.issues); return {}; } - return result.data; + return { + podcast_transcript: result.data.podcast_transcript ?? undefined, + }; } /** diff --git a/surfsense_web/components/tool-ui/image/index.tsx b/surfsense_web/components/tool-ui/image/index.tsx index 79f1c5a10..1d28490a3 100644 --- a/surfsense_web/components/tool-ui/image/index.tsx +++ b/surfsense_web/components/tool-ui/image/index.tsx @@ -11,26 +11,26 @@ import { cn } from "@/lib/utils"; /** * Zod schemas for runtime validation */ -const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "auto"]); +const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]); const ImageFitSchema = z.enum(["cover", "contain"]); const ImageSourceSchema = z.object({ label: z.string(), - iconUrl: z.string().optional(), - url: z.string().optional(), + iconUrl: z.string().nullish(), + url: z.string().nullish(), }); const SerializableImageSchema = z.object({ id: z.string(), assetId: z.string(), src: z.string(), - alt: z.string(), - title: z.string().optional(), - description: z.string().optional(), - href: z.string().optional(), - domain: z.string().optional(), - ratio: AspectRatioSchema.optional(), - source: ImageSourceSchema.optional(), + alt: z.string().nullish(), // Made optional - will use fallback if missing + title: z.string().nullish(), + description: z.string().nullish(), + href: z.string().nullish(), + domain: z.string().nullish(), + ratio: AspectRatioSchema.nullish(), + source: ImageSourceSchema.nullish(), }); /** @@ -48,7 +48,7 @@ export interface ImageProps { id: string; assetId: string; src: string; - alt: string; + alt?: string; // Optional with default fallback title?: string; description?: string; href?: string; @@ -62,18 +62,45 @@ export interface ImageProps { /** * Parse and validate serializable image from tool result + * Returns a valid SerializableImage with fallback values for missing optional fields */ -export function parseSerializableImage(result: unknown): SerializableImage { +export function parseSerializableImage(result: unknown): SerializableImage & { alt: string } { const parsed = SerializableImageSchema.safeParse(result); if (!parsed.success) { console.warn("Invalid image data:", parsed.error.issues); - // Try to extract basic info for error display + + // Try to extract basic info and return a fallback object const obj = (result && typeof result === "object" ? result : {}) as Record; + + // If we have at least id, assetId, and src, we can still render the image + if ( + typeof obj.id === "string" && + typeof obj.assetId === "string" && + typeof obj.src === "string" + ) { + return { + id: obj.id, + assetId: obj.assetId, + src: obj.src, + alt: typeof obj.alt === "string" ? obj.alt : "Image", + title: typeof obj.title === "string" ? obj.title : undefined, + description: typeof obj.description === "string" ? obj.description : undefined, + href: typeof obj.href === "string" ? obj.href : undefined, + domain: typeof obj.domain === "string" ? obj.domain : undefined, + ratio: undefined, // Use default ratio + source: undefined, + }; + } + throw new Error(`Invalid image: ${parsed.error.issues.map((i) => i.message).join(", ")}`); } - return parsed.data; + // Provide fallback for alt if it's null/undefined + return { + ...parsed.data, + alt: parsed.data.alt ?? "Image", + }; } /** @@ -89,6 +116,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string { return "aspect-video"; case "9:16": return "aspect-[9/16]"; + case "21:9": + return "aspect-[21/9]"; case "auto": default: return "aspect-[4/3]"; @@ -172,7 +201,7 @@ export function ImageLoading({ title = "Loading image..." }: { title?: string }) export function Image({ id, src, - alt, + alt = "Image", title, description, href, diff --git a/surfsense_web/components/tool-ui/media-card/index.tsx b/surfsense_web/components/tool-ui/media-card/index.tsx index b773ef4a3..d4fe0c7c0 100644 --- a/surfsense_web/components/tool-ui/media-card/index.tsx +++ b/surfsense_web/components/tool-ui/media-card/index.tsx @@ -13,27 +13,27 @@ import { cn } from "@/lib/utils"; /** * Zod schemas for runtime validation */ -const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "21:9", "auto"]); +const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]); const MediaCardKindSchema = z.enum(["link", "image", "video", "audio"]); const ResponseActionSchema = z.object({ id: z.string(), label: z.string(), - variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).optional(), - confirmLabel: z.string().optional(), + variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).nullish(), + confirmLabel: z.string().nullish(), }); const SerializableMediaCardSchema = z.object({ id: z.string(), assetId: z.string(), kind: MediaCardKindSchema, - href: z.string().optional(), - src: z.string().optional(), + href: z.string().nullish(), + src: z.string().nullish(), title: z.string(), - description: z.string().optional(), - thumb: z.string().optional(), - ratio: AspectRatioSchema.optional(), - domain: z.string().optional(), + description: z.string().nullish(), + thumb: z.string().nullish(), + ratio: AspectRatioSchema.nullish(), + domain: z.string().nullish(), }); /** @@ -90,6 +90,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string { return "aspect-[4/3]"; case "16:9": return "aspect-video"; + case "9:16": + return "aspect-[9/16]"; case "21:9": return "aspect-[21/9]"; case "auto":