refactor: enhance link preview functionality with Chromium fallback

- Added a fallback mechanism using headless Chromium to fetch page content when standard HTTP requests fail.
- Introduced utility functions for unescaping HTML entities and converting relative URLs to absolute.
- Updated HTTP request headers to mimic a browser for better compatibility with web servers.
- Improved error handling and logging for better debugging and user feedback.
- Made various properties in Zod schemas nullable for better type safety and flexibility in handling optional data.
This commit is contained in:
Anish Sarkar 2025-12-26 00:07:45 +05:30
parent 4c2de73694
commit bea18960a4
7 changed files with 271 additions and 86 deletions

View file

@ -6,13 +6,19 @@ Open Graph image, etc.) to display rich link previews in the chat UI.
"""
import hashlib
import logging
import re
from typing import Any
from urllib.parse import urlparse
import httpx
import trafilatura
from fake_useragent import UserAgent
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.tools import tool
logger = logging.getLogger(__name__)
def extract_domain(url: str) -> str:
"""Extract the domain from a URL."""
@ -138,6 +144,96 @@ def generate_preview_id(url: str) -> str:
return f"link-preview-{hash_val}"
def _unescape_html(text: str) -> str:
"""Unescape common HTML entities."""
return (
text.replace("&", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", '"')
.replace("&#39;", "'")
.replace("&apos;", "'")
)
def _make_absolute_url(image_url: str, base_url: str) -> str:
"""Convert a relative image URL to an absolute URL."""
if image_url.startswith(("http://", "https://")):
return image_url
if image_url.startswith("//"):
return f"https:{image_url}"
if image_url.startswith("/"):
parsed = urlparse(base_url)
return f"{parsed.scheme}://{parsed.netloc}{image_url}"
return image_url
async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
"""
Fetch page content using headless Chromium browser.
Used as a fallback when simple HTTP requests are blocked (403, etc.).
Args:
url: URL to fetch
Returns:
Dict with title, description, image, and raw_html, or None if failed
"""
try:
logger.info(f"[link_preview] Falling back to Chromium for {url}")
# Generate a realistic User-Agent to avoid bot detection
ua = UserAgent()
user_agent = ua.random
# Use AsyncChromiumLoader to fetch the page
crawl_loader = AsyncChromiumLoader(
urls=[url], headless=True, user_agent=user_agent
)
documents = await crawl_loader.aload()
if not documents:
logger.warning(f"[link_preview] Chromium returned no documents for {url}")
return None
doc = documents[0]
raw_html = doc.page_content
if not raw_html or len(raw_html.strip()) == 0:
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
return None
# Extract metadata using Trafilatura
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
# Extract OG image from raw HTML (trafilatura doesn't extract this)
image = extract_image(raw_html)
result = {
"title": None,
"description": None,
"image": image,
"raw_html": raw_html,
}
if trafilatura_metadata:
result["title"] = trafilatura_metadata.title
result["description"] = trafilatura_metadata.description
# If trafilatura didn't get the title/description, try OG tags
if not result["title"]:
result["title"] = extract_title(raw_html)
if not result["description"]:
result["description"] = extract_description(raw_html)
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
return result
except Exception as e:
logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
return None
def create_link_preview_tool():
"""
Factory function to create the link_preview tool.
@ -184,13 +280,20 @@ def create_link_preview_tool():
url = f"https://{url}"
try:
# Use a browser-like User-Agent to fetch Open Graph metadata.
# This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
# We're only fetching publicly available metadata (title, description, thumbnail)
# that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (compatible; SurfSenseBot/1.0; +https://surfsense.net)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
},
) as client:
response = await client.get(url)
@ -218,32 +321,14 @@ def create_link_preview_tool():
image = extract_image(html)
# Make sure image URL is absolute
if image and not image.startswith(("http://", "https://")):
if image.startswith("//"):
image = f"https:{image}"
elif image.startswith("/"):
parsed = urlparse(url)
image = f"{parsed.scheme}://{parsed.netloc}{image}"
if image:
image = _make_absolute_url(image, url)
# Clean up title and description (unescape HTML entities)
if title:
title = (
title.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", '"')
.replace("&#39;", "'")
.replace("&apos;", "'")
)
title = _unescape_html(title)
if description:
description = (
description.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", '"')
.replace("&#39;", "'")
.replace("&apos;", "'")
)
description = _unescape_html(description)
# Truncate long descriptions
if len(description) > 200:
description = description[:197] + "..."
@ -260,6 +345,37 @@ def create_link_preview_tool():
}
except httpx.TimeoutException:
# Timeout - try Chromium fallback
logger.warning(f"[link_preview] Timeout for {url}, trying Chromium fallback")
chromium_result = await fetch_with_chromium(url)
if chromium_result:
title = chromium_result.get("title") or domain
description = chromium_result.get("description")
image = chromium_result.get("image")
# Clean up and truncate
if title:
title = _unescape_html(title)
if description:
description = _unescape_html(description)
if len(description) > 200:
description = description[:197] + "..."
# Make sure image URL is absolute
if image:
image = _make_absolute_url(image, url)
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": title,
"description": description,
"thumb": image,
"domain": domain,
}
return {
"id": preview_id,
"assetId": url,
@ -270,6 +386,42 @@ def create_link_preview_tool():
"error": "Request timed out",
}
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# For 403 (Forbidden) and similar bot-detection errors, try Chromium fallback
if status_code in (403, 401, 406, 429):
logger.warning(
f"[link_preview] HTTP {status_code} for {url}, trying Chromium fallback"
)
chromium_result = await fetch_with_chromium(url)
if chromium_result:
title = chromium_result.get("title") or domain
description = chromium_result.get("description")
image = chromium_result.get("image")
# Clean up and truncate
if title:
title = _unescape_html(title)
if description:
description = _unescape_html(description)
if len(description) > 200:
description = description[:197] + "..."
# Make sure image URL is absolute
if image:
image = _make_absolute_url(image, url)
return {
"id": preview_id,
"assetId": url,
"kind": "link",
"href": url,
"title": title,
"description": description,
"thumb": image,
"domain": domain,
}
return {
"id": preview_id,
"assetId": url,
@ -277,11 +429,11 @@ def create_link_preview_tool():
"href": url,
"title": domain or "Link",
"domain": domain,
"error": f"HTTP {e.response.status_code}",
"error": f"HTTP {status_code}",
}
except Exception as e:
error_message = str(e)
print(f"[link_preview] Error fetching {url}: {error_message}")
logger.error(f"[link_preview] Error fetching {url}: {error_message}")
return {
"id": preview_id,
"assetId": url,

View file

@ -19,20 +19,20 @@ import { cn } from "@/lib/utils";
*/
const SerializableArticleSchema = z.object({
id: z.string().default("article-unknown"),
assetId: z.string().optional(),
kind: z.literal("article").optional(),
assetId: z.string().nullish(),
kind: z.literal("article").nullish(),
title: z.string().default("Untitled Article"),
description: z.string().optional(),
content: z.string().optional(),
href: z.string().url().optional(),
domain: z.string().optional(),
author: z.string().optional(),
date: z.string().optional(),
word_count: z.number().optional(),
wordCount: z.number().optional(),
was_truncated: z.boolean().optional(),
wasTruncated: z.boolean().optional(),
error: z.string().optional(),
description: z.string().nullish(),
content: z.string().nullish(),
href: z.string().url().nullish(),
domain: z.string().nullish(),
author: z.string().nullish(),
date: z.string().nullish(),
word_count: z.number().nullish(),
wordCount: z.number().nullish(),
was_truncated: z.boolean().nullish(),
wasTruncated: z.boolean().nullish(),
error: z.string().nullish(),
});
/**

View file

@ -70,12 +70,12 @@ const ThinkingStepSchema = z.object({
});
const DeepAgentThinkingArgsSchema = z.object({
query: z.string().optional(),
context: z.string().optional(),
query: z.string().nullish(),
context: z.string().nullish(),
});
const DeepAgentThinkingResultSchema = z.object({
steps: z.array(ThinkingStepSchema).optional(),
steps: z.array(ThinkingStepSchema).nullish(),
status: z
.enum([
THINKING_STATUS.THINKING,
@ -83,8 +83,8 @@ const DeepAgentThinkingResultSchema = z.object({
THINKING_STATUS.SYNTHESIZING,
THINKING_STATUS.COMPLETED,
])
.optional(),
summary: z.string().optional(),
.nullish(),
summary: z.string().nullish(),
});
/** Types derived from Zod schemas */
@ -325,7 +325,7 @@ export const DeepAgentThinkingToolUI = makeAssistantToolUI<
render: function DeepAgentThinkingUI({ result, status }) {
// Loading state - tool is still running
if (status.type === "running" || status.type === "requires-action") {
return <ThinkingLoadingState status={result?.status} />;
return <ThinkingLoadingState status={result?.status ?? undefined} />;
}
// Incomplete/cancelled state

View file

@ -23,7 +23,7 @@ interface DisplayImageResult {
id: string;
assetId: string;
src: string;
alt: string;
alt?: string; // Made optional - parseSerializableImage provides fallback
title?: string;
description?: string;
domain?: string;

View file

@ -14,27 +14,27 @@ import { clearActivePodcastTaskId, setActivePodcastTaskId } from "@/lib/chat/pod
*/
const GeneratePodcastArgsSchema = z.object({
source_content: z.string(),
podcast_title: z.string().optional(),
user_prompt: z.string().optional(),
podcast_title: z.string().nullish(),
user_prompt: z.string().nullish(),
});
const GeneratePodcastResultSchema = z.object({
status: z.enum(["processing", "already_generating", "success", "error"]),
task_id: z.string().optional(),
podcast_id: z.number().optional(),
title: z.string().optional(),
transcript_entries: z.number().optional(),
message: z.string().optional(),
error: z.string().optional(),
task_id: z.string().nullish(),
podcast_id: z.number().nullish(),
title: z.string().nullish(),
transcript_entries: z.number().nullish(),
message: z.string().nullish(),
error: z.string().nullish(),
});
const TaskStatusResponseSchema = z.object({
status: z.enum(["processing", "success", "error"]),
podcast_id: z.number().optional(),
title: z.string().optional(),
transcript_entries: z.number().optional(),
state: z.string().optional(),
error: z.string().optional(),
podcast_id: z.number().nullish(),
title: z.string().nullish(),
transcript_entries: z.number().nullish(),
state: z.string().nullish(),
error: z.string().nullish(),
});
const PodcastTranscriptEntrySchema = z.object({
@ -43,7 +43,7 @@ const PodcastTranscriptEntrySchema = z.object({
});
const PodcastDetailsSchema = z.object({
podcast_transcript: z.array(PodcastTranscriptEntrySchema).optional(),
podcast_transcript: z.array(PodcastTranscriptEntrySchema).nullish(),
});
/**
@ -75,7 +75,9 @@ function parsePodcastDetails(data: unknown): { podcast_transcript?: PodcastTrans
console.warn("Invalid podcast details:", result.error.issues);
return {};
}
return result.data;
return {
podcast_transcript: result.data.podcast_transcript ?? undefined,
};
}
/**

View file

@ -11,26 +11,26 @@ import { cn } from "@/lib/utils";
/**
* Zod schemas for runtime validation
*/
const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "auto"]);
const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
const ImageFitSchema = z.enum(["cover", "contain"]);
const ImageSourceSchema = z.object({
label: z.string(),
iconUrl: z.string().optional(),
url: z.string().optional(),
iconUrl: z.string().nullish(),
url: z.string().nullish(),
});
const SerializableImageSchema = z.object({
id: z.string(),
assetId: z.string(),
src: z.string(),
alt: z.string(),
title: z.string().optional(),
description: z.string().optional(),
href: z.string().optional(),
domain: z.string().optional(),
ratio: AspectRatioSchema.optional(),
source: ImageSourceSchema.optional(),
alt: z.string().nullish(), // Made optional - will use fallback if missing
title: z.string().nullish(),
description: z.string().nullish(),
href: z.string().nullish(),
domain: z.string().nullish(),
ratio: AspectRatioSchema.nullish(),
source: ImageSourceSchema.nullish(),
});
/**
@ -48,7 +48,7 @@ export interface ImageProps {
id: string;
assetId: string;
src: string;
alt: string;
alt?: string; // Optional with default fallback
title?: string;
description?: string;
href?: string;
@ -62,18 +62,45 @@ export interface ImageProps {
/**
* Parse and validate serializable image from tool result
* Returns a valid SerializableImage with fallback values for missing optional fields
*/
export function parseSerializableImage(result: unknown): SerializableImage {
export function parseSerializableImage(result: unknown): SerializableImage & { alt: string } {
const parsed = SerializableImageSchema.safeParse(result);
if (!parsed.success) {
console.warn("Invalid image data:", parsed.error.issues);
// Try to extract basic info for error display
// Try to extract basic info and return a fallback object
const obj = (result && typeof result === "object" ? result : {}) as Record<string, unknown>;
// If we have at least id, assetId, and src, we can still render the image
if (
typeof obj.id === "string" &&
typeof obj.assetId === "string" &&
typeof obj.src === "string"
) {
return {
id: obj.id,
assetId: obj.assetId,
src: obj.src,
alt: typeof obj.alt === "string" ? obj.alt : "Image",
title: typeof obj.title === "string" ? obj.title : undefined,
description: typeof obj.description === "string" ? obj.description : undefined,
href: typeof obj.href === "string" ? obj.href : undefined,
domain: typeof obj.domain === "string" ? obj.domain : undefined,
ratio: undefined, // Use default ratio
source: undefined,
};
}
throw new Error(`Invalid image: ${parsed.error.issues.map((i) => i.message).join(", ")}`);
}
return parsed.data;
// Provide fallback for alt if it's null/undefined
return {
...parsed.data,
alt: parsed.data.alt ?? "Image",
};
}
/**
@ -89,6 +116,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
return "aspect-video";
case "9:16":
return "aspect-[9/16]";
case "21:9":
return "aspect-[21/9]";
case "auto":
default:
return "aspect-[4/3]";
@ -172,7 +201,7 @@ export function ImageLoading({ title = "Loading image..." }: { title?: string })
export function Image({
id,
src,
alt,
alt = "Image",
title,
description,
href,

View file

@ -13,27 +13,27 @@ import { cn } from "@/lib/utils";
/**
* Zod schemas for runtime validation
*/
const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "21:9", "auto"]);
const AspectRatioSchema = z.enum(["1:1", "4:3", "16:9", "9:16", "21:9", "auto"]);
const MediaCardKindSchema = z.enum(["link", "image", "video", "audio"]);
const ResponseActionSchema = z.object({
id: z.string(),
label: z.string(),
variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).optional(),
confirmLabel: z.string().optional(),
variant: z.enum(["default", "secondary", "outline", "destructive", "ghost"]).nullish(),
confirmLabel: z.string().nullish(),
});
const SerializableMediaCardSchema = z.object({
id: z.string(),
assetId: z.string(),
kind: MediaCardKindSchema,
href: z.string().optional(),
src: z.string().optional(),
href: z.string().nullish(),
src: z.string().nullish(),
title: z.string(),
description: z.string().optional(),
thumb: z.string().optional(),
ratio: AspectRatioSchema.optional(),
domain: z.string().optional(),
description: z.string().nullish(),
thumb: z.string().nullish(),
ratio: AspectRatioSchema.nullish(),
domain: z.string().nullish(),
});
/**
@ -90,6 +90,8 @@ function getAspectRatioClass(ratio?: AspectRatio): string {
return "aspect-[4/3]";
case "16:9":
return "aspect-video";
case "9:16":
return "aspect-[9/16]";
case "21:9":
return "aspect-[21/9]";
case "auto":