feat: add web scraping tool to chat agent for extracting and summarizing webpage content

This commit is contained in:
Anish Sarkar 2025-12-23 01:49:29 +05:30
parent da7cb81252
commit 24dd52ed99
9 changed files with 1018 additions and 76 deletions

View file

@ -18,6 +18,7 @@ from app.agents.new_chat.display_image import create_display_image_tool
from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool
from app.agents.new_chat.link_preview import create_link_preview_tool
from app.agents.new_chat.podcast import create_generate_podcast_tool
from app.agents.new_chat.scrape_webpage import create_scrape_webpage_tool
from app.agents.new_chat.system_prompt import build_surfsense_system_prompt
from app.services.connector_service import ConnectorService
@ -38,6 +39,8 @@ def create_surfsense_deep_agent(
enable_podcast: bool = True,
enable_link_preview: bool = True,
enable_display_image: bool = True,
enable_scrape_webpage: bool = True,
firecrawl_api_key: str | None = None,
additional_tools: Sequence[BaseTool] | None = None,
):
"""
@ -61,6 +64,10 @@ def create_surfsense_deep_agent(
When True, the agent can fetch and display rich link previews.
enable_display_image: Whether to include the display image tool (default: True).
When True, the agent can display images with metadata.
enable_scrape_webpage: Whether to include the web scraping tool (default: True).
When True, the agent can scrape and read webpage content.
firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
Falls back to Chromium/Trafilatura if not provided.
additional_tools: Optional sequence of additional tools to inject into the agent.
The search_knowledge_base tool will always be included.
@ -96,6 +103,11 @@ def create_surfsense_deep_agent(
display_image_tool = create_display_image_tool()
tools.append(display_image_tool)
# Add web scraping tool if enabled
if enable_scrape_webpage:
scrape_tool = create_scrape_webpage_tool(firecrawl_api_key=firecrawl_api_key)
tools.append(scrape_tool)
if additional_tools:
tools.extend(additional_tools)

View file

@ -86,9 +86,7 @@ def create_display_image_tool():
ratio = "16:9" # Default
if "unsplash.com" in src or "pexels.com" in src:
ratio = "16:9"
elif "imgur.com" in src:
ratio = "auto"
elif "github.com" in src or "githubusercontent.com" in src:
elif "imgur.com" in src or "github.com" in src or "githubusercontent.com" in src:
ratio = "auto"
return {

View file

@ -0,0 +1,197 @@
"""
Web scraping tool for the new chat agent.
This module provides a tool for scraping and extracting content from webpages
using the existing WebCrawlerConnector. The scraped content can be used by
the agent to answer questions about web pages.
"""
import hashlib
from typing import Any
from urllib.parse import urlparse
from langchain_core.tools import tool
from app.connectors.webcrawler_connector import WebCrawlerConnector
def extract_domain(url: str) -> str:
"""Extract the domain from a URL."""
try:
parsed = urlparse(url)
domain = parsed.netloc
# Remove 'www.' prefix if present
if domain.startswith("www."):
domain = domain[4:]
return domain
except Exception:
return ""
def generate_scrape_id(url: str) -> str:
"""Generate a unique ID for a scraped webpage."""
hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
return f"scrape-{hash_val}"
def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
"""
Truncate content to a maximum length.
Returns:
Tuple of (truncated_content, was_truncated)
"""
if len(content) <= max_length:
return content, False
# Try to truncate at a sentence boundary
truncated = content[:max_length]
last_period = truncated.rfind(".")
last_newline = truncated.rfind("\n\n")
# Use the later of the two boundaries, or just truncate
boundary = max(last_period, last_newline)
if boundary > max_length * 0.8: # Only use boundary if it's not too far back
truncated = content[: boundary + 1]
return truncated + "\n\n[Content truncated...]", True
def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
"""
Factory function to create the scrape_webpage tool.
Args:
firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
Falls back to Chromium/Trafilatura if not provided.
Returns:
A configured tool function for scraping webpages.
"""
@tool
async def scrape_webpage(
url: str,
max_length: int = 50000,
) -> dict[str, Any]:
"""
Scrape and extract the main content from a webpage.
Use this tool when the user wants you to read, summarize, or answer
questions about a specific webpage's content. This tool actually
fetches and reads the full page content.
Common triggers:
- "Read this article and summarize it"
- "What does this page say about X?"
- "Summarize this blog post for me"
- "Tell me the key points from this article"
- "What's in this webpage?"
Args:
url: The URL of the webpage to scrape (must be HTTP/HTTPS)
max_length: Maximum content length to return (default: 50000 chars)
Returns:
A dictionary containing:
- id: Unique identifier for this scrape
- assetId: The URL (for deduplication)
- kind: "article" (type of content)
- href: The URL to open when clicked
- title: Page title
- description: Brief description or excerpt
- content: The extracted main content (markdown format)
- domain: The domain name
- word_count: Approximate word count
- was_truncated: Whether content was truncated
- error: Error message (if scraping failed)
"""
scrape_id = generate_scrape_id(url)
domain = extract_domain(url)
# Validate and normalize URL
if not url.startswith(("http://", "https://")):
url = f"https://{url}"
try:
# Create webcrawler connector
connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
# Crawl the URL
result, error = await connector.crawl_url(url, formats=["markdown"])
if error:
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": domain or "Webpage",
"domain": domain,
"error": error,
}
if not result:
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": domain or "Webpage",
"domain": domain,
"error": "No content returned from crawler",
}
# Extract content and metadata
content = result.get("content", "")
metadata = result.get("metadata", {})
# Get title from metadata
title = metadata.get("title", "")
if not title:
title = domain or url.split("/")[-1] or "Webpage"
# Get description from metadata
description = metadata.get("description", "")
if not description and content:
# Use first paragraph as description
first_para = content.split("\n\n")[0] if content else ""
description = first_para[:300] + "..." if len(first_para) > 300 else first_para
# Truncate content if needed
content, was_truncated = truncate_content(content, max_length)
# Calculate word count
word_count = len(content.split())
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": title,
"description": description,
"content": content,
"domain": domain,
"word_count": word_count,
"was_truncated": was_truncated,
"crawler_type": result.get("crawler_type", "unknown"),
"author": metadata.get("author"),
"date": metadata.get("date"),
}
except Exception as e:
error_message = str(e)
print(f"[scrape_webpage] Error scraping {url}: {error_message}")
return {
"id": scrape_id,
"assetId": url,
"kind": "article",
"href": url,
"title": domain or "Webpage",
"domain": domain,
"error": f"Failed to scrape: {error_message[:100]}",
}
return scrape_webpage

View file

@ -173,6 +173,29 @@ You have access to the following tools:
- description: Optional description providing context about the image
- Returns: An image card with the image, title, and description
- The image will automatically be displayed in the chat.
5. scrape_webpage: Scrape and extract the main content from a webpage.
- Use this when the user wants you to READ and UNDERSTAND the actual content of a webpage.
- IMPORTANT: This is different from link_preview:
* link_preview: Only fetches metadata (title, description, thumbnail) for display
* scrape_webpage: Actually reads the FULL page content so you can analyze/summarize it
- Trigger scenarios:
* "Read this article and summarize it"
* "What does this page say about X?"
* "Summarize this blog post for me"
* "Tell me the key points from this article"
* "What's in this webpage?"
* "Can you analyze this article?"
- Args:
- url: The URL of the webpage to scrape (must be HTTP/HTTPS)
- max_length: Maximum content length to return (default: 50000 chars)
- Returns: The page title, description, full content (in markdown), word count, and metadata
- After scraping, you will have the full article text and can analyze, summarize, or answer questions about it.
- IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`.
* When you find relevant/important images in the scraped content, use the `display_image` tool to show them to the user.
* This makes your response more visual and engaging.
* Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content.
* Don't show every image - just the most relevant 1-3 images that enhance understanding.
</tools>
<tool_call_examples>
- User: "Fetch all my notes and what's in them?"
@ -205,6 +228,24 @@ You have access to the following tools:
- User: "Can you display a diagram of a neural network?"
- Call: `display_image(src="https://example.com/neural-network.png", alt="Neural network diagram", title="Neural Network Architecture", description="A visual representation of a neural network with input, hidden, and output layers")`
- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
- Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
- After getting the content, provide a summary based on the scraped text
- User: "What does this page say about machine learning? https://docs.example.com/ml-guide"
- Call: `scrape_webpage(url="https://docs.example.com/ml-guide")`
- Then answer the question using the extracted content
- User: "Summarize this blog post: https://medium.com/some-article"
- Call: `scrape_webpage(url="https://medium.com/some-article")`
- Provide a comprehensive summary of the article content
- User: "Read this tutorial and explain it: https://example.com/ml-tutorial"
- First: `scrape_webpage(url="https://example.com/ml-tutorial")`
- Then, if the content contains useful diagrams/images like `![Neural Network Diagram](https://example.com/nn-diagram.png)`:
- Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")`
- Then provide your explanation, referencing the displayed image
</tool_call_examples>{citation_section}
"""

View file

@ -319,6 +319,20 @@ async def stream_new_chat(
status="in_progress",
items=last_active_step_items,
)
elif tool_name == "scrape_webpage":
url = (
tool_input.get("url", "")
if isinstance(tool_input, dict)
else str(tool_input)
)
last_active_step_title = "Scraping webpage"
last_active_step_items = [f"URL: {url[:80]}{'...' if len(url) > 80 else ''}"]
yield streaming_service.format_thinking_step(
step_id=tool_step_id,
title="Scraping webpage",
status="in_progress",
items=last_active_step_items,
)
elif tool_name == "generate_podcast":
podcast_title = (
tool_input.get("podcast_title", "SurfSense Podcast")
@ -398,6 +412,16 @@ async def stream_new_chat(
f"Displaying image: {src[:60]}{'...' if len(src) > 60 else ''}",
"info",
)
elif tool_name == "scrape_webpage":
url = (
tool_input.get("url", "")
if isinstance(tool_input, dict)
else str(tool_input)
)
yield streaming_service.format_terminal_info(
f"Scraping webpage: {url[:70]}{'...' if len(url) > 70 else ''}",
"info",
)
elif tool_name == "generate_podcast":
title = (
tool_input.get("podcast_title", "SurfSense Podcast")
@ -502,6 +526,31 @@ async def stream_new_chat(
status="completed",
items=completed_items,
)
elif tool_name == "scrape_webpage":
# Build completion items for webpage scraping
if isinstance(tool_output, dict):
title = tool_output.get("title", "Webpage")
word_count = tool_output.get("word_count", 0)
has_error = "error" in tool_output
if has_error:
completed_items = [
*last_active_step_items,
f"Error: {tool_output.get('error', 'Failed to scrape')[:50]}",
]
else:
completed_items = [
*last_active_step_items,
f"Title: {title[:50]}{'...' if len(title) > 50 else ''}",
f"Extracted: {word_count:,} words",
]
else:
completed_items = [*last_active_step_items, "Content extracted"]
yield streaming_service.format_thinking_step(
step_id=original_step_id,
title="Scraping webpage",
status="completed",
items=completed_items,
)
elif tool_name == "generate_podcast":
# Build detailed completion items based on podcast status
podcast_status = (
@ -630,6 +679,47 @@ async def stream_new_chat(
f"Image displayed: {title[:40]}{'...' if len(title) > 40 else ''}",
"success",
)
elif tool_name == "scrape_webpage":
# Stream the scrape result so frontend can render the Article component
# Note: We send metadata for display, but content goes to LLM for processing
if isinstance(tool_output, dict):
# Create a display-friendly output (without full content for the card)
display_output = {
k: v for k, v in tool_output.items() if k != "content"
}
# But keep a truncated content preview
if "content" in tool_output:
content = tool_output.get("content", "")
display_output["content_preview"] = (
content[:500] + "..." if len(content) > 500 else content
)
yield streaming_service.format_tool_output_available(
tool_call_id,
display_output,
)
else:
yield streaming_service.format_tool_output_available(
tool_call_id,
{"result": tool_output},
)
# Send terminal message
if isinstance(tool_output, dict) and "error" not in tool_output:
title = tool_output.get("title", "Webpage")
word_count = tool_output.get("word_count", 0)
yield streaming_service.format_terminal_info(
f"Scraped: {title[:40]}{'...' if len(title) > 40 else ''} ({word_count:,} words)",
"success",
)
else:
error_msg = (
tool_output.get("error", "Failed to scrape")
if isinstance(tool_output, dict)
else "Failed to scrape"
)
yield streaming_service.format_terminal_info(
f"Scrape failed: {error_msg}",
"error",
)
elif tool_name == "search_knowledge_base":
# Don't stream the full output for search (can be very large), just acknowledge
yield streaming_service.format_tool_output_available(

View file

@ -13,6 +13,7 @@ import { Thread } from "@/components/assistant-ui/thread";
import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview";
import { DisplayImageToolUI } from "@/components/tool-ui/display-image";
import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
import type { ThinkingStep } from "@/components/tool-ui/deepagent-thinking";
import { getBearerToken } from "@/lib/auth-utils";
import { createAttachmentAdapter, extractAttachmentContent } from "@/lib/chat/attachment-adapter";
@ -81,7 +82,7 @@ function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike {
/**
* Tools that should render custom UI in the chat.
*/
const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image"]);
const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image", "scrape_webpage"]);
/**
* Type for thinking step data from the backend
@ -245,47 +246,74 @@ export default function NewChatPage() {
// Prepare assistant message
const assistantMsgId = `msg-assistant-${Date.now()}`;
let accumulatedText = "";
const currentThinkingSteps = new Map<string, ThinkingStepData>();
const toolCalls = new Map<
string,
{
toolCallId: string;
toolName: string;
args: Record<string, unknown>;
result?: unknown;
// Ordered content parts to preserve inline tool call positions
// Each part is either a text segment or a tool call
type ContentPart =
| { type: "text"; text: string }
| {
type: "tool-call";
toolCallId: string;
toolName: string;
args: Record<string, unknown>;
result?: unknown;
};
const contentParts: ContentPart[] = [];
// Track the current text segment index (for appending text deltas)
let currentTextPartIndex = -1;
// Map to track tool call indices for updating results
const toolCallIndices = new Map<string, number>();
// Helper to get or create the current text part for appending text
const appendText = (delta: string) => {
if (currentTextPartIndex >= 0 && contentParts[currentTextPartIndex]?.type === "text") {
// Append to existing text part
(contentParts[currentTextPartIndex] as { type: "text"; text: string }).text += delta;
} else {
// Create new text part
contentParts.push({ type: "text", text: delta });
currentTextPartIndex = contentParts.length - 1;
}
>();
};
// Helper to add a tool call (this "breaks" the current text segment)
const addToolCall = (toolCallId: string, toolName: string, args: Record<string, unknown>) => {
if (TOOLS_WITH_UI.has(toolName)) {
contentParts.push({
type: "tool-call",
toolCallId,
toolName,
args,
});
toolCallIndices.set(toolCallId, contentParts.length - 1);
// Reset text part index so next text creates a new segment
currentTextPartIndex = -1;
}
};
// Helper to update a tool call's args or result
const updateToolCall = (toolCallId: string, update: { args?: Record<string, unknown>; result?: unknown }) => {
const index = toolCallIndices.get(toolCallId);
if (index !== undefined && contentParts[index]?.type === "tool-call") {
const tc = contentParts[index] as ContentPart & { type: "tool-call" };
if (update.args) tc.args = update.args;
if (update.result !== undefined) tc.result = update.result;
}
};
// Helper to build content for UI (without thinking-steps)
const buildContentForUI = (): ThreadMessageLike["content"] => {
const parts: Array<
| { type: "text"; text: string }
| {
type: "tool-call";
toolCallId: string;
toolName: string;
args: Record<string, unknown>;
result?: unknown;
}
> = [];
if (accumulatedText) {
parts.push({ type: "text", text: accumulatedText });
}
for (const toolCall of toolCalls.values()) {
if (TOOLS_WITH_UI.has(toolCall.toolName)) {
parts.push({
type: "tool-call",
toolCallId: toolCall.toolCallId,
toolName: toolCall.toolName,
args: toolCall.args,
result: toolCall.result,
});
}
}
return parts.length > 0
? (parts as ThreadMessageLike["content"])
// Filter to only include text parts with content and tool-calls with UI
const filtered = contentParts.filter((part) => {
if (part.type === "text") return part.text.length > 0;
if (part.type === "tool-call") return TOOLS_WITH_UI.has(part.toolName);
return false;
});
return filtered.length > 0
? (filtered as ThreadMessageLike["content"])
: [{ type: "text", text: "" }];
};
@ -301,20 +329,15 @@ export default function NewChatPage() {
});
}
if (accumulatedText) {
parts.push({ type: "text", text: accumulatedText });
}
for (const toolCall of toolCalls.values()) {
if (TOOLS_WITH_UI.has(toolCall.toolName)) {
parts.push({
type: "tool-call",
toolCallId: toolCall.toolCallId,
toolName: toolCall.toolName,
args: toolCall.args,
result: toolCall.result,
});
// Add content parts (filtered)
for (const part of contentParts) {
if (part.type === "text" && part.text.length > 0) {
parts.push(part);
} else if (part.type === "tool-call" && TOOLS_WITH_UI.has(part.toolName)) {
parts.push(part);
}
}
return parts.length > 0 ? parts : [{ type: "text", text: "" }];
};
@ -399,7 +422,7 @@ export default function NewChatPage() {
switch (parsed.type) {
case "text-delta":
accumulatedText += parsed.delta;
appendText(parsed.delta);
setMessages((prev) =>
prev.map((m) =>
m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -408,11 +431,8 @@ export default function NewChatPage() {
break;
case "tool-input-start":
toolCalls.set(parsed.toolCallId, {
toolCallId: parsed.toolCallId,
toolName: parsed.toolName,
args: {},
});
// Add tool call inline - this breaks the current text segment
addToolCall(parsed.toolCallId, parsed.toolName, {});
setMessages((prev) =>
prev.map((m) =>
m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -421,14 +441,12 @@ export default function NewChatPage() {
break;
case "tool-input-available": {
const tc = toolCalls.get(parsed.toolCallId);
if (tc) tc.args = parsed.input || {};
else
toolCalls.set(parsed.toolCallId, {
toolCallId: parsed.toolCallId,
toolName: parsed.toolName,
args: parsed.input || {},
});
// Update existing tool call's args, or add if not exists
if (toolCallIndices.has(parsed.toolCallId)) {
updateToolCall(parsed.toolCallId, { args: parsed.input || {} });
} else {
addToolCall(parsed.toolCallId, parsed.toolName, parsed.input || {});
}
setMessages((prev) =>
prev.map((m) =>
m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -438,15 +456,17 @@ export default function NewChatPage() {
}
case "tool-output-available": {
const tc = toolCalls.get(parsed.toolCallId);
if (tc) {
tc.result = parsed.output;
if (
tc.toolName === "generate_podcast" &&
parsed.output?.status === "processing" &&
parsed.output?.task_id
) {
setActivePodcastTaskId(parsed.output.task_id);
// Update the tool call with its result
updateToolCall(parsed.toolCallId, { result: parsed.output });
// Handle podcast-specific logic
if (parsed.output?.status === "processing" && parsed.output?.task_id) {
// Check if this is a podcast tool by looking at the content part
const idx = toolCallIndices.get(parsed.toolCallId);
if (idx !== undefined) {
const part = contentParts[idx];
if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
setActivePodcastTaskId(parsed.output.task_id);
}
}
}
setMessages((prev) =>
@ -491,7 +511,7 @@ export default function NewChatPage() {
// Persist assistant message (with thinking steps for restoration on refresh)
const finalContent = buildContentForPersistence();
if (accumulatedText || toolCalls.size > 0) {
if (contentParts.length > 0) {
appendMessage(threadId, {
role: "assistant",
content: finalContent,
@ -593,6 +613,7 @@ export default function NewChatPage() {
<GeneratePodcastToolUI />
<LinkPreviewToolUI />
<DisplayImageToolUI />
<ScrapeWebpageToolUI />
<div className="h-[calc(100vh-64px)] max-h-[calc(100vh-64px)] overflow-hidden">
<Thread messageThinkingSteps={messageThinkingSteps} />
</div>

View file

@ -0,0 +1,406 @@
"use client";
import { Card, CardContent } from "@/components/ui/card";
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from "@/components/ui/tooltip";
import { cn } from "@/lib/utils";
import {
AlertCircleIcon,
BookOpenIcon,
CalendarIcon,
ExternalLinkIcon,
FileTextIcon,
UserIcon,
} from "lucide-react";
import { Component, type ReactNode, useCallback } from "react";
/**
* Article component props
*/
export interface ArticleProps {
/** Unique identifier for the article */
id: string;
/** Asset identifier (usually the URL) */
assetId?: string;
/** Article title */
title: string;
/** Brief description or excerpt */
description?: string;
/** Full content of the article (markdown) */
content?: string;
/** URL to the original article */
href?: string;
/** Domain of the article source */
domain?: string;
/** Author name */
author?: string;
/** Publication date */
date?: string;
/** Word count */
wordCount?: number;
/** Whether content was truncated */
wasTruncated?: boolean;
/** Optional max width */
maxWidth?: string;
/** Optional error message */
error?: string;
/** Optional className */
className?: string;
/** Response actions */
responseActions?: Array<{
id: string;
label: string;
variant?: "default" | "outline";
}>;
/** Response action handler */
onResponseAction?: (actionId: string) => void;
}
/**
* Serializable article data type (from backend)
*/
export interface SerializableArticle {
id: string;
assetId?: string;
kind?: "article";
title: string;
description?: string;
content?: string;
href?: string;
domain?: string;
author?: string;
date?: string;
word_count?: number;
wordCount?: number;
was_truncated?: boolean;
wasTruncated?: boolean;
error?: string;
}
/**
* Parse serializable article data to ArticleProps
*/
export function parseSerializableArticle(data: unknown): ArticleProps {
const obj = data as Record<string, unknown>;
return {
id: String(obj.id || "article-unknown"),
assetId: obj.assetId as string | undefined,
title: String(obj.title || "Untitled Article"),
description: obj.description as string | undefined,
content: obj.content as string | undefined,
href: obj.href as string | undefined,
domain: obj.domain as string | undefined,
author: obj.author as string | undefined,
date: obj.date as string | undefined,
wordCount: (obj.word_count || obj.wordCount) as number | undefined,
wasTruncated: (obj.was_truncated || obj.wasTruncated) as boolean | undefined,
error: obj.error as string | undefined,
};
}
/**
* Format word count for display
*/
function formatWordCount(count: number): string {
if (count >= 1000) {
return `${(count / 1000).toFixed(1)}k words`;
}
return `${count} words`;
}
/**
* Article card component for displaying scraped webpage content
*/
export function Article({
id,
title,
description,
content,
href,
domain,
author,
date,
wordCount,
wasTruncated,
maxWidth = "100%",
error,
className,
responseActions,
onResponseAction,
}: ArticleProps) {
const handleCardClick = useCallback(() => {
if (href) {
window.open(href, "_blank", "noopener,noreferrer");
}
}, [href]);
// Error state
if (error) {
return (
<Card
id={id}
className={cn(
"overflow-hidden border-destructive/20 bg-destructive/5",
className
)}
style={{ maxWidth }}
>
<CardContent className="p-4">
<div className="flex items-center gap-3">
<div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-destructive/10">
<AlertCircleIcon className="size-5 text-destructive" />
</div>
<div className="flex-1 min-w-0">
<p className="font-medium text-destructive text-sm">
Failed to scrape webpage
</p>
{href && (
<p className="text-muted-foreground text-xs mt-0.5 truncate">
{href}
</p>
)}
<p className="text-muted-foreground text-xs mt-1">{error}</p>
</div>
</div>
</CardContent>
</Card>
);
}
return (
<TooltipProvider>
<Card
id={id}
className={cn(
"group relative overflow-hidden transition-all duration-200",
"hover:shadow-lg hover:border-primary/20",
href && "cursor-pointer",
className
)}
style={{ maxWidth }}
onClick={href ? handleCardClick : undefined}
role={href ? "link" : undefined}
tabIndex={href ? 0 : undefined}
onKeyDown={(e) => {
if (href && (e.key === "Enter" || e.key === " ")) {
e.preventDefault();
handleCardClick();
}
}}
>
{/* Header */}
<CardContent className="p-4">
<div className="flex items-start gap-3">
{/* Icon */}
<div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-primary/10">
<BookOpenIcon className="size-5 text-primary" />
</div>
{/* Content */}
<div className="flex-1 min-w-0">
{/* Title */}
<h3 className="font-semibold text-sm line-clamp-2 group-hover:text-primary transition-colors">
{title}
</h3>
{/* Description */}
{description && (
<p className="text-muted-foreground text-xs mt-1 line-clamp-2">
{description}
</p>
)}
{/* Metadata row */}
<div className="flex flex-wrap items-center gap-x-3 gap-y-1 mt-2 text-xs text-muted-foreground">
{domain && (
<Tooltip>
<TooltipTrigger asChild>
<span className="flex items-center gap-1">
<ExternalLinkIcon className="size-3" />
<span className="truncate max-w-[120px]">{domain}</span>
</span>
</TooltipTrigger>
<TooltipContent>
<p>Source: {domain}</p>
</TooltipContent>
</Tooltip>
)}
{author && (
<Tooltip>
<TooltipTrigger asChild>
<span className="flex items-center gap-1">
<UserIcon className="size-3" />
<span className="truncate max-w-[100px]">{author}</span>
</span>
</TooltipTrigger>
<TooltipContent>
<p>Author: {author}</p>
</TooltipContent>
</Tooltip>
)}
{date && (
<span className="flex items-center gap-1">
<CalendarIcon className="size-3" />
<span>{date}</span>
</span>
)}
{wordCount && (
<Tooltip>
<TooltipTrigger asChild>
<span className="flex items-center gap-1">
<FileTextIcon className="size-3" />
<span>{formatWordCount(wordCount)}</span>
{wasTruncated && (
<span className="text-warning">(truncated)</span>
)}
</span>
</TooltipTrigger>
<TooltipContent>
<p>
{wasTruncated
? "Content was truncated due to length"
: "Full article content available"}
</p>
</TooltipContent>
</Tooltip>
)}
</div>
</div>
{/* External link indicator */}
{href && (
<div className="flex-shrink-0 opacity-0 group-hover:opacity-100 transition-opacity">
<ExternalLinkIcon className="size-4 text-muted-foreground" />
</div>
)}
</div>
{/* Response actions */}
{responseActions && responseActions.length > 0 && (
<div className="flex gap-2 mt-3 pt-3 border-t">
{responseActions.map((action) => (
<button
key={action.id}
type="button"
onClick={(e) => {
e.stopPropagation();
onResponseAction?.(action.id);
}}
className={cn(
"px-3 py-1.5 text-xs font-medium rounded-md transition-colors",
action.variant === "outline"
? "border border-input bg-background hover:bg-accent hover:text-accent-foreground"
: "bg-primary text-primary-foreground hover:bg-primary/90"
)}
>
{action.label}
</button>
))}
</div>
)}
</CardContent>
</Card>
</TooltipProvider>
);
}
/**
* Loading state for article component
*/
export function ArticleLoading({
title = "Loading article...",
}: { title?: string }) {
return (
<Card className="overflow-hidden animate-pulse">
<CardContent className="p-4">
<div className="flex items-start gap-3">
<div className="size-10 rounded-lg bg-muted" />
<div className="flex-1 space-y-2">
<div className="h-4 bg-muted rounded w-3/4" />
<div className="h-3 bg-muted rounded w-full" />
<div className="h-3 bg-muted rounded w-1/2" />
</div>
</div>
<p className="text-xs text-muted-foreground mt-3">{title}</p>
</CardContent>
</Card>
);
}
/**
* Skeleton for article component
*/
export function ArticleSkeleton() {
return (
<Card className="overflow-hidden">
<CardContent className="p-4">
<div className="flex items-start gap-3 animate-pulse">
<div className="size-10 rounded-lg bg-muted" />
<div className="flex-1 space-y-2">
<div className="h-4 bg-muted rounded w-3/4" />
<div className="h-3 bg-muted rounded w-full" />
<div className="h-3 bg-muted rounded w-2/3" />
</div>
</div>
</CardContent>
</Card>
);
}
/**
* Error boundary props
*/
interface ErrorBoundaryProps {
children: ReactNode;
fallback?: ReactNode;
}
interface ErrorBoundaryState {
hasError: boolean;
}
/**
* Error boundary for article component
*/
export class ArticleErrorBoundary extends Component<
ErrorBoundaryProps,
ErrorBoundaryState
> {
constructor(props: ErrorBoundaryProps) {
super(props);
this.state = { hasError: false };
}
static getDerivedStateFromError(): ErrorBoundaryState {
return { hasError: true };
}
render() {
if (this.state.hasError) {
return (
this.props.fallback || (
<Card className="overflow-hidden border-destructive/20 bg-destructive/5">
<CardContent className="p-4">
<div className="flex items-center gap-3">
<AlertCircleIcon className="size-5 text-destructive" />
<p className="text-sm text-destructive">
Failed to render article
</p>
</div>
</CardContent>
</Card>
)
);
}
return this.props.children;
}
}

View file

@ -46,3 +46,17 @@ export {
type DisplayImageArgs,
type DisplayImageResult,
} from "./display-image";
export {
Article,
ArticleErrorBoundary,
ArticleLoading,
ArticleSkeleton,
parseSerializableArticle,
type ArticleProps,
type SerializableArticle,
} from "./article";
export {
ScrapeWebpageToolUI,
type ScrapeWebpageArgs,
type ScrapeWebpageResult,
} from "./scrape-webpage";

View file

@ -0,0 +1,163 @@
"use client";
import { makeAssistantToolUI } from "@assistant-ui/react";
import { AlertCircleIcon, FileTextIcon } from "lucide-react";
import {
Article,
ArticleErrorBoundary,
ArticleLoading,
parseSerializableArticle,
} from "@/components/tool-ui/article";
/**
* Type definitions for the scrape_webpage tool
*/
interface ScrapeWebpageArgs {
url: string;
max_length?: number;
}
interface ScrapeWebpageResult {
id: string;
assetId: string;
kind: "article";
href: string;
title: string;
description?: string;
content?: string;
domain?: string;
author?: string;
date?: string;
word_count?: number;
was_truncated?: boolean;
crawler_type?: string;
error?: string;
}
/**
* Error state component shown when webpage scraping fails
*/
function ScrapeErrorState({ url, error }: { url: string; error: string }) {
return (
<div className="my-4 overflow-hidden rounded-xl border border-destructive/20 bg-destructive/5 p-4 max-w-md">
<div className="flex items-center gap-4">
<div className="flex size-12 shrink-0 items-center justify-center rounded-lg bg-destructive/10">
<AlertCircleIcon className="size-6 text-destructive" />
</div>
<div className="flex-1 min-w-0">
<p className="font-medium text-destructive text-sm">Failed to scrape webpage</p>
<p className="text-muted-foreground text-xs mt-0.5 truncate">{url}</p>
<p className="text-muted-foreground text-xs mt-1">{error}</p>
</div>
</div>
</div>
);
}
/**
* Cancelled state component
*/
function ScrapeCancelledState({ url }: { url: string }) {
return (
<div className="my-4 rounded-xl border border-muted p-4 text-muted-foreground max-w-md">
<p className="flex items-center gap-2">
<FileTextIcon className="size-4" />
<span className="line-through truncate">Scraping: {url}</span>
</p>
</div>
);
}
/**
* Parsed Article component with error handling
*/
function ParsedArticle({ result }: { result: unknown }) {
const article = parseSerializableArticle(result);
return (
<Article
{...article}
maxWidth="480px"
responseActions={[
{ id: "open", label: "Open Source", variant: "default" },
]}
onResponseAction={(id) => {
if (id === "open" && article.href) {
window.open(article.href, "_blank", "noopener,noreferrer");
}
}}
/>
);
}
/**
* Scrape Webpage Tool UI Component
*
* This component is registered with assistant-ui to render an article card
* when the scrape_webpage tool is called by the agent.
*
* It displays scraped webpage content including:
* - Title and description
* - Author and date (if available)
* - Word count
* - Link to original source
*/
export const ScrapeWebpageToolUI = makeAssistantToolUI<
ScrapeWebpageArgs,
ScrapeWebpageResult
>({
toolName: "scrape_webpage",
render: function ScrapeWebpageUI({ args, result, status }) {
const url = args.url || "Unknown URL";
// Loading state - tool is still running
if (status.type === "running" || status.type === "requires-action") {
return (
<div className="my-4">
<ArticleLoading title={`Scraping ${url}...`} />
</div>
);
}
// Incomplete/cancelled state
if (status.type === "incomplete") {
if (status.reason === "cancelled") {
return <ScrapeCancelledState url={url} />;
}
if (status.reason === "error") {
return (
<ScrapeErrorState
url={url}
error={typeof status.error === "string" ? status.error : "An error occurred"}
/>
);
}
}
// No result yet
if (!result) {
return (
<div className="my-4">
<ArticleLoading title={`Extracting content from ${url}...`} />
</div>
);
}
// Error result from the tool
if (result.error) {
return <ScrapeErrorState url={url} error={result.error} />;
}
// Success - render the article card
return (
<div className="my-4">
<ArticleErrorBoundary>
<ParsedArticle result={result} />
</ArticleErrorBoundary>
</div>
);
},
});
export type { ScrapeWebpageArgs, ScrapeWebpageResult };