feat: add web scraping tool to chat agent for extracting and summarizing webpage content

2026-07-26 23:51:14 +02:00 · 2025-12-23 01:49:29 +05:30 · 2025-12-23 01:49:29 +05:30 · 24dd52ed99
commit 24dd52ed99
parent da7cb81252
9 changed files with 1018 additions and 76 deletions
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@ -18,6 +18,7 @@ from app.agents.new_chat.display_image import create_display_image_tool
 from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool
 from app.agents.new_chat.link_preview import create_link_preview_tool
 from app.agents.new_chat.podcast import create_generate_podcast_tool
+from app.agents.new_chat.scrape_webpage import create_scrape_webpage_tool
 from app.agents.new_chat.system_prompt import build_surfsense_system_prompt
 from app.services.connector_service import ConnectorService

@ -38,6 +39,8 @@ def create_surfsense_deep_agent(
    enable_podcast: bool = True,
    enable_link_preview: bool = True,
    enable_display_image: bool = True,
+    enable_scrape_webpage: bool = True,
+    firecrawl_api_key: str | None = None,
    additional_tools: Sequence[BaseTool] | None = None,
 ):
    """
@ -61,6 +64,10 @@ def create_surfsense_deep_agent(
                            When True, the agent can fetch and display rich link previews.
        enable_display_image: Whether to include the display image tool (default: True).
                             When True, the agent can display images with metadata.
+        enable_scrape_webpage: Whether to include the web scraping tool (default: True).
+                              When True, the agent can scrape and read webpage content.
+        firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
+                          Falls back to Chromium/Trafilatura if not provided.
        additional_tools: Optional sequence of additional tools to inject into the agent.
                         The search_knowledge_base tool will always be included.

@ -96,6 +103,11 @@ def create_surfsense_deep_agent(
        display_image_tool = create_display_image_tool()
        tools.append(display_image_tool)

+    # Add web scraping tool if enabled
+    if enable_scrape_webpage:
+        scrape_tool = create_scrape_webpage_tool(firecrawl_api_key=firecrawl_api_key)
+        tools.append(scrape_tool)
+
    if additional_tools:
        tools.extend(additional_tools)

--- a/surfsense_backend/app/agents/new_chat/display_image.py
+++ b/surfsense_backend/app/agents/new_chat/display_image.py
@ -86,9 +86,7 @@ def create_display_image_tool():
        ratio = "16:9"  # Default
        if "unsplash.com" in src or "pexels.com" in src:
            ratio = "16:9"
-        elif "imgur.com" in src:
-            ratio = "auto"
-        elif "github.com" in src or "githubusercontent.com" in src:
+        elif "imgur.com" in src or "github.com" in src or "githubusercontent.com" in src:
            ratio = "auto"
        
        return {
--- a/surfsense_backend/app/agents/new_chat/scrape_webpage.py
+++ b/surfsense_backend/app/agents/new_chat/scrape_webpage.py
@ -0,0 +1,197 @@
+"""
+Web scraping tool for the new chat agent.
+
+This module provides a tool for scraping and extracting content from webpages
+using the existing WebCrawlerConnector. The scraped content can be used by
+the agent to answer questions about web pages.
+"""
+
+import hashlib
+from typing import Any
+from urllib.parse import urlparse
+
+from langchain_core.tools import tool
+
+from app.connectors.webcrawler_connector import WebCrawlerConnector
+
+
+def extract_domain(url: str) -> str:
+    """Extract the domain from a URL."""
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc
+        # Remove 'www.' prefix if present
+        if domain.startswith("www."):
+            domain = domain[4:]
+        return domain
+    except Exception:
+        return ""
+
+
+def generate_scrape_id(url: str) -> str:
+    """Generate a unique ID for a scraped webpage."""
+    hash_val = hashlib.md5(url.encode()).hexdigest()[:12]
+    return f"scrape-{hash_val}"
+
+
+def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]:
+    """
+    Truncate content to a maximum length.
+    
+    Returns:
+        Tuple of (truncated_content, was_truncated)
+    """
+    if len(content) <= max_length:
+        return content, False
+    
+    # Try to truncate at a sentence boundary
+    truncated = content[:max_length]
+    last_period = truncated.rfind(".")
+    last_newline = truncated.rfind("\n\n")
+    
+    # Use the later of the two boundaries, or just truncate
+    boundary = max(last_period, last_newline)
+    if boundary > max_length * 0.8:  # Only use boundary if it's not too far back
+        truncated = content[: boundary + 1]
+    
+    return truncated + "\n\n[Content truncated...]", True
+
+
+def create_scrape_webpage_tool(firecrawl_api_key: str | None = None):
+    """
+    Factory function to create the scrape_webpage tool.
+
+    Args:
+        firecrawl_api_key: Optional Firecrawl API key for premium web scraping.
+                          Falls back to Chromium/Trafilatura if not provided.
+
+    Returns:
+        A configured tool function for scraping webpages.
+    """
+
+    @tool
+    async def scrape_webpage(
+        url: str,
+        max_length: int = 50000,
+    ) -> dict[str, Any]:
+        """
+        Scrape and extract the main content from a webpage.
+
+        Use this tool when the user wants you to read, summarize, or answer
+        questions about a specific webpage's content. This tool actually
+        fetches and reads the full page content.
+
+        Common triggers:
+        - "Read this article and summarize it"
+        - "What does this page say about X?"
+        - "Summarize this blog post for me"
+        - "Tell me the key points from this article"
+        - "What's in this webpage?"
+
+        Args:
+            url: The URL of the webpage to scrape (must be HTTP/HTTPS)
+            max_length: Maximum content length to return (default: 50000 chars)
+
+        Returns:
+            A dictionary containing:
+            - id: Unique identifier for this scrape
+            - assetId: The URL (for deduplication)
+            - kind: "article" (type of content)
+            - href: The URL to open when clicked
+            - title: Page title
+            - description: Brief description or excerpt
+            - content: The extracted main content (markdown format)
+            - domain: The domain name
+            - word_count: Approximate word count
+            - was_truncated: Whether content was truncated
+            - error: Error message (if scraping failed)
+        """
+        scrape_id = generate_scrape_id(url)
+        domain = extract_domain(url)
+
+        # Validate and normalize URL
+        if not url.startswith(("http://", "https://")):
+            url = f"https://{url}"
+
+        try:
+            # Create webcrawler connector
+            connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key)
+
+            # Crawl the URL
+            result, error = await connector.crawl_url(url, formats=["markdown"])
+
+            if error:
+                return {
+                    "id": scrape_id,
+                    "assetId": url,
+                    "kind": "article",
+                    "href": url,
+                    "title": domain or "Webpage",
+                    "domain": domain,
+                    "error": error,
+                }
+
+            if not result:
+                return {
+                    "id": scrape_id,
+                    "assetId": url,
+                    "kind": "article",
+                    "href": url,
+                    "title": domain or "Webpage",
+                    "domain": domain,
+                    "error": "No content returned from crawler",
+                }
+
+            # Extract content and metadata
+            content = result.get("content", "")
+            metadata = result.get("metadata", {})
+
+            # Get title from metadata
+            title = metadata.get("title", "")
+            if not title:
+                title = domain or url.split("/")[-1] or "Webpage"
+
+            # Get description from metadata
+            description = metadata.get("description", "")
+            if not description and content:
+                # Use first paragraph as description
+                first_para = content.split("\n\n")[0] if content else ""
+                description = first_para[:300] + "..." if len(first_para) > 300 else first_para
+
+            # Truncate content if needed
+            content, was_truncated = truncate_content(content, max_length)
+
+            # Calculate word count
+            word_count = len(content.split())
+
+            return {
+                "id": scrape_id,
+                "assetId": url,
+                "kind": "article",
+                "href": url,
+                "title": title,
+                "description": description,
+                "content": content,
+                "domain": domain,
+                "word_count": word_count,
+                "was_truncated": was_truncated,
+                "crawler_type": result.get("crawler_type", "unknown"),
+                "author": metadata.get("author"),
+                "date": metadata.get("date"),
+            }
+
+        except Exception as e:
+            error_message = str(e)
+            print(f"[scrape_webpage] Error scraping {url}: {error_message}")
+            return {
+                "id": scrape_id,
+                "assetId": url,
+                "kind": "article",
+                "href": url,
+                "title": domain or "Webpage",
+                "domain": domain,
+                "error": f"Failed to scrape: {error_message[:100]}",
+            }
+
+    return scrape_webpage
+
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -173,6 +173,29 @@ You have access to the following tools:
    - description: Optional description providing context about the image
  - Returns: An image card with the image, title, and description
  - The image will automatically be displayed in the chat.
+
+5. scrape_webpage: Scrape and extract the main content from a webpage.
+  - Use this when the user wants you to READ and UNDERSTAND the actual content of a webpage.
+  - IMPORTANT: This is different from link_preview:
+    * link_preview: Only fetches metadata (title, description, thumbnail) for display
+    * scrape_webpage: Actually reads the FULL page content so you can analyze/summarize it
+  - Trigger scenarios:
+    * "Read this article and summarize it"
+    * "What does this page say about X?"
+    * "Summarize this blog post for me"
+    * "Tell me the key points from this article"
+    * "What's in this webpage?"
+    * "Can you analyze this article?"
+  - Args:
+    - url: The URL of the webpage to scrape (must be HTTP/HTTPS)
+    - max_length: Maximum content length to return (default: 50000 chars)
+  - Returns: The page title, description, full content (in markdown), word count, and metadata
+  - After scraping, you will have the full article text and can analyze, summarize, or answer questions about it.
+  - IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`.
+    * When you find relevant/important images in the scraped content, use the `display_image` tool to show them to the user.
+    * This makes your response more visual and engaging.
+    * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content.
+    * Don't show every image - just the most relevant 1-3 images that enhance understanding.
 </tools>
 <tool_call_examples>
 - User: "Fetch all my notes and what's in them?"
@ -205,6 +228,24 @@ You have access to the following tools:

 - User: "Can you display a diagram of a neural network?"
  - Call: `display_image(src="https://example.com/neural-network.png", alt="Neural network diagram", title="Neural Network Architecture", description="A visual representation of a neural network with input, hidden, and output layers")`
+
+- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
+  - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
+  - After getting the content, provide a summary based on the scraped text
+
+- User: "What does this page say about machine learning? https://docs.example.com/ml-guide"
+  - Call: `scrape_webpage(url="https://docs.example.com/ml-guide")`
+  - Then answer the question using the extracted content
+
+- User: "Summarize this blog post: https://medium.com/some-article"
+  - Call: `scrape_webpage(url="https://medium.com/some-article")`
+  - Provide a comprehensive summary of the article content
+
+- User: "Read this tutorial and explain it: https://example.com/ml-tutorial"
+  - First: `scrape_webpage(url="https://example.com/ml-tutorial")`
+  - Then, if the content contains useful diagrams/images like `![Neural Network Diagram](https://example.com/nn-diagram.png)`:
+    - Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")`
+  - Then provide your explanation, referencing the displayed image
 </tool_call_examples>{citation_section}
 """

--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -319,6 +319,20 @@ async def stream_new_chat(
                        status="in_progress",
                        items=last_active_step_items,
                    )
+                elif tool_name == "scrape_webpage":
+                    url = (
+                        tool_input.get("url", "")
+                        if isinstance(tool_input, dict)
+                        else str(tool_input)
+                    )
+                    last_active_step_title = "Scraping webpage"
+                    last_active_step_items = [f"URL: {url[:80]}{'...' if len(url) > 80 else ''}"]
+                    yield streaming_service.format_thinking_step(
+                        step_id=tool_step_id,
+                        title="Scraping webpage",
+                        status="in_progress",
+                        items=last_active_step_items,
+                    )
                elif tool_name == "generate_podcast":
                    podcast_title = (
                        tool_input.get("podcast_title", "SurfSense Podcast")
@ -398,6 +412,16 @@ async def stream_new_chat(
                        f"Displaying image: {src[:60]}{'...' if len(src) > 60 else ''}",
                        "info",
                    )
+                elif tool_name == "scrape_webpage":
+                    url = (
+                        tool_input.get("url", "")
+                        if isinstance(tool_input, dict)
+                        else str(tool_input)
+                    )
+                    yield streaming_service.format_terminal_info(
+                        f"Scraping webpage: {url[:70]}{'...' if len(url) > 70 else ''}",
+                        "info",
+                    )
                elif tool_name == "generate_podcast":
                    title = (
                        tool_input.get("podcast_title", "SurfSense Podcast")
@ -502,6 +526,31 @@ async def stream_new_chat(
                        status="completed",
                        items=completed_items,
                    )
+                elif tool_name == "scrape_webpage":
+                    # Build completion items for webpage scraping
+                    if isinstance(tool_output, dict):
+                        title = tool_output.get("title", "Webpage")
+                        word_count = tool_output.get("word_count", 0)
+                        has_error = "error" in tool_output
+                        if has_error:
+                            completed_items = [
+                                *last_active_step_items,
+                                f"Error: {tool_output.get('error', 'Failed to scrape')[:50]}",
+                            ]
+                        else:
+                            completed_items = [
+                                *last_active_step_items,
+                                f"Title: {title[:50]}{'...' if len(title) > 50 else ''}",
+                                f"Extracted: {word_count:,} words",
+                            ]
+                    else:
+                        completed_items = [*last_active_step_items, "Content extracted"]
+                    yield streaming_service.format_thinking_step(
+                        step_id=original_step_id,
+                        title="Scraping webpage",
+                        status="completed",
+                        items=completed_items,
+                    )
                elif tool_name == "generate_podcast":
                    # Build detailed completion items based on podcast status
                    podcast_status = (
@ -630,6 +679,47 @@ async def stream_new_chat(
                            f"Image displayed: {title[:40]}{'...' if len(title) > 40 else ''}",
                            "success",
                        )
+                elif tool_name == "scrape_webpage":
+                    # Stream the scrape result so frontend can render the Article component
+                    # Note: We send metadata for display, but content goes to LLM for processing
+                    if isinstance(tool_output, dict):
+                        # Create a display-friendly output (without full content for the card)
+                        display_output = {
+                            k: v for k, v in tool_output.items() if k != "content"
+                        }
+                        # But keep a truncated content preview
+                        if "content" in tool_output:
+                            content = tool_output.get("content", "")
+                            display_output["content_preview"] = (
+                                content[:500] + "..." if len(content) > 500 else content
+                            )
+                        yield streaming_service.format_tool_output_available(
+                            tool_call_id,
+                            display_output,
+                        )
+                    else:
+                        yield streaming_service.format_tool_output_available(
+                            tool_call_id,
+                            {"result": tool_output},
+                        )
+                    # Send terminal message
+                    if isinstance(tool_output, dict) and "error" not in tool_output:
+                        title = tool_output.get("title", "Webpage")
+                        word_count = tool_output.get("word_count", 0)
+                        yield streaming_service.format_terminal_info(
+                            f"Scraped: {title[:40]}{'...' if len(title) > 40 else ''} ({word_count:,} words)",
+                            "success",
+                        )
+                    else:
+                        error_msg = (
+                            tool_output.get("error", "Failed to scrape")
+                            if isinstance(tool_output, dict)
+                            else "Failed to scrape"
+                        )
+                        yield streaming_service.format_terminal_info(
+                            f"Scrape failed: {error_msg}",
+                            "error",
+                        )
                elif tool_name == "search_knowledge_base":
                    # Don't stream the full output for search (can be very large), just acknowledge
                    yield streaming_service.format_tool_output_available(
--- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
@ -13,6 +13,7 @@ import { Thread } from "@/components/assistant-ui/thread";
 import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
 import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview";
 import { DisplayImageToolUI } from "@/components/tool-ui/display-image";
+import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
 import type { ThinkingStep } from "@/components/tool-ui/deepagent-thinking";
 import { getBearerToken } from "@/lib/auth-utils";
 import { createAttachmentAdapter, extractAttachmentContent } from "@/lib/chat/attachment-adapter";
@ -81,7 +82,7 @@ function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike {
 /**
 * Tools that should render custom UI in the chat.
 */
-const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image"]);
+const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image", "scrape_webpage"]);

 /**
 * Type for thinking step data from the backend
@ -245,47 +246,74 @@ export default function NewChatPage() {

 			// Prepare assistant message
 			const assistantMsgId = `msg-assistant-${Date.now()}`;
-			let accumulatedText = "";
 			const currentThinkingSteps = new Map<string, ThinkingStepData>();
-			const toolCalls = new Map<
-				string,
-				{
-					toolCallId: string;
-					toolName: string;
-					args: Record<string, unknown>;
-					result?: unknown;
+			
+			// Ordered content parts to preserve inline tool call positions
+			// Each part is either a text segment or a tool call
+			type ContentPart = 
+				| { type: "text"; text: string }
+				| {
+						type: "tool-call";
+						toolCallId: string;
+						toolName: string;
+						args: Record<string, unknown>;
+						result?: unknown;
+				  };
+			const contentParts: ContentPart[] = [];
+			
+			// Track the current text segment index (for appending text deltas)
+			let currentTextPartIndex = -1;
+			
+			// Map to track tool call indices for updating results
+			const toolCallIndices = new Map<string, number>();
+			
+			// Helper to get or create the current text part for appending text
+			const appendText = (delta: string) => {
+				if (currentTextPartIndex >= 0 && contentParts[currentTextPartIndex]?.type === "text") {
+					// Append to existing text part
+					(contentParts[currentTextPartIndex] as { type: "text"; text: string }).text += delta;
+				} else {
+					// Create new text part
+					contentParts.push({ type: "text", text: delta });
+					currentTextPartIndex = contentParts.length - 1;
 				}
-			>();
+			};
+			
+			// Helper to add a tool call (this "breaks" the current text segment)
+			const addToolCall = (toolCallId: string, toolName: string, args: Record<string, unknown>) => {
+				if (TOOLS_WITH_UI.has(toolName)) {
+					contentParts.push({
+						type: "tool-call",
+						toolCallId,
+						toolName,
+						args,
+					});
+					toolCallIndices.set(toolCallId, contentParts.length - 1);
+					// Reset text part index so next text creates a new segment
+					currentTextPartIndex = -1;
+				}
+			};
+			
+			// Helper to update a tool call's args or result
+			const updateToolCall = (toolCallId: string, update: { args?: Record<string, unknown>; result?: unknown }) => {
+				const index = toolCallIndices.get(toolCallId);
+				if (index !== undefined && contentParts[index]?.type === "tool-call") {
+					const tc = contentParts[index] as ContentPart & { type: "tool-call" };
+					if (update.args) tc.args = update.args;
+					if (update.result !== undefined) tc.result = update.result;
+				}
+			};

 			// Helper to build content for UI (without thinking-steps)
 			const buildContentForUI = (): ThreadMessageLike["content"] => {
-				const parts: Array<
-					| { type: "text"; text: string }
-					| {
-							type: "tool-call";
-							toolCallId: string;
-							toolName: string;
-							args: Record<string, unknown>;
-							result?: unknown;
-					  }
-				> = [];
-				
-				if (accumulatedText) {
-					parts.push({ type: "text", text: accumulatedText });
-				}
-				for (const toolCall of toolCalls.values()) {
-					if (TOOLS_WITH_UI.has(toolCall.toolName)) {
-						parts.push({
-							type: "tool-call",
-							toolCallId: toolCall.toolCallId,
-							toolName: toolCall.toolName,
-							args: toolCall.args,
-							result: toolCall.result,
-						});
-					}
-				}
-				return parts.length > 0
-					? (parts as ThreadMessageLike["content"])
+				// Filter to only include text parts with content and tool-calls with UI
+				const filtered = contentParts.filter((part) => {
+					if (part.type === "text") return part.text.length > 0;
+					if (part.type === "tool-call") return TOOLS_WITH_UI.has(part.toolName);
+					return false;
+				});
+				return filtered.length > 0
+					? (filtered as ThreadMessageLike["content"])
 					: [{ type: "text", text: "" }];
 			};

@ -301,20 +329,15 @@ export default function NewChatPage() {
 					});
 				}
 				
-				if (accumulatedText) {
-					parts.push({ type: "text", text: accumulatedText });
-				}
-				for (const toolCall of toolCalls.values()) {
-					if (TOOLS_WITH_UI.has(toolCall.toolName)) {
-						parts.push({
-							type: "tool-call",
-							toolCallId: toolCall.toolCallId,
-							toolName: toolCall.toolName,
-							args: toolCall.args,
-							result: toolCall.result,
-						});
+				// Add content parts (filtered)
+				for (const part of contentParts) {
+					if (part.type === "text" && part.text.length > 0) {
+						parts.push(part);
+					} else if (part.type === "tool-call" && TOOLS_WITH_UI.has(part.toolName)) {
+						parts.push(part);
 					}
 				}
+				
 				return parts.length > 0 ? parts : [{ type: "text", text: "" }];
 			};

@ -399,7 +422,7 @@ export default function NewChatPage() {

 									switch (parsed.type) {
 										case "text-delta":
-											accumulatedText += parsed.delta;
+											appendText(parsed.delta);
 											setMessages((prev) =>
 												prev.map((m) =>
 													m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -408,11 +431,8 @@ export default function NewChatPage() {
 											break;

 										case "tool-input-start":
-											toolCalls.set(parsed.toolCallId, {
-												toolCallId: parsed.toolCallId,
-												toolName: parsed.toolName,
-												args: {},
-											});
+											// Add tool call inline - this breaks the current text segment
+											addToolCall(parsed.toolCallId, parsed.toolName, {});
 											setMessages((prev) =>
 												prev.map((m) =>
 													m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -421,14 +441,12 @@ export default function NewChatPage() {
 											break;

 										case "tool-input-available": {
-											const tc = toolCalls.get(parsed.toolCallId);
-											if (tc) tc.args = parsed.input || {};
-											else
-												toolCalls.set(parsed.toolCallId, {
-													toolCallId: parsed.toolCallId,
-													toolName: parsed.toolName,
-													args: parsed.input || {},
-												});
+											// Update existing tool call's args, or add if not exists
+											if (toolCallIndices.has(parsed.toolCallId)) {
+												updateToolCall(parsed.toolCallId, { args: parsed.input || {} });
+											} else {
+												addToolCall(parsed.toolCallId, parsed.toolName, parsed.input || {});
+											}
 											setMessages((prev) =>
 												prev.map((m) =>
 													m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m
@ -438,15 +456,17 @@ export default function NewChatPage() {
 										}

 										case "tool-output-available": {
-											const tc = toolCalls.get(parsed.toolCallId);
-											if (tc) {
-												tc.result = parsed.output;
-												if (
-													tc.toolName === "generate_podcast" &&
-													parsed.output?.status === "processing" &&
-													parsed.output?.task_id
-												) {
-													setActivePodcastTaskId(parsed.output.task_id);
+											// Update the tool call with its result
+											updateToolCall(parsed.toolCallId, { result: parsed.output });
+											// Handle podcast-specific logic
+											if (parsed.output?.status === "processing" && parsed.output?.task_id) {
+												// Check if this is a podcast tool by looking at the content part
+												const idx = toolCallIndices.get(parsed.toolCallId);
+												if (idx !== undefined) {
+													const part = contentParts[idx];
+													if (part?.type === "tool-call" && part.toolName === "generate_podcast") {
+														setActivePodcastTaskId(parsed.output.task_id);
+													}
 												}
 											}
 											setMessages((prev) =>
@ -491,7 +511,7 @@ export default function NewChatPage() {

 				// Persist assistant message (with thinking steps for restoration on refresh)
 				const finalContent = buildContentForPersistence();
-				if (accumulatedText || toolCalls.size > 0) {
+				if (contentParts.length > 0) {
 					appendMessage(threadId, {
 						role: "assistant",
 						content: finalContent,
@ -593,6 +613,7 @@ export default function NewChatPage() {
 			<GeneratePodcastToolUI />
 			<LinkPreviewToolUI />
 			<DisplayImageToolUI />
+			<ScrapeWebpageToolUI />
 			<div className="h-[calc(100vh-64px)] max-h-[calc(100vh-64px)] overflow-hidden">
 				<Thread messageThinkingSteps={messageThinkingSteps} />
 			</div>
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ b/surfsense_web/components/tool-ui/article/index.tsx
@ -0,0 +1,406 @@
+"use client";
+
+import { Card, CardContent } from "@/components/ui/card";
+import {
+	Tooltip,
+	TooltipContent,
+	TooltipProvider,
+	TooltipTrigger,
+} from "@/components/ui/tooltip";
+import { cn } from "@/lib/utils";
+import {
+	AlertCircleIcon,
+	BookOpenIcon,
+	CalendarIcon,
+	ExternalLinkIcon,
+	FileTextIcon,
+	UserIcon,
+} from "lucide-react";
+import { Component, type ReactNode, useCallback } from "react";
+
+/**
+ * Article component props
+ */
+export interface ArticleProps {
+	/** Unique identifier for the article */
+	id: string;
+	/** Asset identifier (usually the URL) */
+	assetId?: string;
+	/** Article title */
+	title: string;
+	/** Brief description or excerpt */
+	description?: string;
+	/** Full content of the article (markdown) */
+	content?: string;
+	/** URL to the original article */
+	href?: string;
+	/** Domain of the article source */
+	domain?: string;
+	/** Author name */
+	author?: string;
+	/** Publication date */
+	date?: string;
+	/** Word count */
+	wordCount?: number;
+	/** Whether content was truncated */
+	wasTruncated?: boolean;
+	/** Optional max width */
+	maxWidth?: string;
+	/** Optional error message */
+	error?: string;
+	/** Optional className */
+	className?: string;
+	/** Response actions */
+	responseActions?: Array<{
+		id: string;
+		label: string;
+		variant?: "default" | "outline";
+	}>;
+	/** Response action handler */
+	onResponseAction?: (actionId: string) => void;
+}
+
+/**
+ * Serializable article data type (from backend)
+ */
+export interface SerializableArticle {
+	id: string;
+	assetId?: string;
+	kind?: "article";
+	title: string;
+	description?: string;
+	content?: string;
+	href?: string;
+	domain?: string;
+	author?: string;
+	date?: string;
+	word_count?: number;
+	wordCount?: number;
+	was_truncated?: boolean;
+	wasTruncated?: boolean;
+	error?: string;
+}
+
+/**
+ * Parse serializable article data to ArticleProps
+ */
+export function parseSerializableArticle(data: unknown): ArticleProps {
+	const obj = data as Record<string, unknown>;
+	return {
+		id: String(obj.id || "article-unknown"),
+		assetId: obj.assetId as string | undefined,
+		title: String(obj.title || "Untitled Article"),
+		description: obj.description as string | undefined,
+		content: obj.content as string | undefined,
+		href: obj.href as string | undefined,
+		domain: obj.domain as string | undefined,
+		author: obj.author as string | undefined,
+		date: obj.date as string | undefined,
+		wordCount: (obj.word_count || obj.wordCount) as number | undefined,
+		wasTruncated: (obj.was_truncated || obj.wasTruncated) as boolean | undefined,
+		error: obj.error as string | undefined,
+	};
+}
+
+/**
+ * Format word count for display
+ */
+function formatWordCount(count: number): string {
+	if (count >= 1000) {
+		return `${(count / 1000).toFixed(1)}k words`;
+	}
+	return `${count} words`;
+}
+
+/**
+ * Article card component for displaying scraped webpage content
+ */
+export function Article({
+	id,
+	title,
+	description,
+	content,
+	href,
+	domain,
+	author,
+	date,
+	wordCount,
+	wasTruncated,
+	maxWidth = "100%",
+	error,
+	className,
+	responseActions,
+	onResponseAction,
+}: ArticleProps) {
+	const handleCardClick = useCallback(() => {
+		if (href) {
+			window.open(href, "_blank", "noopener,noreferrer");
+		}
+	}, [href]);
+
+	// Error state
+	if (error) {
+		return (
+			<Card
+				id={id}
+				className={cn(
+					"overflow-hidden border-destructive/20 bg-destructive/5",
+					className
+				)}
+				style={{ maxWidth }}
+			>
+				<CardContent className="p-4">
+					<div className="flex items-center gap-3">
+						<div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-destructive/10">
+							<AlertCircleIcon className="size-5 text-destructive" />
+						</div>
+						<div className="flex-1 min-w-0">
+							<p className="font-medium text-destructive text-sm">
+								Failed to scrape webpage
+							</p>
+							{href && (
+								<p className="text-muted-foreground text-xs mt-0.5 truncate">
+									{href}
+								</p>
+							)}
+							<p className="text-muted-foreground text-xs mt-1">{error}</p>
+						</div>
+					</div>
+				</CardContent>
+			</Card>
+		);
+	}
+
+	return (
+		<TooltipProvider>
+			<Card
+				id={id}
+				className={cn(
+					"group relative overflow-hidden transition-all duration-200",
+					"hover:shadow-lg hover:border-primary/20",
+					href && "cursor-pointer",
+					className
+				)}
+				style={{ maxWidth }}
+				onClick={href ? handleCardClick : undefined}
+				role={href ? "link" : undefined}
+				tabIndex={href ? 0 : undefined}
+				onKeyDown={(e) => {
+					if (href && (e.key === "Enter" || e.key === " ")) {
+						e.preventDefault();
+						handleCardClick();
+					}
+				}}
+			>
+				{/* Header */}
+				<CardContent className="p-4">
+					<div className="flex items-start gap-3">
+						{/* Icon */}
+						<div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-primary/10">
+							<BookOpenIcon className="size-5 text-primary" />
+						</div>
+
+						{/* Content */}
+						<div className="flex-1 min-w-0">
+							{/* Title */}
+							<h3 className="font-semibold text-sm line-clamp-2 group-hover:text-primary transition-colors">
+								{title}
+							</h3>
+
+							{/* Description */}
+							{description && (
+								<p className="text-muted-foreground text-xs mt-1 line-clamp-2">
+									{description}
+								</p>
+							)}
+
+							{/* Metadata row */}
+							<div className="flex flex-wrap items-center gap-x-3 gap-y-1 mt-2 text-xs text-muted-foreground">
+								{domain && (
+									<Tooltip>
+										<TooltipTrigger asChild>
+											<span className="flex items-center gap-1">
+												<ExternalLinkIcon className="size-3" />
+												<span className="truncate max-w-[120px]">{domain}</span>
+											</span>
+										</TooltipTrigger>
+										<TooltipContent>
+											<p>Source: {domain}</p>
+										</TooltipContent>
+									</Tooltip>
+								)}
+
+								{author && (
+									<Tooltip>
+										<TooltipTrigger asChild>
+											<span className="flex items-center gap-1">
+												<UserIcon className="size-3" />
+												<span className="truncate max-w-[100px]">{author}</span>
+											</span>
+										</TooltipTrigger>
+										<TooltipContent>
+											<p>Author: {author}</p>
+										</TooltipContent>
+									</Tooltip>
+								)}
+
+								{date && (
+									<span className="flex items-center gap-1">
+										<CalendarIcon className="size-3" />
+										<span>{date}</span>
+									</span>
+								)}
+
+								{wordCount && (
+									<Tooltip>
+										<TooltipTrigger asChild>
+											<span className="flex items-center gap-1">
+												<FileTextIcon className="size-3" />
+												<span>{formatWordCount(wordCount)}</span>
+												{wasTruncated && (
+													<span className="text-warning">(truncated)</span>
+												)}
+											</span>
+										</TooltipTrigger>
+										<TooltipContent>
+											<p>
+												{wasTruncated
+													? "Content was truncated due to length"
+													: "Full article content available"}
+											</p>
+										</TooltipContent>
+									</Tooltip>
+								)}
+							</div>
+						</div>
+
+						{/* External link indicator */}
+						{href && (
+							<div className="flex-shrink-0 opacity-0 group-hover:opacity-100 transition-opacity">
+								<ExternalLinkIcon className="size-4 text-muted-foreground" />
+							</div>
+						)}
+					</div>
+
+					{/* Response actions */}
+					{responseActions && responseActions.length > 0 && (
+						<div className="flex gap-2 mt-3 pt-3 border-t">
+							{responseActions.map((action) => (
+								<button
+									key={action.id}
+									type="button"
+									onClick={(e) => {
+										e.stopPropagation();
+										onResponseAction?.(action.id);
+									}}
+									className={cn(
+										"px-3 py-1.5 text-xs font-medium rounded-md transition-colors",
+										action.variant === "outline"
+											? "border border-input bg-background hover:bg-accent hover:text-accent-foreground"
+											: "bg-primary text-primary-foreground hover:bg-primary/90"
+									)}
+								>
+									{action.label}
+								</button>
+							))}
+						</div>
+					)}
+				</CardContent>
+			</Card>
+		</TooltipProvider>
+	);
+}
+
+/**
+ * Loading state for article component
+ */
+export function ArticleLoading({
+	title = "Loading article...",
+}: { title?: string }) {
+	return (
+		<Card className="overflow-hidden animate-pulse">
+			<CardContent className="p-4">
+				<div className="flex items-start gap-3">
+					<div className="size-10 rounded-lg bg-muted" />
+					<div className="flex-1 space-y-2">
+						<div className="h-4 bg-muted rounded w-3/4" />
+						<div className="h-3 bg-muted rounded w-full" />
+						<div className="h-3 bg-muted rounded w-1/2" />
+					</div>
+				</div>
+				<p className="text-xs text-muted-foreground mt-3">{title}</p>
+			</CardContent>
+		</Card>
+	);
+}
+
+/**
+ * Skeleton for article component
+ */
+export function ArticleSkeleton() {
+	return (
+		<Card className="overflow-hidden">
+			<CardContent className="p-4">
+				<div className="flex items-start gap-3 animate-pulse">
+					<div className="size-10 rounded-lg bg-muted" />
+					<div className="flex-1 space-y-2">
+						<div className="h-4 bg-muted rounded w-3/4" />
+						<div className="h-3 bg-muted rounded w-full" />
+						<div className="h-3 bg-muted rounded w-2/3" />
+					</div>
+				</div>
+			</CardContent>
+		</Card>
+	);
+}
+
+/**
+ * Error boundary props
+ */
+interface ErrorBoundaryProps {
+	children: ReactNode;
+	fallback?: ReactNode;
+}
+
+interface ErrorBoundaryState {
+	hasError: boolean;
+}
+
+/**
+ * Error boundary for article component
+ */
+export class ArticleErrorBoundary extends Component<
+	ErrorBoundaryProps,
+	ErrorBoundaryState
+> {
+	constructor(props: ErrorBoundaryProps) {
+		super(props);
+		this.state = { hasError: false };
+	}
+
+	static getDerivedStateFromError(): ErrorBoundaryState {
+		return { hasError: true };
+	}
+
+	render() {
+		if (this.state.hasError) {
+			return (
+				this.props.fallback || (
+					<Card className="overflow-hidden border-destructive/20 bg-destructive/5">
+						<CardContent className="p-4">
+							<div className="flex items-center gap-3">
+								<AlertCircleIcon className="size-5 text-destructive" />
+								<p className="text-sm text-destructive">
+									Failed to render article
+								</p>
+							</div>
+						</CardContent>
+					</Card>
+				)
+			);
+		}
+
+		return this.props.children;
+	}
+}
+
--- a/surfsense_web/components/tool-ui/index.ts
+++ b/surfsense_web/components/tool-ui/index.ts
@ -46,3 +46,17 @@ export {
  type DisplayImageArgs,
  type DisplayImageResult,
 } from "./display-image";
+export {
+  Article,
+  ArticleErrorBoundary,
+  ArticleLoading,
+  ArticleSkeleton,
+  parseSerializableArticle,
+  type ArticleProps,
+  type SerializableArticle,
+} from "./article";
+export {
+  ScrapeWebpageToolUI,
+  type ScrapeWebpageArgs,
+  type ScrapeWebpageResult,
+} from "./scrape-webpage";
--- a/surfsense_web/components/tool-ui/scrape-webpage.tsx
+++ b/surfsense_web/components/tool-ui/scrape-webpage.tsx
@ -0,0 +1,163 @@
+"use client";
+
+import { makeAssistantToolUI } from "@assistant-ui/react";
+import { AlertCircleIcon, FileTextIcon } from "lucide-react";
+import {
+	Article,
+	ArticleErrorBoundary,
+	ArticleLoading,
+	parseSerializableArticle,
+} from "@/components/tool-ui/article";
+
+/**
+ * Type definitions for the scrape_webpage tool
+ */
+interface ScrapeWebpageArgs {
+	url: string;
+	max_length?: number;
+}
+
+interface ScrapeWebpageResult {
+	id: string;
+	assetId: string;
+	kind: "article";
+	href: string;
+	title: string;
+	description?: string;
+	content?: string;
+	domain?: string;
+	author?: string;
+	date?: string;
+	word_count?: number;
+	was_truncated?: boolean;
+	crawler_type?: string;
+	error?: string;
+}
+
+/**
+ * Error state component shown when webpage scraping fails
+ */
+function ScrapeErrorState({ url, error }: { url: string; error: string }) {
+	return (
+		<div className="my-4 overflow-hidden rounded-xl border border-destructive/20 bg-destructive/5 p-4 max-w-md">
+			<div className="flex items-center gap-4">
+				<div className="flex size-12 shrink-0 items-center justify-center rounded-lg bg-destructive/10">
+					<AlertCircleIcon className="size-6 text-destructive" />
+				</div>
+				<div className="flex-1 min-w-0">
+					<p className="font-medium text-destructive text-sm">Failed to scrape webpage</p>
+					<p className="text-muted-foreground text-xs mt-0.5 truncate">{url}</p>
+					<p className="text-muted-foreground text-xs mt-1">{error}</p>
+				</div>
+			</div>
+		</div>
+	);
+}
+
+/**
+ * Cancelled state component
+ */
+function ScrapeCancelledState({ url }: { url: string }) {
+	return (
+		<div className="my-4 rounded-xl border border-muted p-4 text-muted-foreground max-w-md">
+			<p className="flex items-center gap-2">
+				<FileTextIcon className="size-4" />
+				<span className="line-through truncate">Scraping: {url}</span>
+			</p>
+		</div>
+	);
+}
+
+/**
+ * Parsed Article component with error handling
+ */
+function ParsedArticle({ result }: { result: unknown }) {
+	const article = parseSerializableArticle(result);
+
+	return (
+		<Article
+			{...article}
+			maxWidth="480px"
+			responseActions={[
+				{ id: "open", label: "Open Source", variant: "default" },
+			]}
+			onResponseAction={(id) => {
+				if (id === "open" && article.href) {
+					window.open(article.href, "_blank", "noopener,noreferrer");
+				}
+			}}
+		/>
+	);
+}
+
+/**
+ * Scrape Webpage Tool UI Component
+ *
+ * This component is registered with assistant-ui to render an article card
+ * when the scrape_webpage tool is called by the agent.
+ *
+ * It displays scraped webpage content including:
+ * - Title and description
+ * - Author and date (if available)
+ * - Word count
+ * - Link to original source
+ */
+export const ScrapeWebpageToolUI = makeAssistantToolUI<
+	ScrapeWebpageArgs,
+	ScrapeWebpageResult
+>({
+	toolName: "scrape_webpage",
+	render: function ScrapeWebpageUI({ args, result, status }) {
+		const url = args.url || "Unknown URL";
+
+		// Loading state - tool is still running
+		if (status.type === "running" || status.type === "requires-action") {
+			return (
+				<div className="my-4">
+					<ArticleLoading title={`Scraping ${url}...`} />
+				</div>
+			);
+		}
+
+		// Incomplete/cancelled state
+		if (status.type === "incomplete") {
+			if (status.reason === "cancelled") {
+				return <ScrapeCancelledState url={url} />;
+			}
+			if (status.reason === "error") {
+				return (
+					<ScrapeErrorState
+						url={url}
+						error={typeof status.error === "string" ? status.error : "An error occurred"}
+					/>
+				);
+			}
+		}
+
+		// No result yet
+		if (!result) {
+			return (
+				<div className="my-4">
+					<ArticleLoading title={`Extracting content from ${url}...`} />
+				</div>
+			);
+		}
+
+		// Error result from the tool
+		if (result.error) {
+			return <ScrapeErrorState url={url} error={result.error} />;
+		}
+
+		// Success - render the article card
+		return (
+			<div className="my-4">
+				<ArticleErrorBoundary>
+					<ParsedArticle result={result} />
+				</ArticleErrorBoundary>
+			</div>
+		);
+	},
+});
+
+export type { ScrapeWebpageArgs, ScrapeWebpageResult };
+