diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index e6b2aca06..aa1950bff 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -18,6 +18,7 @@ from app.agents.new_chat.display_image import create_display_image_tool from app.agents.new_chat.knowledge_base import create_search_knowledge_base_tool from app.agents.new_chat.link_preview import create_link_preview_tool from app.agents.new_chat.podcast import create_generate_podcast_tool +from app.agents.new_chat.scrape_webpage import create_scrape_webpage_tool from app.agents.new_chat.system_prompt import build_surfsense_system_prompt from app.services.connector_service import ConnectorService @@ -38,6 +39,8 @@ def create_surfsense_deep_agent( enable_podcast: bool = True, enable_link_preview: bool = True, enable_display_image: bool = True, + enable_scrape_webpage: bool = True, + firecrawl_api_key: str | None = None, additional_tools: Sequence[BaseTool] | None = None, ): """ @@ -61,6 +64,10 @@ def create_surfsense_deep_agent( When True, the agent can fetch and display rich link previews. enable_display_image: Whether to include the display image tool (default: True). When True, the agent can display images with metadata. + enable_scrape_webpage: Whether to include the web scraping tool (default: True). + When True, the agent can scrape and read webpage content. + firecrawl_api_key: Optional Firecrawl API key for premium web scraping. + Falls back to Chromium/Trafilatura if not provided. additional_tools: Optional sequence of additional tools to inject into the agent. The search_knowledge_base tool will always be included. @@ -96,6 +103,11 @@ def create_surfsense_deep_agent( display_image_tool = create_display_image_tool() tools.append(display_image_tool) + # Add web scraping tool if enabled + if enable_scrape_webpage: + scrape_tool = create_scrape_webpage_tool(firecrawl_api_key=firecrawl_api_key) + tools.append(scrape_tool) + if additional_tools: tools.extend(additional_tools) diff --git a/surfsense_backend/app/agents/new_chat/display_image.py b/surfsense_backend/app/agents/new_chat/display_image.py index 03153300a..0cd05b523 100644 --- a/surfsense_backend/app/agents/new_chat/display_image.py +++ b/surfsense_backend/app/agents/new_chat/display_image.py @@ -86,9 +86,7 @@ def create_display_image_tool(): ratio = "16:9" # Default if "unsplash.com" in src or "pexels.com" in src: ratio = "16:9" - elif "imgur.com" in src: - ratio = "auto" - elif "github.com" in src or "githubusercontent.com" in src: + elif "imgur.com" in src or "github.com" in src or "githubusercontent.com" in src: ratio = "auto" return { diff --git a/surfsense_backend/app/agents/new_chat/scrape_webpage.py b/surfsense_backend/app/agents/new_chat/scrape_webpage.py new file mode 100644 index 000000000..40a9c917f --- /dev/null +++ b/surfsense_backend/app/agents/new_chat/scrape_webpage.py @@ -0,0 +1,197 @@ +""" +Web scraping tool for the new chat agent. + +This module provides a tool for scraping and extracting content from webpages +using the existing WebCrawlerConnector. The scraped content can be used by +the agent to answer questions about web pages. +""" + +import hashlib +from typing import Any +from urllib.parse import urlparse + +from langchain_core.tools import tool + +from app.connectors.webcrawler_connector import WebCrawlerConnector + + +def extract_domain(url: str) -> str: + """Extract the domain from a URL.""" + try: + parsed = urlparse(url) + domain = parsed.netloc + # Remove 'www.' prefix if present + if domain.startswith("www."): + domain = domain[4:] + return domain + except Exception: + return "" + + +def generate_scrape_id(url: str) -> str: + """Generate a unique ID for a scraped webpage.""" + hash_val = hashlib.md5(url.encode()).hexdigest()[:12] + return f"scrape-{hash_val}" + + +def truncate_content(content: str, max_length: int = 50000) -> tuple[str, bool]: + """ + Truncate content to a maximum length. + + Returns: + Tuple of (truncated_content, was_truncated) + """ + if len(content) <= max_length: + return content, False + + # Try to truncate at a sentence boundary + truncated = content[:max_length] + last_period = truncated.rfind(".") + last_newline = truncated.rfind("\n\n") + + # Use the later of the two boundaries, or just truncate + boundary = max(last_period, last_newline) + if boundary > max_length * 0.8: # Only use boundary if it's not too far back + truncated = content[: boundary + 1] + + return truncated + "\n\n[Content truncated...]", True + + +def create_scrape_webpage_tool(firecrawl_api_key: str | None = None): + """ + Factory function to create the scrape_webpage tool. + + Args: + firecrawl_api_key: Optional Firecrawl API key for premium web scraping. + Falls back to Chromium/Trafilatura if not provided. + + Returns: + A configured tool function for scraping webpages. + """ + + @tool + async def scrape_webpage( + url: str, + max_length: int = 50000, + ) -> dict[str, Any]: + """ + Scrape and extract the main content from a webpage. + + Use this tool when the user wants you to read, summarize, or answer + questions about a specific webpage's content. This tool actually + fetches and reads the full page content. + + Common triggers: + - "Read this article and summarize it" + - "What does this page say about X?" + - "Summarize this blog post for me" + - "Tell me the key points from this article" + - "What's in this webpage?" + + Args: + url: The URL of the webpage to scrape (must be HTTP/HTTPS) + max_length: Maximum content length to return (default: 50000 chars) + + Returns: + A dictionary containing: + - id: Unique identifier for this scrape + - assetId: The URL (for deduplication) + - kind: "article" (type of content) + - href: The URL to open when clicked + - title: Page title + - description: Brief description or excerpt + - content: The extracted main content (markdown format) + - domain: The domain name + - word_count: Approximate word count + - was_truncated: Whether content was truncated + - error: Error message (if scraping failed) + """ + scrape_id = generate_scrape_id(url) + domain = extract_domain(url) + + # Validate and normalize URL + if not url.startswith(("http://", "https://")): + url = f"https://{url}" + + try: + # Create webcrawler connector + connector = WebCrawlerConnector(firecrawl_api_key=firecrawl_api_key) + + # Crawl the URL + result, error = await connector.crawl_url(url, formats=["markdown"]) + + if error: + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": domain or "Webpage", + "domain": domain, + "error": error, + } + + if not result: + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": domain or "Webpage", + "domain": domain, + "error": "No content returned from crawler", + } + + # Extract content and metadata + content = result.get("content", "") + metadata = result.get("metadata", {}) + + # Get title from metadata + title = metadata.get("title", "") + if not title: + title = domain or url.split("/")[-1] or "Webpage" + + # Get description from metadata + description = metadata.get("description", "") + if not description and content: + # Use first paragraph as description + first_para = content.split("\n\n")[0] if content else "" + description = first_para[:300] + "..." if len(first_para) > 300 else first_para + + # Truncate content if needed + content, was_truncated = truncate_content(content, max_length) + + # Calculate word count + word_count = len(content.split()) + + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": title, + "description": description, + "content": content, + "domain": domain, + "word_count": word_count, + "was_truncated": was_truncated, + "crawler_type": result.get("crawler_type", "unknown"), + "author": metadata.get("author"), + "date": metadata.get("date"), + } + + except Exception as e: + error_message = str(e) + print(f"[scrape_webpage] Error scraping {url}: {error_message}") + return { + "id": scrape_id, + "assetId": url, + "kind": "article", + "href": url, + "title": domain or "Webpage", + "domain": domain, + "error": f"Failed to scrape: {error_message[:100]}", + } + + return scrape_webpage + diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index 74abadd38..2677b21fd 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -173,6 +173,29 @@ You have access to the following tools: - description: Optional description providing context about the image - Returns: An image card with the image, title, and description - The image will automatically be displayed in the chat. + +5. scrape_webpage: Scrape and extract the main content from a webpage. + - Use this when the user wants you to READ and UNDERSTAND the actual content of a webpage. + - IMPORTANT: This is different from link_preview: + * link_preview: Only fetches metadata (title, description, thumbnail) for display + * scrape_webpage: Actually reads the FULL page content so you can analyze/summarize it + - Trigger scenarios: + * "Read this article and summarize it" + * "What does this page say about X?" + * "Summarize this blog post for me" + * "Tell me the key points from this article" + * "What's in this webpage?" + * "Can you analyze this article?" + - Args: + - url: The URL of the webpage to scrape (must be HTTP/HTTPS) + - max_length: Maximum content length to return (default: 50000 chars) + - Returns: The page title, description, full content (in markdown), word count, and metadata + - After scraping, you will have the full article text and can analyze, summarize, or answer questions about it. + - IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`. + * When you find relevant/important images in the scraped content, use the `display_image` tool to show them to the user. + * This makes your response more visual and engaging. + * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content. + * Don't show every image - just the most relevant 1-3 images that enhance understanding. - User: "Fetch all my notes and what's in them?" @@ -205,6 +228,24 @@ You have access to the following tools: - User: "Can you display a diagram of a neural network?" - Call: `display_image(src="https://example.com/neural-network.png", alt="Neural network diagram", title="Neural Network Architecture", description="A visual representation of a neural network with input, hidden, and output layers")` + +- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends" + - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")` + - After getting the content, provide a summary based on the scraped text + +- User: "What does this page say about machine learning? https://docs.example.com/ml-guide" + - Call: `scrape_webpage(url="https://docs.example.com/ml-guide")` + - Then answer the question using the extracted content + +- User: "Summarize this blog post: https://medium.com/some-article" + - Call: `scrape_webpage(url="https://medium.com/some-article")` + - Provide a comprehensive summary of the article content + +- User: "Read this tutorial and explain it: https://example.com/ml-tutorial" + - First: `scrape_webpage(url="https://example.com/ml-tutorial")` + - Then, if the content contains useful diagrams/images like `![Neural Network Diagram](https://example.com/nn-diagram.png)`: + - Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")` + - Then provide your explanation, referencing the displayed image {citation_section} """ diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 6e2b30c6a..d465e23f6 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -319,6 +319,20 @@ async def stream_new_chat( status="in_progress", items=last_active_step_items, ) + elif tool_name == "scrape_webpage": + url = ( + tool_input.get("url", "") + if isinstance(tool_input, dict) + else str(tool_input) + ) + last_active_step_title = "Scraping webpage" + last_active_step_items = [f"URL: {url[:80]}{'...' if len(url) > 80 else ''}"] + yield streaming_service.format_thinking_step( + step_id=tool_step_id, + title="Scraping webpage", + status="in_progress", + items=last_active_step_items, + ) elif tool_name == "generate_podcast": podcast_title = ( tool_input.get("podcast_title", "SurfSense Podcast") @@ -398,6 +412,16 @@ async def stream_new_chat( f"Displaying image: {src[:60]}{'...' if len(src) > 60 else ''}", "info", ) + elif tool_name == "scrape_webpage": + url = ( + tool_input.get("url", "") + if isinstance(tool_input, dict) + else str(tool_input) + ) + yield streaming_service.format_terminal_info( + f"Scraping webpage: {url[:70]}{'...' if len(url) > 70 else ''}", + "info", + ) elif tool_name == "generate_podcast": title = ( tool_input.get("podcast_title", "SurfSense Podcast") @@ -502,6 +526,31 @@ async def stream_new_chat( status="completed", items=completed_items, ) + elif tool_name == "scrape_webpage": + # Build completion items for webpage scraping + if isinstance(tool_output, dict): + title = tool_output.get("title", "Webpage") + word_count = tool_output.get("word_count", 0) + has_error = "error" in tool_output + if has_error: + completed_items = [ + *last_active_step_items, + f"Error: {tool_output.get('error', 'Failed to scrape')[:50]}", + ] + else: + completed_items = [ + *last_active_step_items, + f"Title: {title[:50]}{'...' if len(title) > 50 else ''}", + f"Extracted: {word_count:,} words", + ] + else: + completed_items = [*last_active_step_items, "Content extracted"] + yield streaming_service.format_thinking_step( + step_id=original_step_id, + title="Scraping webpage", + status="completed", + items=completed_items, + ) elif tool_name == "generate_podcast": # Build detailed completion items based on podcast status podcast_status = ( @@ -630,6 +679,47 @@ async def stream_new_chat( f"Image displayed: {title[:40]}{'...' if len(title) > 40 else ''}", "success", ) + elif tool_name == "scrape_webpage": + # Stream the scrape result so frontend can render the Article component + # Note: We send metadata for display, but content goes to LLM for processing + if isinstance(tool_output, dict): + # Create a display-friendly output (without full content for the card) + display_output = { + k: v for k, v in tool_output.items() if k != "content" + } + # But keep a truncated content preview + if "content" in tool_output: + content = tool_output.get("content", "") + display_output["content_preview"] = ( + content[:500] + "..." if len(content) > 500 else content + ) + yield streaming_service.format_tool_output_available( + tool_call_id, + display_output, + ) + else: + yield streaming_service.format_tool_output_available( + tool_call_id, + {"result": tool_output}, + ) + # Send terminal message + if isinstance(tool_output, dict) and "error" not in tool_output: + title = tool_output.get("title", "Webpage") + word_count = tool_output.get("word_count", 0) + yield streaming_service.format_terminal_info( + f"Scraped: {title[:40]}{'...' if len(title) > 40 else ''} ({word_count:,} words)", + "success", + ) + else: + error_msg = ( + tool_output.get("error", "Failed to scrape") + if isinstance(tool_output, dict) + else "Failed to scrape" + ) + yield streaming_service.format_terminal_info( + f"Scrape failed: {error_msg}", + "error", + ) elif tool_name == "search_knowledge_base": # Don't stream the full output for search (can be very large), just acknowledge yield streaming_service.format_tool_output_available( diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index 7c6bd34c1..9c7e3cb4a 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -13,6 +13,7 @@ import { Thread } from "@/components/assistant-ui/thread"; import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast"; import { LinkPreviewToolUI } from "@/components/tool-ui/link-preview"; import { DisplayImageToolUI } from "@/components/tool-ui/display-image"; +import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage"; import type { ThinkingStep } from "@/components/tool-ui/deepagent-thinking"; import { getBearerToken } from "@/lib/auth-utils"; import { createAttachmentAdapter, extractAttachmentContent } from "@/lib/chat/attachment-adapter"; @@ -81,7 +82,7 @@ function convertToThreadMessage(msg: MessageRecord): ThreadMessageLike { /** * Tools that should render custom UI in the chat. */ -const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image"]); +const TOOLS_WITH_UI = new Set(["generate_podcast", "link_preview", "display_image", "scrape_webpage"]); /** * Type for thinking step data from the backend @@ -245,47 +246,74 @@ export default function NewChatPage() { // Prepare assistant message const assistantMsgId = `msg-assistant-${Date.now()}`; - let accumulatedText = ""; const currentThinkingSteps = new Map(); - const toolCalls = new Map< - string, - { - toolCallId: string; - toolName: string; - args: Record; - result?: unknown; + + // Ordered content parts to preserve inline tool call positions + // Each part is either a text segment or a tool call + type ContentPart = + | { type: "text"; text: string } + | { + type: "tool-call"; + toolCallId: string; + toolName: string; + args: Record; + result?: unknown; + }; + const contentParts: ContentPart[] = []; + + // Track the current text segment index (for appending text deltas) + let currentTextPartIndex = -1; + + // Map to track tool call indices for updating results + const toolCallIndices = new Map(); + + // Helper to get or create the current text part for appending text + const appendText = (delta: string) => { + if (currentTextPartIndex >= 0 && contentParts[currentTextPartIndex]?.type === "text") { + // Append to existing text part + (contentParts[currentTextPartIndex] as { type: "text"; text: string }).text += delta; + } else { + // Create new text part + contentParts.push({ type: "text", text: delta }); + currentTextPartIndex = contentParts.length - 1; } - >(); + }; + + // Helper to add a tool call (this "breaks" the current text segment) + const addToolCall = (toolCallId: string, toolName: string, args: Record) => { + if (TOOLS_WITH_UI.has(toolName)) { + contentParts.push({ + type: "tool-call", + toolCallId, + toolName, + args, + }); + toolCallIndices.set(toolCallId, contentParts.length - 1); + // Reset text part index so next text creates a new segment + currentTextPartIndex = -1; + } + }; + + // Helper to update a tool call's args or result + const updateToolCall = (toolCallId: string, update: { args?: Record; result?: unknown }) => { + const index = toolCallIndices.get(toolCallId); + if (index !== undefined && contentParts[index]?.type === "tool-call") { + const tc = contentParts[index] as ContentPart & { type: "tool-call" }; + if (update.args) tc.args = update.args; + if (update.result !== undefined) tc.result = update.result; + } + }; // Helper to build content for UI (without thinking-steps) const buildContentForUI = (): ThreadMessageLike["content"] => { - const parts: Array< - | { type: "text"; text: string } - | { - type: "tool-call"; - toolCallId: string; - toolName: string; - args: Record; - result?: unknown; - } - > = []; - - if (accumulatedText) { - parts.push({ type: "text", text: accumulatedText }); - } - for (const toolCall of toolCalls.values()) { - if (TOOLS_WITH_UI.has(toolCall.toolName)) { - parts.push({ - type: "tool-call", - toolCallId: toolCall.toolCallId, - toolName: toolCall.toolName, - args: toolCall.args, - result: toolCall.result, - }); - } - } - return parts.length > 0 - ? (parts as ThreadMessageLike["content"]) + // Filter to only include text parts with content and tool-calls with UI + const filtered = contentParts.filter((part) => { + if (part.type === "text") return part.text.length > 0; + if (part.type === "tool-call") return TOOLS_WITH_UI.has(part.toolName); + return false; + }); + return filtered.length > 0 + ? (filtered as ThreadMessageLike["content"]) : [{ type: "text", text: "" }]; }; @@ -301,20 +329,15 @@ export default function NewChatPage() { }); } - if (accumulatedText) { - parts.push({ type: "text", text: accumulatedText }); - } - for (const toolCall of toolCalls.values()) { - if (TOOLS_WITH_UI.has(toolCall.toolName)) { - parts.push({ - type: "tool-call", - toolCallId: toolCall.toolCallId, - toolName: toolCall.toolName, - args: toolCall.args, - result: toolCall.result, - }); + // Add content parts (filtered) + for (const part of contentParts) { + if (part.type === "text" && part.text.length > 0) { + parts.push(part); + } else if (part.type === "tool-call" && TOOLS_WITH_UI.has(part.toolName)) { + parts.push(part); } } + return parts.length > 0 ? parts : [{ type: "text", text: "" }]; }; @@ -399,7 +422,7 @@ export default function NewChatPage() { switch (parsed.type) { case "text-delta": - accumulatedText += parsed.delta; + appendText(parsed.delta); setMessages((prev) => prev.map((m) => m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m @@ -408,11 +431,8 @@ export default function NewChatPage() { break; case "tool-input-start": - toolCalls.set(parsed.toolCallId, { - toolCallId: parsed.toolCallId, - toolName: parsed.toolName, - args: {}, - }); + // Add tool call inline - this breaks the current text segment + addToolCall(parsed.toolCallId, parsed.toolName, {}); setMessages((prev) => prev.map((m) => m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m @@ -421,14 +441,12 @@ export default function NewChatPage() { break; case "tool-input-available": { - const tc = toolCalls.get(parsed.toolCallId); - if (tc) tc.args = parsed.input || {}; - else - toolCalls.set(parsed.toolCallId, { - toolCallId: parsed.toolCallId, - toolName: parsed.toolName, - args: parsed.input || {}, - }); + // Update existing tool call's args, or add if not exists + if (toolCallIndices.has(parsed.toolCallId)) { + updateToolCall(parsed.toolCallId, { args: parsed.input || {} }); + } else { + addToolCall(parsed.toolCallId, parsed.toolName, parsed.input || {}); + } setMessages((prev) => prev.map((m) => m.id === assistantMsgId ? { ...m, content: buildContentForUI() } : m @@ -438,15 +456,17 @@ export default function NewChatPage() { } case "tool-output-available": { - const tc = toolCalls.get(parsed.toolCallId); - if (tc) { - tc.result = parsed.output; - if ( - tc.toolName === "generate_podcast" && - parsed.output?.status === "processing" && - parsed.output?.task_id - ) { - setActivePodcastTaskId(parsed.output.task_id); + // Update the tool call with its result + updateToolCall(parsed.toolCallId, { result: parsed.output }); + // Handle podcast-specific logic + if (parsed.output?.status === "processing" && parsed.output?.task_id) { + // Check if this is a podcast tool by looking at the content part + const idx = toolCallIndices.get(parsed.toolCallId); + if (idx !== undefined) { + const part = contentParts[idx]; + if (part?.type === "tool-call" && part.toolName === "generate_podcast") { + setActivePodcastTaskId(parsed.output.task_id); + } } } setMessages((prev) => @@ -491,7 +511,7 @@ export default function NewChatPage() { // Persist assistant message (with thinking steps for restoration on refresh) const finalContent = buildContentForPersistence(); - if (accumulatedText || toolCalls.size > 0) { + if (contentParts.length > 0) { appendMessage(threadId, { role: "assistant", content: finalContent, @@ -593,6 +613,7 @@ export default function NewChatPage() { +
diff --git a/surfsense_web/components/tool-ui/article/index.tsx b/surfsense_web/components/tool-ui/article/index.tsx new file mode 100644 index 000000000..bf8e83411 --- /dev/null +++ b/surfsense_web/components/tool-ui/article/index.tsx @@ -0,0 +1,406 @@ +"use client"; + +import { Card, CardContent } from "@/components/ui/card"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "@/components/ui/tooltip"; +import { cn } from "@/lib/utils"; +import { + AlertCircleIcon, + BookOpenIcon, + CalendarIcon, + ExternalLinkIcon, + FileTextIcon, + UserIcon, +} from "lucide-react"; +import { Component, type ReactNode, useCallback } from "react"; + +/** + * Article component props + */ +export interface ArticleProps { + /** Unique identifier for the article */ + id: string; + /** Asset identifier (usually the URL) */ + assetId?: string; + /** Article title */ + title: string; + /** Brief description or excerpt */ + description?: string; + /** Full content of the article (markdown) */ + content?: string; + /** URL to the original article */ + href?: string; + /** Domain of the article source */ + domain?: string; + /** Author name */ + author?: string; + /** Publication date */ + date?: string; + /** Word count */ + wordCount?: number; + /** Whether content was truncated */ + wasTruncated?: boolean; + /** Optional max width */ + maxWidth?: string; + /** Optional error message */ + error?: string; + /** Optional className */ + className?: string; + /** Response actions */ + responseActions?: Array<{ + id: string; + label: string; + variant?: "default" | "outline"; + }>; + /** Response action handler */ + onResponseAction?: (actionId: string) => void; +} + +/** + * Serializable article data type (from backend) + */ +export interface SerializableArticle { + id: string; + assetId?: string; + kind?: "article"; + title: string; + description?: string; + content?: string; + href?: string; + domain?: string; + author?: string; + date?: string; + word_count?: number; + wordCount?: number; + was_truncated?: boolean; + wasTruncated?: boolean; + error?: string; +} + +/** + * Parse serializable article data to ArticleProps + */ +export function parseSerializableArticle(data: unknown): ArticleProps { + const obj = data as Record; + return { + id: String(obj.id || "article-unknown"), + assetId: obj.assetId as string | undefined, + title: String(obj.title || "Untitled Article"), + description: obj.description as string | undefined, + content: obj.content as string | undefined, + href: obj.href as string | undefined, + domain: obj.domain as string | undefined, + author: obj.author as string | undefined, + date: obj.date as string | undefined, + wordCount: (obj.word_count || obj.wordCount) as number | undefined, + wasTruncated: (obj.was_truncated || obj.wasTruncated) as boolean | undefined, + error: obj.error as string | undefined, + }; +} + +/** + * Format word count for display + */ +function formatWordCount(count: number): string { + if (count >= 1000) { + return `${(count / 1000).toFixed(1)}k words`; + } + return `${count} words`; +} + +/** + * Article card component for displaying scraped webpage content + */ +export function Article({ + id, + title, + description, + content, + href, + domain, + author, + date, + wordCount, + wasTruncated, + maxWidth = "100%", + error, + className, + responseActions, + onResponseAction, +}: ArticleProps) { + const handleCardClick = useCallback(() => { + if (href) { + window.open(href, "_blank", "noopener,noreferrer"); + } + }, [href]); + + // Error state + if (error) { + return ( + + +
+
+ +
+
+

+ Failed to scrape webpage +

+ {href && ( +

+ {href} +

+ )} +

{error}

+
+
+
+
+ ); + } + + return ( + + { + if (href && (e.key === "Enter" || e.key === " ")) { + e.preventDefault(); + handleCardClick(); + } + }} + > + {/* Header */} + +
+ {/* Icon */} +
+ +
+ + {/* Content */} +
+ {/* Title */} +

+ {title} +

+ + {/* Description */} + {description && ( +

+ {description} +

+ )} + + {/* Metadata row */} +
+ {domain && ( + + + + + {domain} + + + +

Source: {domain}

+
+
+ )} + + {author && ( + + + + + {author} + + + +

Author: {author}

+
+
+ )} + + {date && ( + + + {date} + + )} + + {wordCount && ( + + + + + {formatWordCount(wordCount)} + {wasTruncated && ( + (truncated) + )} + + + +

+ {wasTruncated + ? "Content was truncated due to length" + : "Full article content available"} +

+
+
+ )} +
+
+ + {/* External link indicator */} + {href && ( +
+ +
+ )} +
+ + {/* Response actions */} + {responseActions && responseActions.length > 0 && ( +
+ {responseActions.map((action) => ( + + ))} +
+ )} +
+
+
+ ); +} + +/** + * Loading state for article component + */ +export function ArticleLoading({ + title = "Loading article...", +}: { title?: string }) { + return ( + + +
+
+
+
+
+
+
+
+

{title}

+ + + ); +} + +/** + * Skeleton for article component + */ +export function ArticleSkeleton() { + return ( + + +
+
+
+
+
+
+
+
+ + + ); +} + +/** + * Error boundary props + */ +interface ErrorBoundaryProps { + children: ReactNode; + fallback?: ReactNode; +} + +interface ErrorBoundaryState { + hasError: boolean; +} + +/** + * Error boundary for article component + */ +export class ArticleErrorBoundary extends Component< + ErrorBoundaryProps, + ErrorBoundaryState +> { + constructor(props: ErrorBoundaryProps) { + super(props); + this.state = { hasError: false }; + } + + static getDerivedStateFromError(): ErrorBoundaryState { + return { hasError: true }; + } + + render() { + if (this.state.hasError) { + return ( + this.props.fallback || ( + + +
+ +

+ Failed to render article +

+
+
+
+ ) + ); + } + + return this.props.children; + } +} + diff --git a/surfsense_web/components/tool-ui/index.ts b/surfsense_web/components/tool-ui/index.ts index b9500e7cd..163d279a9 100644 --- a/surfsense_web/components/tool-ui/index.ts +++ b/surfsense_web/components/tool-ui/index.ts @@ -46,3 +46,17 @@ export { type DisplayImageArgs, type DisplayImageResult, } from "./display-image"; +export { + Article, + ArticleErrorBoundary, + ArticleLoading, + ArticleSkeleton, + parseSerializableArticle, + type ArticleProps, + type SerializableArticle, +} from "./article"; +export { + ScrapeWebpageToolUI, + type ScrapeWebpageArgs, + type ScrapeWebpageResult, +} from "./scrape-webpage"; diff --git a/surfsense_web/components/tool-ui/scrape-webpage.tsx b/surfsense_web/components/tool-ui/scrape-webpage.tsx new file mode 100644 index 000000000..c9ced3d80 --- /dev/null +++ b/surfsense_web/components/tool-ui/scrape-webpage.tsx @@ -0,0 +1,163 @@ +"use client"; + +import { makeAssistantToolUI } from "@assistant-ui/react"; +import { AlertCircleIcon, FileTextIcon } from "lucide-react"; +import { + Article, + ArticleErrorBoundary, + ArticleLoading, + parseSerializableArticle, +} from "@/components/tool-ui/article"; + +/** + * Type definitions for the scrape_webpage tool + */ +interface ScrapeWebpageArgs { + url: string; + max_length?: number; +} + +interface ScrapeWebpageResult { + id: string; + assetId: string; + kind: "article"; + href: string; + title: string; + description?: string; + content?: string; + domain?: string; + author?: string; + date?: string; + word_count?: number; + was_truncated?: boolean; + crawler_type?: string; + error?: string; +} + +/** + * Error state component shown when webpage scraping fails + */ +function ScrapeErrorState({ url, error }: { url: string; error: string }) { + return ( +
+
+
+ +
+
+

Failed to scrape webpage

+

{url}

+

{error}

+
+
+
+ ); +} + +/** + * Cancelled state component + */ +function ScrapeCancelledState({ url }: { url: string }) { + return ( +
+

+ + Scraping: {url} +

+
+ ); +} + +/** + * Parsed Article component with error handling + */ +function ParsedArticle({ result }: { result: unknown }) { + const article = parseSerializableArticle(result); + + return ( +
{ + if (id === "open" && article.href) { + window.open(article.href, "_blank", "noopener,noreferrer"); + } + }} + /> + ); +} + +/** + * Scrape Webpage Tool UI Component + * + * This component is registered with assistant-ui to render an article card + * when the scrape_webpage tool is called by the agent. + * + * It displays scraped webpage content including: + * - Title and description + * - Author and date (if available) + * - Word count + * - Link to original source + */ +export const ScrapeWebpageToolUI = makeAssistantToolUI< + ScrapeWebpageArgs, + ScrapeWebpageResult +>({ + toolName: "scrape_webpage", + render: function ScrapeWebpageUI({ args, result, status }) { + const url = args.url || "Unknown URL"; + + // Loading state - tool is still running + if (status.type === "running" || status.type === "requires-action") { + return ( +
+ +
+ ); + } + + // Incomplete/cancelled state + if (status.type === "incomplete") { + if (status.reason === "cancelled") { + return ; + } + if (status.reason === "error") { + return ( + + ); + } + } + + // No result yet + if (!result) { + return ( +
+ +
+ ); + } + + // Error result from the tool + if (result.error) { + return ; + } + + // Success - render the article card + return ( +
+ + + +
+ ); + }, +}); + +export type { ScrapeWebpageArgs, ScrapeWebpageResult }; +