From 3f4e1a7dfd45181beb7fcc41e050438d3c53d4e5 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:55:06 +0530 Subject: [PATCH] refactor: remove frontend of `scrape_webpage` tool --- .../app/agents/new_chat/system_prompt.py | 12 +- .../app/services/public_chat_service.py | 1 - .../new-chat/[[...chat_id]]/page.tsx | 1 - .../assistant-ui/assistant-message.tsx | 2 - .../components/assistant-ui/tool-fallback.tsx | 151 +++++-- .../components/public-chat/public-thread.tsx | 2 - .../components/tool-ui/article/index.tsx | 425 ------------------ surfsense_web/components/tool-ui/index.ts | 16 - .../components/tool-ui/scrape-webpage.tsx | 163 ------- 9 files changed, 118 insertions(+), 655 deletions(-) delete mode 100644 surfsense_web/components/tool-ui/article/index.tsx delete mode 100644 surfsense_web/components/tool-ui/scrape-webpage.tsx diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index 77df3acfd..b53251a1d 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -220,7 +220,8 @@ _TOOL_INSTRUCTIONS["scrape_webpage"] = """ - url: The URL of the webpage to scrape (must be HTTP/HTTPS) - max_length: Maximum content length to return (default: 50000 chars) - Returns: The page title, description, full content (in markdown), word count, and metadata - - After scraping, you will have the full article text and can analyze, summarize, or answer questions about it. + - After scraping, provide a comprehensive, well-structured summary with key takeaways using headings or bullet points. + - Reference the source using markdown links [descriptive text](url) — never bare URLs. - IMAGES: The scraped content may contain image URLs in markdown format like `![alt text](image_url)`. * When you find relevant/important images in the scraped content, include them in your response using standard markdown image syntax: `![alt text](image_url)`. * This makes your response more visual and engaging. @@ -244,6 +245,8 @@ _TOOL_INSTRUCTIONS["web_search"] = """ - Args: - query: The search query - use specific, descriptive terms - top_k: Number of results to retrieve (default: 10, max: 50) + - If search snippets are insufficient for the user's question, use `scrape_webpage` on the most relevant result URL for full content. + - When presenting results, reference sources as markdown links [descriptive text](url) — never bare URLs. """ # Memory tool instructions have private and shared variants. @@ -429,13 +432,16 @@ _TOOL_EXAMPLES["generate_report"] = """ _TOOL_EXAMPLES["scrape_webpage"] = """ - User: "Check out https://dev.to/some-article" - Call: `scrape_webpage(url="https://dev.to/some-article")` - - Then provide your analysis of the content. + - Respond with a structured analysis — key points, takeaways. - User: "Read this article and summarize it for me: https://example.com/blog/ai-trends" - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")` - - Then provide a summary based on the scraped text. + - Respond with a thorough summary using headings and bullet points. - User: (after discussing https://example.com/stats) "Can you get the live data from that page?" - Call: `scrape_webpage(url="https://example.com/stats")` - IMPORTANT: Always attempt scraping first. Never refuse before trying the tool. +- User: "https://example.com/blog/weekend-recipes" + - Call: `scrape_webpage(url="https://example.com/blog/weekend-recipes")` + - When a user sends just a URL with no instructions, scrape it and provide a concise summary of the content. """ _TOOL_EXAMPLES["generate_image"] = """ diff --git a/surfsense_backend/app/services/public_chat_service.py b/surfsense_backend/app/services/public_chat_service.py index 763ae64c3..376db974f 100644 --- a/surfsense_backend/app/services/public_chat_service.py +++ b/surfsense_backend/app/services/public_chat_service.py @@ -42,7 +42,6 @@ UI_TOOLS = { "generate_podcast", "generate_report", "generate_video_presentation", - "scrape_webpage", } diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index b3cc4fa6c..3f6893169 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -134,7 +134,6 @@ const TOOLS_WITH_UI = new Set([ "display_image", "generate_image", "delete_notion_page", - "scrape_webpage", "create_notion_page", "update_notion_page", "create_linear_issue", diff --git a/surfsense_web/components/assistant-ui/assistant-message.tsx b/surfsense_web/components/assistant-ui/assistant-message.tsx index fa3aec45a..14fb18bf9 100644 --- a/surfsense_web/components/assistant-ui/assistant-message.tsx +++ b/surfsense_web/components/assistant-ui/assistant-message.tsx @@ -29,7 +29,6 @@ import { CreateJiraIssueToolUI, DeleteJiraIssueToolUI, UpdateJiraIssueToolUI } f import { CreateLinearIssueToolUI, DeleteLinearIssueToolUI, UpdateLinearIssueToolUI } from "@/components/tool-ui/linear"; import { CreateNotionPageToolUI, DeleteNotionPageToolUI, UpdateNotionPageToolUI } from "@/components/tool-ui/notion"; import { SandboxExecuteToolUI } from "@/components/tool-ui/sandbox-execute"; -import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage"; import { RecallMemoryToolUI, SaveMemoryToolUI } from "@/components/tool-ui/user-memory"; import { useComments } from "@/hooks/use-comments"; import { useMediaQuery } from "@/hooks/use-media-query"; @@ -59,7 +58,6 @@ const AssistantMessageInner: FC = () => { generate_video_presentation: GenerateVideoPresentationToolUI, display_image: DisplayImageToolUI, generate_image: GenerateImageToolUI, - scrape_webpage: ScrapeWebpageToolUI, save_memory: SaveMemoryToolUI, recall_memory: RecallMemoryToolUI, execute: SandboxExecuteToolUI, diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx index 636b43c36..d12ffb5d6 100644 --- a/surfsense_web/components/assistant-ui/tool-fallback.tsx +++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx @@ -1,8 +1,14 @@ import type { ToolCallMessagePartComponent } from "@assistant-ui/react"; import { CheckIcon, ChevronDownIcon, ChevronUpIcon, XCircleIcon } from "lucide-react"; import { useState } from "react"; -import { Button } from "@/components/ui/button"; import { cn } from "@/lib/utils"; +import { getToolIcon } from "@/contracts/enums/toolIcons"; + +function formatToolName(name: string): string { + return name + .replace(/_/g, " ") + .replace(/\b\w/g, (c) => c.toUpperCase()); +} export const ToolFallback: ToolCallMessagePartComponent = ({ toolName, @@ -10,66 +16,127 @@ export const ToolFallback: ToolCallMessagePartComponent = ({ result, status, }) => { - const [isCollapsed, setIsCollapsed] = useState(true); + const [isExpanded, setIsExpanded] = useState(false); const isCancelled = status?.type === "incomplete" && status.reason === "cancelled"; + const isError = status?.type === "incomplete" && status.reason === "error"; + const isRunning = status?.type === "running" || status?.type === "requires-action"; const cancelledReason = isCancelled && status.error ? typeof status.error === "string" ? status.error : JSON.stringify(status.error) : null; + const errorReason = + isError && status.error + ? typeof status.error === "string" + ? status.error + : JSON.stringify(status.error) + : null; + + const Icon = getToolIcon(toolName); + const displayName = formatToolName(toolName); return (
-
- {isCancelled ? ( - - ) : ( - - )} -

setIsExpanded(!isExpanded)} + className="flex w-full items-center gap-3 px-5 py-4 text-left transition-colors hover:bg-muted/50 focus:outline-none focus-visible:outline-none" + > +

- {isCancelled ? "Cancelled tool: " : "Used tool: "} - {toolName} -

- -
- {!isCollapsed && ( -
- {cancelledReason && ( -
-

- Cancelled reason: -

-

- {cancelledReason} -

-
- )} -
-
{argsText}
-
- {!isCancelled && result !== undefined && ( -
-

Result:

-
-								{typeof result === "string" ? result : JSON.stringify(result, null, 2)}
-							
-
+ {isError ? ( + + ) : isCancelled ? ( + + ) : isRunning ? ( + + ) : ( + )}
+ +
+

+ {isRunning + ? displayName + : isCancelled + ? `Cancelled: ${displayName}` + : isError + ? `Failed: ${displayName}` + : displayName} +

+ {isRunning && ( +

Running...

+ )} + {cancelledReason && ( +

{cancelledReason}

+ )} + {errorReason && ( +

{errorReason}

+ )} +
+ + {!isRunning && ( +
+ {isExpanded ? ( + + ) : ( + + )} +
+ )} + + + {isExpanded && !isRunning && ( + <> +
+
+ {argsText && ( +
+

Arguments

+
+									{argsText}
+								
+
+ )} + {!isCancelled && result !== undefined && ( + <> +
+
+

Result

+
+										{typeof result === "string" ? result : JSON.stringify(result, null, 2)}
+									
+
+ + )} +
+ )}
); diff --git a/surfsense_web/components/public-chat/public-thread.tsx b/surfsense_web/components/public-chat/public-thread.tsx index 8076188c0..9b1fe7e49 100644 --- a/surfsense_web/components/public-chat/public-thread.tsx +++ b/surfsense_web/components/public-chat/public-thread.tsx @@ -17,7 +17,6 @@ import { GenerateImageToolUI } from "@/components/tool-ui/generate-image"; import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast"; import { GenerateReportToolUI } from "@/components/tool-ui/generate-report"; import { GenerateVideoPresentationToolUI } from "@/components/tool-ui/video-presentation"; -import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage"; interface PublicThreadProps { footer?: ReactNode; @@ -152,7 +151,6 @@ const PublicAssistantMessage: FC = () => { generate_video_presentation: GenerateVideoPresentationToolUI, display_image: DisplayImageToolUI, generate_image: GenerateImageToolUI, - scrape_webpage: ScrapeWebpageToolUI, }, Fallback: ToolFallback, }, diff --git a/surfsense_web/components/tool-ui/article/index.tsx b/surfsense_web/components/tool-ui/article/index.tsx deleted file mode 100644 index 43ea7c4c9..000000000 --- a/surfsense_web/components/tool-ui/article/index.tsx +++ /dev/null @@ -1,425 +0,0 @@ -"use client"; - -import { - AlertCircleIcon, - BookOpenIcon, - CalendarIcon, - ExternalLinkIcon, - FileTextIcon, - UserIcon, -} from "lucide-react"; -import Image from "next/image"; -import { Component, type ReactNode, useCallback, useState } from "react"; -import { z } from "zod"; -import { Card, CardContent } from "@/components/ui/card"; -import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; -import { cn } from "@/lib/utils"; - -/** - * Zod schema for serializable article data (from backend) - */ -const SerializableArticleSchema = z.object({ - id: z.string().default("article-unknown"), - assetId: z.string().nullish(), - kind: z.literal("article").nullish(), - title: z.string().default("Untitled Article"), - description: z.string().nullish(), - content: z.string().nullish(), - href: z.string().url().nullish(), - domain: z.string().nullish(), - author: z.string().nullish(), - date: z.string().nullish(), - word_count: z.number().nullish(), - wordCount: z.number().nullish(), - was_truncated: z.boolean().nullish(), - wasTruncated: z.boolean().nullish(), - error: z.string().nullish(), -}); - -/** - * Serializable article data type (from backend) - */ -export type SerializableArticle = z.infer; - -/** - * Article component props - */ -export interface ArticleProps { - /** Unique identifier for the article */ - id: string; - /** Asset identifier (usually the URL) */ - assetId?: string; - /** Article title */ - title: string; - /** Brief description or excerpt */ - description?: string; - /** Full content of the article (markdown) */ - content?: string; - /** URL to the original article */ - href?: string; - /** Domain of the article source */ - domain?: string; - /** Author name */ - author?: string; - /** Publication date */ - date?: string; - /** Word count */ - wordCount?: number; - /** Whether content was truncated */ - wasTruncated?: boolean; - /** Optional max width */ - maxWidth?: string; - /** Optional error message */ - error?: string; - /** Optional className */ - className?: string; - /** Response actions */ - responseActions?: Array<{ - id: string; - label: string; - variant?: "default" | "outline"; - }>; - /** Response action handler */ - onResponseAction?: (actionId: string) => void; -} - -/** - * Parse and validate serializable article data to ArticleProps - */ -export function parseSerializableArticle(data: unknown): ArticleProps { - const result = SerializableArticleSchema.safeParse(data); - - if (!result.success) { - console.warn("Invalid article data:", result.error.issues); - // Return fallback with basic info - const obj = (data && typeof data === "object" ? data : {}) as Record; - return { - id: String(obj.id || "article-unknown"), - title: String(obj.title || "Untitled Article"), - error: "Failed to parse article data", - }; - } - - const parsed = result.data; - return { - id: parsed.id, - assetId: parsed.assetId, - title: parsed.title, - description: parsed.description, - content: parsed.content, - href: parsed.href, - domain: parsed.domain, - author: parsed.author, - date: parsed.date, - wordCount: parsed.word_count ?? parsed.wordCount, - wasTruncated: parsed.was_truncated ?? parsed.wasTruncated, - error: parsed.error, - }; -} - -/** - * Format word count for display - */ -function formatWordCount(count: number): string { - if (count >= 1000) { - return `${(count / 1000).toFixed(1)}k words`; - } - return `${count} words`; -} - -/** - * Favicon component that fetches the site icon via Google's favicon service, - * falling back to BookOpenIcon on error. - */ -function SiteFavicon({ domain }: { domain: string }) { - const [failed, setFailed] = useState(false); - - if (failed) { - return ; - } - - return ( - {`${domain} setFailed(true)} - unoptimized - /> - ); -} - -/** - * Article card component for displaying scraped webpage content - */ -export function Article({ - id, - title, - description, - content, - href, - domain, - author, - date, - wordCount, - wasTruncated, - maxWidth = "100%", - error, - className, - responseActions, - onResponseAction, -}: ArticleProps) { - const handleCardClick = useCallback(() => { - if (href) { - window.open(href, "_blank", "noopener,noreferrer"); - } - }, [href]); - - // Error state - if (error) { - return ( - - -
-
- -
-
-

Failed to scrape webpage

- {href &&

{href}

} -

{error}

-
-
-
-
- ); - } - - return ( - - { - if (href && (e.key === "Enter" || e.key === " ")) { - e.preventDefault(); - handleCardClick(); - } - }} - > - {/* Header */} - -
- {/* Favicon / Icon */} - {domain ? ( -
- -
- ) : ( -
- -
- )} - - {/* Content */} -
- {/* Title */} -

- {title} -

- - {/* Description */} - {description && ( -

- {description} -

- )} - - {/* Metadata row */} -
- {domain && ( - - - - - {domain} - - - -

Source: {domain}

-
-
- )} - - {author && ( - - - - - {author} - - - -

Author: {author}

-
-
- )} - - {date && ( - - - {date} - - )} - - {wordCount && ( - - - - - {formatWordCount(wordCount)} - {wasTruncated && (truncated)} - - - -

- {wasTruncated - ? "Content was truncated due to length" - : "Full article content available"} -

-
-
- )} -
-
-
- - {/* Response actions */} - {responseActions && responseActions.length > 0 && ( -
- {responseActions.map((action) => ( - - ))} -
- )} -
-
-
- ); -} - -/** - * Loading state for article component - */ -export function ArticleLoading({ title = "Loading article..." }: { title?: string }) { - return ( - - -
-
-
-
-
-
-
-
-

{title}

- - - ); -} - -/** - * Skeleton for article component - */ -export function ArticleSkeleton() { - return ( - - -
-
-
-
-
-
-
-
- - - ); -} - -/** - * Error boundary props - */ -interface ErrorBoundaryProps { - children: ReactNode; - fallback?: ReactNode; -} - -interface ErrorBoundaryState { - hasError: boolean; -} - -/** - * Error boundary for article component - */ -export class ArticleErrorBoundary extends Component { - constructor(props: ErrorBoundaryProps) { - super(props); - this.state = { hasError: false }; - } - - static getDerivedStateFromError(): ErrorBoundaryState { - return { hasError: true }; - } - - render() { - if (this.state.hasError) { - return ( - this.props.fallback || ( - - -
- -

Failed to render article

-
-
-
- ) - ); - } - - return this.props.children; - } -} diff --git a/surfsense_web/components/tool-ui/index.ts b/surfsense_web/components/tool-ui/index.ts index 0f1126847..b2978f8f3 100644 --- a/surfsense_web/components/tool-ui/index.ts +++ b/surfsense_web/components/tool-ui/index.ts @@ -6,15 +6,6 @@ * rich UI when specific tools are called by the agent. */ -export { - Article, - ArticleErrorBoundary, - ArticleLoading, - type ArticleProps, - ArticleSkeleton, - parseSerializableArticle, - type SerializableArticle, -} from "./article"; export { Audio } from "./audio"; export { type DisplayImageArgs, @@ -65,13 +56,6 @@ export { ExecuteResultSchema, SandboxExecuteToolUI, } from "./sandbox-execute"; -export { - type ScrapeWebpageArgs, - ScrapeWebpageArgsSchema, - type ScrapeWebpageResult, - ScrapeWebpageResultSchema, - ScrapeWebpageToolUI, -} from "./scrape-webpage"; export { type MemoryItem, type RecallMemoryArgs, diff --git a/surfsense_web/components/tool-ui/scrape-webpage.tsx b/surfsense_web/components/tool-ui/scrape-webpage.tsx deleted file mode 100644 index a17c56734..000000000 --- a/surfsense_web/components/tool-ui/scrape-webpage.tsx +++ /dev/null @@ -1,163 +0,0 @@ -"use client"; - -import type { ToolCallMessagePartProps } from "@assistant-ui/react"; -import { AlertCircleIcon, FileTextIcon } from "lucide-react"; -import { z } from "zod"; -import { - Article, - ArticleErrorBoundary, - ArticleLoading, - parseSerializableArticle, -} from "@/components/tool-ui/article"; - -// ============================================================================ -// Zod Schemas -// ============================================================================ - -/** - * Schema for scrape_webpage tool arguments - */ -const ScrapeWebpageArgsSchema = z.object({ - url: z.string(), - max_length: z.number().nullish(), -}); - -/** - * Schema for scrape_webpage tool result - */ -const ScrapeWebpageResultSchema = z.object({ - id: z.string(), - assetId: z.string(), - kind: z.literal("article"), - href: z.string(), - title: z.string(), - description: z.string().nullish(), - content: z.string().nullish(), - domain: z.string().nullish(), - author: z.string().nullish(), - date: z.string().nullish(), - word_count: z.number().nullish(), - was_truncated: z.boolean().nullish(), - crawler_type: z.string().nullish(), - error: z.string().nullish(), -}); - -// ============================================================================ -// Types -// ============================================================================ - -type ScrapeWebpageArgs = z.infer; -type ScrapeWebpageResult = z.infer; - -/** - * Error state component shown when webpage scraping fails - */ -function ScrapeErrorState({ url, error }: { url: string; error: string }) { - return ( -
-
-
- -
-
-

Failed to scrape webpage

-

{url}

-

{error}

-
-
-
- ); -} - -/** - * Cancelled state component - */ -function ScrapeCancelledState({ url }: { url: string }) { - return ( -
-

- - Scraping: {url} -

-
- ); -} - -/** - * Parsed Article component with error handling - */ -function ParsedArticle({ result }: { result: unknown }) { - const { description, ...article } = parseSerializableArticle(result); - - return
; -} - -/** - * Scrape Webpage Tool UI Component - * - * This component is registered with assistant-ui to render an article card - * when the scrape_webpage tool is called by the agent. - * - * It displays scraped webpage content including: - * - Title and description - * - Author and date (if available) - * - Word count - * - Link to original source - */ -export const ScrapeWebpageToolUI = ({ args, result, status }: ToolCallMessagePartProps) => { - const url = args.url || "Unknown URL"; - - // Loading state - tool is still running - if (status.type === "running" || status.type === "requires-action") { - return ( -
- -
- ); - } - - // Incomplete/cancelled state - if (status.type === "incomplete") { - if (status.reason === "cancelled") { - return ; - } - if (status.reason === "error") { - return ( - - ); - } - } - - // No result yet - if (!result) { - return ( -
- -
- ); - } - - // Error result from the tool - if (result.error) { - return ; - } - - // Success - render the article card - return ( -
- - - -
- ); -}; - -export { - ScrapeWebpageArgsSchema, - ScrapeWebpageResultSchema, - type ScrapeWebpageArgs, - type ScrapeWebpageResult, -};