From 3f4e1a7dfd45181beb7fcc41e050438d3c53d4e5 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:55:06 +0530
Subject: [PATCH] refactor: remove frontend of `scrape_webpage` tool
---
.../app/agents/new_chat/system_prompt.py | 12 +-
.../app/services/public_chat_service.py | 1 -
.../new-chat/[[...chat_id]]/page.tsx | 1 -
.../assistant-ui/assistant-message.tsx | 2 -
.../components/assistant-ui/tool-fallback.tsx | 151 +++++--
.../components/public-chat/public-thread.tsx | 2 -
.../components/tool-ui/article/index.tsx | 425 ------------------
surfsense_web/components/tool-ui/index.ts | 16 -
.../components/tool-ui/scrape-webpage.tsx | 163 -------
9 files changed, 118 insertions(+), 655 deletions(-)
delete mode 100644 surfsense_web/components/tool-ui/article/index.tsx
delete mode 100644 surfsense_web/components/tool-ui/scrape-webpage.tsx
diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py
index 77df3acfd..b53251a1d 100644
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@@ -220,7 +220,8 @@ _TOOL_INSTRUCTIONS["scrape_webpage"] = """
- url: The URL of the webpage to scrape (must be HTTP/HTTPS)
- max_length: Maximum content length to return (default: 50000 chars)
- Returns: The page title, description, full content (in markdown), word count, and metadata
- - After scraping, you will have the full article text and can analyze, summarize, or answer questions about it.
+ - After scraping, provide a comprehensive, well-structured summary with key takeaways using headings or bullet points.
+ - Reference the source using markdown links [descriptive text](url) — never bare URLs.
- IMAGES: The scraped content may contain image URLs in markdown format like ``.
* When you find relevant/important images in the scraped content, include them in your response using standard markdown image syntax: ``.
* This makes your response more visual and engaging.
@@ -244,6 +245,8 @@ _TOOL_INSTRUCTIONS["web_search"] = """
- Args:
- query: The search query - use specific, descriptive terms
- top_k: Number of results to retrieve (default: 10, max: 50)
+ - If search snippets are insufficient for the user's question, use `scrape_webpage` on the most relevant result URL for full content.
+ - When presenting results, reference sources as markdown links [descriptive text](url) — never bare URLs.
"""
# Memory tool instructions have private and shared variants.
@@ -429,13 +432,16 @@ _TOOL_EXAMPLES["generate_report"] = """
_TOOL_EXAMPLES["scrape_webpage"] = """
- User: "Check out https://dev.to/some-article"
- Call: `scrape_webpage(url="https://dev.to/some-article")`
- - Then provide your analysis of the content.
+ - Respond with a structured analysis — key points, takeaways.
- User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
- Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
- - Then provide a summary based on the scraped text.
+ - Respond with a thorough summary using headings and bullet points.
- User: (after discussing https://example.com/stats) "Can you get the live data from that page?"
- Call: `scrape_webpage(url="https://example.com/stats")`
- IMPORTANT: Always attempt scraping first. Never refuse before trying the tool.
+- User: "https://example.com/blog/weekend-recipes"
+ - Call: `scrape_webpage(url="https://example.com/blog/weekend-recipes")`
+ - When a user sends just a URL with no instructions, scrape it and provide a concise summary of the content.
"""
_TOOL_EXAMPLES["generate_image"] = """
diff --git a/surfsense_backend/app/services/public_chat_service.py b/surfsense_backend/app/services/public_chat_service.py
index 763ae64c3..376db974f 100644
--- a/surfsense_backend/app/services/public_chat_service.py
+++ b/surfsense_backend/app/services/public_chat_service.py
@@ -42,7 +42,6 @@ UI_TOOLS = {
"generate_podcast",
"generate_report",
"generate_video_presentation",
- "scrape_webpage",
}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
index b3cc4fa6c..3f6893169 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx
@@ -134,7 +134,6 @@ const TOOLS_WITH_UI = new Set([
"display_image",
"generate_image",
"delete_notion_page",
- "scrape_webpage",
"create_notion_page",
"update_notion_page",
"create_linear_issue",
diff --git a/surfsense_web/components/assistant-ui/assistant-message.tsx b/surfsense_web/components/assistant-ui/assistant-message.tsx
index fa3aec45a..14fb18bf9 100644
--- a/surfsense_web/components/assistant-ui/assistant-message.tsx
+++ b/surfsense_web/components/assistant-ui/assistant-message.tsx
@@ -29,7 +29,6 @@ import { CreateJiraIssueToolUI, DeleteJiraIssueToolUI, UpdateJiraIssueToolUI } f
import { CreateLinearIssueToolUI, DeleteLinearIssueToolUI, UpdateLinearIssueToolUI } from "@/components/tool-ui/linear";
import { CreateNotionPageToolUI, DeleteNotionPageToolUI, UpdateNotionPageToolUI } from "@/components/tool-ui/notion";
import { SandboxExecuteToolUI } from "@/components/tool-ui/sandbox-execute";
-import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
import { RecallMemoryToolUI, SaveMemoryToolUI } from "@/components/tool-ui/user-memory";
import { useComments } from "@/hooks/use-comments";
import { useMediaQuery } from "@/hooks/use-media-query";
@@ -59,7 +58,6 @@ const AssistantMessageInner: FC = () => {
generate_video_presentation: GenerateVideoPresentationToolUI,
display_image: DisplayImageToolUI,
generate_image: GenerateImageToolUI,
- scrape_webpage: ScrapeWebpageToolUI,
save_memory: SaveMemoryToolUI,
recall_memory: RecallMemoryToolUI,
execute: SandboxExecuteToolUI,
diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx
index 636b43c36..d12ffb5d6 100644
--- a/surfsense_web/components/assistant-ui/tool-fallback.tsx
+++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx
@@ -1,8 +1,14 @@
import type { ToolCallMessagePartComponent } from "@assistant-ui/react";
import { CheckIcon, ChevronDownIcon, ChevronUpIcon, XCircleIcon } from "lucide-react";
import { useState } from "react";
-import { Button } from "@/components/ui/button";
import { cn } from "@/lib/utils";
+import { getToolIcon } from "@/contracts/enums/toolIcons";
+
+function formatToolName(name: string): string {
+ return name
+ .replace(/_/g, " ")
+ .replace(/\b\w/g, (c) => c.toUpperCase());
+}
export const ToolFallback: ToolCallMessagePartComponent = ({
toolName,
@@ -10,66 +16,127 @@ export const ToolFallback: ToolCallMessagePartComponent = ({
result,
status,
}) => {
- const [isCollapsed, setIsCollapsed] = useState(true);
+ const [isExpanded, setIsExpanded] = useState(false);
const isCancelled = status?.type === "incomplete" && status.reason === "cancelled";
+ const isError = status?.type === "incomplete" && status.reason === "error";
+ const isRunning = status?.type === "running" || status?.type === "requires-action";
const cancelledReason =
isCancelled && status.error
? typeof status.error === "string"
? status.error
: JSON.stringify(status.error)
: null;
+ const errorReason =
+ isError && status.error
+ ? typeof status.error === "string"
+ ? status.error
+ : JSON.stringify(status.error)
+ : null;
+
+ const Icon = getToolIcon(toolName);
+ const displayName = formatToolName(toolName);
return (
-
- {isCancelled ? (
-
- ) : (
-
- )}
-
setIsExpanded(!isExpanded)}
+ className="flex w-full items-center gap-3 px-5 py-4 text-left transition-colors hover:bg-muted/50 focus:outline-none focus-visible:outline-none"
+ >
+
- {isCancelled ? "Cancelled tool: " : "Used tool: "}
- {toolName}
-
-
-
- {!isCollapsed && (
-
- {cancelledReason && (
-
-
- Cancelled reason:
-
-
- {cancelledReason}
-
-
- )}
-
- {!isCancelled && result !== undefined && (
-
-
Result:
-
- {typeof result === "string" ? result : JSON.stringify(result, null, 2)}
-
-
+ {isError ? (
+
+ ) : isCancelled ? (
+
+ ) : isRunning ? (
+
+ ) : (
+
)}
+
+
+
+ {isRunning
+ ? displayName
+ : isCancelled
+ ? `Cancelled: ${displayName}`
+ : isError
+ ? `Failed: ${displayName}`
+ : displayName}
+
+ {isRunning && (
+
Running...
+ )}
+ {cancelledReason && (
+
{cancelledReason}
+ )}
+ {errorReason && (
+
{errorReason}
+ )}
+
+
+ {!isRunning && (
+
+ {isExpanded ? (
+
+ ) : (
+
+ )}
+
+ )}
+
+
+ {isExpanded && !isRunning && (
+ <>
+
+
+ {argsText && (
+
+
Arguments
+
+ {argsText}
+
+
+ )}
+ {!isCancelled && result !== undefined && (
+ <>
+
+
+
Result
+
+ {typeof result === "string" ? result : JSON.stringify(result, null, 2)}
+
+
+ >
+ )}
+
+ >
)}
);
diff --git a/surfsense_web/components/public-chat/public-thread.tsx b/surfsense_web/components/public-chat/public-thread.tsx
index 8076188c0..9b1fe7e49 100644
--- a/surfsense_web/components/public-chat/public-thread.tsx
+++ b/surfsense_web/components/public-chat/public-thread.tsx
@@ -17,7 +17,6 @@ import { GenerateImageToolUI } from "@/components/tool-ui/generate-image";
import { GeneratePodcastToolUI } from "@/components/tool-ui/generate-podcast";
import { GenerateReportToolUI } from "@/components/tool-ui/generate-report";
import { GenerateVideoPresentationToolUI } from "@/components/tool-ui/video-presentation";
-import { ScrapeWebpageToolUI } from "@/components/tool-ui/scrape-webpage";
interface PublicThreadProps {
footer?: ReactNode;
@@ -152,7 +151,6 @@ const PublicAssistantMessage: FC = () => {
generate_video_presentation: GenerateVideoPresentationToolUI,
display_image: DisplayImageToolUI,
generate_image: GenerateImageToolUI,
- scrape_webpage: ScrapeWebpageToolUI,
},
Fallback: ToolFallback,
},
diff --git a/surfsense_web/components/tool-ui/article/index.tsx b/surfsense_web/components/tool-ui/article/index.tsx
deleted file mode 100644
index 43ea7c4c9..000000000
--- a/surfsense_web/components/tool-ui/article/index.tsx
+++ /dev/null
@@ -1,425 +0,0 @@
-"use client";
-
-import {
- AlertCircleIcon,
- BookOpenIcon,
- CalendarIcon,
- ExternalLinkIcon,
- FileTextIcon,
- UserIcon,
-} from "lucide-react";
-import Image from "next/image";
-import { Component, type ReactNode, useCallback, useState } from "react";
-import { z } from "zod";
-import { Card, CardContent } from "@/components/ui/card";
-import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
-import { cn } from "@/lib/utils";
-
-/**
- * Zod schema for serializable article data (from backend)
- */
-const SerializableArticleSchema = z.object({
- id: z.string().default("article-unknown"),
- assetId: z.string().nullish(),
- kind: z.literal("article").nullish(),
- title: z.string().default("Untitled Article"),
- description: z.string().nullish(),
- content: z.string().nullish(),
- href: z.string().url().nullish(),
- domain: z.string().nullish(),
- author: z.string().nullish(),
- date: z.string().nullish(),
- word_count: z.number().nullish(),
- wordCount: z.number().nullish(),
- was_truncated: z.boolean().nullish(),
- wasTruncated: z.boolean().nullish(),
- error: z.string().nullish(),
-});
-
-/**
- * Serializable article data type (from backend)
- */
-export type SerializableArticle = z.infer
;
-
-/**
- * Article component props
- */
-export interface ArticleProps {
- /** Unique identifier for the article */
- id: string;
- /** Asset identifier (usually the URL) */
- assetId?: string;
- /** Article title */
- title: string;
- /** Brief description or excerpt */
- description?: string;
- /** Full content of the article (markdown) */
- content?: string;
- /** URL to the original article */
- href?: string;
- /** Domain of the article source */
- domain?: string;
- /** Author name */
- author?: string;
- /** Publication date */
- date?: string;
- /** Word count */
- wordCount?: number;
- /** Whether content was truncated */
- wasTruncated?: boolean;
- /** Optional max width */
- maxWidth?: string;
- /** Optional error message */
- error?: string;
- /** Optional className */
- className?: string;
- /** Response actions */
- responseActions?: Array<{
- id: string;
- label: string;
- variant?: "default" | "outline";
- }>;
- /** Response action handler */
- onResponseAction?: (actionId: string) => void;
-}
-
-/**
- * Parse and validate serializable article data to ArticleProps
- */
-export function parseSerializableArticle(data: unknown): ArticleProps {
- const result = SerializableArticleSchema.safeParse(data);
-
- if (!result.success) {
- console.warn("Invalid article data:", result.error.issues);
- // Return fallback with basic info
- const obj = (data && typeof data === "object" ? data : {}) as Record;
- return {
- id: String(obj.id || "article-unknown"),
- title: String(obj.title || "Untitled Article"),
- error: "Failed to parse article data",
- };
- }
-
- const parsed = result.data;
- return {
- id: parsed.id,
- assetId: parsed.assetId,
- title: parsed.title,
- description: parsed.description,
- content: parsed.content,
- href: parsed.href,
- domain: parsed.domain,
- author: parsed.author,
- date: parsed.date,
- wordCount: parsed.word_count ?? parsed.wordCount,
- wasTruncated: parsed.was_truncated ?? parsed.wasTruncated,
- error: parsed.error,
- };
-}
-
-/**
- * Format word count for display
- */
-function formatWordCount(count: number): string {
- if (count >= 1000) {
- return `${(count / 1000).toFixed(1)}k words`;
- }
- return `${count} words`;
-}
-
-/**
- * Favicon component that fetches the site icon via Google's favicon service,
- * falling back to BookOpenIcon on error.
- */
-function SiteFavicon({ domain }: { domain: string }) {
- const [failed, setFailed] = useState(false);
-
- if (failed) {
- return ;
- }
-
- return (
- setFailed(true)}
- unoptimized
- />
- );
-}
-
-/**
- * Article card component for displaying scraped webpage content
- */
-export function Article({
- id,
- title,
- description,
- content,
- href,
- domain,
- author,
- date,
- wordCount,
- wasTruncated,
- maxWidth = "100%",
- error,
- className,
- responseActions,
- onResponseAction,
-}: ArticleProps) {
- const handleCardClick = useCallback(() => {
- if (href) {
- window.open(href, "_blank", "noopener,noreferrer");
- }
- }, [href]);
-
- // Error state
- if (error) {
- return (
-
-
-
-
-
-
Failed to scrape webpage
- {href &&
{href}
}
-
{error}
-
-
-
-
- );
- }
-
- return (
-
- {
- if (href && (e.key === "Enter" || e.key === " ")) {
- e.preventDefault();
- handleCardClick();
- }
- }}
- >
- {/* Header */}
-
-
- {/* Favicon / Icon */}
- {domain ? (
-
-
-
- ) : (
-
-
-
- )}
-
- {/* Content */}
-
- {/* Title */}
-
- {title}
-
-
- {/* Description */}
- {description && (
-
- {description}
-
- )}
-
- {/* Metadata row */}
-
- {domain && (
-
-
-
-
- {domain}
-
-
-
- Source: {domain}
-
-
- )}
-
- {author && (
-
-
-
-
- {author}
-
-
-
- Author: {author}
-
-
- )}
-
- {date && (
-
-
- {date}
-
- )}
-
- {wordCount && (
-
-
-
-
- {formatWordCount(wordCount)}
- {wasTruncated && (truncated)}
-
-
-
-
- {wasTruncated
- ? "Content was truncated due to length"
- : "Full article content available"}
-
-
-
- )}
-
-
-
-
- {/* Response actions */}
- {responseActions && responseActions.length > 0 && (
-
- {responseActions.map((action) => (
-
- ))}
-
- )}
-
-
-
- );
-}
-
-/**
- * Loading state for article component
- */
-export function ArticleLoading({ title = "Loading article..." }: { title?: string }) {
- return (
-
-
-
- {title}
-
-
- );
-}
-
-/**
- * Skeleton for article component
- */
-export function ArticleSkeleton() {
- return (
-
-
-
-
-
- );
-}
-
-/**
- * Error boundary props
- */
-interface ErrorBoundaryProps {
- children: ReactNode;
- fallback?: ReactNode;
-}
-
-interface ErrorBoundaryState {
- hasError: boolean;
-}
-
-/**
- * Error boundary for article component
- */
-export class ArticleErrorBoundary extends Component {
- constructor(props: ErrorBoundaryProps) {
- super(props);
- this.state = { hasError: false };
- }
-
- static getDerivedStateFromError(): ErrorBoundaryState {
- return { hasError: true };
- }
-
- render() {
- if (this.state.hasError) {
- return (
- this.props.fallback || (
-
-
-
-
-
Failed to render article
-
-
-
- )
- );
- }
-
- return this.props.children;
- }
-}
diff --git a/surfsense_web/components/tool-ui/index.ts b/surfsense_web/components/tool-ui/index.ts
index 0f1126847..b2978f8f3 100644
--- a/surfsense_web/components/tool-ui/index.ts
+++ b/surfsense_web/components/tool-ui/index.ts
@@ -6,15 +6,6 @@
* rich UI when specific tools are called by the agent.
*/
-export {
- Article,
- ArticleErrorBoundary,
- ArticleLoading,
- type ArticleProps,
- ArticleSkeleton,
- parseSerializableArticle,
- type SerializableArticle,
-} from "./article";
export { Audio } from "./audio";
export {
type DisplayImageArgs,
@@ -65,13 +56,6 @@ export {
ExecuteResultSchema,
SandboxExecuteToolUI,
} from "./sandbox-execute";
-export {
- type ScrapeWebpageArgs,
- ScrapeWebpageArgsSchema,
- type ScrapeWebpageResult,
- ScrapeWebpageResultSchema,
- ScrapeWebpageToolUI,
-} from "./scrape-webpage";
export {
type MemoryItem,
type RecallMemoryArgs,
diff --git a/surfsense_web/components/tool-ui/scrape-webpage.tsx b/surfsense_web/components/tool-ui/scrape-webpage.tsx
deleted file mode 100644
index a17c56734..000000000
--- a/surfsense_web/components/tool-ui/scrape-webpage.tsx
+++ /dev/null
@@ -1,163 +0,0 @@
-"use client";
-
-import type { ToolCallMessagePartProps } from "@assistant-ui/react";
-import { AlertCircleIcon, FileTextIcon } from "lucide-react";
-import { z } from "zod";
-import {
- Article,
- ArticleErrorBoundary,
- ArticleLoading,
- parseSerializableArticle,
-} from "@/components/tool-ui/article";
-
-// ============================================================================
-// Zod Schemas
-// ============================================================================
-
-/**
- * Schema for scrape_webpage tool arguments
- */
-const ScrapeWebpageArgsSchema = z.object({
- url: z.string(),
- max_length: z.number().nullish(),
-});
-
-/**
- * Schema for scrape_webpage tool result
- */
-const ScrapeWebpageResultSchema = z.object({
- id: z.string(),
- assetId: z.string(),
- kind: z.literal("article"),
- href: z.string(),
- title: z.string(),
- description: z.string().nullish(),
- content: z.string().nullish(),
- domain: z.string().nullish(),
- author: z.string().nullish(),
- date: z.string().nullish(),
- word_count: z.number().nullish(),
- was_truncated: z.boolean().nullish(),
- crawler_type: z.string().nullish(),
- error: z.string().nullish(),
-});
-
-// ============================================================================
-// Types
-// ============================================================================
-
-type ScrapeWebpageArgs = z.infer;
-type ScrapeWebpageResult = z.infer;
-
-/**
- * Error state component shown when webpage scraping fails
- */
-function ScrapeErrorState({ url, error }: { url: string; error: string }) {
- return (
-
-
-
-
-
Failed to scrape webpage
-
{url}
-
{error}
-
-
-
- );
-}
-
-/**
- * Cancelled state component
- */
-function ScrapeCancelledState({ url }: { url: string }) {
- return (
-
-
-
- Scraping: {url}
-
-
- );
-}
-
-/**
- * Parsed Article component with error handling
- */
-function ParsedArticle({ result }: { result: unknown }) {
- const { description, ...article } = parseSerializableArticle(result);
-
- return ;
-}
-
-/**
- * Scrape Webpage Tool UI Component
- *
- * This component is registered with assistant-ui to render an article card
- * when the scrape_webpage tool is called by the agent.
- *
- * It displays scraped webpage content including:
- * - Title and description
- * - Author and date (if available)
- * - Word count
- * - Link to original source
- */
-export const ScrapeWebpageToolUI = ({ args, result, status }: ToolCallMessagePartProps) => {
- const url = args.url || "Unknown URL";
-
- // Loading state - tool is still running
- if (status.type === "running" || status.type === "requires-action") {
- return (
-
- );
- }
-
- // Incomplete/cancelled state
- if (status.type === "incomplete") {
- if (status.reason === "cancelled") {
- return ;
- }
- if (status.reason === "error") {
- return (
-
- );
- }
- }
-
- // No result yet
- if (!result) {
- return (
-
- );
- }
-
- // Error result from the tool
- if (result.error) {
- return ;
- }
-
- // Success - render the article card
- return (
-
- );
-};
-
-export {
- ScrapeWebpageArgsSchema,
- ScrapeWebpageResultSchema,
- type ScrapeWebpageArgs,
- type ScrapeWebpageResult,
-};