webclaw/crates/webclaw-mcp/src/tools.rs

/// Tool parameter structs for MCP tool inputs.
/// Each struct derives JsonSchema for automatic schema generation,
/// and Deserialize for parsing from MCP tool call arguments.
use schemars::JsonSchema;
use serde::Deserialize;

#[derive(Debug, Deserialize, JsonSchema)]
pub struct ScrapeParams {
    /// URL to scrape
    pub url: String,
    /// Output format: "markdown" (default), "llm", "text", or "json"
    pub format: Option<String>,
    /// CSS selectors to include (only extract matching elements)
    pub include_selectors: Option<Vec<String>>,
    /// CSS selectors to exclude from output
    pub exclude_selectors: Option<Vec<String>>,
    /// If true, extract only the main content (article/main element)
    pub only_main_content: Option<bool>,
    /// Browser profile: "chrome" (default), "firefox", or "random"
    pub browser: Option<String>,
    /// Cookies to send with the request (e.g. ["name=value", "session=abc123"])
    pub cookies: Option<Vec<String>>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct CrawlParams {
    /// Seed URL to start crawling from
    pub url: String,
    /// Maximum link depth to follow (default: 2)
    pub depth: Option<u32>,
    /// Maximum number of pages to crawl (default: 50)
    pub max_pages: Option<usize>,
    /// Number of concurrent requests (default: 5)
    pub concurrency: Option<usize>,
    /// Seed the frontier from sitemap discovery before crawling
    pub use_sitemap: Option<bool>,
    /// Output format for each page: "markdown" (default), "llm", "text"
    pub format: Option<String>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct MapParams {
    /// Base URL to discover sitemaps from (e.g. `<https://example.com>`)
    pub url: String,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct BatchParams {
    /// List of URLs to extract content from
    pub urls: Vec<String>,
    /// Output format: "markdown" (default), "llm", "text"
    pub format: Option<String>,
    /// Number of concurrent requests (default: 5)
    pub concurrency: Option<usize>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct ExtractParams {
    /// URL to fetch and extract structured data from
    pub url: String,
    /// Natural language prompt describing what to extract
    pub prompt: Option<String>,
    /// JSON schema describing the structure to extract
    pub schema: Option<serde_json::Value>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct SummarizeParams {
    /// URL to fetch and summarize
    pub url: String,
    /// Number of sentences in the summary (default: 3)
    pub max_sentences: Option<usize>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct DiffParams {
    /// URL to fetch current content from
    pub url: String,
    /// Previous extraction snapshot as a JSON string (ExtractionResult)
    pub previous_snapshot: String,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct BrandParams {
    /// URL to extract brand identity from
    pub url: String,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct ResearchParams {
    /// Research query or question to investigate
    pub query: String,
    /// Enable deep research mode for more thorough investigation (default: false)
    pub deep: Option<bool>,
    /// Topic hint to guide research focus (e.g. "technology", "finance", "science")
    pub topic: Option<String>,
}

#[derive(Debug, Deserialize, JsonSchema)]
pub struct SearchParams {
    /// Search query
    pub query: String,
    /// Number of results to return (default: 10)
    pub num_results: Option<u32>,
}

/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
#[derive(Debug, Deserialize, JsonSchema)]
pub struct VerticalParams {
    /// Name of the vertical extractor. Call `list_extractors` to see all
    /// available names. Examples: "reddit", "github_repo", "pypi",
    /// "trustpilot_reviews", "youtube_video", "shopify_product".
    pub name: String,
    /// URL to extract. Must match the URL patterns the extractor claims;
    /// otherwise the tool returns a clear "URL mismatch" error.
    pub url: String,
}

/// `list_extractors` takes no arguments but we still need an empty struct
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
#[derive(Debug, Deserialize, JsonSchema)]
pub struct ListExtractorsParams {}
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`/// Tool parameter structs for MCP tool inputs.`
			`/// Each struct derives JsonSchema for automatic schema generation,`
			`/// and Deserialize for parsing from MCP tool call arguments.`
			`use schemars::JsonSchema;`
			`use serde::Deserialize;`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct ScrapeParams {`
			`/// URL to scrape`
			`pub url: String,`
			`/// Output format: "markdown" (default), "llm", "text", or "json"`
			`pub format: Option<String>,`
			`/// CSS selectors to include (only extract matching elements)`
			`pub include_selectors: Option<Vec<String>>,`
			`/// CSS selectors to exclude from output`
			`pub exclude_selectors: Option<Vec<String>>,`
			`/// If true, extract only the main content (article/main element)`
			`pub only_main_content: Option<bool>,`
			`/// Browser profile: "chrome" (default), "firefox", or "random"`
			`pub browser: Option<String>,`
feat: add --cookie-file support for JSON cookie files - --cookie-file reads Chrome extension format ([{name, value, domain, ...}]) - Works with EditThisCookie, Cookie-Editor, and similar browser extensions - Merges with --cookie when both provided - MCP scrape tool now accepts cookies parameter - Closes #7 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-31 10:54:53 +02:00			`/// Cookies to send with the request (e.g. ["name=value", "session=abc123"])`
			`pub cookies: Option<Vec<String>>,`
Initial release: webclaw v0.1.0 — web content extraction for LLMs CLI + MCP server for extracting clean, structured content from any URL. 6 Rust crates, 10 MCP tools, TLS fingerprinting, 5 output formats. MIT Licensed \| https://webclaw.io 2026-03-23 18:31:11 +01:00			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct CrawlParams {`
			`/// Seed URL to start crawling from`
			`pub url: String,`
			`/// Maximum link depth to follow (default: 2)`
			`pub depth: Option<u32>,`
			`/// Maximum number of pages to crawl (default: 50)`
			`pub max_pages: Option<usize>,`
			`/// Number of concurrent requests (default: 5)`
			`pub concurrency: Option<usize>,`
			`/// Seed the frontier from sitemap discovery before crawling`
			`pub use_sitemap: Option<bool>,`
			`/// Output format for each page: "markdown" (default), "llm", "text"`
			`pub format: Option<String>,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct MapParams {`
			/// Base URL to discover sitemaps from (e.g. `<https://example.com>`)
			`pub url: String,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct BatchParams {`
			`/// List of URLs to extract content from`
			`pub urls: Vec<String>,`
			`/// Output format: "markdown" (default), "llm", "text"`
			`pub format: Option<String>,`
			`/// Number of concurrent requests (default: 5)`
			`pub concurrency: Option<usize>,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct ExtractParams {`
			`/// URL to fetch and extract structured data from`
			`pub url: String,`
			`/// Natural language prompt describing what to extract`
			`pub prompt: Option<String>,`
			`/// JSON schema describing the structure to extract`
			`pub schema: Option<serde_json::Value>,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct SummarizeParams {`
			`/// URL to fetch and summarize`
			`pub url: String,`
			`/// Number of sentences in the summary (default: 3)`
			`pub max_sentences: Option<usize>,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct DiffParams {`
			`/// URL to fetch current content from`
			`pub url: String,`
			`/// Previous extraction snapshot as a JSON string (ExtractionResult)`
			`pub previous_snapshot: String,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct BrandParams {`
			`/// URL to extract brand identity from`
			`pub url: String,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct ResearchParams {`
			`/// Research query or question to investigate`
			`pub query: String,`
			`/// Enable deep research mode for more thorough investigation (default: false)`
			`pub deep: Option<bool>,`
			`/// Topic hint to guide research focus (e.g. "technology", "finance", "science")`
			`pub topic: Option<String>,`
			`}`

			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct SearchParams {`
			`/// Search query`
			`pub query: String,`
			`/// Number of results to return (default: 10)`
			`pub num_results: Option<u32>,`
			`}`
feat(cli+mcp): vertical extractor support (28 extractors discoverable + callable) Wires the vertical extractor catalog into both the CLI and the MCP server so users don't have to hit the HTTP API to invoke them. Same semantics as `/v1/scrape/{vertical}` + `/v1/extractors`. CLI (webclaw-cli): - New subcommand `webclaw extractors` lists all 28 extractors with name, label, and sample URL. `--json` flag emits the full catalog as machine-readable JSON. - New subcommand `webclaw vertical <name> <url>` runs a specific extractor and prints typed JSON. Pretty-printed by default; `--raw` for single-line. Exits 1 with a clear "URL does not match" error on mismatch. - FetchClient built with Firefox profile + cloud fallback attached when WEBCLAW_API_KEY is set, so antibot-gated verticals escalate. MCP (webclaw-mcp): - New tool `list_extractors` (no args) returns the catalog as pretty-printed JSON for in-session discovery. - New tool `vertical_scrape` takes `{name, url}` and returns typed JSON. Reuses the long-lived self.fetch_client. - Tool count goes from 10 to 12. Server-info instruction string updated accordingly. Tests: 215 passing, clippy clean. Manual surface-tested end-to-end: CLI prints real Reddit/github/pypi data; MCP JSON-RPC session returns 28-entry catalog + typed responses for pypi/requests + rust-lang/rust in 200-400ms. Version bumped to 0.5.2 (minor for API additions, backwards compatible). 2026-04-22 21:41:15 +02:00
			/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct VerticalParams {`
			/// Name of the vertical extractor. Call `list_extractors` to see all
			`/// available names. Examples: "reddit", "github_repo", "pypi",`
			`/// "trustpilot_reviews", "youtube_video", "shopify_product".`
			`pub name: String,`
			`/// URL to extract. Must match the URL patterns the extractor claims;`
			`/// otherwise the tool returns a clear "URL mismatch" error.`
			`pub url: String,`
			`}`

			/// `list_extractors` takes no arguments but we still need an empty struct
			`/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.`
			`#[derive(Debug, Deserialize, JsonSchema)]`
			`pub struct ListExtractorsParams {}`