diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cfd1e5..ef2d2f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ All notable changes to webclaw are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). +## [0.5.2] — 2026-04-22 + +### Added +- **`webclaw vertical ` subcommand on the CLI.** Runs a specific vertical extractor and prints typed JSON (pretty-printed by default, `--raw` for single-line). Example: `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/` returns `{post: {title, author, points, ...}, comments: [...]}`. URL-mismatch errors surface cleanly as `"URL '...' does not match the '...' extractor"` on stderr with exit code 1. + +- **`webclaw extractors` subcommand on the CLI.** Lists all 28 vertical extractors with name, label, and one URL pattern sample. `--json` emits the full catalog as JSON (same shape as `GET /v1/extractors`) for tooling. Covers discovery for users who don't know which vertical to pick. + +- **`vertical_scrape` and `list_extractors` tools on `webclaw-mcp`.** Claude Desktop / Claude Code users can now call any of the 28 extractors by name from an MCP session. Tool count goes from 10 to 12. `list_extractors` takes no args and returns the full catalog; `vertical_scrape` takes `{name, url}` and returns the typed JSON payload. Antibot-gated verticals still auto-escalate to the webclaw cloud API when `WEBCLAW_API_KEY` is set. + +### Changed +- Server-info instruction string in `webclaw-mcp` now lists all 12 tools (previously hard-coded 10). Also `webclaw --help` on the CLI now shows the three subcommands: `bench`, `extractors`, `vertical`. + +--- + ## [0.5.1] — 2026-04-22 ### Added diff --git a/Cargo.lock b/Cargo.lock index bad52e3..ed0f4fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3199,7 +3199,7 @@ dependencies = [ [[package]] name = "webclaw-cli" -version = "0.5.1" +version = "0.5.2" dependencies = [ "clap", "dotenvy", @@ -3220,7 +3220,7 @@ dependencies = [ [[package]] name = "webclaw-core" -version = "0.5.1" +version = "0.5.2" dependencies = [ "ego-tree", "once_cell", @@ -3238,7 +3238,7 @@ dependencies = [ [[package]] name = "webclaw-fetch" -version = "0.5.1" +version = "0.5.2" dependencies = [ "async-trait", "bytes", @@ -3263,7 +3263,7 @@ dependencies = [ [[package]] name = "webclaw-llm" -version = "0.5.1" +version = "0.5.2" dependencies = [ "async-trait", "reqwest", @@ -3276,7 +3276,7 @@ dependencies = [ [[package]] name = "webclaw-mcp" -version = "0.5.1" +version = "0.5.2" dependencies = [ "dirs", "dotenvy", @@ -3296,7 +3296,7 @@ dependencies = [ [[package]] name = "webclaw-pdf" -version = "0.5.1" +version = "0.5.2" dependencies = [ "pdf-extract", "thiserror", @@ -3305,7 +3305,7 @@ dependencies = [ [[package]] name = "webclaw-server" -version = "0.5.1" +version = "0.5.2" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 92152f2..a286972 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = ["crates/*"] [workspace.package] -version = "0.5.1" +version = "0.5.2" edition = "2024" license = "AGPL-3.0" repository = "https://github.com/0xMassi/webclaw" diff --git a/crates/webclaw-cli/src/main.rs b/crates/webclaw-cli/src/main.rs index 91af384..a12cae1 100644 --- a/crates/webclaw-cli/src/main.rs +++ b/crates/webclaw-cli/src/main.rs @@ -308,6 +308,34 @@ enum Commands { #[arg(long)] facts: Option, }, + + /// List all vertical extractors in the catalog. + /// + /// Each entry has a stable `name` (usable with `webclaw vertical `), + /// a human-friendly label, a one-line description, and the URL + /// patterns it claims. The same data is served by `/v1/extractors` + /// when running the REST API. + Extractors { + /// Emit JSON instead of a human-friendly table. + #[arg(long)] + json: bool, + }, + + /// Run a vertical extractor by name. Returns typed JSON with fields + /// specific to the target site (title, price, author, rating, etc.) + /// rather than generic markdown. + /// + /// Use `webclaw extractors` to see the full list. Example: + /// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`. + Vertical { + /// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`). + name: String, + /// URL to extract. + url: String, + /// Emit compact JSON (single line). Default is pretty-printed. + #[arg(long)] + raw: bool, + }, } #[derive(Clone, ValueEnum)] @@ -2288,6 +2316,83 @@ async fn main() { } return; } + Commands::Extractors { json } => { + let entries = webclaw_fetch::extractors::list(); + if *json { + // Serialize with serde_json. ExtractorInfo derives + // Serialize so this is a one-liner. + match serde_json::to_string_pretty(&entries) { + Ok(s) => println!("{s}"), + Err(e) => { + eprintln!("error: failed to serialise catalog: {e}"); + process::exit(1); + } + } + } else { + // Human-friendly table: NAME + LABEL + one URL + // pattern sample. Keeps the output scannable on a + // narrow terminal. + println!("{} vertical extractors available:\n", entries.len()); + let name_w = entries.iter().map(|e| e.name.len()).max().unwrap_or(0); + let label_w = entries.iter().map(|e| e.label.len()).max().unwrap_or(0); + for e in &entries { + let pattern_sample = e.url_patterns.first().copied().unwrap_or(""); + println!( + " {: "); + } + return; + } + Commands::Vertical { name, url, raw } => { + // Build a FetchClient with cloud fallback attached when + // WEBCLAW_API_KEY is set. Antibot-gated verticals + // (amazon, ebay, etsy, trustpilot) need this to escalate + // on bot protection. + let fetch_cfg = webclaw_fetch::FetchConfig { + browser: webclaw_fetch::BrowserProfile::Firefox, + ..webclaw_fetch::FetchConfig::default() + }; + let mut client = match webclaw_fetch::FetchClient::new(fetch_cfg) { + Ok(c) => c, + Err(e) => { + eprintln!("error: failed to build fetch client: {e}"); + process::exit(1); + } + }; + if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() { + client = client.with_cloud(cloud); + } + match webclaw_fetch::extractors::dispatch_by_name(&client, name, url).await { + Ok(data) => { + let rendered = if *raw { + serde_json::to_string(&data) + } else { + serde_json::to_string_pretty(&data) + }; + match rendered { + Ok(s) => println!("{s}"), + Err(e) => { + eprintln!("error: JSON encode failed: {e}"); + process::exit(1); + } + } + } + Err(e) => { + // UrlMismatch / UnknownVertical / Fetch all get + // Display impls with actionable messages. + eprintln!("error: {e}"); + process::exit(1); + } + } + return; + } } } diff --git a/crates/webclaw-mcp/src/server.rs b/crates/webclaw-mcp/src/server.rs index 87c222e..a4af79d 100644 --- a/crates/webclaw-mcp/src/server.rs +++ b/crates/webclaw-mcp/src/server.rs @@ -718,6 +718,50 @@ impl WebclawMcp { Ok(serde_json::to_string_pretty(&resp).unwrap_or_default()) } } + + /// List every vertical extractor the server knows about. Returns a + /// JSON array of `{name, label, description, url_patterns}` entries. + /// Call this to discover what verticals are available before using + /// `vertical_scrape`. + #[tool] + async fn list_extractors( + &self, + Parameters(_params): Parameters, + ) -> Result { + let catalog = webclaw_fetch::extractors::list(); + serde_json::to_string_pretty(&catalog) + .map_err(|e| format!("failed to serialise extractor catalog: {e}")) + } + + /// Run a vertical extractor by name and return typed JSON specific + /// to the target site (title, price, rating, author, etc.), not + /// generic markdown. Use `list_extractors` to discover available + /// names. Example names: `reddit`, `github_repo`, `trustpilot_reviews`, + /// `youtube_video`, `shopify_product`, `pypi`, `npm`, `arxiv`. + /// + /// Antibot-gated verticals (amazon_product, ebay_listing, + /// etsy_listing, trustpilot_reviews) will automatically escalate to + /// the webclaw cloud API when local fetch hits bot protection, + /// provided `WEBCLAW_API_KEY` is set. + #[tool] + async fn vertical_scrape( + &self, + Parameters(params): Parameters, + ) -> Result { + validate_url(¶ms.url)?; + // Reuse the long-lived default FetchClient. Extractors accept + // `&dyn Fetcher`; FetchClient implements the trait so this just + // works (see webclaw_fetch::Fetcher and client::FetchClient). + let data = webclaw_fetch::extractors::dispatch_by_name( + self.fetch_client.as_ref(), + ¶ms.name, + ¶ms.url, + ) + .await + .map_err(|e| e.to_string())?; + serde_json::to_string_pretty(&data) + .map_err(|e| format!("failed to serialise extractor output: {e}")) + } } #[tool_handler] @@ -727,7 +771,8 @@ impl ServerHandler for WebclawMcp { .with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION"))) .with_instructions(String::from( "Webclaw MCP server -- web content extraction for AI agents. \ - Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.", + Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \ + list_extractors, vertical_scrape.", )) } } diff --git a/crates/webclaw-mcp/src/tools.rs b/crates/webclaw-mcp/src/tools.rs index e0195f1..02bf534 100644 --- a/crates/webclaw-mcp/src/tools.rs +++ b/crates/webclaw-mcp/src/tools.rs @@ -103,3 +103,20 @@ pub struct SearchParams { /// Number of results to return (default: 10) pub num_results: Option, } + +/// Parameters for `vertical_scrape`: run a site-specific extractor by name. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct VerticalParams { + /// Name of the vertical extractor. Call `list_extractors` to see all + /// available names. Examples: "reddit", "github_repo", "pypi", + /// "trustpilot_reviews", "youtube_video", "shopify_product". + pub name: String, + /// URL to extract. Must match the URL patterns the extractor claims; + /// otherwise the tool returns a clear "URL mismatch" error. + pub url: String, +} + +/// `list_extractors` takes no arguments but we still need an empty struct +/// so rmcp can generate a schema and parse the (empty) JSON-RPC params. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct ListExtractorsParams {}