mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(cli+mcp): vertical extractor support (28 extractors discoverable + callable)
Wires the vertical extractor catalog into both the CLI and the MCP
server so users don't have to hit the HTTP API to invoke them. Same
semantics as `/v1/scrape/{vertical}` + `/v1/extractors`.
CLI (webclaw-cli):
- New subcommand `webclaw extractors` lists all 28 extractors with
name, label, and sample URL. `--json` flag emits the full catalog
as machine-readable JSON.
- New subcommand `webclaw vertical <name> <url>` runs a specific
extractor and prints typed JSON. Pretty-printed by default; `--raw`
for single-line. Exits 1 with a clear "URL does not match" error
on mismatch.
- FetchClient built with Firefox profile + cloud fallback attached
when WEBCLAW_API_KEY is set, so antibot-gated verticals escalate.
MCP (webclaw-mcp):
- New tool `list_extractors` (no args) returns the catalog as
pretty-printed JSON for in-session discovery.
- New tool `vertical_scrape` takes `{name, url}` and returns typed
JSON. Reuses the long-lived self.fetch_client.
- Tool count goes from 10 to 12. Server-info instruction string
updated accordingly.
Tests: 215 passing, clippy clean. Manual surface-tested end-to-end:
CLI prints real Reddit/github/pypi data; MCP JSON-RPC session returns
28-entry catalog + typed responses for pypi/requests + rust-lang/rust
in 200-400ms.
Version bumped to 0.5.2 (minor for API additions, backwards compatible).
This commit is contained in:
parent
058493bc8f
commit
0daa2fec1a
6 changed files with 190 additions and 9 deletions
14
CHANGELOG.md
14
CHANGELOG.md
|
|
@ -3,6 +3,20 @@
|
||||||
All notable changes to webclaw are documented here.
|
All notable changes to webclaw are documented here.
|
||||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [0.5.2] — 2026-04-22
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **`webclaw vertical <name> <url>` subcommand on the CLI.** Runs a specific vertical extractor and prints typed JSON (pretty-printed by default, `--raw` for single-line). Example: `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/` returns `{post: {title, author, points, ...}, comments: [...]}`. URL-mismatch errors surface cleanly as `"URL '...' does not match the '...' extractor"` on stderr with exit code 1.
|
||||||
|
|
||||||
|
- **`webclaw extractors` subcommand on the CLI.** Lists all 28 vertical extractors with name, label, and one URL pattern sample. `--json` emits the full catalog as JSON (same shape as `GET /v1/extractors`) for tooling. Covers discovery for users who don't know which vertical to pick.
|
||||||
|
|
||||||
|
- **`vertical_scrape` and `list_extractors` tools on `webclaw-mcp`.** Claude Desktop / Claude Code users can now call any of the 28 extractors by name from an MCP session. Tool count goes from 10 to 12. `list_extractors` takes no args and returns the full catalog; `vertical_scrape` takes `{name, url}` and returns the typed JSON payload. Antibot-gated verticals still auto-escalate to the webclaw cloud API when `WEBCLAW_API_KEY` is set.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Server-info instruction string in `webclaw-mcp` now lists all 12 tools (previously hard-coded 10). Also `webclaw --help` on the CLI now shows the three subcommands: `bench`, `extractors`, `vertical`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.5.1] — 2026-04-22
|
## [0.5.1] — 2026-04-22
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3199,7 +3199,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-cli"
|
name = "webclaw-cli"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3220,7 +3220,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-core"
|
name = "webclaw-core"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ego-tree",
|
"ego-tree",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
|
@ -3238,7 +3238,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-fetch"
|
name = "webclaw-fetch"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|
@ -3263,7 +3263,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-llm"
|
name = "webclaw-llm"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
@ -3276,7 +3276,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-mcp"
|
name = "webclaw-mcp"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dirs",
|
"dirs",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
|
|
@ -3296,7 +3296,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-pdf"
|
name = "webclaw-pdf"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pdf-extract",
|
"pdf-extract",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
@ -3305,7 +3305,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webclaw-server"
|
name = "webclaw-server"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
||||||
members = ["crates/*"]
|
members = ["crates/*"]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
repository = "https://github.com/0xMassi/webclaw"
|
repository = "https://github.com/0xMassi/webclaw"
|
||||||
|
|
|
||||||
|
|
@ -308,6 +308,34 @@ enum Commands {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
facts: Option<PathBuf>,
|
facts: Option<PathBuf>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// List all vertical extractors in the catalog.
|
||||||
|
///
|
||||||
|
/// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
|
||||||
|
/// a human-friendly label, a one-line description, and the URL
|
||||||
|
/// patterns it claims. The same data is served by `/v1/extractors`
|
||||||
|
/// when running the REST API.
|
||||||
|
Extractors {
|
||||||
|
/// Emit JSON instead of a human-friendly table.
|
||||||
|
#[arg(long)]
|
||||||
|
json: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Run a vertical extractor by name. Returns typed JSON with fields
|
||||||
|
/// specific to the target site (title, price, author, rating, etc.)
|
||||||
|
/// rather than generic markdown.
|
||||||
|
///
|
||||||
|
/// Use `webclaw extractors` to see the full list. Example:
|
||||||
|
/// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
|
||||||
|
Vertical {
|
||||||
|
/// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
|
||||||
|
name: String,
|
||||||
|
/// URL to extract.
|
||||||
|
url: String,
|
||||||
|
/// Emit compact JSON (single line). Default is pretty-printed.
|
||||||
|
#[arg(long)]
|
||||||
|
raw: bool,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
|
|
@ -2288,6 +2316,83 @@ async fn main() {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Commands::Extractors { json } => {
|
||||||
|
let entries = webclaw_fetch::extractors::list();
|
||||||
|
if *json {
|
||||||
|
// Serialize with serde_json. ExtractorInfo derives
|
||||||
|
// Serialize so this is a one-liner.
|
||||||
|
match serde_json::to_string_pretty(&entries) {
|
||||||
|
Ok(s) => println!("{s}"),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: failed to serialise catalog: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Human-friendly table: NAME + LABEL + one URL
|
||||||
|
// pattern sample. Keeps the output scannable on a
|
||||||
|
// narrow terminal.
|
||||||
|
println!("{} vertical extractors available:\n", entries.len());
|
||||||
|
let name_w = entries.iter().map(|e| e.name.len()).max().unwrap_or(0);
|
||||||
|
let label_w = entries.iter().map(|e| e.label.len()).max().unwrap_or(0);
|
||||||
|
for e in &entries {
|
||||||
|
let pattern_sample = e.url_patterns.first().copied().unwrap_or("");
|
||||||
|
println!(
|
||||||
|
" {:<nw$} {:<lw$} {}",
|
||||||
|
e.name,
|
||||||
|
e.label,
|
||||||
|
pattern_sample,
|
||||||
|
nw = name_w,
|
||||||
|
lw = label_w,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
println!("\nRun one: webclaw vertical <name> <url>");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Commands::Vertical { name, url, raw } => {
|
||||||
|
// Build a FetchClient with cloud fallback attached when
|
||||||
|
// WEBCLAW_API_KEY is set. Antibot-gated verticals
|
||||||
|
// (amazon, ebay, etsy, trustpilot) need this to escalate
|
||||||
|
// on bot protection.
|
||||||
|
let fetch_cfg = webclaw_fetch::FetchConfig {
|
||||||
|
browser: webclaw_fetch::BrowserProfile::Firefox,
|
||||||
|
..webclaw_fetch::FetchConfig::default()
|
||||||
|
};
|
||||||
|
let mut client = match webclaw_fetch::FetchClient::new(fetch_cfg) {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: failed to build fetch client: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() {
|
||||||
|
client = client.with_cloud(cloud);
|
||||||
|
}
|
||||||
|
match webclaw_fetch::extractors::dispatch_by_name(&client, name, url).await {
|
||||||
|
Ok(data) => {
|
||||||
|
let rendered = if *raw {
|
||||||
|
serde_json::to_string(&data)
|
||||||
|
} else {
|
||||||
|
serde_json::to_string_pretty(&data)
|
||||||
|
};
|
||||||
|
match rendered {
|
||||||
|
Ok(s) => println!("{s}"),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("error: JSON encode failed: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// UrlMismatch / UnknownVertical / Fetch all get
|
||||||
|
// Display impls with actionable messages.
|
||||||
|
eprintln!("error: {e}");
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -718,6 +718,50 @@ impl WebclawMcp {
|
||||||
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
|
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List every vertical extractor the server knows about. Returns a
|
||||||
|
/// JSON array of `{name, label, description, url_patterns}` entries.
|
||||||
|
/// Call this to discover what verticals are available before using
|
||||||
|
/// `vertical_scrape`.
|
||||||
|
#[tool]
|
||||||
|
async fn list_extractors(
|
||||||
|
&self,
|
||||||
|
Parameters(_params): Parameters<ListExtractorsParams>,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
let catalog = webclaw_fetch::extractors::list();
|
||||||
|
serde_json::to_string_pretty(&catalog)
|
||||||
|
.map_err(|e| format!("failed to serialise extractor catalog: {e}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a vertical extractor by name and return typed JSON specific
|
||||||
|
/// to the target site (title, price, rating, author, etc.), not
|
||||||
|
/// generic markdown. Use `list_extractors` to discover available
|
||||||
|
/// names. Example names: `reddit`, `github_repo`, `trustpilot_reviews`,
|
||||||
|
/// `youtube_video`, `shopify_product`, `pypi`, `npm`, `arxiv`.
|
||||||
|
///
|
||||||
|
/// Antibot-gated verticals (amazon_product, ebay_listing,
|
||||||
|
/// etsy_listing, trustpilot_reviews) will automatically escalate to
|
||||||
|
/// the webclaw cloud API when local fetch hits bot protection,
|
||||||
|
/// provided `WEBCLAW_API_KEY` is set.
|
||||||
|
#[tool]
|
||||||
|
async fn vertical_scrape(
|
||||||
|
&self,
|
||||||
|
Parameters(params): Parameters<VerticalParams>,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
validate_url(¶ms.url)?;
|
||||||
|
// Reuse the long-lived default FetchClient. Extractors accept
|
||||||
|
// `&dyn Fetcher`; FetchClient implements the trait so this just
|
||||||
|
// works (see webclaw_fetch::Fetcher and client::FetchClient).
|
||||||
|
let data = webclaw_fetch::extractors::dispatch_by_name(
|
||||||
|
self.fetch_client.as_ref(),
|
||||||
|
¶ms.name,
|
||||||
|
¶ms.url,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| e.to_string())?;
|
||||||
|
serde_json::to_string_pretty(&data)
|
||||||
|
.map_err(|e| format!("failed to serialise extractor output: {e}"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tool_handler]
|
#[tool_handler]
|
||||||
|
|
@ -727,7 +771,8 @@ impl ServerHandler for WebclawMcp {
|
||||||
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
||||||
.with_instructions(String::from(
|
.with_instructions(String::from(
|
||||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
|
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
|
||||||
|
list_extractors, vertical_scrape.",
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -103,3 +103,20 @@ pub struct SearchParams {
|
||||||
/// Number of results to return (default: 10)
|
/// Number of results to return (default: 10)
|
||||||
pub num_results: Option<u32>,
|
pub num_results: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
||||||
|
#[derive(Debug, Deserialize, JsonSchema)]
|
||||||
|
pub struct VerticalParams {
|
||||||
|
/// Name of the vertical extractor. Call `list_extractors` to see all
|
||||||
|
/// available names. Examples: "reddit", "github_repo", "pypi",
|
||||||
|
/// "trustpilot_reviews", "youtube_video", "shopify_product".
|
||||||
|
pub name: String,
|
||||||
|
/// URL to extract. Must match the URL patterns the extractor claims;
|
||||||
|
/// otherwise the tool returns a clear "URL mismatch" error.
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `list_extractors` takes no arguments but we still need an empty struct
|
||||||
|
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
|
||||||
|
#[derive(Debug, Deserialize, JsonSchema)]
|
||||||
|
pub struct ListExtractorsParams {}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue