mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
feat(cli+mcp): vertical extractor support (28 extractors discoverable + callable)
Wires the vertical extractor catalog into both the CLI and the MCP
server so users don't have to hit the HTTP API to invoke them. Same
semantics as `/v1/scrape/{vertical}` + `/v1/extractors`.
CLI (webclaw-cli):
- New subcommand `webclaw extractors` lists all 28 extractors with
name, label, and sample URL. `--json` flag emits the full catalog
as machine-readable JSON.
- New subcommand `webclaw vertical <name> <url>` runs a specific
extractor and prints typed JSON. Pretty-printed by default; `--raw`
for single-line. Exits 1 with a clear "URL does not match" error
on mismatch.
- FetchClient built with Firefox profile + cloud fallback attached
when WEBCLAW_API_KEY is set, so antibot-gated verticals escalate.
MCP (webclaw-mcp):
- New tool `list_extractors` (no args) returns the catalog as
pretty-printed JSON for in-session discovery.
- New tool `vertical_scrape` takes `{name, url}` and returns typed
JSON. Reuses the long-lived self.fetch_client.
- Tool count goes from 10 to 12. Server-info instruction string
updated accordingly.
Tests: 215 passing, clippy clean. Manual surface-tested end-to-end:
CLI prints real Reddit/github/pypi data; MCP JSON-RPC session returns
28-entry catalog + typed responses for pypi/requests + rust-lang/rust
in 200-400ms.
Version bumped to 0.5.2 (minor for API additions, backwards compatible).
This commit is contained in:
parent
058493bc8f
commit
0daa2fec1a
6 changed files with 190 additions and 9 deletions
14
CHANGELOG.md
14
CHANGELOG.md
|
|
@ -3,6 +3,20 @@
|
|||
All notable changes to webclaw are documented here.
|
||||
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
||||
|
||||
## [0.5.2] — 2026-04-22
|
||||
|
||||
### Added
|
||||
- **`webclaw vertical <name> <url>` subcommand on the CLI.** Runs a specific vertical extractor and prints typed JSON (pretty-printed by default, `--raw` for single-line). Example: `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/` returns `{post: {title, author, points, ...}, comments: [...]}`. URL-mismatch errors surface cleanly as `"URL '...' does not match the '...' extractor"` on stderr with exit code 1.
|
||||
|
||||
- **`webclaw extractors` subcommand on the CLI.** Lists all 28 vertical extractors with name, label, and one URL pattern sample. `--json` emits the full catalog as JSON (same shape as `GET /v1/extractors`) for tooling. Covers discovery for users who don't know which vertical to pick.
|
||||
|
||||
- **`vertical_scrape` and `list_extractors` tools on `webclaw-mcp`.** Claude Desktop / Claude Code users can now call any of the 28 extractors by name from an MCP session. Tool count goes from 10 to 12. `list_extractors` takes no args and returns the full catalog; `vertical_scrape` takes `{name, url}` and returns the typed JSON payload. Antibot-gated verticals still auto-escalate to the webclaw cloud API when `WEBCLAW_API_KEY` is set.
|
||||
|
||||
### Changed
|
||||
- Server-info instruction string in `webclaw-mcp` now lists all 12 tools (previously hard-coded 10). Also `webclaw --help` on the CLI now shows the three subcommands: `bench`, `extractors`, `vertical`.
|
||||
|
||||
---
|
||||
|
||||
## [0.5.1] — 2026-04-22
|
||||
|
||||
### Added
|
||||
|
|
|
|||
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3199,7 +3199,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3220,7 +3220,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3238,7 +3238,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
|
|
@ -3263,7 +3263,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3276,7 +3276,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3296,7 +3296,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3305,7 +3305,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-server"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -308,6 +308,34 @@ enum Commands {
|
|||
#[arg(long)]
|
||||
facts: Option<PathBuf>,
|
||||
},
|
||||
|
||||
/// List all vertical extractors in the catalog.
|
||||
///
|
||||
/// Each entry has a stable `name` (usable with `webclaw vertical <name>`),
|
||||
/// a human-friendly label, a one-line description, and the URL
|
||||
/// patterns it claims. The same data is served by `/v1/extractors`
|
||||
/// when running the REST API.
|
||||
Extractors {
|
||||
/// Emit JSON instead of a human-friendly table.
|
||||
#[arg(long)]
|
||||
json: bool,
|
||||
},
|
||||
|
||||
/// Run a vertical extractor by name. Returns typed JSON with fields
|
||||
/// specific to the target site (title, price, author, rating, etc.)
|
||||
/// rather than generic markdown.
|
||||
///
|
||||
/// Use `webclaw extractors` to see the full list. Example:
|
||||
/// `webclaw vertical reddit https://www.reddit.com/r/rust/comments/abc/`.
|
||||
Vertical {
|
||||
/// Vertical name (e.g. `reddit`, `github_repo`, `trustpilot_reviews`).
|
||||
name: String,
|
||||
/// URL to extract.
|
||||
url: String,
|
||||
/// Emit compact JSON (single line). Default is pretty-printed.
|
||||
#[arg(long)]
|
||||
raw: bool,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
|
|
@ -2288,6 +2316,83 @@ async fn main() {
|
|||
}
|
||||
return;
|
||||
}
|
||||
Commands::Extractors { json } => {
|
||||
let entries = webclaw_fetch::extractors::list();
|
||||
if *json {
|
||||
// Serialize with serde_json. ExtractorInfo derives
|
||||
// Serialize so this is a one-liner.
|
||||
match serde_json::to_string_pretty(&entries) {
|
||||
Ok(s) => println!("{s}"),
|
||||
Err(e) => {
|
||||
eprintln!("error: failed to serialise catalog: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Human-friendly table: NAME + LABEL + one URL
|
||||
// pattern sample. Keeps the output scannable on a
|
||||
// narrow terminal.
|
||||
println!("{} vertical extractors available:\n", entries.len());
|
||||
let name_w = entries.iter().map(|e| e.name.len()).max().unwrap_or(0);
|
||||
let label_w = entries.iter().map(|e| e.label.len()).max().unwrap_or(0);
|
||||
for e in &entries {
|
||||
let pattern_sample = e.url_patterns.first().copied().unwrap_or("");
|
||||
println!(
|
||||
" {:<nw$} {:<lw$} {}",
|
||||
e.name,
|
||||
e.label,
|
||||
pattern_sample,
|
||||
nw = name_w,
|
||||
lw = label_w,
|
||||
);
|
||||
}
|
||||
println!("\nRun one: webclaw vertical <name> <url>");
|
||||
}
|
||||
return;
|
||||
}
|
||||
Commands::Vertical { name, url, raw } => {
|
||||
// Build a FetchClient with cloud fallback attached when
|
||||
// WEBCLAW_API_KEY is set. Antibot-gated verticals
|
||||
// (amazon, ebay, etsy, trustpilot) need this to escalate
|
||||
// on bot protection.
|
||||
let fetch_cfg = webclaw_fetch::FetchConfig {
|
||||
browser: webclaw_fetch::BrowserProfile::Firefox,
|
||||
..webclaw_fetch::FetchConfig::default()
|
||||
};
|
||||
let mut client = match webclaw_fetch::FetchClient::new(fetch_cfg) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
eprintln!("error: failed to build fetch client: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
};
|
||||
if let Some(cloud) = webclaw_fetch::cloud::CloudClient::from_env() {
|
||||
client = client.with_cloud(cloud);
|
||||
}
|
||||
match webclaw_fetch::extractors::dispatch_by_name(&client, name, url).await {
|
||||
Ok(data) => {
|
||||
let rendered = if *raw {
|
||||
serde_json::to_string(&data)
|
||||
} else {
|
||||
serde_json::to_string_pretty(&data)
|
||||
};
|
||||
match rendered {
|
||||
Ok(s) => println!("{s}"),
|
||||
Err(e) => {
|
||||
eprintln!("error: JSON encode failed: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// UrlMismatch / UnknownVertical / Fetch all get
|
||||
// Display impls with actionable messages.
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -718,6 +718,50 @@ impl WebclawMcp {
|
|||
Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
||||
/// List every vertical extractor the server knows about. Returns a
|
||||
/// JSON array of `{name, label, description, url_patterns}` entries.
|
||||
/// Call this to discover what verticals are available before using
|
||||
/// `vertical_scrape`.
|
||||
#[tool]
|
||||
async fn list_extractors(
|
||||
&self,
|
||||
Parameters(_params): Parameters<ListExtractorsParams>,
|
||||
) -> Result<String, String> {
|
||||
let catalog = webclaw_fetch::extractors::list();
|
||||
serde_json::to_string_pretty(&catalog)
|
||||
.map_err(|e| format!("failed to serialise extractor catalog: {e}"))
|
||||
}
|
||||
|
||||
/// Run a vertical extractor by name and return typed JSON specific
|
||||
/// to the target site (title, price, rating, author, etc.), not
|
||||
/// generic markdown. Use `list_extractors` to discover available
|
||||
/// names. Example names: `reddit`, `github_repo`, `trustpilot_reviews`,
|
||||
/// `youtube_video`, `shopify_product`, `pypi`, `npm`, `arxiv`.
|
||||
///
|
||||
/// Antibot-gated verticals (amazon_product, ebay_listing,
|
||||
/// etsy_listing, trustpilot_reviews) will automatically escalate to
|
||||
/// the webclaw cloud API when local fetch hits bot protection,
|
||||
/// provided `WEBCLAW_API_KEY` is set.
|
||||
#[tool]
|
||||
async fn vertical_scrape(
|
||||
&self,
|
||||
Parameters(params): Parameters<VerticalParams>,
|
||||
) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
// Reuse the long-lived default FetchClient. Extractors accept
|
||||
// `&dyn Fetcher`; FetchClient implements the trait so this just
|
||||
// works (see webclaw_fetch::Fetcher and client::FetchClient).
|
||||
let data = webclaw_fetch::extractors::dispatch_by_name(
|
||||
self.fetch_client.as_ref(),
|
||||
¶ms.name,
|
||||
¶ms.url,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
serde_json::to_string_pretty(&data)
|
||||
.map_err(|e| format!("failed to serialise extractor output: {e}"))
|
||||
}
|
||||
}
|
||||
|
||||
#[tool_handler]
|
||||
|
|
@ -727,7 +771,8 @@ impl ServerHandler for WebclawMcp {
|
|||
.with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
|
||||
.with_instructions(String::from(
|
||||
"Webclaw MCP server -- web content extraction for AI agents. \
|
||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
|
||||
Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
|
||||
list_extractors, vertical_scrape.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,3 +103,20 @@ pub struct SearchParams {
|
|||
/// Number of results to return (default: 10)
|
||||
pub num_results: Option<u32>,
|
||||
}
|
||||
|
||||
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
pub struct VerticalParams {
|
||||
/// Name of the vertical extractor. Call `list_extractors` to see all
|
||||
/// available names. Examples: "reddit", "github_repo", "pypi",
|
||||
/// "trustpilot_reviews", "youtube_video", "shopify_product".
|
||||
pub name: String,
|
||||
/// URL to extract. Must match the URL patterns the extractor claims;
|
||||
/// otherwise the tool returns a clear "URL mismatch" error.
|
||||
pub url: String,
|
||||
}
|
||||
|
||||
/// `list_extractors` takes no arguments but we still need an empty struct
|
||||
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
|
||||
#[derive(Debug, Deserialize, JsonSchema)]
|
||||
pub struct ListExtractorsParams {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue