mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-06-10 22:45:13 +02:00
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run
Wires the vertical extractor catalog into both the CLI and the MCP
server so users don't have to hit the HTTP API to invoke them. Same
semantics as `/v1/scrape/{vertical}` + `/v1/extractors`.
CLI (webclaw-cli):
- New subcommand `webclaw extractors` lists all 28 extractors with
name, label, and sample URL. `--json` flag emits the full catalog
as machine-readable JSON.
- New subcommand `webclaw vertical <name> <url>` runs a specific
extractor and prints typed JSON. Pretty-printed by default; `--raw`
for single-line. Exits 1 with a clear "URL does not match" error
on mismatch.
- FetchClient built with Firefox profile + cloud fallback attached
when WEBCLAW_API_KEY is set, so antibot-gated verticals escalate.
MCP (webclaw-mcp):
- New tool `list_extractors` (no args) returns the catalog as
pretty-printed JSON for in-session discovery.
- New tool `vertical_scrape` takes `{name, url}` and returns typed
JSON. Reuses the long-lived self.fetch_client.
- Tool count goes from 10 to 12. Server-info instruction string
updated accordingly.
Tests: 215 passing, clippy clean. Manual surface-tested end-to-end:
CLI prints real Reddit/github/pypi data; MCP JSON-RPC session returns
28-entry catalog + typed responses for pypi/requests + rust-lang/rust
in 200-400ms.
Version bumped to 0.5.2 (minor for API additions, backwards compatible).
122 lines
4.2 KiB
Rust
122 lines
4.2 KiB
Rust
/// Tool parameter structs for MCP tool inputs.
|
|
/// Each struct derives JsonSchema for automatic schema generation,
|
|
/// and Deserialize for parsing from MCP tool call arguments.
|
|
use schemars::JsonSchema;
|
|
use serde::Deserialize;
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct ScrapeParams {
|
|
/// URL to scrape
|
|
pub url: String,
|
|
/// Output format: "markdown" (default), "llm", "text", or "json"
|
|
pub format: Option<String>,
|
|
/// CSS selectors to include (only extract matching elements)
|
|
pub include_selectors: Option<Vec<String>>,
|
|
/// CSS selectors to exclude from output
|
|
pub exclude_selectors: Option<Vec<String>>,
|
|
/// If true, extract only the main content (article/main element)
|
|
pub only_main_content: Option<bool>,
|
|
/// Browser profile: "chrome" (default), "firefox", or "random"
|
|
pub browser: Option<String>,
|
|
/// Cookies to send with the request (e.g. ["name=value", "session=abc123"])
|
|
pub cookies: Option<Vec<String>>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct CrawlParams {
|
|
/// Seed URL to start crawling from
|
|
pub url: String,
|
|
/// Maximum link depth to follow (default: 2)
|
|
pub depth: Option<u32>,
|
|
/// Maximum number of pages to crawl (default: 50)
|
|
pub max_pages: Option<usize>,
|
|
/// Number of concurrent requests (default: 5)
|
|
pub concurrency: Option<usize>,
|
|
/// Seed the frontier from sitemap discovery before crawling
|
|
pub use_sitemap: Option<bool>,
|
|
/// Output format for each page: "markdown" (default), "llm", "text"
|
|
pub format: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct MapParams {
|
|
/// Base URL to discover sitemaps from (e.g. `<https://example.com>`)
|
|
pub url: String,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct BatchParams {
|
|
/// List of URLs to extract content from
|
|
pub urls: Vec<String>,
|
|
/// Output format: "markdown" (default), "llm", "text"
|
|
pub format: Option<String>,
|
|
/// Number of concurrent requests (default: 5)
|
|
pub concurrency: Option<usize>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct ExtractParams {
|
|
/// URL to fetch and extract structured data from
|
|
pub url: String,
|
|
/// Natural language prompt describing what to extract
|
|
pub prompt: Option<String>,
|
|
/// JSON schema describing the structure to extract
|
|
pub schema: Option<serde_json::Value>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct SummarizeParams {
|
|
/// URL to fetch and summarize
|
|
pub url: String,
|
|
/// Number of sentences in the summary (default: 3)
|
|
pub max_sentences: Option<usize>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct DiffParams {
|
|
/// URL to fetch current content from
|
|
pub url: String,
|
|
/// Previous extraction snapshot as a JSON string (ExtractionResult)
|
|
pub previous_snapshot: String,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct BrandParams {
|
|
/// URL to extract brand identity from
|
|
pub url: String,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct ResearchParams {
|
|
/// Research query or question to investigate
|
|
pub query: String,
|
|
/// Enable deep research mode for more thorough investigation (default: false)
|
|
pub deep: Option<bool>,
|
|
/// Topic hint to guide research focus (e.g. "technology", "finance", "science")
|
|
pub topic: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct SearchParams {
|
|
/// Search query
|
|
pub query: String,
|
|
/// Number of results to return (default: 10)
|
|
pub num_results: Option<u32>,
|
|
}
|
|
|
|
/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct VerticalParams {
|
|
/// Name of the vertical extractor. Call `list_extractors` to see all
|
|
/// available names. Examples: "reddit", "github_repo", "pypi",
|
|
/// "trustpilot_reviews", "youtube_video", "shopify_product".
|
|
pub name: String,
|
|
/// URL to extract. Must match the URL patterns the extractor claims;
|
|
/// otherwise the tool returns a clear "URL mismatch" error.
|
|
pub url: String,
|
|
}
|
|
|
|
/// `list_extractors` takes no arguments but we still need an empty struct
|
|
/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
|
|
#[derive(Debug, Deserialize, JsonSchema)]
|
|
pub struct ListExtractorsParams {}
|