feat(cli+mcp): vertical extractor support (28 extractors discoverable + callable)

Wires the vertical extractor catalog into both the CLI and the MCP server so users don't have to hit the HTTP API to invoke them. Same semantics as `/v1/scrape/{vertical}` + `/v1/extractors`. CLI (webclaw-cli): - New subcommand `webclaw extractors` lists all 28 extractors with name, label, and sample URL. `--json` flag emits the full catalog as machine-readable JSON. - New subcommand `webclaw vertical <name> <url>` runs a specific extractor and prints typed JSON. Pretty-printed by default; `--raw` for single-line. Exits 1 with a clear "URL does not match" error on mismatch. - FetchClient built with Firefox profile + cloud fallback attached when WEBCLAW_API_KEY is set, so antibot-gated verticals escalate. MCP (webclaw-mcp): - New tool `list_extractors` (no args) returns the catalog as pretty-printed JSON for in-session discovery. - New tool `vertical_scrape` takes `{name, url}` and returns typed JSON. Reuses the long-lived self.fetch_client. - Tool count goes from 10 to 12. Server-info instruction string updated accordingly. Tests: 215 passing, clippy clean. Manual surface-tested end-to-end: CLI prints real Reddit/github/pypi data; MCP JSON-RPC session returns 28-entry catalog + typed responses for pypi/requests + rust-lang/rust in 200-400ms. Version bumped to 0.5.2 (minor for API additions, backwards compatible).
2026-07-27 08:01:01 +02:00 · 2026-04-22 21:41:15 +02:00 · 2026-04-22 21:41:15 +02:00 · 0daa2fec1a
commit 0daa2fec1a
parent 058493bc8f
6 changed files with 190 additions and 9 deletions
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -718,6 +718,50 @@ impl WebclawMcp {
            Ok(serde_json::to_string_pretty(&resp).unwrap_or_default())
        }
    }
+
+    /// List every vertical extractor the server knows about. Returns a
+    /// JSON array of `{name, label, description, url_patterns}` entries.
+    /// Call this to discover what verticals are available before using
+    /// `vertical_scrape`.
+    #[tool]
+    async fn list_extractors(
+        &self,
+        Parameters(_params): Parameters<ListExtractorsParams>,
+    ) -> Result<String, String> {
+        let catalog = webclaw_fetch::extractors::list();
+        serde_json::to_string_pretty(&catalog)
+            .map_err(|e| format!("failed to serialise extractor catalog: {e}"))
+    }
+
+    /// Run a vertical extractor by name and return typed JSON specific
+    /// to the target site (title, price, rating, author, etc.), not
+    /// generic markdown. Use `list_extractors` to discover available
+    /// names. Example names: `reddit`, `github_repo`, `trustpilot_reviews`,
+    /// `youtube_video`, `shopify_product`, `pypi`, `npm`, `arxiv`.
+    ///
+    /// Antibot-gated verticals (amazon_product, ebay_listing,
+    /// etsy_listing, trustpilot_reviews) will automatically escalate to
+    /// the webclaw cloud API when local fetch hits bot protection,
+    /// provided `WEBCLAW_API_KEY` is set.
+    #[tool]
+    async fn vertical_scrape(
+        &self,
+        Parameters(params): Parameters<VerticalParams>,
+    ) -> Result<String, String> {
+        validate_url(&params.url)?;
+        // Reuse the long-lived default FetchClient. Extractors accept
+        // `&dyn Fetcher`; FetchClient implements the trait so this just
+        // works (see webclaw_fetch::Fetcher and client::FetchClient).
+        let data = webclaw_fetch::extractors::dispatch_by_name(
+            self.fetch_client.as_ref(),
+            &params.name,
+            &params.url,
+        )
+        .await
+        .map_err(|e| e.to_string())?;
+        serde_json::to_string_pretty(&data)
+            .map_err(|e| format!("failed to serialise extractor output: {e}"))
+    }
 }

 #[tool_handler]
@ -727,7 +771,8 @@ impl ServerHandler for WebclawMcp {
            .with_server_info(Implementation::new("webclaw-mcp", env!("CARGO_PKG_VERSION")))
            .with_instructions(String::from(
                "Webclaw MCP server -- web content extraction for AI agents. \
-                 Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search.",
+                 Tools: scrape, crawl, map, batch, extract, summarize, diff, brand, research, search, \
+                 list_extractors, vertical_scrape.",
            ))
    }
 }
--- a/crates/webclaw-mcp/src/tools.rs
+++ b/crates/webclaw-mcp/src/tools.rs
@ -103,3 +103,20 @@ pub struct SearchParams {
    /// Number of results to return (default: 10)
    pub num_results: Option<u32>,
 }
+
+/// Parameters for `vertical_scrape`: run a site-specific extractor by name.
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct VerticalParams {
+    /// Name of the vertical extractor. Call `list_extractors` to see all
+    /// available names. Examples: "reddit", "github_repo", "pypi",
+    /// "trustpilot_reviews", "youtube_video", "shopify_product".
+    pub name: String,
+    /// URL to extract. Must match the URL patterns the extractor claims;
+    /// otherwise the tool returns a clear "URL mismatch" error.
+    pub url: String,
+}
+
+/// `list_extractors` takes no arguments but we still need an empty struct
+/// so rmcp can generate a schema and parse the (empty) JSON-RPC params.
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct ListExtractorsParams {}