fix(mcp): vertical_scrape uses Firefox profile, not default Chrome

Reddit's .json API rejects the wreq-Chrome TLS fingerprint with a 403 even from residential IPs. Their block list includes known browser-emulation library fingerprints. wreq-Firefox passes. The CLI `vertical` subcommand already forced Firefox; MCP `vertical_scrape` was still falling back to the long-lived `self.fetch_client` which defaults to Chrome, so reddit failed on MCP and nobody noticed because the earlier test runs all had an API key set that masked the issue. Switched vertical_scrape to reuse `self.firefox_or_build()` which gives us the cached Firefox client (same pattern the scrape tool uses when the caller requests `browser: firefox`). Firefox is strictly-safer-than-Chrome for every vertical in the catalog, so making it the hard default for `vertical_scrape` is the right call. Verified end-to-end from a clean shell with no WEBCLAW_API_KEY: - MCP reddit: 679ms, post/author/6 comments correct - MCP instagram_profile: 1157ms, 18471 followers No change to the `scrape` tool -- it keeps the user-selectable browser param. Bumps version to 0.5.3.
2026-06-08 22:25:12 +02:00 · 2026-04-22 23:18:11 +02:00 · 2026-04-22 23:18:11 +02:00 · 4bf11d902f
commit 4bf11d902f
parent 0daa2fec1a
3 changed files with 23 additions and 18 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3199,7 +3199,7 @@ dependencies = [

 [[package]]
 name = "webclaw-cli"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "clap",
 "dotenvy",
@ -3220,7 +3220,7 @@ dependencies = [

 [[package]]
 name = "webclaw-core"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "ego-tree",
 "once_cell",
@ -3238,7 +3238,7 @@ dependencies = [

 [[package]]
 name = "webclaw-fetch"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "async-trait",
 "bytes",
@ -3263,7 +3263,7 @@ dependencies = [

 [[package]]
 name = "webclaw-llm"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "async-trait",
 "reqwest",
@ -3276,7 +3276,7 @@ dependencies = [

 [[package]]
 name = "webclaw-mcp"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "dirs",
 "dotenvy",
@ -3296,7 +3296,7 @@ dependencies = [

 [[package]]
 name = "webclaw-pdf"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "pdf-extract",
 "thiserror",
@ -3305,7 +3305,7 @@ dependencies = [

 [[package]]
 name = "webclaw-server"
-version = "0.5.2"
+version = "0.5.3"
 dependencies = [
 "anyhow",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/*"]

 [workspace.package]
-version = "0.5.2"
+version = "0.5.3"
 edition = "2024"
 license = "AGPL-3.0"
 repository = "https://github.com/0xMassi/webclaw"
--- a/crates/webclaw-mcp/src/server.rs
+++ b/crates/webclaw-mcp/src/server.rs
@ -749,16 +749,21 @@ impl WebclawMcp {
        Parameters(params): Parameters<VerticalParams>,
    ) -> Result<String, String> {
        validate_url(&params.url)?;
-        // Reuse the long-lived default FetchClient. Extractors accept
-        // `&dyn Fetcher`; FetchClient implements the trait so this just
-        // works (see webclaw_fetch::Fetcher and client::FetchClient).
-        let data = webclaw_fetch::extractors::dispatch_by_name(
-            self.fetch_client.as_ref(),
-            &params.name,
-            &params.url,
-        )
-        .await
-        .map_err(|e| e.to_string())?;
+        // Use the cached Firefox client, not the default Chrome one.
+        // Reddit's `.json` endpoint rejects the wreq-Chrome TLS
+        // fingerprint with a 403 even from residential IPs (they
+        // ship a fingerprint blocklist that includes common
+        // browser-emulation libraries). The wreq-Firefox fingerprint
+        // still passes, and Firefox is equally fine for every other
+        // vertical in the catalog, so it's a strictly-safer default
+        // for `vertical_scrape` than the generic `scrape` tool's
+        // Chrome default. Matches the CLI `webclaw vertical`
+        // subcommand which already uses Firefox.
+        let client = self.firefox_or_build()?;
+        let data =
+            webclaw_fetch::extractors::dispatch_by_name(client.as_ref(), &params.name, &params.url)
+                .await
+                .map_err(|e| e.to_string())?;
        serde_json::to_string_pretty(&data)
            .map_err(|e| format!("failed to serialise extractor output: {e}"))
    }