fix(mcp): vertical_scrape uses Firefox profile, not default Chrome
Some checks are pending
CI / Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Docs (push) Waiting to run

Reddit's .json API rejects the wreq-Chrome TLS fingerprint with a
403 even from residential IPs. Their block list includes known
browser-emulation library fingerprints. wreq-Firefox passes. The
CLI `vertical` subcommand already forced Firefox; MCP
`vertical_scrape` was still falling back to the long-lived
`self.fetch_client` which defaults to Chrome, so reddit failed
on MCP and nobody noticed because the earlier test runs all had
an API key set that masked the issue.

Switched vertical_scrape to reuse `self.firefox_or_build()` which
gives us the cached Firefox client (same pattern the scrape tool
uses when the caller requests `browser: firefox`). Firefox is
strictly-safer-than-Chrome for every vertical in the catalog, so
making it the hard default for `vertical_scrape` is the right call.

Verified end-to-end from a clean shell with no WEBCLAW_API_KEY:
- MCP reddit: 679ms, post/author/6 comments correct
- MCP instagram_profile: 1157ms, 18471 followers

No change to the `scrape` tool -- it keeps the user-selectable
browser param.

Bumps version to 0.5.3.
This commit is contained in:
Valerio 2026-04-22 23:18:11 +02:00
parent 0daa2fec1a
commit 4bf11d902f
3 changed files with 23 additions and 18 deletions

14
Cargo.lock generated
View file

@ -3199,7 +3199,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-cli" name = "webclaw-cli"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"clap", "clap",
"dotenvy", "dotenvy",
@ -3220,7 +3220,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-core" name = "webclaw-core"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"ego-tree", "ego-tree",
"once_cell", "once_cell",
@ -3238,7 +3238,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-fetch" name = "webclaw-fetch"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"bytes", "bytes",
@ -3263,7 +3263,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-llm" name = "webclaw-llm"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"reqwest", "reqwest",
@ -3276,7 +3276,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-mcp" name = "webclaw-mcp"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"dirs", "dirs",
"dotenvy", "dotenvy",
@ -3296,7 +3296,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-pdf" name = "webclaw-pdf"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"pdf-extract", "pdf-extract",
"thiserror", "thiserror",
@ -3305,7 +3305,7 @@ dependencies = [
[[package]] [[package]]
name = "webclaw-server" name = "webclaw-server"
version = "0.5.2" version = "0.5.3"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"axum", "axum",

View file

@ -3,7 +3,7 @@ resolver = "2"
members = ["crates/*"] members = ["crates/*"]
[workspace.package] [workspace.package]
version = "0.5.2" version = "0.5.3"
edition = "2024" edition = "2024"
license = "AGPL-3.0" license = "AGPL-3.0"
repository = "https://github.com/0xMassi/webclaw" repository = "https://github.com/0xMassi/webclaw"

View file

@ -749,16 +749,21 @@ impl WebclawMcp {
Parameters(params): Parameters<VerticalParams>, Parameters(params): Parameters<VerticalParams>,
) -> Result<String, String> { ) -> Result<String, String> {
validate_url(&params.url)?; validate_url(&params.url)?;
// Reuse the long-lived default FetchClient. Extractors accept // Use the cached Firefox client, not the default Chrome one.
// `&dyn Fetcher`; FetchClient implements the trait so this just // Reddit's `.json` endpoint rejects the wreq-Chrome TLS
// works (see webclaw_fetch::Fetcher and client::FetchClient). // fingerprint with a 403 even from residential IPs (they
let data = webclaw_fetch::extractors::dispatch_by_name( // ship a fingerprint blocklist that includes common
self.fetch_client.as_ref(), // browser-emulation libraries). The wreq-Firefox fingerprint
&params.name, // still passes, and Firefox is equally fine for every other
&params.url, // vertical in the catalog, so it's a strictly-safer default
) // for `vertical_scrape` than the generic `scrape` tool's
.await // Chrome default. Matches the CLI `webclaw vertical`
.map_err(|e| e.to_string())?; // subcommand which already uses Firefox.
let client = self.firefox_or_build()?;
let data =
webclaw_fetch::extractors::dispatch_by_name(client.as_ref(), &params.name, &params.url)
.await
.map_err(|e| e.to_string())?;
serde_json::to_string_pretty(&data) serde_json::to_string_pretty(&data)
.map_err(|e| format!("failed to serialise extractor output: {e}")) .map_err(|e| format!("failed to serialise extractor output: {e}"))
} }