mirror of
https://github.com/0xMassi/webclaw.git
synced 2026-04-25 00:06:21 +02:00
fix(mcp): vertical_scrape uses Firefox profile, not default Chrome
Reddit's .json API rejects the wreq-Chrome TLS fingerprint with a 403 even from residential IPs. Their block list includes known browser-emulation library fingerprints. wreq-Firefox passes. The CLI `vertical` subcommand already forced Firefox; MCP `vertical_scrape` was still falling back to the long-lived `self.fetch_client` which defaults to Chrome, so reddit failed on MCP and nobody noticed because the earlier test runs all had an API key set that masked the issue. Switched vertical_scrape to reuse `self.firefox_or_build()` which gives us the cached Firefox client (same pattern the scrape tool uses when the caller requests `browser: firefox`). Firefox is strictly-safer-than-Chrome for every vertical in the catalog, so making it the hard default for `vertical_scrape` is the right call. Verified end-to-end from a clean shell with no WEBCLAW_API_KEY: - MCP reddit: 679ms, post/author/6 comments correct - MCP instagram_profile: 1157ms, 18471 followers No change to the `scrape` tool -- it keeps the user-selectable browser param. Bumps version to 0.5.3.
This commit is contained in:
parent
0daa2fec1a
commit
4bf11d902f
3 changed files with 23 additions and 18 deletions
14
Cargo.lock
generated
14
Cargo.lock
generated
|
|
@ -3199,7 +3199,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-cli"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"dotenvy",
|
||||
|
|
@ -3220,7 +3220,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-core"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"ego-tree",
|
||||
"once_cell",
|
||||
|
|
@ -3238,7 +3238,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-fetch"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
|
|
@ -3263,7 +3263,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-llm"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"reqwest",
|
||||
|
|
@ -3276,7 +3276,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-mcp"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"dirs",
|
||||
"dotenvy",
|
||||
|
|
@ -3296,7 +3296,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-pdf"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"pdf-extract",
|
||||
"thiserror",
|
||||
|
|
@ -3305,7 +3305,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webclaw-server"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ resolver = "2"
|
|||
members = ["crates/*"]
|
||||
|
||||
[workspace.package]
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
edition = "2024"
|
||||
license = "AGPL-3.0"
|
||||
repository = "https://github.com/0xMassi/webclaw"
|
||||
|
|
|
|||
|
|
@ -749,16 +749,21 @@ impl WebclawMcp {
|
|||
Parameters(params): Parameters<VerticalParams>,
|
||||
) -> Result<String, String> {
|
||||
validate_url(¶ms.url)?;
|
||||
// Reuse the long-lived default FetchClient. Extractors accept
|
||||
// `&dyn Fetcher`; FetchClient implements the trait so this just
|
||||
// works (see webclaw_fetch::Fetcher and client::FetchClient).
|
||||
let data = webclaw_fetch::extractors::dispatch_by_name(
|
||||
self.fetch_client.as_ref(),
|
||||
¶ms.name,
|
||||
¶ms.url,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
// Use the cached Firefox client, not the default Chrome one.
|
||||
// Reddit's `.json` endpoint rejects the wreq-Chrome TLS
|
||||
// fingerprint with a 403 even from residential IPs (they
|
||||
// ship a fingerprint blocklist that includes common
|
||||
// browser-emulation libraries). The wreq-Firefox fingerprint
|
||||
// still passes, and Firefox is equally fine for every other
|
||||
// vertical in the catalog, so it's a strictly-safer default
|
||||
// for `vertical_scrape` than the generic `scrape` tool's
|
||||
// Chrome default. Matches the CLI `webclaw vertical`
|
||||
// subcommand which already uses Firefox.
|
||||
let client = self.firefox_or_build()?;
|
||||
let data =
|
||||
webclaw_fetch::extractors::dispatch_by_name(client.as_ref(), ¶ms.name, ¶ms.url)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
serde_json::to_string_pretty(&data)
|
||||
.map_err(|e| format!("failed to serialise extractor output: {e}"))
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue